In [71]:
# read clean data
import pandas as pd
import random
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
import numpy as np
import re

df = pd.read_csv('resdf_delivery_mode_2024-02-10.csv')


df['prvs_lscs'] = df['indication'].str.contains('LSCS', case=False, na=False).astype(int)
df['prvs_abortion'] = df['indication'].str.contains('ABORTION', case=False, na=False).astype(int)
df['oligo'] = df['indication'].str.contains('OLIGO', case=False, na=False).astype(int)
df['meconium'] = df['indication'].str.contains('MECONIUM', case=False, na=False).astype(int)
df['iugr'] = df['indication'].str.contains('IUGR', case=False, na=False).astype(int)
df['gdm'] = df['indication'].str.contains('GDM', case=False, na=False).astype(int)
df['hyperthyraidisim'] = df['diagnosis'].str.contains('HYPOTHYRAIDISIM', case=False, na=False).astype(int)

def extract_info(df, column_name, start_phrase, end_phrase=None):
    if end_phrase:
        # Extracting data between start_phrase and end_phrase
        df[column_name] = df['usg'].str.extract(f'{start_phrase}(.+?){end_phrase}')
    else:
        # Extracting data after start_phrase if no end_phrase is provided
        df[column_name] = df['usg'].str.extract(f'{start_phrase}(.+)')
    # Cleaning extracted data
    df[column_name] = df[column_name].str.strip()

def extract_efw(df):
    # Extracting only the first numeric part before the space
    df['E.F.W'] = df['usg'].str.extract('E.F.W- (\d+)').astype(str)
    
# Extracting each piece of information
extract_info(df, 'FHR', 'FHR-', '/MIN')
extract_info(df, 'Grade', 'GRADE-', ',')
extract_info(df, 'E.F.W', 'E.F.W-', ' GMS')
extract_info(df, 'A.F.I', 'A.F.I- ', ' CM')
extract_info(df, 'Placenta', 'PLACENTA- ', ',')
extract_info(df, 'Cervix_Length', 'CERVIX-', 'CM')
df['fetus_type'] = df['usg'].str.extract(r'(\b\w+)\s+LIVE')

df['fetus_type'] = np.where(df.fetus_type.isin(['SINGL', 'SINGLWE', 'WITH']),'SINGLE', df['fetus_type'])

extract_efw(df)

df = df[~df.usg.isna()]



In [72]:
def cal_lower_upper(df,min_range,max_range):
    # Generate random numbers and fill null values
    random_values = [random.uniform(min_range, max_range) if pd.isna(val) else val for val in df['kg']]
    
    return random_values

In [73]:
## Replace age and drop age with 16years

df['age'] = df['age'].replace({269.0: 26,257:25,16:21})



## Weight#############
## Replace missing weight with +/- in std deviation for weight and height
mean_weight = df['kg'].mean()
std_dev_weight=df['kg'].std()
# Define the range for random numbers
kg_upper_range=mean_weight+std_dev_weight
kg_lower_range=mean_weight-std_dev_weight
df['kg_upd']=cal_lower_upper(df,kg_lower_range,kg_upper_range)


##### Height ###########
## Replace missing weight with +/- in std deviation for weight and height
mean_height = df['height'].mean()
std_dev_height=df['height'].std()
# Define the range for random numbers
height_upper_range=mean_height+std_dev_height
height_lower_range=mean_height-std_dev_height
df['height_upd']=cal_lower_upper(df,height_lower_range,height_upper_range)

## Convert height from cms to metres
df['height_upd']=df['height_upd']/100



## Replace missing haemoglobin with mean
df['hb']=df['hb'].replace({'11..9':'11.9','10..2':'10.2','10 .1':'10.1'})
df['hb']=df['hb'].astype(float)
mean_hb = df['hb'].mean()
df['hb'].fillna(mean_hb, inplace=True)


##Looking into age and can be imputed ##TBD
#Replace for bp
df['bp']=df['bp'].replace({'10.2':'110/70'})
df['bp'].fillna('110/70', inplace=True)


## Drop rows who opted for LSCS option by default
df['maternal_request'] = df['indication'].str.contains('MATERNAL', case=False, na=False).astype(int)
df=df[df['maternal_request']==0]

df['hhh'].replace({'110/70':'NO- REACTIVE'},inplace=True)


df.fetus_type = df.fetus_type.fillna('SINGLE')


df['bmi']=((df['kg_upd'])/(df['height_upd']*df['height_upd']))


In [74]:
df['Placenta']=df['Placenta'].astype(str)


### Categories merged
FUNDAL:
FUNDAL
FUNDAL LEFT ANTERIOR
FUNDAL-POSTERIOR
FUNDAL-POSTERIOR .'
FUNDAL-ANTERIOR
FUNDAL ANTERIOR
FUNDAL . '
FUNDAL . '



FUNDO-ANTERIOR:FUNDO-ANTERIOR
FUNDO-ANTERIOR . '
FUNDO-ANTERIOR .'
FUNDO-ANTERIOR . CERVIX-3.2 CM
FUNDO- ANTERIOR . '
FUNDO- ANTERIOR
FUNDOANT IN LOCATION .'
FUNDO-ANTERIOR RIGHT LATERAL EXTENSION. NOT LOW LYING .'
FUNDO-ANTERIOR NOT LOW LYING


POSTERIOR:POSTERIOR
POSTERIOR NOT LOW LYING
POSTERIOR FUNDAL . A.F.I- 11.3 CM
POSTERIOR RTLATERAL
POSTERIOR LOW LYING
POSTERIOR . '
POSTERIOR . (USG IN L.R )'
POSTERIOR LEFT LATERAL
POSTERIOR & FUNDAL
POSTERIOR-FUNDAL . '


ANTERIOR:ANTERIOR
ANTERIOR .
ANTERIOR . '
ANTERIOR . A.F.I- 10 CM
ANTERIOR LEFT LATERAL
ANTERIOR RIGHT FUNDAL
ANTERIOR FUNDAL
ANTERIOR FUNDAL . GARDE-3 . '
ANTERIOR TO THE RIGHT NOT LOW LYING . '
ANTERIOR AND LEFT LATERAL
ANTERIOR LEFT LATERAL . '
ANTERIOR . '


NAN:nan

LEFT:LEFT
LEFT LATERAL
LEFT FUNDAL


RIGHT:RIGHT
RIGHT ANTERIOR
RIGHT LATERAL
RIGHT LATERAL POSTERIOR
(SINGLE)- RIGHT LATERAL '


OTHERS:NOT LOW LYINY POSTERIOR
NATERIOR
FUNDIC
FUNDO-POSTERIOR
FUNDO-POSTERIOR . '
FUNDO-''
FUNDO- POSTERIOR
FUNDIC POSTERIOR
FUNDIC PARTLY ANTERIOR
FUNDIC RIGHT LATERAL
FUNDIC-PARTLY ANTERIOR PARTLY'
FUNDP-POSTERIOR
SEPARATION
LATERAL
FUNDO POSTERIOR WITH LEFT LATERAL
LATERLAL POSTERIOR
FUN-ANTERIOR
POSTEROLATERALLY



In [75]:
## Code to merge categories in placenta and perform one hot encoding
mapping = {
    'FUNDAL': 'FUNDAL',
    'FUNDO-ANTERIOR': 'FUNDO-ANTERIOR',
    'POSTERIOR': 'POSTERIOR',
    'ANTERIOR': 'ANTERIOR',
    'nan': 'nan',
    'LEFT': 'LEFT',
    'RIGHT': 'RIGHT'
}

# Create new columns based on mapping
for key, value in mapping.items():
    df[value] = df['Placenta'].apply(lambda x: 1 if key in x else 0)


In [76]:
# Define the pattern to match "AFI" followed by 10 characters
pattern = r'A.F(.{13})'

# Function to extract the matched strings
def extract_matched_strings(text):
    match = re.search(pattern, text)
    if match:
        return match.group(1)  # Extract the 10 characters after "CER"
    else:
        return None

# Apply the function to the DataFrame column
df['Extracted_String_A.F.I'] = df['usg'].apply(extract_matched_strings)
df['Extracted_String_A.F.I']=df['Extracted_String_A.F.I'].astype(str)

df['A.F.I_upd']=df['A.F.I'].fillna(df['Extracted_String_A.F.I'])

# Regular expression to extract numerical values
pattern = r'(\d+\.?\d*)'

# Function to extract numerical values
def extract_numerical_values(text):
    matches = re.findall(pattern, str(text))
    return ' '.join(matches)

# Apply the function to the DataFrame column
df['A.F.I_numerical'] = df['A.F.I_upd'].apply(extract_numerical_values)

df['A.F.I_numerical']=df['A.F.I_numerical'].replace({'176 3 6.4':'6.4','14.0 4.2':'14.0','9 10':'9','10.3 3 2.8':'10.3',
                                                    '13 14':'13',
                                                    '2306 34':'ADEQUATE',
                                                    '112':'11.2','163':'16.3'})


In [77]:
# Define the pattern to match "CER" followed by 10 characters
pattern = r'CERVIX-(.{5})'

# Function to extract the matched strings
def extract_matched_strings(text):
    match = re.search(pattern, text)
    if match:
        return match.group(1)  # Extract the 10 characters after "CER"
    else:
        return None

# Apply the function to the DataFrame column
df['Extracted_String_cervix'] = df['usg'].apply(extract_matched_strings)
df['Extracted_String_cervix']=df['Extracted_String_cervix'].astype(str)

df['cervix_upd']=df['Cervix_Length'].fillna(df['Extracted_String_cervix'])

# Regular expression to extract numerical values
pattern = r'(\d+\.?\d*)'

# Function to extract numerical values
def extract_numerical_values(text):
    matches = re.findall(pattern, str(text))
    return ' '.join(matches)

# # Apply the function to the DataFrame column
df['cervix_numerical'] = df['cervix_upd'].apply(extract_numerical_values)

df['cervix_numerical']=df['cervix_numerical'].replace({'2838 12':'normal','32 1719 251 12.3':'32',
                                                      '35':'3.5','41':'4.1','45':'4.5','36':'3.6','34':'3.4',
                                                      '33':'3.3','29':'2.9','32':'3.2','30':'3.0'})


In [84]:
## Code to update FHR


pattern = r'FHR-(\d+)'
import re
# Function to extract FHR value from a string
def extract_fhr(column_value):
    # Convert non-string values to string
    if not isinstance(column_value, str):
        column_value = str(column_value)
    fhr_match = re.search(pattern, column_value)
    if fhr_match:
        return fhr_match.group(1)
    else:
        return None

# Apply the function to the column
df['new_fhr']=df['FHR'].apply(extract_fhr)
df['fhr_upd'] = df['new_fhr'].fillna(df['FHR'])

In [81]:
df.drop(columns=[ 'nan','Extracted_String_A.F.I', 'A.F.I_upd','Extracted_String_cervix', 'cervix_upd','new_fhr'],inplace=True)

In [83]:
df.head()

Unnamed: 0,patient_id,height,kg,lmp,age,edd,indication,diagnosis,usg,hb,bp,hhh,bg,ga_weeks,delivery_mode,filename,prvs_lscs,prvs_abortion,oligo,meconium,iugr,gdm,hyperthyraidisim,FHR,Grade,E.F.W,A.F.I,Placenta,Cervix_Length,fetus_type,kg_upd,height_upd,maternal_request,bmi,FUNDAL,FUNDO-ANTERIOR,POSTERIOR,ANTERIOR,LEFT,RIGHT,A.F.I_numerical,cervix_numerical,fhr_upd
0,1,147.0,77.0,9.7.20,39.0,16.4.21,"['PREV LSCS &', 'UN CONTROL B/P', 'nan', 'nan'...",['G2P1L1 WITH CHRONIC WITH HTN WITH PREV LSCS'...,"['GA-36 WKS 4 DAYS , E.F.W- 3044 +_457 GMS, FH...",10.6,110/70,NO- REACTIVE,O + VE,37.0,LSCS,APRIL-2021.xlsx,1,0,0,0,0,0,0,130.0,3.0,3044.0,15.4,FUNDAL,,SINGLE,77.0,0.77,0,129.87013,1,0,0,0,0,0,15.4,,130.0
1,2,152.0,72.0,29.6.20,25.0,5.4.21,"['nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'na...",['G3P1L1A1 AT 39 WKS 4 DAYS POG IN LATENT LABO...,['SINGLE LIVE FETUS IN CEPHALIC PRESENTATION ....,14.3,110/70,NO- REACTIVE,B + VE,39.0,NVD,APRIL-2021.xlsx,0,0,0,0,0,0,0,133.0,3.0,3087.0,13.6,FUNDO-ANTERIOR . ',3.2,SINGLE,72.0,0.72,0,138.888889,0,1,0,1,0,0,13.6,3.2,133.0
2,3,153.0,70.0,17.7.20,32.0,24.4.21,"['PREV LSCS ', 'nan', 'nan', 'nan', 'nan', 'na...",['G2P1L1 AT 36 WKS 5 DAYS POG WITH PREV LSCS W...,"['GA- 36 WKS 1 DAYS , E.F.W- 2861 +_429 GMS, F...",11.6,110/70,NO- REACTIVE,B + VE,36.0,LSCS,APRIL-2021.xlsx,1,0,0,0,0,0,0,148.0,3.0,2861.0,4.5,POSTERIOR,3.1,SINGLE,70.0,0.7,0,142.857143,0,0,1,0,0,0,4.5,3.1,148.0
3,4,152.0,72.0,24.6.20,27.0,31.3.21,"['NON PROGRESS ', 'OF LABOUR', 'nan', 'nan', '...",['PRIMI AT 40 WKS 1 DAYS POG WITH OI CONCEPTIO...,['SINGLE LIVE INTRAUTERINE FETUS IN CEPHALIC P...,12.1,110/70,NO- REACTIVE,O + VE,40.0,LSCS,APRIL-2021.xlsx,0,0,0,0,0,0,1,156.0,,,,FUNDO-ANTERIOR,,SINGLE,72.0,0.72,0,138.888889,0,1,0,1,0,0,,3.5,156.0
4,5,163.0,82.0,20.6.20,28.0,27.3.21,"['nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'na...",['PRIMI AT 39 WKS POG WITH THROMBCY WITH PEV...,['SINGLE LIVE FETUS IN CEPHALIC PRESENTATION ....,11.9,110/70,NO- REACTIVE,B + VE,39.0,LSCS,APRIL-2021.xlsx,0,0,0,0,0,0,0,,,,,FUNDO-ANTERIOR,,SINGLE,82.0,0.82,0,121.95122,0,1,0,1,0,0,16.2,,
