In [162]:
import pandas as pd
import warnings

In [163]:
with warnings.catch_warnings():
    warnings.filterwarnings('ignore', category=FutureWarning)

## Load the dataset

In [164]:
df = pd.read_csv('Synthetic-Infant-Health-Data.csv')

## Explore and Confirm Features and Labels

In [165]:
print(df.head())
print()
print(len(df.columns),'Features:', df.columns)

   Unnamed: 0 BirthAsphyxia HypDistrib HypoxiaInO2     CO2  ChestXray  \
0           0            no      Equal      Severe  Normal     Normal   
1           1            no      Equal    Moderate    High  Grd_Glass   
2           2            no      Equal      Severe  Normal  Plethoric   
3           3            no      Equal    Moderate  Normal  Plethoric   
4           4            no      Equal    Moderate  Normal  Plethoric   

  Grunting LVHreport LowerBodyO2 RUQO2  ...  XrayReport Disease  \
0      yes        no        5-12    <5  ...  Asy/Patchy     TGA   
1       no        no          <5  5-12  ...   Grd_Glass  Fallot   
2       no       yes        5-12  5-12  ...      Normal     PFC   
3       no        no        5-12    <5  ...   Plethoric   PAIVS   
4       no       yes         12+  5-12  ...   Plethoric   PAIVS   

  GruntingReport        Age  LVH  DuctFlow CardiacMixing LungParench LungFlow  \
0             no  4-10_days   no  Lt_to_Rt       Transp.      Normal   Normal

## Explore Size/ Shape of Dataset

In [166]:
print('Shape:', df.shape)
print(df.shape[0], 'Samples and ', df.shape[1], 'Features')
print()
print(df.info())

Shape: (15000, 21)
15000 Samples and  21 Features

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Unnamed: 0      15000 non-null  int64 
 1   BirthAsphyxia   15000 non-null  object
 2   HypDistrib      15000 non-null  object
 3   HypoxiaInO2     15000 non-null  object
 4   CO2             15000 non-null  object
 5   ChestXray       15000 non-null  object
 6   Grunting        15000 non-null  object
 7   LVHreport       15000 non-null  object
 8   LowerBodyO2     15000 non-null  object
 9   RUQO2           15000 non-null  object
 10  CO2Report       15000 non-null  object
 11  XrayReport      15000 non-null  object
 12  Disease         15000 non-null  object
 13  GruntingReport  15000 non-null  object
 14  Age             15000 non-null  object
 15  LVH             15000 non-null  object
 16  DuctFlow        9311 non-null   object
 17 

## Getting Familiar with the Features

In [167]:
for col in df.columns:
    
    no_unique = df[col].nunique()
    print(f"Feature '{col}': {no_unique} unique value(s)")
    
    if no_unique <= 20:
        unique_values = df[col].unique()
        formatted_values = "[" + ", ".join([f"'{val}'" for val in unique_values]) + "]"
        
        print('Unique Values:', formatted_values)
        print()

Feature 'Unnamed: 0': 15000 unique value(s)
Feature 'BirthAsphyxia': 2 unique value(s)
Unique Values: ['no', 'yes']

Feature 'HypDistrib': 2 unique value(s)
Unique Values: ['Equal', 'Unequal']

Feature 'HypoxiaInO2': 3 unique value(s)
Unique Values: ['Severe', 'Moderate', 'Mild']

Feature 'CO2': 3 unique value(s)
Unique Values: ['Normal', 'High', 'Low']

Feature 'ChestXray': 5 unique value(s)
Unique Values: ['Normal', 'Grd_Glass', 'Plethoric', 'Oligaemic', 'Asy/Patch']

Feature 'Grunting': 2 unique value(s)
Unique Values: ['yes', 'no']

Feature 'LVHreport': 2 unique value(s)
Unique Values: ['no', 'yes']

Feature 'LowerBodyO2': 3 unique value(s)
Unique Values: ['5-12', '<5', '12+']

Feature 'RUQO2': 3 unique value(s)
Unique Values: ['<5', '5-12', '12+']

Feature 'CO2Report': 2 unique value(s)
Unique Values: ['<7.5', '>=7.5']

Feature 'XrayReport': 5 unique value(s)
Unique Values: ['Asy/Patchy', 'Grd_Glass', 'Normal', 'Plethoric', 'Oligaemic']

Feature 'Disease': 6 unique value(s)
Unique

## Investigate Data Types of Features and Labels

In [168]:
print(df.dtypes)

Unnamed: 0         int64
BirthAsphyxia     object
HypDistrib        object
HypoxiaInO2       object
CO2               object
ChestXray         object
Grunting          object
LVHreport         object
LowerBodyO2       object
RUQO2             object
CO2Report         object
XrayReport        object
Disease           object
GruntingReport    object
Age               object
LVH               object
DuctFlow          object
CardiacMixing     object
LungParench       object
LungFlow          object
Sick              object
dtype: object


## Check for Nan values

In [169]:
# Check if any NaN values exist in each column
any_nan_in_columns = df.isna().any()
columns_with_nan = df.columns[any_nan_in_columns].to_list()
print(columns_with_nan)

['DuctFlow', 'CardiacMixing']


In [170]:
nan_counts = df[columns_with_nan].isna().sum()
print(nan_counts)

DuctFlow         5689
CardiacMixing     727
dtype: int64


In [171]:
value_counts = df['DuctFlow'].value_counts()
print("Value Counts:\n", value_counts)

Value Counts:
 DuctFlow
Lt_to_Rt    8209
Rt_to_Lt    1102
Name: count, dtype: int64


In [172]:
value_counts = df['CardiacMixing'].value_counts()
print("Value Counts:\n", value_counts)

Value Counts:
 CardiacMixing
Complete    8442
Transp.     4839
Mild         992
Name: count, dtype: int64


## Handle the Nan values

In [173]:
# Fill the Nan Values with the Most Frequent Value = Lt_to_Rt
df['DuctFlow'] = df['DuctFlow'].fillna('Lt_to_Rt')

In [174]:
# Fill the Nan Values with the Most Frequent Value = Complete
df['CardiacMixing'] = df['CardiacMixing'].fillna('Complete')

In [175]:
# Check for Nan Values Now
nan_counts = df[columns_with_nan].isna().sum()
print(nan_counts)

DuctFlow         0
CardiacMixing    0
dtype: int64


## Data Types Necessary Conversions

In [176]:
categorical_mappings = {
    'BirthAsphyxia': {'no': 0, 'yes': 1},
    
    'HypDistrib': {'Equal': 0, 'Unequal': 1},
    
    'HypoxiaInO2': {'Mild': 0, 'Moderate': 1, 'Severe': 2},
    
    'CO2': {'Normal': 0, 'High': 1, 'Low': 2},
    
    'ChestXray': {'Normal': 0, 'Grd_Glass': 1, 'Plethoric': 2, 'Oligaemic': 3, 'Asy/Patch': 4},
    
    'Grunting': {'yes': 1, 'no': 0},
    
    'LVHreport': {'yes': 1, 'no': 0},
    
    'LowerBodyO2': {'<5': 0, '5-12': 1, '12+': 2},
    
    'RUQO2': {'<5': 0, '5-12': 1, '12+': 2},
    
    'CO2Report': {'<7.5': 0, '>=7.5': 1},
    
    'XrayReport': {'Asy/Patchy': 0, 'Grd_Glass': 1, 'Normal': 2, 'Plethoric': 3, 'Oligaemic': 4},
    
    'Disease': {'TGA': 0, 'Fallot': 1, 'PFC': 2, 'PAIVS': 3, 'TAPVD': 4, 'Lung': 5},
    
    'GruntingReport': {'yes': 1, 'no': 0},
    
    'Age': {'0-3_days': 0, '4-10_days': 1, '11-30_days': 2},
    
    'LVH': {'yes': 1, 'no': 0},
    
    'DuctFlow': {'Lt_to_Rt': 0, 'Rt_to_Lt': 1},
    
    'CardiacMixing': {'Transp.': 0, 'Mild': 1, 'Complete': 2},
    
    'LungParench': {'Normal': 0, 'Abnormal': 1, 'Congested': 2},
    
    'LungFlow': {'Normal': 0, 'High': 1, 'Low': 2},
    
    'Sick': {'yes': 1, 'no': 0}
}

for col, mapping in categorical_mappings.items():
    df[col] = df[col].map(mapping)

### After Conversion

In [177]:
print(df.dtypes)

Unnamed: 0        int64
BirthAsphyxia     int64
HypDistrib        int64
HypoxiaInO2       int64
CO2               int64
ChestXray         int64
Grunting          int64
LVHreport         int64
LowerBodyO2       int64
RUQO2             int64
CO2Report         int64
XrayReport        int64
Disease           int64
GruntingReport    int64
Age               int64
LVH               int64
DuctFlow          int64
CardiacMixing     int64
LungParench       int64
LungFlow          int64
Sick              int64
dtype: object


In [178]:
df.head()

Unnamed: 0.1,Unnamed: 0,BirthAsphyxia,HypDistrib,HypoxiaInO2,CO2,ChestXray,Grunting,LVHreport,LowerBodyO2,RUQO2,...,XrayReport,Disease,GruntingReport,Age,LVH,DuctFlow,CardiacMixing,LungParench,LungFlow,Sick
0,0,0,0,2,0,0,1,0,1,0,...,0,0,0,1,0,0,0,0,0,0
1,1,0,0,1,1,1,0,0,0,1,...,1,1,0,0,0,1,1,1,1,0
2,2,0,0,2,0,2,0,1,1,1,...,2,2,0,0,0,0,2,0,1,0
3,3,0,0,1,0,2,0,0,1,0,...,3,3,0,0,0,0,2,0,2,0
4,4,0,0,1,0,2,0,1,2,1,...,3,3,0,0,1,0,2,0,0,1


## Calculate Memory Usage Differences

In [179]:
print(df.memory_usage())
print()
print(df.memory_usage(deep=True))

Index                132
Unnamed: 0        120000
BirthAsphyxia     120000
HypDistrib        120000
HypoxiaInO2       120000
CO2               120000
ChestXray         120000
Grunting          120000
LVHreport         120000
LowerBodyO2       120000
RUQO2             120000
CO2Report         120000
XrayReport        120000
Disease           120000
GruntingReport    120000
Age               120000
LVH               120000
DuctFlow          120000
CardiacMixing     120000
LungParench       120000
LungFlow          120000
Sick              120000
dtype: int64

Index                132
Unnamed: 0        120000
BirthAsphyxia     120000
HypDistrib        120000
HypoxiaInO2       120000
CO2               120000
ChestXray         120000
Grunting          120000
LVHreport         120000
LowerBodyO2       120000
RUQO2             120000
CO2Report         120000
XrayReport        120000
Disease           120000
GruntingReport    120000
Age               120000
LVH               120000
DuctFlow   

## Explore Statistical Facts

In [180]:
df.describe()

Unnamed: 0.1,Unnamed: 0,BirthAsphyxia,HypDistrib,HypoxiaInO2,CO2,ChestXray,Grunting,LVHreport,LowerBodyO2,RUQO2,...,XrayReport,Disease,GruntingReport,Age,LVH,DuctFlow,CardiacMixing,LungParench,LungFlow,Sick
count,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,...,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0
mean,7499.5,0.026533,0.027267,1.327533,0.2352,1.812267,0.1214,0.199133,0.6574,0.6872,...,2.459133,1.475933,0.159133,0.386467,0.188467,0.073467,1.288667,0.236467,1.352267,0.237733
std,4330.271354,0.16072,0.162865,0.608835,0.527541,1.360225,0.326602,0.399362,0.640562,0.646981,...,1.225301,1.377854,0.365813,0.667588,0.391097,0.26091,0.922277,0.526531,0.703378,0.425709
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3749.75,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,7499.5,0.0,0.0,1.0,0.0,2.0,0.0,0.0,1.0,1.0,...,2.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0
75%,11249.25,0.0,0.0,2.0,0.0,3.0,0.0,0.0,1.0,1.0,...,3.0,3.0,0.0,1.0,0.0,0.0,2.0,0.0,2.0,0.0
max,14999.0,1.0,1.0,2.0,2.0,4.0,1.0,1.0,2.0,2.0,...,4.0,5.0,1.0,2.0,1.0,1.0,2.0,2.0,2.0,1.0


## The Dataset is ready to work on ;)