# GenPredict: Model 

### Import the libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

### Import the datasets

### **Train DataSet**

In [2]:
# loading the datset
train = pd.read_csv("../raw_data/train.csv")
train.head()

Unnamed: 0,Patient Id,Patient Age,Genes in mother's side,Inherited from father,Maternal gene,Paternal gene,Blood cell count (mcL),Patient First Name,Family Name,Father's name,...,Birth defects,White Blood cell count (thousand per microliter),Blood test result,Symptom 1,Symptom 2,Symptom 3,Symptom 4,Symptom 5,Genetic Disorder,Disorder Subclass
0,PID0x6418,2.0,Yes,No,Yes,No,4.760603,Richard,,Larre,...,,9.857562,,1.0,1.0,1.0,1.0,1.0,Mitochondrial genetic inheritance disorders,Leber's hereditary optic neuropathy
1,PID0x25d5,4.0,Yes,Yes,No,No,4.910669,Mike,,Brycen,...,Multiple,5.52256,normal,1.0,,1.0,1.0,0.0,,Cystic fibrosis
2,PID0x4a82,6.0,Yes,No,No,No,4.893297,Kimberly,,Nashon,...,Singular,,normal,0.0,1.0,1.0,1.0,1.0,Multifactorial genetic inheritance disorders,Diabetes
3,PID0x4ac8,12.0,Yes,No,Yes,No,4.70528,Jeffery,Hoelscher,Aayaan,...,Singular,7.919321,inconclusive,0.0,0.0,1.0,0.0,0.0,Mitochondrial genetic inheritance disorders,Leigh syndrome
4,PID0x1bf7,11.0,Yes,No,,Yes,4.720703,Johanna,Stutzman,Suave,...,Multiple,4.09821,,0.0,0.0,0.0,0.0,,Multifactorial genetic inheritance disorders,Cancer


### Features Selection

In [3]:
# DROP UNNECESSARY COLUMNS
train.drop("Patient Id",axis=1,inplace=True)
train.drop("Family Name",axis=1,inplace=True)
train.drop("Patient First Name",axis=1,inplace=True)
train.drop("Father's name",axis=1,inplace=True)
train.drop("Institute Name",axis=1,inplace=True)
train.drop("Location of Institute",axis=1,inplace=True)
train.drop("Place of birth",axis=1,inplace=True)
train.drop("Parental consent",axis=1,inplace=True)
train.drop("Test 1",axis=1,inplace=True)
train.drop("Test 2",axis=1,inplace=True)
train.drop("Test 3",axis=1,inplace=True)
train.drop("Test 5",axis=1,inplace=True)

In [4]:
train.head()

Unnamed: 0,Patient Age,Genes in mother's side,Inherited from father,Maternal gene,Paternal gene,Blood cell count (mcL),Mother's age,Father's age,Status,Respiratory Rate (breaths/min),...,Birth defects,White Blood cell count (thousand per microliter),Blood test result,Symptom 1,Symptom 2,Symptom 3,Symptom 4,Symptom 5,Genetic Disorder,Disorder Subclass
0,2.0,Yes,No,Yes,No,4.760603,,,Alive,Normal (30-60),...,,9.857562,,1.0,1.0,1.0,1.0,1.0,Mitochondrial genetic inheritance disorders,Leber's hereditary optic neuropathy
1,4.0,Yes,Yes,No,No,4.910669,,23.0,Deceased,Tachypnea,...,Multiple,5.52256,normal,1.0,,1.0,1.0,0.0,,Cystic fibrosis
2,6.0,Yes,No,No,No,4.893297,41.0,22.0,Alive,Normal (30-60),...,Singular,,normal,0.0,1.0,1.0,1.0,1.0,Multifactorial genetic inheritance disorders,Diabetes
3,12.0,Yes,No,Yes,No,4.70528,21.0,,Deceased,Tachypnea,...,Singular,7.919321,inconclusive,0.0,0.0,1.0,0.0,0.0,Mitochondrial genetic inheritance disorders,Leigh syndrome
4,11.0,Yes,No,,Yes,4.720703,32.0,,Alive,Tachypnea,...,Multiple,4.09821,,0.0,0.0,0.0,0.0,,Multifactorial genetic inheritance disorders,Cancer


### Rename columns

In [5]:
train.rename(columns={"Patient Age":"Patient_Age",
                    "Genes in mother's side":"Genes_Mother_Side","Inherited from father":"Inherited_from_father",
                    "Maternal gene":"Maternal_gene","Paternal gene":"Paternal_gene","Blood cell count (mcL)":"Blood_cell_count(mcL)",
                    "Mother's age":"Mother_Age","Father's age":"Father_Age",
                    "Respiratory Rate (breaths/min)":"Respiratory_Rate_Breaths_Min","Heart Rate (rates/min":"Heart_Rates","Follow-up":"Follow_Up", "Birth asphyxia":"Birth_Asphyxia",
                    "Autopsy shows birth defect (if applicable)":"Autopsy_Birth_Defect","Folic acid details (peri-conceptional)":"Folic_Acid",
                    "H/O serious maternal illness":"Maternal_Illness","H/O radiation exposure (x-ray)":"Radiation_Exposure",
                    "H/O substance abuse":"Substance_Abuse","Assisted conception IVF/ART":"Assisted_Conception", "History of anomalies in previous pregnancies":"History_Previous_Pregnancies",
                    "No. of previous abortion":"Previous_Abortion","Birth defects":"Birth_Defects","Test 4":"Test_4",
                    "White Blood cell count (thousand per microliter)":"White_Blood_Cell","Blood test result":"Blood_Test_Result",
                    "Symptom 1":"Symptom_1","Symptom 2":"Symptom_2","Symptom 3":"Symptom_3","Symptom 4":"Symptom_4",
                    "Symptom 5":"Symptom_5","Genetic Disorder":"Genetic_Disorder","Disorder Subclass":"Disorder_Subclass"},inplace=True)

### Check For Dublicates

In [6]:
train.duplicated().sum()

0

### Check For Nulls

In [7]:
(train.isnull().sum()/len(train)*100).sort_values(ascending=False)

Mother_Age                      27.333243
Father_Age                      27.106824
Maternal_gene                   12.724720
Symptom_2                       10.062039
Substance_Abuse                  9.939773
Gender                           9.840149
History_Previous_Pregnancies     9.835620
Disorder_Subclass                9.817507
Follow_Up                        9.808450
Previous_Abortion                9.790336
Symptom_1                        9.758638
Birth_Defects                    9.754109
Symptom_5                        9.749581
Radiation_Exposure               9.749581
Maternal_Illness                 9.745053
Respiratory_Rate_Breaths_Min     9.731468
White_Blood_Cell                 9.726939
Genetic_Disorder                 9.717883
Blood_Test_Result                9.713354
Test_4                           9.690712
Birth_Asphyxia                   9.686184
Assisted_Conception              9.609202
Folic_Acid                       9.586560
Heart_Rates                      9

### Replacing to NAN

In [8]:
train["Gender"] = train["Gender"].replace("Ambiguous",np.nan)

train["Birth_Asphyxia"] = train["Birth_Asphyxia"].replace("No record",np.nan)
train["Birth_Asphyxia"] = train["Birth_Asphyxia"].replace("Not available",np.nan)

train["Autopsy_Birth_Defect"] = train["Autopsy_Birth_Defect"].replace("Not applicable",np.nan)
train["Autopsy_Birth_Defect"] = train["Autopsy_Birth_Defect"].replace("None",np.nan)

train["Radiation_Exposure"] = train["Radiation_Exposure"].replace("Not applicable",np.nan)
train["Radiation_Exposure"] = train["Radiation_Exposure"].replace("-",np.nan)

train["Substance_Abuse"] = train["Substance_Abuse"].replace("Not applicable",np.nan)
train["Substance_Abuse"] = train["Substance_Abuse"].replace("-",np.nan)

In [9]:
(train.isnull().sum()/len(train)*100).sort_values(ascending=False)

Autopsy_Birth_Defect            70.076529
Birth_Asphyxia                  54.942716
Radiation_Exposure              54.784223
Substance_Abuse                 54.680071
Gender                          40.157587
Mother_Age                      27.333243
Father_Age                      27.106824
Maternal_gene                   12.724720
Symptom_2                       10.062039
History_Previous_Pregnancies     9.835620
Disorder_Subclass                9.817507
Follow_Up                        9.808450
Previous_Abortion                9.790336
Symptom_1                        9.758638
Birth_Defects                    9.754109
Symptom_5                        9.749581
Maternal_Illness                 9.745053
Respiratory_Rate_Breaths_Min     9.731468
White_Blood_Cell                 9.726939
Genetic_Disorder                 9.717883
Blood_Test_Result                9.713354
Test_4                           9.690712
Assisted_Conception              9.609202
Folic_Acid                       9

In [10]:
train.drop("Autopsy_Birth_Defect",axis=1,inplace=True)
train.drop("Birth_Asphyxia",axis=1,inplace=True)
train.drop("Radiation_Exposure",axis=1,inplace=True)
train.drop("Substance_Abuse",axis=1,inplace=True)

In [11]:
train = train.dropna(subset=['Test_4'])

In [12]:
train.shape

(19943, 29)

### Filling missing values in our target column

In [13]:
def fill_Genetic_Disorder(row):
    if pd.isnull(row['Genetic_Disorder']):
        if row['Disorder_Subclass'] == "Leber's hereditary op":
            return 'Mitochondrial genetic inheritance disorders'
        elif row['Disorder_Subclass'] == "Cystic fibrosis":
            return 'Single-gene inheritance diseases'
        elif row['Disorder_Subclass'] == "Diabetes":
            return 'Multifactorial genetic inheritance disorders'
        elif row['Disorder_Subclass'] == "Leigh syndrome":
            return 'Mitochondrial genetic inheritance disorders'
        elif row['Disorder_Subclass'] == "Cancer":
            return 'Multifactorial genetic inheritance disorders'
        elif row['Disorder_Subclass'] == "Tay-Sachs":
            return 'Single-gene inheritance diseases'
        elif row['Disorder_Subclass'] == "Mitochondrial myopathy":
            return 'Mitochondrial genetic inheritance disorders'
        elif row['Disorder_Subclass'] == "Hemochromatosis":
            return 'Single-gene inheritance diseases'
        elif row['Disorder_Subclass'] == "Leber's hereditary optic neuropathy":
            return 'Mitochondrial genetic inheritance disorders'
        elif row['Disorder_Subclass'] == "Alzheimer's":
            return 'Multifactorial genetic inheritance disorders'
    return row['Genetic_Disorder']

train['Genetic_Disorder'] = train.apply(fill_Genetic_Disorder, axis=1)

In [14]:
train['Genetic_Disorder'].isna().sum()

240

In [15]:
train = train.dropna(subset=['Genetic_Disorder'])

In [16]:
train.shape

(19703, 29)

### Define the features and target

In [17]:
X= train.drop(columns=[ "Genetic_Disorder", "Disorder_Subclass"],axis=1)
y_Genetic_Disorder= train["Genetic_Disorder"]

### Split the train to 80/20 

In [18]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y_Genetic_Disorder, test_size=0.2, random_state=42)

### Filling Null values

In [19]:
# Filling a numeric columns
imputer = SimpleImputer(strategy='mean')
numeric_cols = X_train.select_dtypes(include='number').columns
X_train[numeric_cols] = imputer.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = imputer.transform(X_test[numeric_cols])

In [20]:
# Filling a Categorical columns
cat_imputer = SimpleImputer(strategy='most_frequent')
categorical_cols = X_train.select_dtypes(exclude='number').columns
X_train[categorical_cols] = cat_imputer.fit_transform(X_train[categorical_cols])
X_test[categorical_cols] = cat_imputer.transform(X_test[categorical_cols])

In [21]:
(X_train.isnull().sum()/len(X_train)*100).sort_values(ascending=False)

Patient_Age                     0.0
Folic_Acid                      0.0
Symptom_4                       0.0
Symptom_3                       0.0
Symptom_2                       0.0
Symptom_1                       0.0
Blood_Test_Result               0.0
White_Blood_Cell                0.0
Birth_Defects                   0.0
Previous_Abortion               0.0
History_Previous_Pregnancies    0.0
Assisted_Conception             0.0
Maternal_Illness                0.0
Gender                          0.0
Genes_Mother_Side               0.0
Follow_Up                       0.0
Test_4                          0.0
Heart_Rates                     0.0
Respiratory_Rate_Breaths_Min    0.0
Status                          0.0
Father_Age                      0.0
Mother_Age                      0.0
Blood_cell_count(mcL)           0.0
Paternal_gene                   0.0
Maternal_gene                   0.0
Inherited_from_father           0.0
Symptom_5                       0.0
dtype: float64

In [22]:
(y_train.isnull().sum()/len(y_train)*100)

0.0

In [23]:
y_train.value_counts()

Mitochondrial genetic inheritance disorders     8061
Single-gene inheritance diseases                6086
Multifactorial genetic inheritance disorders    1615
Name: Genetic_Disorder, dtype: int64

#### Remove parenthetical text from the respiratory rate column

In [24]:
X_train["Respiratory_Rate_Breaths_Min"] = (X_train["Respiratory_Rate_Breaths_Min"].str.replace(r"\s*\(.*\)", "", regex=True))
X_test["Respiratory_Rate_Breaths_Min"] = (X_test["Respiratory_Rate_Breaths_Min"].str.replace(r"\s*\(.*\)", "", regex=True))

### Convert the datatypes

In [25]:
dtype_map = {
    'Patient_Age': 'int64',
    'Genes_Mother_Side': 'object',
    'Inherited_Father': 'object',
    'Maternal_Gene': 'object',
    'Paternal_Gene': 'object',
    'Blood_Cell_mcL': 'float64',
    'Mother_Age': 'int64',
    'Father_Age': 'int64',
    'Status': 'object',
    'Respiratory_Rate_Breaths_Min': 'object',
    'Heart_Rates_Min': 'object',
    'Test_4': 'bool',
    'Follow_Up': 'object',
    'Gender': 'object',
    'Birth_Asphyxia': 'object',
    'Autopsy_Birth_Defect': 'object',
    'Folic_Acid': 'object',
    'Maternal_Illness': 'object',
    'Radiation_Exposure': 'object',
    'Substance_Abuse': 'object',
    'Assisted_Conception': 'object',
    'History_Previous_Pregnancies': 'object',
    'Previous_Abortion': 'int64',
    'Birth_Defects': 'object',
    'White_Blood_Cell': 'float64',
    'Blood_Test_Result': 'object',
    'Symptom_1': 'bool',
    'Symptom_2': 'bool',
    'Symptom_3': 'bool',
    'Symptom_4': 'bool',
    'Symptom_5': 'bool'
}

for col, dtype in dtype_map.items():
    try:
        X_train[col] = X_train[col].astype(dtype)
    except Exception as e:
        print(f"Could not convert {col} to {dtype}: {e}")

Could not convert Inherited_Father to object: 'Inherited_Father'
Could not convert Maternal_Gene to object: 'Maternal_Gene'
Could not convert Paternal_Gene to object: 'Paternal_Gene'
Could not convert Blood_Cell_mcL to float64: 'Blood_Cell_mcL'
Could not convert Heart_Rates_Min to object: 'Heart_Rates_Min'
Could not convert Birth_Asphyxia to object: 'Birth_Asphyxia'
Could not convert Autopsy_Birth_Defect to object: 'Autopsy_Birth_Defect'
Could not convert Radiation_Exposure to object: 'Radiation_Exposure'
Could not convert Substance_Abuse to object: 'Substance_Abuse'


In [26]:
dtype_map = {
    'Patient_Age': 'int64',
    'Genes_Mother_Side': 'object',
    'Inherited_Father': 'object',
    'Maternal_Gene': 'object',
    'Paternal_Gene': 'object',
    'Blood_Cell_mcL': 'float64',
    'Mother_Age': 'int64',
    'Father_Age': 'int64',
    'Status': 'object',
    'Respiratory_Rate_Breaths_Min': 'object',
    'Heart_Rates_Min': 'object',
    'Test_4': 'bool',
    'Follow_Up': 'object',
    'Gender': 'object',
    'Birth_Asphyxia': 'object',
    'Autopsy_Birth_Defect': 'object',
    'Folic_Acid': 'object',
    'Maternal_Illness': 'object',
    'Radiation_Exposure': 'object',
    'Substance_Abuse': 'object',
    'Assisted_Conception': 'object',
    'History_Previous_Pregnancies': 'object',
    'Previous_Abortion': 'int64',
    'Birth_Defects': 'object',
    'White_Blood_Cell': 'float64',
    'Blood_Test_Result': 'object',
    'Symptom_1': 'bool',
    'Symptom_2': 'bool',
    'Symptom_3': 'bool',
    'Symptom_4': 'bool',
    'Symptom_5': 'bool'
}

for col, dtype in dtype_map.items():
    try:
        X_test[col] = X_test[col].astype(dtype)
    except Exception as e:
        print(f"Could not convert {col} to {dtype}: {e}")

Could not convert Inherited_Father to object: 'Inherited_Father'
Could not convert Maternal_Gene to object: 'Maternal_Gene'
Could not convert Paternal_Gene to object: 'Paternal_Gene'
Could not convert Blood_Cell_mcL to float64: 'Blood_Cell_mcL'
Could not convert Heart_Rates_Min to object: 'Heart_Rates_Min'
Could not convert Birth_Asphyxia to object: 'Birth_Asphyxia'
Could not convert Autopsy_Birth_Defect to object: 'Autopsy_Birth_Defect'
Could not convert Radiation_Exposure to object: 'Radiation_Exposure'
Could not convert Substance_Abuse to object: 'Substance_Abuse'


### **Test DataSet**

In [27]:
# loading the datset
test = pd.read_csv("../raw_data/test.csv")
test.head()

Unnamed: 0,Patient Id,Patient Age,Genes in mother's side,Inherited from father,Maternal gene,Paternal gene,Blood cell count (mcL),Patient First Name,Family Name,Father's name,...,History of anomalies in previous pregnancies,No. of previous abortion,Birth defects,White Blood cell count (thousand per microliter),Blood test result,Symptom 1,Symptom 2,Symptom 3,Symptom 4,Symptom 5
0,PID0x4175,6,No,Yes,No,No,4.981655,Charles,,Kore,...,-99,2,Multiple,-99.0,slightly abnormal,True,True,True,True,True
1,PID0x21f5,10,Yes,No,,Yes,5.11889,Catherine,,Homero,...,Yes,-99,Multiple,8.179584,normal,False,False,False,True,False
2,PID0x49b8,5,No,,No,No,4.876204,James,,Danield,...,No,0,Singular,-99.0,slightly abnormal,False,False,True,True,False
3,PID0x2d97,13,No,Yes,Yes,No,4.687767,Brian,,Orville,...,Yes,-99,Singular,6.884071,normal,True,False,True,False,True
4,PID0x58da,5,No,,,Yes,5.152362,Gary,,Issiah,...,No,-99,Multiple,6.195178,normal,True,True,True,True,False


### Features Selection

In [28]:
test.drop("Patient Id",axis=1,inplace=True)
test.drop("Family Name",axis=1,inplace=True)
test.drop("Patient First Name",axis=1,inplace=True)
test.drop("Father's name",axis=1,inplace=True)
test.drop("Institute Name",axis=1,inplace=True)
test.drop("Location of Institute",axis=1,inplace=True)
test.drop("Place of birth",axis=1,inplace=True)
test.drop("Parental consent",axis=1,inplace=True)
test.drop("Test 1",axis=1,inplace=True)
test.drop("Test 2",axis=1,inplace=True)
test.drop("Test 3",axis=1,inplace=True)
test.drop("Test 5",axis=1,inplace=True)

test.drop("Autopsy shows birth defect (if applicable)",axis=1,inplace=True)
test.drop("Birth asphyxia",axis=1,inplace=True)
test.drop("H/O radiation exposure (x-ray)",axis=1,inplace=True)
test.drop("H/O substance abuse",axis=1,inplace=True)

### Rename columns

In [29]:
test.rename(columns={"Patient Age":"Patient_Age",
                    "Genes in mother's side":"Genes_Mother_Side","Inherited from father":"Inherited_from_father",
                    "Maternal gene":"Maternal_gene","Paternal gene":"Paternal_gene","Blood cell count (mcL)":"Blood_cell_count(mcL)",
                    "Mother's age":"Mother_Age","Father's age":"Father_Age",
                    "Respiratory Rate (breaths/min)":"Respiratory_Rate_Breaths_Min","Heart Rate (rates/min":"Heart_Rates","Follow-up":"Follow_Up",
                    "Folic acid details (peri-conceptional)":"Folic_Acid",
                    "H/O serious maternal illness":"Maternal_Illness",
                    "Assisted conception IVF/ART":"Assisted_Conception", "History of anomalies in previous pregnancies":"History_Previous_Pregnancies",
                    "No. of previous abortion":"Previous_Abortion","Birth defects":"Birth_Defects","Test 4":"Test_4",
                    "White Blood cell count (thousand per microliter)":"White_Blood_Cell","Blood test result":"Blood_Test_Result",
                    "Symptom 1":"Symptom_1","Symptom 2":"Symptom_2","Symptom 3":"Symptom_3","Symptom 4":"Symptom_4",
                    "Symptom 5":"Symptom_5","Genetic Disorder":"Genetic_Disorder","Disorder Subclass":"Disorder_Subclass"},inplace=True)

### Check For Dublicates

In [30]:
test.duplicated().sum()

0

### Check For Nulls

In [31]:
(test.isnull().sum()/len(test)*100).sort_values(ascending=False)

Maternal_gene                   39.334390
Heart_Rates                     30.639197
Respiratory_Rate_Breaths_Min    30.491284
Inherited_from_father            5.821447
Patient_Age                      0.000000
Assisted_Conception              0.000000
Symptom_4                        0.000000
Symptom_3                        0.000000
Symptom_2                        0.000000
Symptom_1                        0.000000
Blood_Test_Result                0.000000
White_Blood_Cell                 0.000000
Birth_Defects                    0.000000
Previous_Abortion                0.000000
History_Previous_Pregnancies     0.000000
Gender                           0.000000
Maternal_Illness                 0.000000
Folic_Acid                       0.000000
Genes_Mother_Side                0.000000
Follow_Up                        0.000000
Test_4                           0.000000
Status                           0.000000
Father_Age                       0.000000
Mother_Age                       0

### Replacing to NAN

In [32]:
test["Respiratory_Rate_Breaths_Min"] = test["Respiratory_Rate_Breaths_Min"].replace(["-99", -99, -99.0], np.nan)
test["Respiratory_Rate_Breaths_Min"] = (test["Respiratory_Rate_Breaths_Min"].str.replace(r"\s*\(.*\)", "", regex=True))

test["Heart_Rates"] = test["Heart_Rates"].replace(["-99", -99, -99.0], np.nan)
test["Test_4"] = test["Test_4"].replace(["-99", -99, -99.0], np.nan)

test["Follow_Up"] = test["Follow_Up"].replace(["-99", -99, -99.0], np.nan)

test["Gender"] = test["Gender"].replace(["-99", -99, -99.0], np.nan)
test["Gender"] = test["Gender"].replace("Ambiguous",np.nan)

test["Folic_Acid"] = test["Folic_Acid"].replace(["-99", -99, -99.0], np.nan)
test["Maternal_Illness"] = test["Maternal_Illness"].replace(["-99", -99, -99.0], np.nan)
test["Assisted_Conception"] = test["Assisted_Conception"].replace(["-99", -99, -99.0], np.nan)

test["History_Previous_Pregnancies"] = test["History_Previous_Pregnancies"].replace(["-99", -99, -99.0], np.nan)
test["Previous_Abortion"] = test["Previous_Abortion"].replace(["-99", -99, -99.0], np.nan)
test["Birth_Defects"] = test["Birth_Defects"].replace(["-99", -99, -99.0], np.nan)
test["White_Blood_Cell"] = test["White_Blood_Cell"].replace(["-99", -99, -99.0], np.nan)
test["Blood_Test_Result"] = test["Blood_Test_Result"].replace(["-99", -99, -99.0], np.nan)

In [33]:
(test.isnull().sum()/len(test)*100).sort_values(ascending=False)

Respiratory_Rate_Breaths_Min    52.731115
Heart_Rates                     52.551506
Gender                          48.367670
Maternal_gene                   39.334390
White_Blood_Cell                22.208135
History_Previous_Pregnancies    22.155309
Previous_Abortion               22.144744
Birth_Defects                   22.134179
Folic_Acid                      22.102483
Follow_Up                       22.081352
Assisted_Conception             22.028526
Test_4                          21.996830
Blood_Test_Result               21.975700
Maternal_Illness                21.975700
Inherited_from_father            5.821447
Symptom_4                        0.000000
Symptom_3                        0.000000
Symptom_2                        0.000000
Symptom_1                        0.000000
Patient_Age                      0.000000
Genes_Mother_Side                0.000000
Status                           0.000000
Father_Age                       0.000000
Mother_Age                       0

### Filling Null values

In [34]:
# Filling a Numeric columns
imputer = SimpleImputer(strategy='mean')
numeric_cols = test.select_dtypes(include='number').columns
test[numeric_cols] = imputer.fit_transform(test[numeric_cols])

In [35]:
# Filling a Categorical columns
cat_imputer = SimpleImputer(strategy='most_frequent')
categorical_cols = test.select_dtypes(exclude='number').columns
test[categorical_cols] = cat_imputer.fit_transform(test[categorical_cols])

In [36]:
(test.isnull().sum()/len(test)*100).sort_values(ascending=False)

Patient_Age                     0.0
Folic_Acid                      0.0
Symptom_4                       0.0
Symptom_3                       0.0
Symptom_2                       0.0
Symptom_1                       0.0
Blood_Test_Result               0.0
White_Blood_Cell                0.0
Birth_Defects                   0.0
Previous_Abortion               0.0
History_Previous_Pregnancies    0.0
Assisted_Conception             0.0
Maternal_Illness                0.0
Gender                          0.0
Genes_Mother_Side               0.0
Follow_Up                       0.0
Test_4                          0.0
Heart_Rates                     0.0
Respiratory_Rate_Breaths_Min    0.0
Status                          0.0
Father_Age                      0.0
Mother_Age                      0.0
Blood_cell_count(mcL)           0.0
Paternal_gene                   0.0
Maternal_gene                   0.0
Inherited_from_father           0.0
Symptom_5                       0.0
dtype: float64

In [37]:
print("X_train shape:", X_train.shape)
print("X_test shape :", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape :", y_test.shape)

print("test shape :", test.shape)

X_train shape: (15762, 27)
X_test shape : (3941, 27)
y_train shape: (15762,)
y_test shape : (3941,)
test shape : (9465, 27)


###  **Data preprocessing**

In [38]:
from ml_logic.preprocessor import create_preprocessor, pipeline_smote, encode_target
preprocessor=create_preprocessor()
X_processed = preprocessor.fit_transform(X_train)
X_processed_test = preprocessor.transform(X_test)
y_encoded, le = encode_target(y_train)
y_test_encoded = le.transform(y_test)
X_preprocessed_smote, y_smote_encoded = pipeline_smote(X_processed, y_encoded)

In [39]:
X_processed.shape

(15762, 49)

In [40]:
X_processed_test.shape

(3941, 49)

In [41]:
y_train.value_counts()

Mitochondrial genetic inheritance disorders     8061
Single-gene inheritance diseases                6086
Multifactorial genetic inheritance disorders    1615
Name: Genetic_Disorder, dtype: int64

In [42]:
pd.Series(y_smote_encoded).value_counts()

0    8061
2    8061
1    8061
dtype: int64

### Modeling

In [43]:
from ml_logic.model import initialize_ensemble_model, train_ensemble_model
voting_model= initialize_ensemble_model()
voting_model, scores=train_ensemble_model(voting_model, X_preprocessed_smote, y_smote_encoded)

In [44]:
voting_model.score(X_processed_test, y_test_encoded)

0.5521441258563816

In [45]:
print("\nCross-Validation Results:")
print(f"Mean Accuracy : {scores['test_accuracy'].mean():.4f}")
print(f"Mean Precision: {scores['test_precision'].mean():.4f}")
print(f"Mean Recall   : {scores['test_recall'].mean():.4f}")
print(f"Mean F1-Score : {scores['test_f1'].mean():.4f}")


Cross-Validation Results:
Mean Accuracy : 0.7723
Mean Precision: 0.7809
Mean Recall   : 0.7723
Mean F1-Score : 0.7673


### Prediction

In [46]:
processed_test = preprocessor.transform(test)

In [47]:
from ml_logic.model import predict_ensemble_model
y_pred_labels = predict_ensemble_model(voting_model, processed_test)

In [48]:
y_pred_labels

array(['Single-gene inheritance diseases',
       'Single-gene inheritance diseases',
       'Mitochondrial genetic inheritance disorders', ...,
       'Mitochondrial genetic inheritance disorders',
       'Mitochondrial genetic inheritance disorders',
       'Mitochondrial genetic inheritance disorders'], dtype='<U44')

In [49]:
test_with_pred = test.copy()
test_with_pred["Predicted_Genetic_Disorder"] = y_pred_labels
test_with_pred

Unnamed: 0,Patient_Age,Genes_Mother_Side,Inherited_from_father,Maternal_gene,Paternal_gene,Blood_cell_count(mcL),Mother_Age,Father_Age,Status,Respiratory_Rate_Breaths_Min,...,Previous_Abortion,Birth_Defects,White_Blood_Cell,Blood_Test_Result,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Predicted_Genetic_Disorder
0,6.0,No,Yes,No,No,4.981655,38.0,61.0,Alive,Tachypnea,...,2.000000,Multiple,7.494913,slightly abnormal,True,True,True,True,True,Single-gene inheritance diseases
1,10.0,Yes,No,Yes,Yes,5.118890,33.0,53.0,Alive,Normal,...,2.017099,Multiple,8.179584,normal,False,False,False,True,False,Single-gene inheritance diseases
2,5.0,No,No,No,No,4.876204,48.0,60.0,Deceased,Normal,...,0.000000,Singular,7.494913,slightly abnormal,False,False,True,True,False,Mitochondrial genetic inheritance disorders
3,13.0,No,Yes,Yes,No,4.687767,25.0,55.0,Alive,Normal,...,2.017099,Singular,6.884071,normal,True,False,True,False,True,Mitochondrial genetic inheritance disorders
4,5.0,No,No,Yes,Yes,5.152362,41.0,38.0,Deceased,Tachypnea,...,2.017099,Multiple,6.195178,normal,True,True,True,True,False,Mitochondrial genetic inheritance disorders
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9460,9.0,Yes,Yes,Yes,No,4.878335,28.0,63.0,Alive,Normal,...,2.000000,Multiple,7.234960,abnormal,False,True,True,True,False,Mitochondrial genetic inheritance disorders
9461,1.0,Yes,No,Yes,Yes,4.927151,37.0,62.0,Deceased,Normal,...,2.017099,Singular,4.859536,abnormal,False,True,False,True,True,Single-gene inheritance diseases
9462,2.0,No,Yes,No,No,4.898352,24.0,32.0,Deceased,Tachypnea,...,3.000000,Singular,5.696062,normal,False,True,False,False,True,Mitochondrial genetic inheritance disorders
9463,13.0,No,Yes,No,No,4.804840,36.0,56.0,Alive,Normal,...,1.000000,Singular,3.000000,inconclusive,True,True,True,True,True,Mitochondrial genetic inheritance disorders


In [50]:
test_with_pred[test_with_pred['Predicted_Genetic_Disorder'] != "Mitochondrial genetic inheritance disorders"]

Unnamed: 0,Patient_Age,Genes_Mother_Side,Inherited_from_father,Maternal_gene,Paternal_gene,Blood_cell_count(mcL),Mother_Age,Father_Age,Status,Respiratory_Rate_Breaths_Min,...,Previous_Abortion,Birth_Defects,White_Blood_Cell,Blood_Test_Result,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Predicted_Genetic_Disorder
0,6.0,No,Yes,No,No,4.981655,38.0,61.0,Alive,Tachypnea,...,2.000000,Multiple,7.494913,slightly abnormal,True,True,True,True,True,Single-gene inheritance diseases
1,10.0,Yes,No,Yes,Yes,5.118890,33.0,53.0,Alive,Normal,...,2.017099,Multiple,8.179584,normal,False,False,False,True,False,Single-gene inheritance diseases
5,9.0,No,No,Yes,No,4.942384,27.0,25.0,Alive,Normal,...,0.000000,Singular,6.478252,abnormal,True,True,True,True,True,Multifactorial genetic inheritance disorders
6,4.0,Yes,No,Yes,No,5.113778,19.0,22.0,Alive,Normal,...,1.000000,Singular,3.173918,slightly abnormal,True,True,True,True,False,Single-gene inheritance diseases
7,5.0,Yes,No,Yes,No,4.635096,48.0,24.0,Alive,Normal,...,2.000000,Multiple,8.864555,inconclusive,True,True,True,True,True,Single-gene inheritance diseases
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9452,11.0,No,No,Yes,No,5.250678,50.0,61.0,Deceased,Tachypnea,...,3.000000,Singular,5.180606,slightly abnormal,False,True,True,False,False,Single-gene inheritance diseases
9453,4.0,Yes,Yes,No,Yes,4.949049,18.0,54.0,Deceased,Normal,...,2.017099,Multiple,12.000000,inconclusive,True,True,False,True,True,Single-gene inheritance diseases
9454,13.0,Yes,Yes,Yes,Yes,5.058117,29.0,44.0,Alive,Normal,...,1.000000,Singular,11.209853,slightly abnormal,True,True,False,True,True,Multifactorial genetic inheritance disorders
9457,7.0,No,Yes,Yes,Yes,5.304645,51.0,20.0,Deceased,Tachypnea,...,2.017099,Multiple,7.008263,abnormal,True,True,True,True,True,Single-gene inheritance diseases


In [54]:
test_with_pred.iloc[0]

Patient_Age                                                  6.0
Genes_Mother_Side                                             No
Inherited_from_father                                        Yes
Maternal_gene                                                 No
Paternal_gene                                                 No
Blood_cell_count(mcL)                                   4.981655
Mother_Age                                                  38.0
Father_Age                                                  61.0
Status                                                     Alive
Respiratory_Rate_Breaths_Min                           Tachypnea
Heart_Rates                                               Normal
Test_4                                                       1.0
Follow_Up                                                    Low
Gender                                                      Male
Folic_Acid                                                   Yes
Maternal_Illness         

In [55]:
test_with_pred.iloc[2]

Patient_Age                                                             5.0
Genes_Mother_Side                                                        No
Inherited_from_father                                                    No
Maternal_gene                                                            No
Paternal_gene                                                            No
Blood_cell_count(mcL)                                              4.876204
Mother_Age                                                             48.0
Father_Age                                                             60.0
Status                                                             Deceased
Respiratory_Rate_Breaths_Min                                         Normal
Heart_Rates                                                          Normal
Test_4                                                                  1.0
Follow_Up                                                               Low
Gender      

In [51]:
test_with_pred.iloc[5]

Patient_Age                                                              9.0
Genes_Mother_Side                                                         No
Inherited_from_father                                                     No
Maternal_gene                                                            Yes
Paternal_gene                                                             No
Blood_cell_count(mcL)                                               4.942384
Mother_Age                                                              27.0
Father_Age                                                              25.0
Status                                                                 Alive
Respiratory_Rate_Breaths_Min                                          Normal
Heart_Rates                                                      Tachycardia
Test_4                                                                   1.0
Follow_Up                                                               High

In [52]:
import joblib
#Save the pipeline as a .pkl file
joblib.dump(preprocessor, "preprocessor.pkl")

['preprocessor.pkl']

In [53]:
joblib.dump(voting_model, "model.pkl")

['model.pkl']