**Loading Dataset and Preprocessing**

In [1]:
import numpy as np
import pandas as pd
from scipy import stats

data = (pd.read_csv(r'train_genetic_disorders.csv'))

data.head()

Unnamed: 0,Patient Id,Patient Age,Genes in mother's side,Inherited from father,Maternal gene,Paternal gene,Blood cell count (mcL),Patient First Name,Family Name,Father's name,...,Birth defects,White Blood cell count (thousand per microliter),Blood test result,Symptom 1,Symptom 2,Symptom 3,Symptom 4,Symptom 5,Genetic Disorder,Disorder Subclass
0,PID0x6418,2.0,Yes,No,Yes,No,4.760603,Richard,,Larre,...,,9.857562,,1.0,1.0,1.0,1.0,1.0,Mitochondrial genetic inheritance disorders,Leber's hereditary optic neuropathy
1,PID0x25d5,4.0,Yes,Yes,No,No,4.910669,Mike,,Brycen,...,Multiple,5.52256,normal,1.0,,1.0,1.0,0.0,,Cystic fibrosis
2,PID0x4a82,6.0,Yes,No,No,No,4.893297,Kimberly,,Nashon,...,Singular,,normal,0.0,1.0,1.0,1.0,1.0,Multifactorial genetic inheritance disorders,Diabetes
3,PID0x4ac8,12.0,Yes,No,Yes,No,4.70528,Jeffery,Hoelscher,Aayaan,...,Singular,7.919321,inconclusive,0.0,0.0,1.0,0.0,0.0,Mitochondrial genetic inheritance disorders,Leigh syndrome
4,PID0x1bf7,11.0,Yes,No,,Yes,4.720703,Johanna,Stutzman,Suave,...,Multiple,4.09821,,0.0,0.0,0.0,0.0,,Multifactorial genetic inheritance disorders,Cancer


In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22083 entries, 0 to 22082
Data columns (total 45 columns):
 #   Column                                            Non-Null Count  Dtype  
---  ------                                            --------------  -----  
 0   Patient Id                                        21011 non-null  object 
 1   Patient Age                                       19643 non-null  float64
 2   Genes in mother's side                            21011 non-null  object 
 3   Inherited from father                             20724 non-null  object 
 4   Maternal gene                                     18317 non-null  object 
 5   Paternal gene                                     21011 non-null  object 
 6   Blood cell count (mcL)                            21011 non-null  float64
 7   Patient First Name                                21011 non-null  object 
 8   Family Name                                       11771 non-null  object 
 9   Father's name    

In [3]:
#removing unnecessary features
data = data.drop(columns=['Patient Id','Patient First Name','Family Name','Father\'s name','Institute Name','Location of Institute','Test 1','Test 2','Test 3','Test 4',
'Test 5','Parental consent'])

**Handling null values**

In [4]:
#all null rows
data[data.isnull().all(1)].shape

(1072, 33)

In [5]:
#removing all null rows
data = data[data.isnull().all(1)!=True]

data.shape

(21011, 33)

In [6]:
#rows with missing values
data[data.isnull().any(axis=1)]

Unnamed: 0,Patient Age,Genes in mother's side,Inherited from father,Maternal gene,Paternal gene,Blood cell count (mcL),Mother's age,Father's age,Status,Respiratory Rate (breaths/min),...,Birth defects,White Blood cell count (thousand per microliter),Blood test result,Symptom 1,Symptom 2,Symptom 3,Symptom 4,Symptom 5,Genetic Disorder,Disorder Subclass
0,2.0,Yes,No,Yes,No,4.760603,,,Alive,Normal (30-60),...,,9.857562,,1.0,1.0,1.0,1.0,1.0,Mitochondrial genetic inheritance disorders,Leber's hereditary optic neuropathy
1,4.0,Yes,Yes,No,No,4.910669,,23.0,Deceased,Tachypnea,...,Multiple,5.522560,normal,1.0,,1.0,1.0,0.0,,Cystic fibrosis
2,6.0,Yes,No,No,No,4.893297,41.0,22.0,Alive,Normal (30-60),...,Singular,,normal,0.0,1.0,1.0,1.0,1.0,Multifactorial genetic inheritance disorders,Diabetes
3,12.0,Yes,No,Yes,No,4.705280,21.0,,Deceased,Tachypnea,...,Singular,7.919321,inconclusive,0.0,0.0,1.0,0.0,0.0,Mitochondrial genetic inheritance disorders,Leigh syndrome
4,11.0,Yes,No,,Yes,4.720703,32.0,,Alive,Tachypnea,...,Multiple,4.098210,,0.0,0.0,0.0,0.0,,Multifactorial genetic inheritance disorders,Cancer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21004,6.0,No,No,No,No,4.946398,34.0,34.0,Deceased,Normal (30-60),...,Singular,,normal,0.0,,0.0,1.0,1.0,Mitochondrial genetic inheritance disorders,
21005,2.0,Yes,No,,No,5.144151,19.0,42.0,Alive,Tachypnea,...,Singular,9.863374,slightly abnormal,1.0,0.0,1.0,0.0,0.0,Mitochondrial genetic inheritance disorders,Mitochondrial myopathy
21006,11.0,No,Yes,Yes,No,5.090495,46.0,,Alive,,...,Multiple,,normal,1.0,,0.0,0.0,0.0,Single-gene inheritance diseases,Tay-Sachs
21007,,No,No,No,Yes,5.214750,,38.0,Alive,Tachypnea,...,Singular,7.086312,slightly abnormal,1.0,,1.0,0.0,1.0,Mitochondrial genetic inheritance disorders,Leigh syndrome


In [7]:
#removing rows with null target variables
data = data[(data['Genetic Disorder'].isnull() != True) & (data['Disorder Subclass'].isnull() != True)]

#shape of dataset after removing rows with null target variables
data.shape

(17160, 33)

In [8]:
#missing value imputation
"""
data['Patient Age']= data['Patient Age'].fillna(data['Patient Age'].mean())
data['Genes in mother\'s side']= data['Genes in mother\'s side'].fillna(data['Genes in mother\'s side'].mode()[0])
data['Inherited from father']= data['Inherited from father'].fillna(data['Inherited from father'].mode()[0])
data['Maternal gene']= data['Maternal gene'].fillna(data['Maternal gene'].mode()[0])
"""

for i in data.columns:
    if data[i].dtype.name == 'object':
        data[i] = data[i].fillna(data[i].mode()[0])
    elif data[i].dtype.name != 'object':
        data[i] = data[i].fillna(data[i].mean())

data

Unnamed: 0,Patient Age,Genes in mother's side,Inherited from father,Maternal gene,Paternal gene,Blood cell count (mcL),Mother's age,Father's age,Status,Respiratory Rate (breaths/min),...,Birth defects,White Blood cell count (thousand per microliter),Blood test result,Symptom 1,Symptom 2,Symptom 3,Symptom 4,Symptom 5,Genetic Disorder,Disorder Subclass
0,2.000000,Yes,No,Yes,No,4.760603,34.57081,41.961292,Alive,Normal (30-60),...,Singular,9.857562,slightly abnormal,1.0,1.000000,1.0,1.0,1.000000,Mitochondrial genetic inheritance disorders,Leber's hereditary optic neuropathy
2,6.000000,Yes,No,No,No,4.893297,41.00000,22.000000,Alive,Normal (30-60),...,Singular,7.470663,normal,0.0,1.000000,1.0,1.0,1.000000,Multifactorial genetic inheritance disorders,Diabetes
3,12.000000,Yes,No,Yes,No,4.705280,21.00000,41.961292,Deceased,Tachypnea,...,Singular,7.919321,inconclusive,0.0,0.000000,1.0,0.0,0.000000,Mitochondrial genetic inheritance disorders,Leigh syndrome
4,11.000000,Yes,No,Yes,Yes,4.720703,32.00000,41.961292,Alive,Tachypnea,...,Multiple,4.098210,slightly abnormal,0.0,0.000000,0.0,0.0,0.463115,Multifactorial genetic inheritance disorders,Cancer
5,14.000000,Yes,No,Yes,No,5.103188,34.57081,41.961292,Deceased,Normal (30-60),...,Multiple,10.272230,normal,1.0,0.000000,0.0,1.0,0.000000,Single-gene inheritance diseases,Cystic fibrosis
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21005,2.000000,Yes,No,Yes,No,5.144151,19.00000,42.000000,Alive,Tachypnea,...,Singular,9.863374,slightly abnormal,1.0,0.000000,1.0,0.0,0.000000,Mitochondrial genetic inheritance disorders,Mitochondrial myopathy
21006,11.000000,No,Yes,Yes,No,5.090495,46.00000,41.961292,Alive,Normal (30-60),...,Multiple,7.470663,normal,1.0,0.553347,0.0,0.0,0.000000,Single-gene inheritance diseases,Tay-Sachs
21007,6.945683,No,No,No,Yes,5.214750,34.57081,38.000000,Alive,Tachypnea,...,Singular,7.086312,slightly abnormal,1.0,0.553347,1.0,0.0,1.000000,Mitochondrial genetic inheritance disorders,Leigh syndrome
21009,1.000000,Yes,No,No,Yes,5.224828,33.00000,24.000000,Deceased,Tachypnea,...,Multiple,12.000000,inconclusive,0.0,1.000000,1.0,1.0,0.000000,Mitochondrial genetic inheritance disorders,Mitochondrial myopathy


 **Handling String Values**

In [9]:
#creating a mapping dictionary
cleanup_nums = {"Genes in mother's side": {"Yes": 1, "No": 0},
                "Inherited from father": {"Yes": 1, "No": 0},
                "Maternal gene": {"Yes": 1, "No": 0},
                "Paternal gene": {"Yes": 1, "No": 0},
                "Status": {"Alive": 1, "Deceased": 0},
                "Folic acid details (peri-conceptional)": {"Yes": 1, "No": 0},
                "H/O serious maternal illness": {"Yes": 1, "No": 0},
                "Assisted conception IVF/ART": {"Yes": 1, "No": 0},
                "History of anomalies in previous pregnancies": {"Yes": 1, "No": 0},
                "Follow-up": {"High": 1, "Low": 0},
               "Place of birth":{"Institute": 1, "Home": 0},
               "Birth defects":{"Singular": 1, "Multiple": 2}}
#replace words with numbers
data = data.replace(cleanup_nums)
data.head()

Unnamed: 0,Patient Age,Genes in mother's side,Inherited from father,Maternal gene,Paternal gene,Blood cell count (mcL),Mother's age,Father's age,Status,Respiratory Rate (breaths/min),...,Birth defects,White Blood cell count (thousand per microliter),Blood test result,Symptom 1,Symptom 2,Symptom 3,Symptom 4,Symptom 5,Genetic Disorder,Disorder Subclass
0,2.0,1,0,1,0,4.760603,34.57081,41.961292,1,Normal (30-60),...,1,9.857562,slightly abnormal,1.0,1.0,1.0,1.0,1.0,Mitochondrial genetic inheritance disorders,Leber's hereditary optic neuropathy
2,6.0,1,0,0,0,4.893297,41.0,22.0,1,Normal (30-60),...,1,7.470663,normal,0.0,1.0,1.0,1.0,1.0,Multifactorial genetic inheritance disorders,Diabetes
3,12.0,1,0,1,0,4.70528,21.0,41.961292,0,Tachypnea,...,1,7.919321,inconclusive,0.0,0.0,1.0,0.0,0.0,Mitochondrial genetic inheritance disorders,Leigh syndrome
4,11.0,1,0,1,1,4.720703,32.0,41.961292,1,Tachypnea,...,2,4.09821,slightly abnormal,0.0,0.0,0.0,0.0,0.463115,Multifactorial genetic inheritance disorders,Cancer
5,14.0,1,0,1,0,5.103188,34.57081,41.961292,0,Normal (30-60),...,2,10.27223,normal,1.0,0.0,0.0,1.0,0.0,Single-gene inheritance diseases,Cystic fibrosis


In [10]:
#one hot encoding
data = pd.get_dummies(data, columns=["Respiratory Rate (breaths/min)", "Gender", "Birth asphyxia", "Autopsy shows birth defect (if applicable)",
                            "H/O radiation exposure (x-ray)", "H/O substance abuse", "Heart Rate (rates/min"])

data

Unnamed: 0,Patient Age,Genes in mother's side,Inherited from father,Maternal gene,Paternal gene,Blood cell count (mcL),Mother's age,Father's age,Status,Follow-up,...,H/O radiation exposure (x-ray)_-,H/O radiation exposure (x-ray)_No,H/O radiation exposure (x-ray)_Not applicable,H/O radiation exposure (x-ray)_Yes,H/O substance abuse_-,H/O substance abuse_No,H/O substance abuse_Not applicable,H/O substance abuse_Yes,Heart Rate (rates/min_Normal,Heart Rate (rates/min_Tachycardia
0,2.000000,1,0,1,0,4.760603,34.57081,41.961292,1,1,...,0,1,0,0,0,1,0,0,1,0
2,6.000000,1,0,0,0,4.893297,41.00000,22.000000,1,0,...,0,0,0,1,0,1,0,0,0,1
3,12.000000,1,0,1,0,4.705280,21.00000,41.961292,0,1,...,1,0,0,0,0,0,1,0,1,0
4,11.000000,1,0,1,1,4.720703,32.00000,41.961292,1,0,...,1,0,0,0,0,0,1,0,0,1
5,14.000000,1,0,1,0,5.103188,34.57081,41.961292,0,0,...,0,1,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21005,2.000000,1,0,1,0,5.144151,19.00000,42.000000,1,1,...,0,0,1,0,0,0,1,0,1,0
21006,11.000000,0,1,1,0,5.090495,46.00000,41.961292,1,1,...,0,1,0,0,0,0,1,0,0,1
21007,6.945683,0,0,0,1,5.214750,34.57081,38.000000,1,0,...,0,0,1,0,0,1,0,0,0,1
21009,1.000000,1,0,0,1,5.224828,33.00000,24.000000,0,0,...,0,1,0,0,1,0,0,0,0,1


In [11]:
"""
data["Respiratory Rate (breaths/min)"] = data["Respiratory Rate (breaths/min)"].astype('category').cat.codes
data["Gender"] = data["Gender"].astype('category').cat.codes
data["Birth asphyxia"] = data["Birth asphyxia"].astype('category').cat.codes
...
"""

#label encoding
data["Blood test result"] = data["Blood test result"].astype('category').cat.codes

#subsetting target variables and label encoding
target = data[['Genetic Disorder', 'Disorder Subclass']]
target["Genetic Disorder"] = target["Genetic Disorder"].astype('category').cat.codes
target["Disorder Subclass"] = target["Disorder Subclass"].astype('category').cat.codes

target

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  target["Genetic Disorder"] = target["Genetic Disorder"].astype('category').cat.codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  target["Disorder Subclass"] = target["Disorder Subclass"].astype('category').cat.codes


Unnamed: 0,Genetic Disorder,Disorder Subclass
0,0,5
2,1,3
3,0,6
4,1,1
5,2,2
...,...,...
21005,0,7
21006,2,8
21007,0,6
21009,0,7


In [12]:
#dropping target variables
data = data.drop(['Genetic Disorder', 'Disorder Subclass'], axis = 1)

In [13]:
#remove duplicates
data.drop_duplicates(inplace=True)

data

Unnamed: 0,Patient Age,Genes in mother's side,Inherited from father,Maternal gene,Paternal gene,Blood cell count (mcL),Mother's age,Father's age,Status,Follow-up,...,H/O radiation exposure (x-ray)_-,H/O radiation exposure (x-ray)_No,H/O radiation exposure (x-ray)_Not applicable,H/O radiation exposure (x-ray)_Yes,H/O substance abuse_-,H/O substance abuse_No,H/O substance abuse_Not applicable,H/O substance abuse_Yes,Heart Rate (rates/min_Normal,Heart Rate (rates/min_Tachycardia
0,2.000000,1,0,1,0,4.760603,34.57081,41.961292,1,1,...,0,1,0,0,0,1,0,0,1,0
2,6.000000,1,0,0,0,4.893297,41.00000,22.000000,1,0,...,0,0,0,1,0,1,0,0,0,1
3,12.000000,1,0,1,0,4.705280,21.00000,41.961292,0,1,...,1,0,0,0,0,0,1,0,1,0
4,11.000000,1,0,1,1,4.720703,32.00000,41.961292,1,0,...,1,0,0,0,0,0,1,0,0,1
5,14.000000,1,0,1,0,5.103188,34.57081,41.961292,0,0,...,0,1,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21005,2.000000,1,0,1,0,5.144151,19.00000,42.000000,1,1,...,0,0,1,0,0,0,1,0,1,0
21006,11.000000,0,1,1,0,5.090495,46.00000,41.961292,1,1,...,0,1,0,0,0,0,1,0,0,1
21007,6.945683,0,0,0,1,5.214750,34.57081,38.000000,1,0,...,0,0,1,0,0,1,0,0,0,1
21009,1.000000,1,0,0,1,5.224828,33.00000,24.000000,0,0,...,0,1,0,0,1,0,0,0,0,1


In [14]:
#normalization
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data_norm = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)

data_norm

Unnamed: 0,Patient Age,Genes in mother's side,Inherited from father,Maternal gene,Paternal gene,Blood cell count (mcL),Mother's age,Father's age,Status,Follow-up,...,H/O radiation exposure (x-ray)_-,H/O radiation exposure (x-ray)_No,H/O radiation exposure (x-ray)_Not applicable,H/O radiation exposure (x-ray)_Yes,H/O substance abuse_-,H/O substance abuse_No,H/O substance abuse_Not applicable,H/O substance abuse_Yes,Heart Rate (rates/min_Normal,Heart Rate (rates/min_Tachycardia
0,0.142857,1.0,0.0,1.0,0.0,0.419769,0.502146,0.499120,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.428571,1.0,0.0,0.0,0.0,0.510432,0.696970,0.045455,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
2,0.857143,1.0,0.0,1.0,0.0,0.381970,0.090909,0.499120,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.785714,1.0,0.0,1.0,1.0,0.392507,0.424242,0.499120,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,1.000000,1.0,0.0,1.0,0.0,0.653839,0.502146,0.499120,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17155,0.142857,1.0,0.0,1.0,0.0,0.681827,0.030303,0.500000,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
17156,0.785714,0.0,1.0,1.0,0.0,0.645167,0.848485,0.499120,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
17157,0.496120,0.0,0.0,0.0,1.0,0.730064,0.502146,0.409091,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
17158,0.071429,1.0,0.0,0.0,1.0,0.736949,0.454545,0.090909,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


**Dataset Preparation and Splitting**

In [15]:
X = data.to_numpy()[:10000]
Y = target.to_numpy()[:10000]

#split dataset: 80% for train set and 20% for test set
num_of_rows = int(len(X) * 0.8)
X_train = X[:num_of_rows]
X_test = X[num_of_rows:]

#splitting target variables
Y_train = Y[:num_of_rows]
Y_train_1 = Y_train[:, -2] #for Genetic Disorder
Y_train_2 = Y_train[:, -1] #for Disorder Subclass

Y_test = Y[num_of_rows:]
Y_test_1 = Y_test[:, -2]
Y_test_2 = Y_test[:, -1]

**K Nearest Neighbors Function from Scratch**

In [16]:
def predict(X_test, k):
    m = X_train.shape[0]
    n = X_test.shape[0]
    y_pred = []
    
    # Calculating distances with Euclidean metric
    for i in range(n):
        distances = []  
        for j in range(m):  
            d = (np.sqrt(np.sum(np.square(X_test[i,:] - X_train[j,:]))))
            distances.append((d, Y_train_1[j])) #for Genetic Disorder
            #distances.append((d, Y_train_2[j])) #for Disorder Subclass
        distances = sorted(distances) #sort in ascending order
        
        neighbors = []
        for item in range(k):
            neighbors.append(distances[item][1]) #k nearest neighbors
        y_pred.append(stats.mode(neighbors)[0][0]) #prediction
    return y_pred

**Average Error Function**

In [17]:
def avgErr(pred, actual):
    err = 0
    for i in range(len(pred)):
        if pred[i] != actual[i]:
            err += 1
        else:
            err += 0
    avg_err = (1/(len(pred)))*err
    return avg_err

**Evaluation**

In [18]:
scratch_pred_5 = predict(X_test, 5)
print("Average Error(From Scratch) k = 5: ", avgErr(scratch_pred_5, Y_test_1)*100)

scratch_pred_20 = predict(X_test, 20)
print("Average Error(From Scratch) k = 20: ", avgErr(scratch_pred_20, Y_test_1)*100)

scratch_pred_50 = predict(X_test, 50)
print("Average Error(From Scratch) k = 50: ", avgErr(scratch_pred_50, Y_test_1)*100)

Average Error(From Scratch) k = 5:  53.300000000000004
Average Error(From Scratch) k = 20:  52.1
Average Error(From Scratch) k = 50:  51.800000000000004


**K Nearest Neighbors Algorithm and Evaluation**

In [19]:
#K Nearest Neighbors with library
from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier(n_neighbors = 5, metric='euclidean').fit(X_train, Y_train_1)
lib_pred = KNN.predict(X_test)
print("Average Error(library) k = 5: ", avgErr(lib_pred, Y_test_1)*100)

KNN = KNeighborsClassifier(n_neighbors = 20, metric='euclidean').fit(X_train, Y_train_1)
lib_pred = KNN.predict(X_test)
print("Average Error(library) k = 20: ", avgErr(lib_pred, Y_test_1)*100)

KNN_5 = KNeighborsClassifier(n_neighbors = 50, metric='euclidean').fit(X_train, Y_train_1)
lib_pred = KNN.predict(X_test)
print("Average Error(library) k = 50: ", avgErr(lib_pred, Y_test_1)*100)

Average Error(library) k = 5:  53.300000000000004
Average Error(library) k = 20:  52.1
Average Error(library) k = 50:  52.1
