# Dataset Preparation

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
mkdir /content/drive/MyDrive/genetic-disorder

In [1]:
cd /content/drive/MyDrive/genetic-disorder/dataset

/content/drive/MyDrive/genetic-disorder/dataset


In [2]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [None]:
!unzip *.zip

Archive:  85561676e53d11eb_2.zip
   creating: dataset/
  inflating: dataset/sample_submission.csv  
  inflating: dataset/train.csv       
  inflating: dataset/test.csv        


In [3]:
ls

sample_submission.csv  test.csv  train.csv


In [239]:
df = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [None]:
df

In [None]:
df.dtypes

In [240]:
numeric_columns_df = df.select_dtypes(include=['number']).columns
numeric_columns_df = numeric_columns_df.tolist()
cat_columns_df = df.select_dtypes(include=['object']).columns
cat_columns_df = cat_columns_df.tolist()

In [241]:
numeric_columns_test = test.select_dtypes(include=['number']).columns
numeric_columns_test = numeric_columns_test.tolist()
cat_columns_test = test.select_dtypes(include=['object']).columns
cat_columns_test = cat_columns_test.tolist()

In [242]:
modes_df = dict()
means_df = dict()
modes_test = dict()
means_test = dict()

In [243]:
for column in df[numeric_columns_df]:
    mode = df[column].mean()
    print(mode)
    modes[column] = mode
    # df[column] = df[column].fillna(mode)
    df[column] = df[column].replace(np.nan, mode)

6.974147947327653
4.898871078439609
34.526453542718265
41.9728520842393
0.0
0.0
0.0
1.0
0.0
2.0030620952763414
7.486223987209439
0.592482938578884
0.5518856049544333
0.5362326093484135
0.49774661992989483
0.46191670847967886


In [245]:
for column in test[numeric_columns_test]:
    mode = test[column].mean()
    print(mode)
    modes[column] = mode
    # df[column] = df[column].fillna(mode)
    test[column] = test[column].replace(np.nan, mode)

7.041838351822504
4.900206624717485
34.575488642366615
41.830744849445324
-22.1743264659271
-21.766402535657686
-21.954675118858955
-20.996830427892235
-21.870998415213947
-20.352879027997886
-16.155621275936376


In [246]:
for column in df[cat_columns_df]:
    mode = df[column].mode()[0]
    print(mode)
    means[column] = mode
    df[column] = df[column].replace(np.nan, mode)

PID0x1
Yes
No
Yes
No
James
Smith
Clardie
Not applicable
-
Alive
Normal (30-60)
Normal
Yes
Low
Ambiguous
Yes
Not applicable
Institute
Yes
No
Not applicable
-
Yes
Yes
Singular
slightly abnormal
Mitochondrial genetic inheritance disorders
Leigh syndrome


In [247]:
for column in test[cat_columns_test]:
    mode = test[column].mode()[0]
    print(mode)
    means[column] = mode
    test[column] = test[column].replace(np.nan, mode)

PID0x0
Yes
No
Yes
No
John
Jones
Aravind
-99
-
Deceased
Normal (30-60)
Tachycardia
Yes
Low
Ambiguous
-99
Not applicable
Home
No
Yes
-99
-99
No
No
Singular
-99


In [49]:
df.head(1)

Unnamed: 0,Patient Id,Patient Age,Genes in mother's side,Inherited from father,Maternal gene,Paternal gene,Blood cell count (mcL),Patient First Name,Family Name,Father's name,Mother's age,Father's age,Institute Name,Location of Institute,Status,Respiratory Rate (breaths/min),Heart Rate (rates/min,Test 1,Test 2,Test 3,Test 4,Test 5,Parental consent,Follow-up,Gender,Birth asphyxia,Autopsy shows birth defect (if applicable),Place of birth,Folic acid details (peri-conceptional),H/O serious maternal illness,H/O radiation exposure (x-ray),H/O substance abuse,Assisted conception IVF/ART,History of anomalies in previous pregnancies,No. of previous abortion,Birth defects,White Blood cell count (thousand per microliter),Blood test result,Symptom 1,Symptom 2,Symptom 3,Symptom 4,Symptom 5,Genetic Disorder,Disorder Subclass
0,PID0x6418,2.0,Yes,No,Yes,No,4.760603,Richard,Smith,Larre,34.526454,41.972852,Boston Specialty & Rehabilitation Hospital,"55 FRUIT ST\nCENTRAL, MA 02114\n(42.3624748574...",Alive,Normal (30-60),Normal,0.0,0.0,0.0,1.0,0.0,Yes,High,Ambiguous,Yes,Not applicable,Institute,No,No,No,No,No,Yes,2.003062,Singular,9.857562,slightly abnormal,1.0,1.0,1.0,1.0,1.0,Mitochondrial genetic inheritance disorders,Leber's hereditary optic neuropathy


In [50]:
test.head(1)

Unnamed: 0,Patient Id,Patient Age,Genes in mother's side,Inherited from father,Maternal gene,Paternal gene,Blood cell count (mcL),Patient First Name,Family Name,Father's name,Mother's age,Father's age,Institute Name,Location of Institute,Status,Respiratory Rate (breaths/min),Heart Rate (rates/min,Test 1,Test 2,Test 3,Test 4,Test 5,Parental consent,Follow-up,Gender,Birth asphyxia,Autopsy shows birth defect (if applicable),Place of birth,Folic acid details (peri-conceptional),H/O serious maternal illness,H/O radiation exposure (x-ray),H/O substance abuse,Assisted conception IVF/ART,History of anomalies in previous pregnancies,No. of previous abortion,Birth defects,White Blood cell count (thousand per microliter),Blood test result,Symptom 1,Symptom 2,Symptom 3,Symptom 4,Symptom 5
0,PID0x4175,6,No,Yes,No,No,4.981655,Charles,Jones,Kore,38,61,St. Elizabeth's Hospital,"30 WARREN ST\nALLSTON/BRIGHTON, MA 02134\n(42....",Alive,Tachypnea,Normal,0,-99,0,1,0,-99,Low,Male,Yes,Not applicable,Institute,Yes,No,Yes,-,No,-99,2,Multiple,-99.0,slightly abnormal,True,True,True,True,True


In [None]:
df.dtypes

In [249]:
#Identify categorical and continuous variables
ID_col = ['Patient Id']
target_col = ["Genetic Disorder", "Disorder Subclass"]
# idx = 0

for var in target_col:
    number = LabelEncoder()
    number.fit(df[var])
    df[var] = number.transform(df[var])
    # test[var] = number.fit_transform(test[var])
    # idx += 1

# Using One Hot Encoding

In [327]:
cat_cols = df.select_dtypes(include=['object']).columns.tolist()

In [328]:
cat_cols = cat_cols[1:5]+cat_cols[10:]

In [335]:
for var in cat_cols:
  onehotencoder = OneHotEncoder(handle_unknown='ignore')
  combined = df[var].values.tolist()#+test[var].values.tolist()
  X = onehotencoder.fit(np.asarray(combined).reshape(-1,1))
  X = onehotencoder.transform(df[var].values.reshape(-1,1)).toarray()
  names = [var+'_'+str(int(i)) for i in range(len(df[var].unique()))]
  # names_t = [var+str(int(i)) for i in range(len(test[var].unique())-1)]
  
  print(names)
  # print(names_t)
  
  # if (len(df[var].unique()) == 2):
    # names = [var+'_'+str(int(i)) for i in range(len(df[var].unique())-1)]
  dfOneHot = pd.DataFrame(X, columns = names)
  df = pd.concat([df, dfOneHot], axis=1)
  if len(names) > 1:
    df= df.drop([var+'_'+str(int(0))], axis=1)  
  df= df.drop([var], axis=1)
  X = onehotencoder.transform(test[var].values.reshape(-1,1)).toarray()
  # names = [var+str(int(i)) for i in range(len(test[var].unique())-1)]
  # if (len(test[var].unique()) == 2):
    # names = [var+str(int(i)) for i in range(len(test[var].unique()))]
  testOneHot = pd.DataFrame(X, columns = names) 
  test = pd.concat([test, testOneHot], axis=1)
  if len(names) > 1:
    test = test.drop([var+'_'+str(int(0))], axis=1)
  test = test.drop([var], axis=1)

["Genes in mother's side_0", "Genes in mother's side_1"]
['Inherited from father_0', 'Inherited from father_1']
['Maternal gene_0', 'Maternal gene_1']
['Paternal gene_0', 'Paternal gene_1']
['Status_0', 'Status_1']
['Respiratory Rate (breaths/min)_0', 'Respiratory Rate (breaths/min)_1']
['Heart Rate (rates/min_0', 'Heart Rate (rates/min_1']
['Parental consent_0']
['Follow-up_0', 'Follow-up_1']
['Gender_0', 'Gender_1', 'Gender_2']
['Birth asphyxia_0', 'Birth asphyxia_1', 'Birth asphyxia_2', 'Birth asphyxia_3']
['Autopsy shows birth defect (if applicable)_0', 'Autopsy shows birth defect (if applicable)_1', 'Autopsy shows birth defect (if applicable)_2', 'Autopsy shows birth defect (if applicable)_3']
['Place of birth_0', 'Place of birth_1']
['Folic acid details (peri-conceptional)_0', 'Folic acid details (peri-conceptional)_1']
['H/O serious maternal illness_0', 'H/O serious maternal illness_1']
['H/O radiation exposure (x-ray)_0', 'H/O radiation exposure (x-ray)_1', 'H/O radiation expos

In [336]:
df.head(1)

Unnamed: 0,Patient Id,Patient Age,Blood cell count (mcL),Patient First Name,Family Name,Father's name,Mother's age,Father's age,Institute Name,Location of Institute,Test 1,Test 2,Test 3,Test 4,Test 5,No. of previous abortion,White Blood cell count (thousand per microliter),Symptom 1,Symptom 2,Symptom 3,Symptom 4,Symptom 5,Genetic Disorder,Disorder Subclass,Genes in mother's side_1,Inherited from father_1,Maternal gene_1,Paternal gene_1,Status_1,Respiratory Rate (breaths/min)_1,Heart Rate (rates/min_1,Parental consent_0,Follow-up_1,Gender_1,Gender_2,Birth asphyxia_1,Birth asphyxia_2,Birth asphyxia_3,Autopsy shows birth defect (if applicable)_1,Autopsy shows birth defect (if applicable)_2,Autopsy shows birth defect (if applicable)_3,Place of birth_1,Folic acid details (peri-conceptional)_1,H/O serious maternal illness_1,H/O radiation exposure (x-ray)_1,H/O radiation exposure (x-ray)_2,H/O radiation exposure (x-ray)_3,H/O substance abuse_1,H/O substance abuse_2,H/O substance abuse_3,Assisted conception IVF/ART_1,History of anomalies in previous pregnancies_1,Birth defects_1,Blood test result_1,Blood test result_2,Blood test result_3
0,PID0x6418,2.0,4.760603,Richard,Smith,Larre,34.526454,41.972852,Boston Specialty & Rehabilitation Hospital,"55 FRUIT ST\nCENTRAL, MA 02114\n(42.3624748574...",0.0,0.0,0.0,1.0,0.0,2.003062,9.857562,1.0,1.0,1.0,1.0,1.0,0,5,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0


In [52]:
#Identify categorical and continuous variables
ID_col = ['Patient Id']
target_col = ["Genetic Disorder", "Disorder Subclass"]
# cat_cols = []

In [220]:
df

Unnamed: 0,Patient Id,Patient Age,Blood cell count (mcL),Patient First Name,Family Name,Father's name,Mother's age,Father's age,Institute Name,Location of Institute,Test 1,Test 2,Test 3,Test 4,Test 5,No. of previous abortion,White Blood cell count (thousand per microliter),Symptom 1,Symptom 2,Symptom 3,Symptom 4,Symptom 5,Genetic Disorder,Disorder Subclass,Genes in mother's side_0,Genes in mother's side_1,Inherited from father_0,Inherited from father_1,Maternal gene_0,Maternal gene_1,Paternal gene_0,Paternal gene_1,Status_0,Status_1,Respiratory Rate (breaths/min)_0,Respiratory Rate (breaths/min)_1,Heart Rate (rates/min_0,Heart Rate (rates/min_1,Parental consent_0,Follow-up_0,Follow-up_1,Gender_0,Gender_1,Gender_2,Birth asphyxia_0,Birth asphyxia_1,Birth asphyxia_2,Birth asphyxia_3,Autopsy shows birth defect (if applicable)_0,Autopsy shows birth defect (if applicable)_1,Autopsy shows birth defect (if applicable)_2,Autopsy shows birth defect (if applicable)_3,Place of birth_0,Place of birth_1,Folic acid details (peri-conceptional)_0,Folic acid details (peri-conceptional)_1,H/O serious maternal illness_0,H/O serious maternal illness_1,H/O radiation exposure (x-ray)_0,H/O radiation exposure (x-ray)_1,H/O radiation exposure (x-ray)_2,H/O radiation exposure (x-ray)_3,H/O substance abuse_0,H/O substance abuse_1,H/O substance abuse_2,H/O substance abuse_3,Assisted conception IVF/ART_0,Assisted conception IVF/ART_1,History of anomalies in previous pregnancies_0,History of anomalies in previous pregnancies_1,Birth defects_0,Birth defects_1,Blood test result_0,Blood test result_1,Blood test result_2,Blood test result_3
0,PID0x6418,2.0,4.760603,Richard,Smith,Larre,34.526454,41.972852,Boston Specialty & Rehabilitation Hospital,"55 FRUIT ST\nCENTRAL, MA 02114\n(42.3624748574...",0.0,0.0,0.0,1.0,0.0,2.003062,9.857562,1.0,1.000000,1.0,1.0,1.000000,0,5,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
1,PID0x25d5,4.0,4.910669,Mike,Smith,Brycen,34.526454,23.000000,St. Margaret's Hospital For Women,"1515 COMMONWEALTH AV\nALLSTON/BRIGHTON, MA 021...",0.0,0.0,0.0,1.0,0.0,2.003062,5.522560,1.0,0.551886,1.0,1.0,0.000000,0,2,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
2,PID0x4a82,6.0,4.893297,Kimberly,Smith,Nashon,41.000000,22.000000,Not applicable,-,0.0,0.0,0.0,1.0,0.0,4.000000,7.486224,0.0,1.000000,1.0,1.0,1.000000,1,3,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
3,PID0x4ac8,12.0,4.705280,Jeffery,Hoelscher,Aayaan,21.000000,41.972852,Not applicable,"55 FRUIT ST\nCENTRAL, MA 02114\n(42.3624748574...",0.0,0.0,0.0,1.0,0.0,1.000000,7.919321,0.0,0.000000,1.0,0.0,0.000000,0,6,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
4,PID0x1bf7,11.0,4.720703,Johanna,Stutzman,Suave,32.000000,41.972852,Carney Hospital,"300 LONGWOOD AV\nFENWAY/KENMORE, MA 02115\n(42...",0.0,0.0,0.0,1.0,0.0,4.000000,4.098210,0.0,0.000000,0.0,0.0,0.461917,1,1,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22078,PID0x5598,4.0,5.258298,Lynn,Smith,Alhassane,35.000000,64.000000,Franciscan Children's Hospital,"1153 CENTRE ST\nJAMAICA PLAIN, MA 02130\n(42.3...",0.0,0.0,0.0,1.0,0.0,3.000000,6.584811,0.0,0.000000,1.0,0.0,0.000000,0,6,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
22079,PID0x19cb,8.0,4.974220,Matthew,Farley,Dartanion,34.526454,56.000000,Faulkner Hospital,"170 MORTON ST\nROSLINDALE, MA 02130\n(42.30025...",0.0,0.0,0.0,1.0,0.0,2.000000,7.041556,1.0,1.000000,1.0,1.0,0.000000,1,3,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
22080,PID0x3c4f,8.0,5.186470,John,Smith,Cavani,35.000000,51.000000,Not applicable,-,0.0,0.0,0.0,1.0,0.0,2.000000,7.715464,0.0,0.000000,0.0,1.0,0.461917,0,7,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
22081,PID0x13a,7.0,4.858543,Sharon,Smith,Bomer,19.000000,41.972852,Not applicable,-,0.0,0.0,0.0,1.0,0.0,1.000000,8.437670,1.0,1.000000,1.0,0.0,0.000000,0,6,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [338]:
df.head(1)

Unnamed: 0,Patient Id,Patient Age,Blood cell count (mcL),Patient First Name,Family Name,Father's name,Mother's age,Father's age,Institute Name,Location of Institute,Test 1,Test 2,Test 3,Test 4,Test 5,No. of previous abortion,White Blood cell count (thousand per microliter),Symptom 1,Symptom 2,Symptom 3,Symptom 4,Symptom 5,Genetic Disorder,Disorder Subclass,Genes in mother's side_1,Inherited from father_1,Maternal gene_1,Paternal gene_1,Status_1,Respiratory Rate (breaths/min)_1,Heart Rate (rates/min_1,Parental consent_0,Follow-up_1,Gender_1,Gender_2,Birth asphyxia_1,Birth asphyxia_2,Birth asphyxia_3,Autopsy shows birth defect (if applicable)_1,Autopsy shows birth defect (if applicable)_2,Autopsy shows birth defect (if applicable)_3,Place of birth_1,Folic acid details (peri-conceptional)_1,H/O serious maternal illness_1,H/O radiation exposure (x-ray)_1,H/O radiation exposure (x-ray)_2,H/O radiation exposure (x-ray)_3,H/O substance abuse_1,H/O substance abuse_2,H/O substance abuse_3,Assisted conception IVF/ART_1,History of anomalies in previous pregnancies_1,Birth defects_1,Blood test result_1,Blood test result_2,Blood test result_3
0,PID0x6418,2.0,4.760603,Richard,Smith,Larre,34.526454,41.972852,Boston Specialty & Rehabilitation Hospital,"55 FRUIT ST\nCENTRAL, MA 02114\n(42.3624748574...",0.0,0.0,0.0,1.0,0.0,2.003062,9.857562,1.0,1.0,1.0,1.0,1.0,0,5,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0


In [339]:
feature_col = set([i for i in range(df.shape[1])]).difference(set([0, 3, 4, 5, 8, 9, 22, 23]))
feature_col = list(feature_col)

target_col = [22, 23]

test_feature = set([i for i in range(test.shape[1])]).difference(set([0, 3, 4, 5, 8, 9]))
test_feature = list(test_feature)

X_train = df.iloc[:, feature_col].values
y_train = df.iloc[:, target_col].values
X_test = test.iloc[:, test_feature].values

# Logistic Regression

In [340]:
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputRegressor
model = MultiOutputRegressor(LogisticRegression(multi_class='ovr', max_iter=1000))
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


# KNN

In [341]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

In [342]:
y_pred_knn = y_pred_knn.astype(int)

# Decision Tree

In [343]:
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train.astype(int))
y_pred_dt = dt.predict(X_test)

In [344]:
y_pred_dt = y_pred_dt.astype(int)

# Inverse Tranform Prediction

In [None]:
df = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
numeric_columns_df = df.select_dtypes(include=['number']).columns
numeric_columns_df = numeric_columns_df.tolist()
cat_columns_df = df.select_dtypes(include=['object']).columns
cat_columns_df = cat_columns_df.tolist()
numeric_columns_test = test.select_dtypes(include=['number']).columns
numeric_columns_test = numeric_columns_test.tolist()
cat_columns_test = test.select_dtypes(include=['object']).columns
cat_columns_test = cat_columns_test.tolist()
modes_df = dict()
means_df = dict()
modes_test = dict()
means_test = dict()
for column in df[numeric_columns_df]:
    mode = df[column].mean()
    print(mode)
    modes[column] = mode
    # df[column] = df[column].fillna(mode)
    df[column] = df[column].replace(np.nan, mode)
for column in test[numeric_columns_test]:
    mode = test[column].mean()
    print(mode)
    modes[column] = mode
    # df[column] = df[column].fillna(mode)
    test[column] = test[column].replace(np.nan, mode)
for column in df[cat_columns_df]:
    mode = df[column].mode()[0]
    print(mode)
    means[column] = mode
    df[column] = df[column].replace(np.nan, mode)
for column in test[cat_columns_test]:
    mode = test[column].mode()[0]
    print(mode)
    means[column] = mode
    test[column] = test[column].replace(np.nan, mode)

In [None]:
y_pred_0 = [i[0] for i in y_pred]
y_pred_1 = [i[1] for i in y_pred]
#Identify categorical and continuous variables
ID_col = ['Patient Id']
target_col = ["Genetic Disorder", "Disorder Subclass"]
# idx = 0

# for var in target_col:
number = LabelEncoder()
number.fit(df[target_col[0]])
y_pred_0 = list(number.inverse_transform(y_pred_0))
number.fit(df[target_col[1]])
y_pred_1 = list(number.inverse_transform(y_pred_1))
    # pred = number.transform(df[var])
    # test[var] = number.fit_transform(test[var])
    # idx += 1
y_pred = np.asarray([np.asarray([y_pred_0[i], y_pred_1[i]]) for i in range(len(y_pred))])
y_pred

In [None]:
y_pred_0 = [i[0] for i in y_pred_knn]
y_pred_1 = [i[1] for i in y_pred_knn]
#Identify categorical and continuous variables
ID_col = ['Patient Id']
target_col = ["Genetic Disorder", "Disorder Subclass"]
# idx = 0

# for var in target_col:
number = LabelEncoder()
number.fit(df[target_col[0]])
y_pred_0 = list(number.inverse_transform(y_pred_0))
number.fit(df[target_col[1]])
y_pred_1 = list(number.inverse_transform(y_pred_1))
    # pred = number.transform(df[var])
    # test[var] = number.fit_transform(test[var])
    # idx += 1
y_pred_knn = np.asarray([np.asarray([y_pred_0[i], y_pred_1[i]]) for i in range(len(y_pred_knn))])
y_pred_knn

In [None]:
y_pred_0 = [i[0] for i in y_pred_dt]
y_pred_1 = [i[1] for i in y_pred_dt]
#Identify categorical and continuous variables
ID_col = ['Patient Id']
target_col = ["Genetic Disorder", "Disorder Subclass"]
# idx = 0

# for var in target_col:
number = LabelEncoder()
number.fit(df[target_col[0]])
y_pred_0 = list(number.inverse_transform(y_pred_0))
number.fit(df[target_col[1]])
y_pred_1 = list(number.inverse_transform(y_pred_1))
    # pred = number.transform(df[var])
    # test[var] = number.fit_transform(test[var])
    # idx += 1
y_pred_dt = np.asarray([np.asarray([y_pred_0[i], y_pred_1[i]]) for i in range(len(y_pred_dt))])
y_pred_dt

In [None]:
test[target_col] = y_pred
test = test[ID_col+target_col]
test.to_csv('out.csv', index=False) 
res  = pd.read_csv('out.csv')
res

In [None]:
test[target_col] = y_pred_knn
test = test[ID_col+target_col]
test.to_csv('out_knn.csv', index=False) 
res  = pd.read_csv('out_knn.csv')
res

In [351]:
test[target_col] = y_pred_dt
test = test[ID_col+target_col]
test.to_csv('out_dt.csv', index=False) 
res  = pd.read_csv('out_dt.csv')
res

Unnamed: 0,Patient Id,Genetic Disorder,Disorder Subclass
0,PID0x4175,Single-gene inheritance diseases,Cystic fibrosis
1,PID0x21f5,Single-gene inheritance diseases,Tay-Sachs
2,PID0x49b8,Single-gene inheritance diseases,Tay-Sachs
3,PID0x2d97,Mitochondrial genetic inheritance disorders,Leigh syndrome
4,PID0x58da,Single-gene inheritance diseases,Cystic fibrosis
...,...,...,...
9460,PID0x81e1,Mitochondrial genetic inheritance disorders,Leber's hereditary optic neuropathy
9461,PID0x3514,Single-gene inheritance diseases,Cystic fibrosis
9462,PID0x5408,Single-gene inheritance diseases,Tay-Sachs
9463,PID0x2017,Mitochondrial genetic inheritance disorders,Leber's hereditary optic neuropathy
