# Notebook to Implement Model Training 

---

### 1) Setup

In [44]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import balanced_accuracy_score, make_scorer
from sklearn.model_selection import RepeatedKFold, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [21]:
SEED = 42

In [2]:
TRAIN_CLINICAL_FILENAME = "train_set_clinical.csv"
TEST_CLINICAL_FILENAME = "test_set_clinical.csv"

---

### 2) Read and Preprocess Data

In [14]:
train = pd.read_csv(TRAIN_CLINICAL_FILENAME, sep=";", index_col="ID")

In [15]:
train.shape

(132, 650)

In [16]:
train.head()

Unnamed: 0_level_0,Age (Y),Sex,Fever,Cough,Headache,Sore throat,Muscle or Body Aches,Fadigue,Congestion or runny nose,Shortness of breath or difficulty breathing,...,Freq.8943.76551923189,Freq.9058.85825530971,Freq.9098.58510797401,Freq.9437.74469644083,Freq.9593.90405666006,Freq.9799.842201746,Freq.10432.4853106264,Freq.11006.9514551194,Freq.11161.31855876,Group
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,53,F,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.7e-05,5e-06,2e-05,2.9e-05,3.7e-05,2.4e-05,1.6e-05,3e-05,3e-05,MILD
2,21,F,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,2.1e-05,1.3e-05,2e-06,4e-06,5e-06,2e-06,8e-06,0.000103,2e-06,MILD
5,62,F,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,...,9.9e-05,7.9e-05,1.3e-05,1.2e-05,1e-06,6.2e-05,3e-06,8.9e-05,2.8e-05,MILD
6,34,F,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,...,2.5e-05,2.1e-05,1.7e-05,1e-05,2.2e-05,0.000156,1.5e-05,5.3e-05,7e-06,SEVERE
8,42,M,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,9e-06,0.000153,6e-06,4.6e-05,3e-06,1.7e-05,8e-06,0.00011,7e-06,SEVERE


In [17]:
##### Preprocessing

train_mod = train.copy()
# Drop NaN Values 
train_mod.dropna(inplace=True)
# Convert Sex column to boolean (Female: 1, Male: 0)
train_mod["Sex"] = np.where(train_mod["Sex"]=="F", 1, 0)

In [19]:
train_mod.shape

(131, 650)

In [18]:
train_mod.head()

Unnamed: 0_level_0,Age (Y),Sex,Fever,Cough,Headache,Sore throat,Muscle or Body Aches,Fadigue,Congestion or runny nose,Shortness of breath or difficulty breathing,...,Freq.8943.76551923189,Freq.9058.85825530971,Freq.9098.58510797401,Freq.9437.74469644083,Freq.9593.90405666006,Freq.9799.842201746,Freq.10432.4853106264,Freq.11006.9514551194,Freq.11161.31855876,Group
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,53,1,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.7e-05,5e-06,2e-05,2.9e-05,3.7e-05,2.4e-05,1.6e-05,3e-05,3e-05,MILD
2,21,1,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,2.1e-05,1.3e-05,2e-06,4e-06,5e-06,2e-06,8e-06,0.000103,2e-06,MILD
5,62,1,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,...,9.9e-05,7.9e-05,1.3e-05,1.2e-05,1e-06,6.2e-05,3e-06,8.9e-05,2.8e-05,MILD
6,34,1,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,...,2.5e-05,2.1e-05,1.7e-05,1e-05,2.2e-05,0.000156,1.5e-05,5.3e-05,7e-06,SEVERE
8,42,0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,9e-06,0.000153,6e-06,4.6e-05,3e-06,1.7e-05,8e-06,0.00011,7e-06,SEVERE


---

### 3) Baseline Model Training and CV

In [37]:
# Define Classifier (or pipeline)
clf = RandomForestClassifier(random_state=SEED)

In [38]:
# Get Features and Target
X, y = train_mod.drop("Group", axis=1), train_mod["Group"]

In [39]:
# Defining RepeatedKFold Cross Validator
rkf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=SEED)

In [40]:
# Define metric scorer
metric_scorer = make_scorer(balanced_accuracy_score)

In [41]:
# Cross validate model
scores = cross_val_score(clf, X, y, scoring=metric_scorer, cv=rkf, n_jobs=-1)

In [42]:
# Score from each CV Iteration
scores

array([0.6       , 0.7518797 , 0.76875   , 0.66339869, 0.77777778,
       0.61813187, 0.72222222, 0.74836601, 0.62727273, 0.69444444,
       0.61111111, 0.76923077, 0.54761905, 0.73308271, 0.6372549 ,
       0.75      , 0.65384615, 0.84722222, 0.67261905, 0.78571429,
       0.7       , 0.69230769, 0.7593985 , 0.66666667, 0.61111111,
       0.75      , 0.63636364, 0.55625   , 0.77272727, 0.65972222,
       0.74117647, 0.52857143, 0.81875   , 0.64285714, 0.72727273,
       0.75      , 0.55357143, 0.75      , 0.61111111, 0.65413534,
       0.72222222, 0.89166667, 0.67857143, 0.71875   , 0.64848485,
       0.58223684, 0.70606061, 0.70833333, 0.75238095, 0.64285714])

In [43]:
# Mean Metric Value
np.mean(scores)

0.692270009574189

---

### 4) Experiments

##### 4.1) Hyper Parameter Optimization with GridSearchCV

In [46]:
# Define hyper parameters search space
parameters = {
    "n_estimators": [100, 500, 1000],
    "max_depth": [5, 10, 20]
}

In [50]:
# Define new model (or pipeline) and Grid Search (could be Random Search or Optune as well) Object for HP Optimization
clf = RandomForestClassifier(random_state=SEED)
search = GridSearchCV(clf, parameters, scoring=metric_scorer, cv=rkf, verbose=3)

In [None]:
search.fit(X, y)

In [53]:
# Best Score
search.best_score_

0.6998807091171644

In [52]:
# Best Params
search.best_params_

{'max_depth': 5, 'n_estimators': 100}