# Machine Learning Module

The objective of this module is to proceed to a Machine Learning experiment.

In order to run this notebook, you must have executed the *Extraction* and *Input* notebooks first.

The following CSV files are required for this notebook:

- In the *csv/original_data* folder:
    - *patients.csv*
- In the *csv/static* folder (generated by the Input notebook):
    - *holdout_time_point_1.csv*
    - *holdout_time_point_2.csv*
    - *holdout_time_point_3.csv*
    - *holdout_time_point_4.csv*
    - *pred_holdout.csv*
    - *pred_train.csv*
    - *train_time_point_1.csv*
    - *train_time_point_2.csv*
    - *train_time_point_3.csv*
    - *train_time_point_4.csv*

In [1]:
# Imports
import os

import pandas as pd
from pycaret.classification import ClassificationExperiment
from sklearn.model_selection import StratifiedGroupKFold

os.chdir('../src')
import machine_learning

## Get data (from original and static CSV files)

In [2]:
# Set the working directory
os.chdir('../csv')

In [3]:
# Read train CSV data from static
df_train_time_point_1 = pd.read_csv('static/train_time_point_1.csv')
df_train_time_point_2 = pd.read_csv('static/train_time_point_2.csv')
#df_train_time_point_3 = pd.read_csv('static/train_time_point_3.csv')
df_train_time_point_4 = pd.read_csv('static/train_time_point_4.csv')
df_target_train = pd.read_csv('static/pred_train.csv')

## Prepare experiment data

In [4]:
# Create experimental dataframe
df_experimental = pd.merge(df_train_time_point_1, df_train_time_point_2, how='outer', on=['PatientID', 'Date'])
#df_experimental = df_experimental.merge(df_train_time_point_3, how='outer', on=['PatientID', 'Date'])
df_experimental = df_experimental.merge(df_train_time_point_4, how='outer', on=['PatientID', 'Date'])
machine_learning.set_target_column_in_df(df_experimental, df_target_train, 'PatientID', 'subject_id', 'dod')

## Set PyCaret experiment

In [5]:
# Setup
exp = ClassificationExperiment()
exp.setup(df_experimental, target='target', train_size=0.8, fold_strategy=StratifiedGroupKFold(), fold_groups='PatientID', fold=5)

Unnamed: 0,Description,Value
0,Session id,1298
1,Target,target
2,Target type,Binary
3,Original data shape,"(13506, 1117)"
4,Transformed data shape,"(13506, 1117)"
5,Transformed train set shape,"(10804, 1117)"
6,Transformed test set shape,"(2702, 1117)"
7,Numeric features,1115
8,Categorical features,1
9,Rows with missing values,100.0%


<pycaret.classification.oop.ClassificationExperiment at 0x7f4c4965d460>

In [6]:
# Create model
model = exp.create_model('xgboost')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7284,0.5143,0.0,0.0,0.0,0.0,0.0
1,0.5704,0.4815,0.3282,0.2641,0.2927,-0.0107,-0.0108
2,0.7294,0.7122,0.5714,0.5022,0.5346,0.345,0.3464
3,0.446,0.4137,0.3168,0.1884,0.2363,-0.1558,-0.1688
4,0.7262,0.5471,0.0,0.0,0.0,-0.0037,-0.0263
Mean,0.6401,0.5337,0.2433,0.1909,0.2127,0.035,0.0281
Std,0.1146,0.0996,0.2185,0.1872,0.2005,0.1657,0.1705


In [7]:
# Tune model
tuning_grid = {'max_depth': [5, 6, 7, 8],
               'n_estimators': [200, 300],
               'learning_rate': [0.3, 0.1, 0.05]
               }
model = exp.tune_model(model, custom_grid=tuning_grid, optimize='Accuracy')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7284,0.5305,0.0,0.0,0.0,0.0,0.0
1,0.6009,0.5152,0.4444,0.3262,0.3763,0.0929,0.0953
2,0.7234,0.6971,0.335,0.4876,0.3972,0.2256,0.2323
3,0.4641,0.4415,0.4572,0.2412,0.3158,-0.0594,-0.0677
4,0.7262,0.566,0.0,0.0,0.0,-0.0037,-0.0263
Mean,0.6486,0.5501,0.2473,0.211,0.2178,0.0511,0.0467
Std,0.1042,0.084,0.2064,0.1896,0.1799,0.1,0.1072


Fitting 5 folds for each of 10 candidates, totalling 50 fits


## Predict Holdout set values

In [8]:
# Read holdout CSV data from static
df_holdout_time_point_1 = pd.read_csv('static/holdout_time_point_1.csv')
df_holdout_time_point_2 = pd.read_csv('static/holdout_time_point_2.csv')
#df_holdout_time_point_3 = pd.read_csv('static/holdout_time_point_3.csv')
df_holdout_time_point_4 = pd.read_csv('static/holdout_time_point_4.csv')
df_target_holdout = pd.read_csv('static/pred_holdout.csv')

In [9]:
# Create holdout dataframe
df_holdout = pd.merge(df_holdout_time_point_1, df_holdout_time_point_2, how='outer', on=['PatientID', 'Date'])
#df_holdout = df_holdout.merge(df_holdout_time_point_3, how='outer', on=['PatientID', 'Date'])
df_holdout = df_holdout.merge(df_holdout_time_point_4, how='outer', on=['PatientID', 'Date'])
machine_learning.set_target_column_in_df(df_holdout, df_target_holdout, 'PatientID', 'subject_id', 'dod')

In [12]:
# Prediction
# Hidden cell to preserve data confidentiality
# exp.predict_model(model, data=df_holdout, raw_score=True)