# PhysioNet/Computing in Cardiology Challenge 2020
## Classification of 12-lead ECGs
### 3. Train Model

# Setup Notebook

In [24]:
# Import 3rd party libraries
import os
import sys
import ast
import time
import json
import numpy as np
import pandas as pd

# Import local Libraries
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(os.getcwd()))))))
from kardioml import DATA_PATH
from kardioml.models.physionet2017.training.xgboost_model import Model
from kardioml.data.data_loader import load_challenge_data

# Configure Notebook
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Import Data
### Meta Data

In [25]:
# Import to DataFrame
meta_data = pd.read_csv(os.path.join(DATA_PATH, 'training', 'physionet_2017', 'meta_data.csv'))

# View DataFrame
meta_data.head()

Unnamed: 0,index,dataset,filename,age,sex,fs,length,labels_SNOMEDCT,labels,labels_full,labels_int,labels_train
0,0,A,A0001,74.0,male,500,7500,[59118001],['RBBB'],['right bundle branch block'],[18],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,1,A,A0002,49.0,female,500,5000,[426783006],['SNR'],['sinus rhythm'],[21],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,2,A,A0003,81.0,female,500,5000,[164889003],['AF'],['atrial fibrillation'],[1],"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,3,A,A0004,45.0,male,500,5974,[164889003],['AF'],['atrial fibrillation'],[1],"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,5,A,A0006,29.0,male,500,7000,[59118001],['RBBB'],['right bundle branch block'],[18],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


### Features

In [26]:
# Import to DataFrame
features = pd.read_csv(os.path.join(DATA_PATH, 'training', 'physionet_2017', 'features.csv'))

# View DataFrame
features.head()

Unnamed: 0,dataset,filename,lead,full_waveform_min,full_waveform_max,full_waveform_mean,full_waveform_median,full_waveform_std,full_waveform_skew,full_waveform_kurtosis,...,rpeak_entropy,rpeak_higuchi_fractal_dimension,template_corr_coeff_mean,template_corr_coeff_std,qrs_corr_coeff_mean,qrs_corr_coeff_std,p_wave_corr_coeff_mean,p_wave_corr_coeff_std,t_wave_corr_coeff_mean,t_wave_corr_coeff_std
0,A,A0001,I,-0.976816,1.061664,0.000247,-0.001915,0.267624,0.035163,4.168482,...,3.317816,2.465088,0.987405,0.003816,0.995713,0.002541,0.866373,0.067781,0.942713,0.022492
1,A,A0002,I,-0.625051,1.044477,-0.000199,-0.005681,0.167347,2.358342,15.457311,...,3.091042,,0.979281,0.006906,0.990972,0.006404,0.886879,0.045675,0.688547,0.088916
2,A,A0003,I,-0.360254,1.081835,0.000477,-0.025866,0.196894,3.331854,12.586657,...,3.135494,2.286132,0.67216,0.178956,0.984642,0.007182,0.471329,0.307302,0.060507,0.327653
3,A,A0004,I,-0.595681,1.099412,0.000537,-0.015885,0.208745,2.669416,9.934822,...,2.877468,2.719416,0.849063,0.125237,0.977497,0.01182,0.376888,0.213236,0.121306,0.238374
4,A,A0006,I,-0.671019,1.047365,0.000157,-0.010219,0.150394,2.383495,18.528395,...,2.862201,,0.98607,0.006084,0.992082,0.005623,0.945174,0.030141,0.94174,0.030778


### Labels

In [27]:
# Import to DataFrame
labels = pd.read_csv(os.path.join(DATA_PATH, 'training', 'physionet_2017', 'labels.csv'))

# View DataFrame
labels.head()

Unnamed: 0,270492004,164889003,164890007,426627000,713427006,713426002,445118002,39732003,164909002,251146004,...,47665007,59118001,427393009,426177001,426783006,427084000,63593006,164934002,59931005,17338001
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


# Hyper-Parameter Tuning

In [None]:
# Set parameter bounds
param_bounds = {'learning_rate': (0.01, 1.0),
                'n_estimators': (500, 1500),
                'max_depth': (2, 8),
                'subsample': (0.5, 1.0),  
                'colsample_bytree': (0.5, 1.0),
                'gamma': (0.001, 2.0),
                'min_child_weight': (0, 10),
                'max_delta_step': (0, 10)}

# Set number of iterations
n_iter = 40

# Set number CV folds
cv_folds = 4

# Get 1-D labels for stratifying
stratifier = meta_data['labels'].map(lambda val: ast.literal_eval(val)[0])

# Initialize model
model = Model(features=features.drop(['dataset', 'filename', 'lead'], axis=1), labels=labels, 
              cv_folds=cv_folds, stratifier=stratifier)

# Run hyper-paramter search
model.tune_hyper_parameters(param_bounds=param_bounds, n_iter=n_iter)

# Save model
model.save()

|   iter    |  target   | colsam... |   gamma   | learni... | max_de... | max_depth | min_ch... | n_esti... | subsample |
-------------------------------------------------------------------------------------------------------------------------


# Test Inference

In [None]:
# Load test data
data, header_data = load_challenge_data(filename=os.path.join(DATA_PATH, 'raw', 'Training_WFDB', 'A0100.mat'))

# Run inference
model.challenge_prediction(data=data, header_data=header_data)