In [None]:
!pip install -q PyAthena

## Prepare Datasets for Predictor Training, Validation and Testing

#### TS

Import modules that build patient cohort, extract demographics and lab events data:

In [1]:
from dataproc.cohort import query_esbl_pts, remove_dups, observation_window
from dataproc.sampling import generate_samples
from dataproc.roc_auc_curves import plt_roc_auc_curve, plt_precision_recall_curve
from dataproc.sampling import stratify_set
from dataproc.create_dataset import dataset_creation
from hyper_params import HyperParams

import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder

In [2]:
# load hyperparams instance
params = HyperParams()

Patients cohort:

In [8]:
# Select esbl microbiology test
esbl_admits = query_esbl_pts()
# Remove dups
esbl_admits = remove_dups(esbl_admits)
# Create observation window
esbl_admits_window = observation_window(esbl_admits, window_size=params.observation_window_hours)
# Subset columns
pts_labels = esbl_admits_window[['hadm_id', 'index_date','RESISTANT_YN']]
pts_labels.to_pickle('data/patient_labels.pkl')
pts_labels['RESISTANT_YN'].value_counts()

0    3894
1     742
Name: RESISTANT_YN, dtype: int64

Import cohort/labels data from the .pkl file:

In [3]:
pts_labels = pd.read_pickle('data/patient_labels_multiple.pkl')
print(pts_labels.shape)
pts_labels.head()

(4636, 3)


Unnamed: 0,hadm_id,index_date,RESISTANT_YN
14564,101757,2132-12-31 16:30:00,0
14608,186474,2155-02-25 18:45:00,1
14612,194730,2170-12-22 06:12:00,0
14625,112086,2147-04-05 14:00:00,1
14634,158569,2142-04-01 18:34:00,1


Patient's features data:

In [None]:
# Loading the features
features = dataset_creation(pts_labels['hadm_id'], params.observation_window_hours)
features = features.merge(pts_labels[['hadm_id','RESISTANT_YN']], on='hadm_id')
features.to_pickle('data/features.pkl')

Import features data from the .pkl file:

In [None]:
features = pd.read_pickle('data/features.pkl')
print(list(features.columns))

In [None]:
loinc_codes = list(features.drop(columns=['hadm_id', 'subject_id', 'admittime','admission_type']).columns)[:-8]
print(list(loinc_codes))

In [None]:
features_summary = features[loinc_codes].describe()

In [None]:
# Embedding the features
from dataproc.embeddings import loinc_values

loinc_vals = loinc_values(loinc_codes)
loinc_vals.dropna(subset=['value'], inplace=True)
loinc_vals = loinc_vals.astype({'value': 'string', 'loinc_code': 'category'})
loinc_vals['value'] = loinc_vals['value'].map(lambda x: x.lstrip('LESS THAN '))
loinc_vals['value'] = loinc_vals['value'].map(lambda x: x.lstrip('GREATER THAN '))
loinc_vals['value'] = loinc_vals['value'].map(lambda x: x.lstrip('>GREATER THAN '))
loinc_vals['value'] = loinc_vals['value'].map(lambda x: x.lstrip('<LESS THAN '))
loinc_vals['value'] = loinc_vals['value'].map(lambda x: x.rstrip(' NG/ML'))
loinc_vals['value'] = loinc_vals['value'].map(lambda x: x.lstrip('<>'))
loinc_vals['value'] = loinc_vals['value'].map(lambda x: x.replace(',', '.'))
loinc_vals.drop(list(loinc_vals.loc[loinc_vals['value'] == 'UNABLE TO ANALYZE'].index),  inplace=True)
loinc_vals.drop(list(loinc_vals.loc[loinc_vals['value'] == 'MOLYSIS FALSELY DECREASES THIS RESULT'].index),  inplace=True)
loinc_vals.drop(list(loinc_vals.loc[loinc_vals['value'] == 'COMPUTER NETWORK FAILURE. TEST NOT RESULTED.'].index),  inplace=True)
loinc_vals.drop(list(loinc_vals.loc[loinc_vals['value'] == 'UNABLE TO DETERMINE'].index),  inplace=True)
loinc_vals.drop(list(loinc_vals.loc[loinc_vals['value'] == ':UNABLE TO DETERMINE'].index),  inplace=True)
loinc_vals.drop(list(loinc_vals.loc[loinc_vals['value'] == 'UNABLE TO QUANTITATE'].index),  inplace=True)
loinc_vals.drop(list(loinc_vals.loc[loinc_vals['value'] == 'UNABLE TO REPORT'].index),  inplace=True)

In [None]:
numeric = []
categorical = []
weird = []
for code in loinc_codes:
    size = len(loinc_vals.loc[loinc_vals['loinc_code'] == str(code), 'value'])
    size_unique = len(loinc_vals.loc[loinc_vals['loinc_code'] == str(code), 'value'].unique())
    sum_na = pd.to_numeric(loinc_vals.loc[loinc_vals['loinc_code'] == str(code), 'value'], errors='coerce').isna().sum()
    if sum_na / size < 0.05:
        numeric.append(code)
    elif sum_na / size > 0.05 and size_unique < 100:
        categorical.append(code)
    else:
        weird.append(code)

In [None]:
# remove lab column that contains only 'inf' and 'Nan'
numeric.remove('26498-6')
# remove lab column that contains phrase 'See comments'
categorical.remove('33914-3')
# remove lab column that contains phrase 'Random'
categorical.remove('13362-9')

In [None]:
print('All:', len(loinc_codes))
print('Numeric: ', len(numeric))
print('Categorical: ', len(categorical))
print('Weird:', len(weird))

Summary statistics for numeric lab codes:

In [None]:
numeric_stats = []
for code in numeric:
    a = pd.to_numeric(loinc_vals.loc[loinc_vals['loinc_code'] == str(code), 'value'], errors='coerce').describe()
    numeric_stats.append(a)
numeric_stats_df = pd.concat(numeric_stats, axis=1, keys=numeric)

In [None]:
numeric_stats_df

In [None]:
# for now, ignoring the mixed type lab tests
dataset = features.drop(columns=weird, errors='ignore')

In [None]:
dataset.shape

### Data Preprocessing  and Embeddings:

In [None]:
print(list(dataset.columns))

#### Clean lab numeric variables:

In [None]:
# Convert to numeric selected columns
dataset[numeric] = dataset[numeric].apply(pd.to_numeric, errors='coerce', axis=1)

Since many lab data have outliers the median and interquartile range can be used to standardizing the numeric variables:   
- value = (value – median) / (p75 – p25)

In [None]:
def stanardize_numeric_values(df, list_of_clms, ref_df):
    """
    Use the median and interquartile range to 
    standardize the numeric variables
    value = (value – median) / (p75 – p25)
    """
    for code in list_of_clms:
        median = ref_df[code]['50%']
        p25 = ref_df[code]['25%']
        p75 = ref_df[code]['75%']
        df[code] = (df[code] - median) / (p75 - p25)
    return df
    

In [None]:
dataset = stanardize_numeric_values(dataset, numeric, numeric_stats_df)

Imputation of missing values using scikit-learn https://scikit-learn.org/stable/modules/impute.html#impute

In [None]:
from sklearn.impute import SimpleImputer

def replace_missing_val(df, list_of_clms, how='median'):
    """
    Imputation of missing values using median
    """
    imp = SimpleImputer(strategy=how)
    df_prc = imp.fit_transform(df[list_of_clms])
    df = pd.DataFrame(df_prc, columns=list_of_clms)
    return df


In [None]:
numlabvars_df = replace_missing_val(dataset, numeric, how='median')

In [None]:
numlabvars_df.shape

#### Clean lab categorical variables:

In [None]:
dataset['30089-7'] = np.where(dataset['30089-7'].isin(['<1','1','2']), '0-2',
                     np.where(dataset['30089-7'].isin(['3','4']),'3-5', dataset['30089-7']))

dataset['5767-9'] = np.where(dataset['5767-9'].isin(['CLEAR']), 'Clear',
                    np.where(dataset['5767-9'].isin(['SLHAZY']), 'SlHazy',
                    np.where(dataset['5767-9'].isin(['HAZY']), 'Hazy',
                    np.where(dataset['5767-9'].isin(['SlCloudy']),'SlCldy',  
                    np.where(dataset['5767-9'].isin(['CLOUDY']),'Cloudy',dataset['5767-9'])))))

dataset['5769-5'] = np.where(dataset['5769-5'].isin(['0']), 'NEG',
                    np.where(dataset['5769-5'].isin(['NOTDONE']), 'NONE',
                    np.where(dataset['5769-5'].isin(['LRG']), 'MANY', dataset['5769-5'])))

dataset['5778-6'] = np.where(dataset['5778-6'].isin(['YELLOW','YEL']), 'Yellow',
                    np.where(dataset['5778-6'].isin(['STRAW']), 'Straw',
                    np.where(dataset['5778-6'].isin(['AMBER','AMB']), 'Amber', 
                    np.where(dataset['5778-6'].isin(['RED']), 'Red', 
                    np.where(dataset['5778-6'].isin(['ORANGE']), 'Orange', 
                    np.where(dataset['5778-6'].isin(['DKAMB','DKAMBER']), 'DkAmb', 
                    np.where(dataset['5778-6'].isin([' ']), np.nan, dataset['5778-6'])))))))

dataset['5797-6'] = np.where(dataset['5797-6'].isin(['>80']), '80',dataset['5797-6'])

dataset['5804-0'] = np.where(dataset['5804-0'].isin(['>300']), '300',
                    np.where(dataset['5804-0'].isin([' ']), np.nan, dataset['5804-0']))

dataset['5818-0'] = np.where(dataset['5818-0'].isin(['.2']), '0.2',
                    np.where(dataset['5818-0'].isin(['>8','>8.0']), '8',
                    np.where(dataset['5818-0'].isin(['>12']), '12',
                    np.where(dataset['5818-0'].isin(['NotDone']), np.nan, dataset['5818-0']))))

dataset['5822-2'] = np.where(dataset['5822-2'].isin(['0', 'N']), 'NONE',
                    np.where(dataset['5822-2'].isin(['NOTDONE']), np.nan, dataset['5822-2']))

dataset['778-1'] = np.where(dataset['778-1'].isin(['UNABLE TO ESTIMATE DUE TO PLATELET CLUMPS']), 'NOTDETECTED', dataset['778-1'])


In [None]:
# print value counts for each lab categorical variable:
for col in categorical:
    print('----------------------------------')
    print('Column name: ', col)
    print(dataset[col].value_counts())

In [None]:
# replace 'Nan' values in categorical variables by 'UNKNOWN'
dataset.update(dataset[categorical].fillna('UNKNOWN'))

In [None]:
dataset[categorical].head()

Use one hot encoder for categoric lab features:

In [None]:
enc = OneHotEncoder()
enc.fit(dataset[categorical])
enc.categories_[0:4]

In [None]:
#onehotlabvars = enc.transform(dataset[categorical]).toarray()

In [None]:
onehotlabvars_df = pd.get_dummies(dataset[categorical])
print(onehotlabvars_df.columns)

To reduce the correlation among variables, remove one feature column from the one-hot encoded array:

In [None]:
col_list = list(onehotlabvars_df.filter(regex='_UNKNOWN'))
onehotlabvars_df = onehotlabvars_df[onehotlabvars_df.columns.drop(col_list)]

In [None]:
onehotlabvars_df.shape

#### Clean demographic static variables:

In [None]:
staticvars = ['admission_type', 'admission_location', 'insurance', 'language', 
               'religion', 'marital_status', 'ethnicity', 'gender']

In [None]:
dataset['admission_location'] = \
np.where(dataset['admission_location'].isin(['** INFO NOT AVAILABLE **']), 'EMERGENCY ROOM ADMIT',
np.where(dataset['admission_location'].isin(['TRANSFER FROM SKILLED NUR','TRANSFER FROM OTHER HEALT',
                        'TRANSFER FROM HOSP/EXTRAM']), 'TRANSFER FROM MED FACILITY',dataset['admission_location']))
dataset['language'] = \
np.where(~dataset['language'].isin(['ENGL','SPAN']),'OTHER',dataset['language'])

dataset['religion'] = \
np.where(~dataset['religion'].isin(['CATHOLIC','NOT SPECIFIED','UNOBTAINABLE','PROTESTANT QUAKER','JEWISH']),'OTHER',
np.where(dataset['religion'].isin(['UNOBTAINABLE']),'NOT SPECIFIED', dataset['religion'] ))

dataset['ethnicity'] = \
np.where(dataset['ethnicity'].isin(['ASIAN - CHINESE',
                                    'ASIAN - ASIAN INDIAN',
                                    'ASIAN - VIETNAMESE',
                                    'ASIAN - OTHER',
                                    'ASIAN - FILIPINO',
                                    'ASIAN - CAMBODIAN']), 'ASIAN',
np.where(dataset['ethnicity'].isin(['WHITE - RUSSIAN',
                                    'WHITE - BRAZILIAN',
                                    'WHITE - OTHER EUROPEAN']),'WHITE',
np.where(dataset['ethnicity'].isin(['BLACK/CAPE VERDEAN',
                                    'BLACK/HAITIAN',
                                    'BLACK/AFRICAN']), 'BLACK/AFRICAN AMERICAN',
np.where(dataset['ethnicity'].isin(['HISPANIC/LATINO - PUERTO RICAN',
                                   'HISPANIC/LATINO - DOMINICAN',
                                   'HISPANIC/LATINO - SALVADORAN',
                                   'HISPANIC/LATINO - CUBAN',
                                   'HISPANIC/LATINO - MEXICAN']), 'HISPANIC OR LATINO',   
np.where(dataset['ethnicity'].isin(['MULTI RACE ETHNICITY',
                                    'MIDDLE EASTERN',
                                    'PORTUGUESE',
                                    'AMERICAN INDIAN/ALASKA NATIVE',
                                    'NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER',
                                    'AMERICAN INDIAN/ALASKA NATIVE FEDERALLY RECOGNIZED TRIBE']), 'OTHER',
np.where(dataset['ethnicity'].isin(['UNABLE TO OBTAIN',
                                    'PATIENT DECLINED TO ANSWER']), 'UNKNOWN/NOT SPECIFIED',
dataset['ethnicity']))))))

In [None]:
# print value counts for each demographic variable:
for col in staticvars:
    print('----------------------------------')
    print('Column name: ', col)
    print(dataset[col].value_counts())

#### Use one hot encoder for demographic features:

In [None]:
enc = OneHotEncoder()
enc.fit(dataset[staticvars])
enc.categories_

In [None]:
#onehotstaticvars = enc.transform(dataset[staticvars]).toarray()

In [None]:
onehotstaticvars_df = pd.get_dummies(dataset[staticvars])
print(onehotstaticvars_df.columns)

To reduce the correlation among variables, remove one feature column from the one-hot encoded array:

In [None]:
col_list = ['admission_type_URGENT', 'admission_location_TRANSFER FROM MED FACILITY', 
            'insurance_Self Pay', 'language_OTHER', 'religion_NOT SPECIFIED', 'marital_status_UNKNOWN (DEFAULT)',
            'ethnicity_UNKNOWN/NOT SPECIFIED', 'gender_M']
onehotstaticvars_df = onehotstaticvars_df[onehotstaticvars_df.columns.drop(col_list)]

In [None]:
onehotstaticvars_df.shape

#### Combine all features and constract full dataset

In [None]:
# response variable
#response = np.array([dataset['RESISTANT_YN']])
#response = response.T
#response.shape

In [None]:
# the last variable is a target variable 
#fulldata = np.concatenate((numlabvars_df, onehotlabvars_df, onehotstaticvars_df, response), axis=1)
#fulldata.shape

In [None]:
print(numlabvars_df.shape, onehotlabvars_df.shape, onehotstaticvars_df.shape)

In [None]:
fulldata = pd.concat([numlabvars_df, onehotlabvars_df, onehotstaticvars_df, dataset['RESISTANT_YN']], axis=1)
fulldata.shape

In [None]:
# Save to a file
#np.save('data/fulldata.npy', fulldata)
fulldata.to_csv('data/fulldata.csv', sep=',', index=False)

In [None]:
# Load data
#fulldata = np.load('data/fulldata.npy')
fulldata = pd.read_csv('data/fulldata.csv')
#fulldata = pd.read_csv('data/fulldata_multiple.csv')
fulldata.head()

### Machine Learning Model Development

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import learning_curve
from sklearn.model_selection import validation_curve
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
import matplotlib.pyplot as plt
%matplotlib inline
RANDOM_STATE = 42

score_f1 = make_scorer(f1_score, average='weighted')
score_pr = make_scorer(precision_score, average='weighted')

In [None]:
# Split data
y = fulldata['RESISTANT_YN']
X = fulldata.drop(columns=['RESISTANT_YN'])
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=RANDOM_STATE)
# summarize class distribution
print(Counter(y))
print(Counter(y_train))

In [None]:
# Oversample minority class
oversample = RandomOverSampler(sampling_strategy = 'minority')
# fit and apply the transform
X_over, y_over = oversample.fit_resample(X_train, y_train)
print(Counter(y_over))

In [None]:
# Undersample majority class
undersample = RandomUnderSampler(sampling_strategy = 0.5)
# fit and apply the transform
X_under, y_under = undersample.fit_resample(X_train, y_train)
print(Counter(y_under))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
original_cnt = Counter(y_train)
oversmpl_cnt = Counter(y_over)

d = {'training set': ['original','original','oversampled','oversampled'],
     'class': [0,1,0,1], 
     'count': [original_cnt[0] , original_cnt[1], oversmpl_cnt[0], oversmpl_cnt[1]]}
df = pd.DataFrame(data=d)
 
# who v/s fare barplot
sns.barplot(x = 'training set', y = 'count', hue='class', data = df)

 
# Show the plot
plt.title('Original vs. Oversampled Training Set')
plt.ylim([0, 4000])
plt.show()

Learning curve plot:

In [None]:
# Learning curve as function of sample size
pipe_forest = make_pipeline(RandomForestClassifier(random_state=RANDOM_STATE, 
                               class_weight='balanced_subsample',
                               n_estimators=150,
                               max_depth=20,
                               max_leaf_nodes=70,
                               max_features=40,
                               max_samples=0.9,
                               min_samples_leaf=2,
                               min_samples_split=10))

train_sizes, train_scores, test_scores = learning_curve(estimator=pipe_forest,
                              X=X_over,
                              y=y_over,
                              train_sizes = np.linspace(0.1, 1.0, 5),
                              scoring=score_f1,   
                              cv=3)
train_mean= np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean= np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='training')
plt.fill_between(train_sizes, train_mean+train_std, train_mean-train_std, alpha=0.15, color='blue')
plt.plot(train_sizes, test_mean, color='orange', linestyle='--', marker='s', markersize=5, label='test')
plt.fill_between(train_sizes, test_mean+test_std, test_mean-test_std, alpha=0.15, color='orange')
plt.grid()
plt.xlabel('Sample size')
plt.ylabel('F1-score')
plt.title('Learning Curve')
plt.legend(loc='upper right')
plt.show()

Validation curve:

In [None]:
# Change on of the hyperparameters
pipe_forest = make_pipeline(RandomForestClassifier(random_state=RANDOM_STATE, 
                                                   class_weight='balanced_subsample',
                                                   n_estimators=150,
                                                   max_depth=20,
                                                   max_leaf_nodes=70,
                                                   max_features=40,
                                                   max_samples=0.9,
                                                   min_samples_leaf=2,
                                                   min_samples_split=10))
# Set parameter range
param_name = 'max_leaf_nodes'
param_range = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
train_scores, test_scores = \
                validation_curve(estimator=pipe_forest,
                              X=X_over,
                              y=y_over,
                              param_name='randomforestclassifier__'+param_name,
                              param_range =param_range,
                              scoring=score_f1,   
                              cv=3)
train_mean= np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean= np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.plot(param_range, train_mean, color='blue', marker='o', markersize=5, label='training')
plt.fill_between(param_range, train_mean+train_std, train_mean-train_std, alpha=0.15, color='blue')
plt.plot(param_range, test_mean, color='orange', linestyle='--', marker='s', markersize=5, label='test')
plt.fill_between(param_range, test_mean+test_std, test_mean-test_std, alpha=0.15, color='orange')
plt.grid()
plt.xlabel(param_name)
plt.ylabel('F1-Score')
plt.title('Validation Curve: ' + param_name)
plt.legend(loc='upper right')
plt.show()

Grid Search:

In [None]:
gs = GridSearchCV(estimator = RandomForestClassifier(random_state=RANDOM_STATE, 
                               class_weight='balanced_subsample', n_estimators=100),
                  param_grid={'max_depth': [10, 20, 30],
                             'max_leaf_nodes': [30, 50, 70],
                             'max_features': [20, 40, 60],
                             'max_samples': [0.7, 0.9],
                             'min_samples_leaf':[2, 5, 7, 10],
                             'min_samples_split':[5, 10, 15]},
                  scoring = score_pr,
                  cv=2)

gs = gs.fit(X_over, y_over)
print(gs.best_params_)
#scores = cross_val_score(gs, X_over, y_over, scoring=score_pr, cv=2)
#print('CV precision: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

Random Forest Model:

In [None]:
forest = RandomForestClassifier(random_state=RANDOM_STATE, 
                               class_weight='balanced_subsample',
                               n_estimators=100,
                               max_depth=10,
                               max_leaf_nodes=90,
                               max_features=20,
                               max_samples=0.9,
                               min_samples_leaf=5,
                               min_samples_split=10)
# Train model
forest.fit(X_train, y_train)
# Prediction
y_true, y_pred = y_test, forest.predict(X_test)
# Classification report (recall, preccision, f-score, accuracy):
print(classification_report(y_true, y_pred))
print()
tn, fp, fn, tp = confusion_matrix(y_true=y_test, y_pred=y_pred).ravel()
print('TN:',tn, 'FP:',fp, 'FN:',fn, 'TP:',tp )
print()
scores = cross_val_score(forest, X_train, y_train, scoring=score_f1, cv=5)
print('CV F1-score: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

In [None]:
# Retrain Model for after sampled test set
forest = RandomForestClassifier(random_state=RANDOM_STATE, 
                               class_weight='balanced_subsample',
                               n_estimators=150,
                               max_depth=20,
                               max_leaf_nodes=70,
                               max_features=40,
                               max_samples=0.9,
                               min_samples_leaf=2,
                               min_samples_split=10)

# Train model
forest.fit(X_over, y_over)
# Prediction
y_true, y_pred = y_test, forest.predict(X_test)
# Classification report (recall, preccision, f-score, accuracy)
print(classification_report(y_true, y_pred))
print()
tn, fp, fn, tp = confusion_matrix(y_true=y_test, y_pred=y_pred).ravel()
print('TN:',tn, 'FP:',fp, 'FN:',fn, 'TP:',tp )
print()
scores = cross_val_score(forest, X_train, y_train, scoring=score_f1, cv=5)
print('CV F1-score: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))
# ROC_AUC curve
print()
plt_roc_auc_curve(forest, X_test, y_test, model_name='Random Forest')
# Precision_Recall curve
print()
plt_precision_recall_curve(forest, X_test, y_test, model_name='Random Forest')  

In [None]:
# save classification report
clsf_report = pd.DataFrame(classification_report(y_true, y_pred, output_dict=True)).transpose()
clsf_report.to_csv('rand_forest_summary_report.csv', index= True)

#### Ensemble Learning:
- Logistic Regression
- Random Forest
- k-nearest Neighbors

In [None]:
clf1 = LogisticRegression(penalty='l2', C=0.0001, random_state=RANDOM_STATE, max_iter=8000)
clf2 = RandomForestClassifier(random_state=RANDOM_STATE, 
                               class_weight='balanced_subsample',
                               n_estimators=150,
                               max_depth=20,
                               max_leaf_nodes=70,
                               max_features=40,
                               max_samples=0.9,
                               min_samples_leaf=2,
                               min_samples_split=10)
clf3 = KNeighborsClassifier(n_neighbors=10, p=2, metric='minkowski')

clf_labels = ['Logistic Reg', 'Random Forest', 'KNN']
for clf, label in zip([clf1, clf2, clf3], clf_labels):
    scores = cross_val_score(estimator = clf,
                            X=X_over,
                            y=y_over,
                            cv=5,
                            scoring=score_f1)
    print('f-1 score:', scores.mean(), scores.std(), label)

In [None]:
eclf = VotingClassifier(estimators=[
         ('lr', clf1), ('rf', clf2), ('knn', clf3)], voting='hard')
eclf = eclf.fit(X_over, y_over)
# Prediction
y_true, y_pred = y_test, eclf.predict(X_test)
# Classification report (recall, preccision, f-score, accuracy)
print(classification_report(y_true, y_pred))
print()
tn, fp, fn, tp = confusion_matrix(y_true=y_test, y_pred=y_pred).ravel()
print('TN:',tn, 'FP:',fp, 'FN:',fn, 'TP:',tp )
print()
scores = cross_val_score(forest, X_train, y_train, scoring=score_f1, cv=5)
print('CV F1-score: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))