In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from heart_categories_new import *
from generic_my_ds_utilities import *
from woe import *

from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from sklearn.naive_bayes import BernoulliNB

from tpot.builtins import StackingEstimator
from tpot import TPOTClassifier


In [30]:
trainFeaturesFile = 'HeartDisease/Warm_Up_Machine_Learning_with_a_Heart_-_Train_Values.csv'
trainLabelsFile = 'HeartDisease/Warm_Up_Machine_Learning_with_a_Heart_-_Train_Labels.csv'
testFile = 'HeartDisease/Warm_Up_Machine_Learning_with_a_Heart_-_Test_Values.csv'
                                
features = pd.read_csv(trainFeaturesFile)
labels = pd.read_csv(trainLabelsFile)
df = features.merge(labels, on='patient_id')

In [61]:
def prepareData(data, mappingDict=None, km=None):
    toDrop = ['patient_id', 'resting_blood_pressure', 'serum_cholesterol_mg_per_dl', 'oldpeak_eq_st_depression',
         'age', 'max_heart_rate_achieved']
    
    df = data.copy()
    
    df.thal = LabelEncoder().fit_transform(df.thal)
     
    # discretization of features
    df['disc_resting_blood_pressure'] = df.apply(get_resting_blood_pressure_categories, axis=1)
    df['disc_serum_cholesterol_mg_per_dl'] = df.apply(get_serum_cholesterol_mg_per_dl_categ, axis=1)
    df['disc_oldpeak_eq_st_depression'] = df.apply(get_oldpeak_eq_st_depression_categ, axis=1)
    df['disc_age'] = df.apply(getAgeGroup, axis=1)
    df['disc_max_heart_rate_achieved'] = df.apply(get_max_heart_rate_achieved_categories, axis=1)
    df = df.drop(toDrop, axis=1)
    
    #making all fetures categoies
    for fea in df.columns:
        df[fea] = df[fea].astype('category')
    
    #when preparing training data creates a mappingDict from WoE for each feature
    if mappingDict is None:
        mappingDict = getWoe(df)
    
    # replaces the feature values with mapped WoE values
    for feat, attributes in mappingDict.items():
        df[feat].replace(attributes, inplace=True)
   
     # Clustering, creates clusering model from training data
    if km is None:
        df = df.drop('heart_disease_present', axis=1)
        km = KMeans(n_clusters=2)
        km.fit(df.drop(['disc_oldpeak_eq_st_depression'], axis=1))
        df['kmlabel'] = km.labels_      
    else:
        df['kmlabel'] = km.predict(df_mean.drop(['disc_oldpeak_eq_st_depression'], axis=1))

     #creating selected poly features
    df['fasting_blood_sugar_gt_120_mg_per_dl * disc_resting_blood_pressure'] = df.fasting_blood_sugar_gt_120_mg_per_dl *df.disc_resting_blood_pressure
    df['disc_serum_cholesterol_mg_per_dl * disc_age'] = df.disc_serum_cholesterol_mg_per_dl * df.disc_age
   
    #Selected feature based on analysis
    selectedFeatures = ['kmlabel',
 'disc_serum_cholesterol_mg_per_dl * disc_age',
 'fasting_blood_sugar_gt_120_mg_per_dl * disc_resting_blood_pressure',
 'disc_max_heart_rate_achieved',
 'disc_oldpeak_eq_st_depression',
 'chest_pain_type',
 'num_major_vessels']
    df = df[selectedFeatures]
    
    return  df, mappingDict, km

def auc(X, y, clf):
    clf.fit(X, y)
    predictions = clf.predict_proba(X)[:,1] 
    auc = roc_auc_score(y, predictions) 
    return(auc)



In [63]:
# preparing the data/ as return we have the mappingWoeModel and km model which is used for test data preparation
df_prepared, mappingWoeModel, km = prepareData(df)
y = df.heart_disease_present
X = df_prepared

In [64]:
X.shape

(180, 7)

In [65]:
# trying to see which classifier will be suggested by TPOT for the selected predictors

tpoti = TPOTClassifier()

tpoti = TPOTClassifier(generations=5, population_size=20, cv=5, random_state=42, verbosity=2)

tpoti.fit(X,y)


HBox(children=(IntProgress(value=0, description='Optimization Progress', max=120, style=ProgressStyle(descript…

Generation 1 - Current best internal CV score: 0.85
Generation 2 - Current best internal CV score: 0.861111111111111
Generation 3 - Current best internal CV score: 0.861111111111111
Generation 4 - Current best internal CV score: 0.861111111111111
Generation 5 - Current best internal CV score: 0.861111111111111

Best pipeline: LogisticRegression(SelectFwe(input_matrix, alpha=0.013000000000000001), C=25.0, dual=False, penalty=l1)


TPOTClassifier(config_dict=None, crossover_rate=0.1, cv=5,
        disable_update_check=False, early_stop=None, generations=5,
        max_eval_time_mins=5, max_time_mins=None, memory=None,
        mutation_rate=0.9, n_jobs=1, offspring_size=None,
        periodic_checkpoint_folder=None, population_size=20,
        random_state=42, scoring=None, subsample=1.0, use_dask=False,
        verbosity=2, warm_start=False)

In [None]:
tpoti.export('tpot_exported_pipeline2.py')

In [73]:
#suggested pipeline by tpot
from sklearn.feature_selection import SelectFwe
pipeline = make_pipeline(
    SelectFwe(alpha=0.013),
    LogisticRegression(solver='liblinear', C=25.0, dual=False, penalty='l1')
)

In [74]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X, y, test_size = 0.25, random_state=42, stratify=y)

pipeline.fit(X1_train, y1_train)
y1_pred_test = pipeline.predict(X1_test)
y1_pred_train = pipeline.predict(X1_train)
y1_test_predp = pipeline.predict_proba(X1_test)
y1_train_predp = pipeline.predict_proba(X1_train)
getMetricsForPreictions([pipeline], X, y).T

Unnamed: 0,0
precisionTrain0,0.839506
precisionTrain1,0.87037
precisionTest0,1.0
precisionTest1,0.8
recallTrain0,0.906667
recallTrain1,0.783333
recallTest0,0.8
recallTest1,1.0
f1Train0,0.871795
f1Train1,0.824561


In [75]:
#analysing the missmatches
fp = X1_test.loc[np.logical_and(y1_test != y1_pred_test, y1_test == False)]
tp = X1_test.loc[np.logical_and(y1_test == y1_pred_test, y1_test == True)]
tn = X1_test.loc[np.logical_and(y1_test == y1_pred_test, y1_test == False)]
fn = X1_test.loc[np.logical_and(y1_test != y1_pred_test, y1_test == True)]

In [76]:
fp

Unnamed: 0,kmlabel,disc_serum_cholesterol_mg_per_dl * disc_age,fasting_blood_sugar_gt_120_mg_per_dl * disc_resting_blood_pressure,disc_max_heart_rate_achieved,disc_oldpeak_eq_st_depression,chest_pain_type,num_major_vessels
122,1,0.002086,0.168242,-0.820131,-1.275023,0.696209,-1.198313
4,1,0.000917,0.107615,-0.101223,2.140748,-0.987386,-1.198313
127,1,0.2184,0.19221,-0.101223,-1.275023,-1.42635,-1.198313
166,1,0.003105,0.19221,-1.446476,-1.275023,0.696209,-1.198313
141,1,0.002086,0.19221,-0.820131,0.049249,-0.987386,-1.198313


In [77]:
tp

Unnamed: 0,kmlabel,disc_serum_cholesterol_mg_per_dl * disc_age,fasting_blood_sugar_gt_120_mg_per_dl * disc_resting_blood_pressure,disc_max_heart_rate_achieved,disc_oldpeak_eq_st_depression,chest_pain_type,num_major_vessels
79,1,0.000917,0.19221,-0.101223,-0.11384,0.696209,0.137198
14,1,0.739499,0.19221,-1.446476,2.140748,-0.987386,-1.198313
56,1,0.739499,0.305386,0.278142,2.140748,0.696209,0.137198
31,1,0.003105,0.168242,1.319355,-0.11384,0.696209,1.371036
81,1,0.496782,0.19221,1.319355,-0.11384,0.696209,0.137198
147,0,0.003105,0.19221,-0.101223,2.140748,-0.987386,0.764789
57,1,0.000917,0.19221,1.319355,0.049249,0.696209,0.764789
176,1,0.2184,0.107615,1.319355,2.140748,0.696209,-1.198313
163,1,0.2184,0.168242,-0.101223,-1.275023,0.696209,1.371036
78,1,0.000917,0.168242,-0.101223,2.140748,0.696209,0.764789


In [78]:
tn

Unnamed: 0,kmlabel,disc_serum_cholesterol_mg_per_dl * disc_age,fasting_blood_sugar_gt_120_mg_per_dl * disc_resting_blood_pressure,disc_max_heart_rate_achieved,disc_oldpeak_eq_st_depression,chest_pain_type,num_major_vessels
150,0,0.2184,0.305386,-0.820131,-1.275023,-1.922357,-1.198313
90,0,0.2184,0.19221,-0.820131,-1.275023,-1.922357,-1.198313
92,0,0.2184,0.183313,-1.446476,-1.275023,-1.42635,-1.198313
145,0,0.002086,0.168242,-1.446476,-1.275023,-1.922357,0.764789
151,0,0.2184,0.19221,-0.820131,-1.275023,-1.42635,0.137198
35,0,0.739499,0.19221,-1.446476,-1.275023,0.696209,-1.198313
63,0,0.496782,0.305386,-0.101223,-1.275023,-1.922357,-1.198313
24,0,0.000917,0.168242,-0.101223,-1.275023,-1.922357,-1.198313
1,0,0.739499,0.305386,-0.820131,-0.11384,-1.42635,-1.198313
160,0,0.2184,0.19221,-1.446476,-1.275023,-1.922357,-1.198313


In [79]:
fn

Unnamed: 0,kmlabel,disc_serum_cholesterol_mg_per_dl * disc_age,fasting_blood_sugar_gt_120_mg_per_dl * disc_resting_blood_pressure,disc_max_heart_rate_achieved,disc_oldpeak_eq_st_depression,chest_pain_type,num_major_vessels


In [59]:
test = pd.read_csv(testFile)
test1 = test.copy()
XT, a= prepareData(test1, mappingDict=mappingWoeModel)
a5 = pipeline.predict_proba(XT)[:,1]
new = pd.concat([test1[['patient_id']], pd.DataFrame(a5)], axis=1)
new.columns = ['patient_id', 'heart_disease_present']
new.to_csv('heartdisease_tpot_suggesion_4.csv', index=False)
