In [1]:
import numpy as np
import logging
import pandas as pd
from copy import deepcopy as cp

import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

import heart_disease.plotting as hdp
import heart_disease.preprocessing as hdpp
import heart_disease.parameter_optimization as hdpo
from heart_disease.globals import code_path, raw_data_path, data_path, output_path, plot_path

import seaborn as sns

sns.set_context("paper", rc={"font.size":28,
                             "axes.titlesize":32,
                             "axes.labelsize":25,
                             "axes.ticksize":20,
                             'lines.markersize':30,
                             'lines.linewidth':10,
                             'legend.fontsize': 20,
                             'legend.loc':'lower right',
                             })

label_axissize = 25

%load_ext autoreload
%autoreload 2

In [2]:
# Get Metadata
meta_data_df = pd.read_csv(data_path+'meta_data.csv')
meta_data_df.head()

Unnamed: 0,name,datatype,description,id
0,age,numeric,age in years,3
1,sex,bool,sex (1 = male; 0 = female),4
2,cp,categorical,chest pain type\r -- Value 1: typical angina...,9
3,trestbps,numeric,resting blood pressure (in mm Hg on admission ...,10
4,chol,numeric,serum cholestoral,12


In [3]:
# Import Raw Data
data_file_name_str = 'train_validation.cleveland.csv'
data_df = hdpp.load_data(data_path+data_file_name_str)
data_df.head()

Unnamed: 0_level_0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,ispos_truth
pat_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
162,54.0,0.0,3.0,110.0,214.0,0.0,0.0,158.0,0.0,1.6,2.0,0.0,3.0,False
18,48.0,0.0,3.0,130.0,275.0,0.0,0.0,139.0,0.0,0.2,1.0,0.0,3.0,False
100,45.0,1.0,4.0,115.0,260.0,0.0,2.0,185.0,0.0,0.0,1.0,0.0,3.0,False
78,48.0,1.0,2.0,130.0,245.0,0.0,2.0,180.0,0.0,0.2,2.0,0.0,3.0,False
262,60.0,0.0,1.0,150.0,240.0,0.0,0.0,171.0,0.0,0.9,1.0,0.0,3.0,False


In [4]:
summary_df = hdpo.load_grid_search_summary()
summary_df.head()

Unnamed: 0,mean_test_score,std_test_score,mean_train_score,std_train_score,params,algorithum,sigma_low_test_score,sigma_low_train_score,rank
488,0.863158,0.038115,0.874353,0.013401,"{'classifier__C': 1.0, 'classifier__gamma': 0....",SVC,0.825043,0.860952,0
495,0.863158,0.038115,0.874353,0.013401,"{'classifier__C': 1.0, 'classifier__gamma': 0....",SVC,0.825043,0.860952,1
494,0.863158,0.038115,0.874353,0.013401,"{'classifier__C': 1.0, 'classifier__gamma': 0....",SVC,0.825043,0.860952,2
661,0.862456,0.038654,0.871294,0.014051,"{'classifier__C': 10.0, 'classifier__gamma': 0...",SVC,0.823802,0.857243,3
830,0.862456,0.038971,0.874118,0.012561,"{'classifier__C': 100.0, 'classifier__gamma': ...",SVC,0.823485,0.861556,4


In [5]:
pick=1

In [14]:
summary_df.iloc[pick]['std_test_score']

0.038115019267369175

In [6]:
print(summary_df.iloc[pick]['sigma_low_test_score'])
print(summary_df.iloc[pick]['algorithum'])
print(summary_df.iloc[pick]['params'])
print('-----')

0.825042875469
SVC
{'classifier__C': 1.0, 'classifier__gamma': 0.001, 'classifier__kernel': 'rbf', 'feature__n_components': 24}
-----


In [7]:
# Get the best Pipeline According to the summary
pipe = hdpo.get_paramater_grids(data_path)[summary_df.iloc[pick]['algorithum']]['pipeline']
pipe.set_params(**summary_df.iloc[pick]['params'] )



Pipeline(memory=None,
     steps=[('cleaner', FeatureUnion(n_jobs=1,
       transformer_list=[('numeric_pipe', Pipeline(memory=None,
     steps=[('selector', ItemSelector(key_list=['age', 'trestbps', 'chol', 'thalach', 'oldpeak'])), ('Imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), ('s...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [8]:
from sklearn.model_selection import StratifiedShuffleSplit

X = np.array(data_df.iloc[:, :-1])
y = np.array(data_df['ispos_truth'])

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=10)
for train_idx, validation_idx in sss.split(X, y):
    pass


pipe.fit(data_df.iloc[train_idx],data_df.iloc[train_idx]['ispos_truth'],)
train_score = pipe.score(data_df.iloc[train_idx],data_df.iloc[train_idx]['ispos_truth'])
validation_score = pipe.score(data_df.iloc[validation_idx],data_df.iloc[validation_idx]['ispos_truth'])


In [9]:
# Import Test Data
data_file_name_str = 'test.cleveland.csv'
test_df = hdpp.load_data(data_path+data_file_name_str)
test_df.head()

Unnamed: 0_level_0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,ispos_truth
pat_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
21,58.0,0.0,1.0,150.0,283.0,1.0,2.0,162.0,0.0,1.0,1.0,0.0,3.0,False
188,54.0,1.0,2.0,192.0,283.0,0.0,2.0,195.0,0.0,0.0,1.0,1.0,7.0,True
24,60.0,1.0,4.0,130.0,206.0,0.0,2.0,132.0,1.0,2.4,2.0,2.0,7.0,True
198,50.0,0.0,2.0,120.0,244.0,0.0,0.0,162.0,0.0,1.1,1.0,0.0,3.0,False
39,61.0,1.0,3.0,150.0,243.0,1.0,0.0,137.0,1.0,1.0,2.0,0.0,3.0,False


In [10]:
test_score = pipe.score(test_df,test_df['ispos_truth'])

print("Train {}, Validation {}, Test {}".format(train_score, validation_score, test_score))

Train 0.888235294117647, Validation 0.8771929824561403, Test 0.8157894736842105


In [11]:
name = 'switzerland'
data = hdpp.load_raw_data(raw_data_path+'processed.{}.data.txt'.format(name), data_path+'meta_data.csv')
data.to_csv(data_path + '{}.csv'.format(name), header=True, index_label='pat_id')

In [12]:
for name in ['va','switzerland', 'hungacian']:
    test_df = hdpp.load_data(data_path+'{}.csv'.format(name))
    test_score = pipe.score(test_df,test_df['ispos_truth'])
    print("{}, Test {}".format(name, test_score))

va, Test 0.58
switzerland, Test 0.5691056910569106
hungacian, Test 0.8231292517006803
