In [1]:
import numpy as np
import logging
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import heart_disease.plotting as hdp
import heart_disease.preprocessing as pre_proc
import seaborn as sns
import pandas as pd
from copy import deepcopy as cp
%load_ext autoreload

from heart_disease.globals import code_path, raw_data_path, data_path, output_path


from sklearn.pipeline import Pipeline
from sklearn.linear_model import perceptron
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC

from sklearn.model_selection import StratifiedShuffleSplit, validation_curve, GridSearchCV

%autoreload 2

In [2]:
LOG_FMT = '%(asctime)s|%(name)s|%(levelname)s|%(message)s'
logging.basicConfig(level=logging.INFO, format=LOG_FMT)
logger = logging.getLogger(__name__)

In [3]:
# Get Metadata
meta_data_df = pd.read_csv(data_path+'meta_data.csv')
meta_data_df.head()

Unnamed: 0,name,datatype,description,id
0,age,numeric,age in years,3
1,sex,bool,sex (1 = male; 0 = female),4
2,cp,categorical,chest pain type\r -- Value 1: typical angina...,9
3,trestbps,numeric,resting blood pressure (in mm Hg on admission ...,10
4,chol,numeric,serum cholestoral,12


In [4]:
# Import Raw Data
data_file_name_str = 'train_validation.cleveland.csv'
data_df = pre_proc.load_data(data_path+data_file_name_str)
data_df.head()

Unnamed: 0_level_0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,ispos_truth
pat_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
162,54.0,0.0,3.0,110.0,214.0,0.0,0.0,158.0,0.0,1.6,2.0,0.0,3.0,False
18,48.0,0.0,3.0,130.0,275.0,0.0,0.0,139.0,0.0,0.2,1.0,0.0,3.0,False
100,45.0,1.0,4.0,115.0,260.0,0.0,2.0,185.0,0.0,0.0,1.0,0.0,3.0,False
78,48.0,1.0,2.0,130.0,245.0,0.0,2.0,180.0,0.0,0.2,2.0,0.0,3.0,False
262,60.0,0.0,1.0,150.0,240.0,0.0,0.0,171.0,0.0,0.9,1.0,0.0,3.0,False


In [5]:
pipeline = Pipeline([
    ('cleaner',pre_proc.DataCleaner(data_path+'meta_data.csv').CleaningPipeline),
    ('feature',PCA(n_components=20)),
#     ('feature2',LinearDiscriminantAnalysis()),
#     ('classifier',perceptron.Perceptron())
    ('classifier',SVC())

])

In [6]:
cv = StratifiedShuffleSplit(n_splits=10, test_size=0.25, random_state=0)

In [7]:
# param_range = range(1,20)
param_range = np.logspace(-6, -0.1, 25)

In [8]:
# 4 jobs is best for my machines

In [9]:
parameters = {'classifier__kernel':('linear', 'rbf'), 
              'classifier__C':np.linspace(1,10,5),
              'classifier__gamma':np.logspace(-6, -0.1, 10)}

cv_estimator = GridSearchCV(pipeline, parameters, cv=cv, n_jobs=4)

In [10]:
logger.info("Grid Search Started")
out = cv_estimator.fit(data_df, data_df['ispos_truth'])
logger.info("Grid Search Complete")

2017-11-14 18:06:59,210|__main__|INFO|Grid Search Started
2017-11-14 18:07:12,239|__main__|INFO|Grid Search Complete


In [11]:
out.best_score_

0.86315789473684212

In [12]:
out.best_params_

{'classifier__C': 3.25,
 'classifier__gamma': 0.00041900791057866693,
 'classifier__kernel': 'rbf'}

In [13]:
pd.DataFrame(out.cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_classifier__C,param_classifier__gamma,param_classifier__kernel,params,rank_test_score,split0_test_score,...,split7_test_score,split7_train_score,split8_test_score,split8_train_score,split9_test_score,split9_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,0.014005,0.005470,0.821053,0.890000,1,1e-06,linear,"{'classifier__C': 1.0, 'classifier__gamma': 1e...",63,0.877193,...,0.859649,0.894118,0.771930,0.870588,0.789474,0.911765,0.002970,0.001366,0.053097,0.014893
1,0.017996,0.005944,0.543860,0.541176,1,1e-06,rbf,"{'classifier__C': 1.0, 'classifier__gamma': 1e...",84,0.543860,...,0.543860,0.541176,0.543860,0.541176,0.543860,0.541176,0.005484,0.001042,0.000000,0.000000
2,0.020797,0.008051,0.821053,0.890000,1,4.52434e-06,linear,"{'classifier__C': 1.0, 'classifier__gamma': 4....",63,0.877193,...,0.859649,0.894118,0.771930,0.870588,0.789474,0.911765,0.005069,0.002704,0.053097,0.014893
3,0.018510,0.008274,0.543860,0.541176,1,4.52434e-06,rbf,"{'classifier__C': 1.0, 'classifier__gamma': 4....",84,0.543860,...,0.543860,0.541176,0.543860,0.541176,0.543860,0.541176,0.005899,0.004811,0.000000,0.000000
4,0.026272,0.006969,0.821053,0.890000,1,2.04697e-05,linear,"{'classifier__C': 1.0, 'classifier__gamma': 2....",63,0.877193,...,0.859649,0.894118,0.771930,0.870588,0.789474,0.911765,0.008410,0.003339,0.053097,0.014893
5,0.013680,0.004735,0.543860,0.541176,1,2.04697e-05,rbf,"{'classifier__C': 1.0, 'classifier__gamma': 2....",84,0.543860,...,0.543860,0.541176,0.543860,0.541176,0.543860,0.541176,0.003476,0.000101,0.000000,0.000000
6,0.014987,0.005410,0.821053,0.890000,1,9.26119e-05,linear,"{'classifier__C': 1.0, 'classifier__gamma': 9....",63,0.877193,...,0.859649,0.894118,0.771930,0.870588,0.789474,0.911765,0.004025,0.002265,0.053097,0.014893
7,0.014812,0.005542,0.543860,0.541176,1,9.26119e-05,rbf,"{'classifier__C': 1.0, 'classifier__gamma': 9....",84,0.543860,...,0.543860,0.541176,0.543860,0.541176,0.543860,0.541176,0.001689,0.000803,0.000000,0.000000
8,0.017257,0.005792,0.821053,0.890000,1,0.000419008,linear,"{'classifier__C': 1.0, 'classifier__gamma': 0....",63,0.877193,...,0.859649,0.894118,0.771930,0.870588,0.789474,0.911765,0.003039,0.001029,0.053097,0.014893
9,0.017133,0.006950,0.656140,0.668235,1,0.000419008,rbf,"{'classifier__C': 1.0, 'classifier__gamma': 0....",78,0.631579,...,0.649123,0.652941,0.666667,0.652941,0.684211,0.711765,0.006343,0.003862,0.029565,0.038501
