In [1]:
'''
1. age: age in years 
2. sex: sex (1 = male; 0 = female)
3. cp: chest pain type 
        -- Value 1: typical angina 
        -- Value 2: atypical angina 
        -- Value 3: non-anginal pain 
        -- Value 4: asymptomatic 
4. trestbps: resting blood pressure (in mm Hg on admission to the hospital) 
5. chol: serum cholestoral in mg/dl 
6. fbs: (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false) 
7. restecg: resting electrocardiographic results 
        -- Value 0: normal 
        -- Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV) 
        -- Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria 
8. thalach: maximum heart rate achieved  
9. exang: exercise induced angina (1 = yes; 0 = no) 
10. oldpeak = ST depression induced by exercise relative to rest 
11. slope: the slope of the peak exercise ST segment 
        -- Value 1: upsloping 
        -- Value 2: flat 
        -- Value 3: downsloping 
12. ca: number of major vessels (0-3) colored by flourosopy 
13. thal: 3 = normal; 6 = fixed defect; 7 = reversable defect 

'''

"\n1. age: age in years \n2. sex: sex (1 = male; 0 = female)\n3. cp: chest pain type \n        -- Value 1: typical angina \n        -- Value 2: atypical angina \n        -- Value 3: non-anginal pain \n        -- Value 4: asymptomatic \n4. trestbps: resting blood pressure (in mm Hg on admission to the hospital) \n5. chol: serum cholestoral in mg/dl \n6. fbs: (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false) \n7. restecg: resting electrocardiographic results \n        -- Value 0: normal \n        -- Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV) \n        -- Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria \n8. thalach: maximum heart rate achieved  \n9. exang: exercise induced angina (1 = yes; 0 = no) \n10. oldpeak = ST depression induced by exercise relative to rest \n11. slope: the slope of the peak exercise ST segment \n        -- Value 1: upsloping \n        -- Value 2: flat \n       

In [4]:
import pandas as pd
import numpy as np


In [5]:
df = pd.read_csv('heart.csv')

In [6]:
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [69]:
from sklearn.model_selection import train_test_split

random_state = 0

X, y = df.drop(columns=['target']), df.loc[:, 'target']


In [70]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=random_state)

In [43]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, recall_score, roc_auc_score


In [71]:
scaler = MinMaxScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
params = {
    "max_depth":[3, 4, 5],
    "n_estimators":[60, 70, 50, 40]
}

grid = GridSearchCV(RandomForestClassifier(random_state=random_state), param_grid=params,cv=3,n_jobs=-1, scoring="recall", ).fit(X_train_scaled, y_train)

In [78]:
def stats(clf, X_train, X_test, y_test=y_test):
    print("TRAIN",clf.score(X_train,y_train))
    print("TEST",clf.score(X_test,y_test))
    # print(clf.best_params_)
    y_pred = clf.predict(X_test)
    print("AUC",roc_auc_score(y_test, y_pred))
    print("RECALL",recall_score(y_test, y_pred))
    print(confusion_matrix(y_test,y_pred))
#stats(grid, X_train_scaled, X_test_scaled)


In [19]:
classifier = RandomForestClassifier(random_state=random_state,max_depth= 5, n_estimators=40).fit(X_train_scaled, y_train)
# stats(classifier, X_train_scaled, X_test_scaled)

for v in sorted(list(zip(X.columns, classifier.feature_importances_)), key= lambda x: -x[1]):
    print(v)


('ca', 0.14237629813281696)
('thal', 0.13282729861186368)
('oldpeak', 0.13251528910778326)
('cp', 0.11051004601643066)
('thalach', 0.1030216129165023)
('exang', 0.08784621295776358)
('trestbps', 0.06097909314300931)
('chol', 0.06047266566285679)
('sex', 0.056787044018219775)
('slope', 0.052320488868388366)
('age', 0.041812459056067744)
('restecg', 0.011144289583806154)
('fbs', 0.007387201924491325)


In [16]:
params = {
    "learning_rate":[ 0.1, 1, 0.5],
    "max_depth":[2],
    "n_estimators":[10,50]
}

random_clf = GridSearchCV(GradientBoostingClassifier(random_state=random_state), param_grid=params,cv=3,n_jobs=-1, scoring="recall").fit(X_train_scaled, y_train)

stats(random_clf, X_train_scaled, X_test_scaled)

0.9426229508196722
0.9302325581395349
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 10}
0.8932926829268292
[[23 10]
 [ 3 40]]


In [24]:
classifier = GradientBoostingClassifier(max_depth= 2, n_estimators=10, learning_rate=0.1).fit(X_train_scaled, y_train)
# stats(classifier, X_train_scaled, X_test_scaled)

for v in sorted(list(zip(X.columns, classifier.feature_importances_)), key= lambda x: -x[1]):
    print(v)

('cp', 0.3669285424474521)
('ca', 0.2325037315911726)
('thal', 0.22510508859981584)
('oldpeak', 0.07045607584546569)
('exang', 0.0363000283288606)
('thalach', 0.03490177534447)
('sex', 0.033804757842763065)
('age', 0.0)
('trestbps', 0.0)
('chol', 0.0)
('fbs', 0.0)
('restecg', 0.0)
('slope', 0.0)


In [83]:
from sklearn.svm import SVC

# params = {
#     "gamma":[0.1, 0.3, ],
#     "C":[0.1, 0.5, 1, 2, 3],
#     "degree":[2,3]
# }

# svc_clf = GridSearchCV(SVC(random_state=random_state), param_grid=params,cv=5,n_jobs=-1, scoring="recall").fit(X_train_scaled, y_train)
# stats(svc_clf, X_train_scaled, X_test_scaled)
for c in [5, 10, 15]: 
    for g in[0.1, 0.3]:
        res = SVC(random_state=random_state, C=c, gamma=g).fit(X_train_scaled, y_train)
        print('\n',c,g)
        stats(res, X_train_scaled, X_test_scaled)


 5 0.1
TRAIN 0.801762114537445
TEST 0.7631578947368421
AUC 0.7519379844961241
RECALL 0.8372093023255814
[[22 11]
 [ 7 36]]

 5 0.3
TRAIN 0.788546255506608
TEST 0.7631578947368421
AUC 0.7519379844961241
RECALL 0.8372093023255814
[[22 11]
 [ 7 36]]

 10 0.1
TRAIN 0.7929515418502202
TEST 0.7894736842105263
AUC 0.7822410147991544
RECALL 0.8372093023255814
[[24  9]
 [ 7 36]]

 10 0.3
TRAIN 0.7973568281938326
TEST 0.7631578947368421
AUC 0.7519379844961241
RECALL 0.8372093023255814
[[22 11]
 [ 7 36]]

 15 0.1
TRAIN 0.7929515418502202
TEST 0.7763157894736842
AUC 0.7670894996476393
RECALL 0.8372093023255814
[[23 10]
 [ 7 36]]

 15 0.3
TRAIN 0.801762114537445
TEST 0.75
AUC 0.7403100775193799
RECALL 0.813953488372093
[[22 11]
 [ 8 35]]


In [73]:
svc_clf.cv_results_

{'mean_fit_time': array([0.01213121, 0.00975895, 0.03023944, 0.0499527 , 0.01895947,
        0.00646739, 0.015766  , 0.00945015, 0.00953155, 0.00626922,
        0.00638318, 0.00405846, 0.00586791, 0.00358458, 0.00464287,
        0.00523076, 0.00476923, 0.00640683, 0.00783429, 0.005862  ]),
 'std_fit_time': array([0.00668342, 0.00324633, 0.0420394 , 0.03366899, 0.01607006,
        0.00188859, 0.01258104, 0.00689702, 0.0038314 , 0.00332274,
        0.00324205, 0.00055382, 0.00206949, 0.00074781, 0.00201183,
        0.0030918 , 0.00336307, 0.00339509, 0.00375151, 0.00289012]),
 'mean_score_time': array([0.00739617, 0.0189003 , 0.065657  , 0.0559267 , 0.0099503 ,
        0.00764875, 0.01173735, 0.00647202, 0.00569162, 0.00361409,
        0.00602031, 0.00589414, 0.00428009, 0.0054276 , 0.00472236,
        0.00453582, 0.00444264, 0.00617123, 0.00500908, 0.00359941]),
 'std_score_time': array([0.00266159, 0.02265772, 0.07145332, 0.04586347, 0.00533791,
        0.00313982, 0.01032396, 0.002407

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures

params = {
    "penalty":['l1', 'l2'],
    "C":[0.1, 0.5, 1, 2, 5, 10],
    "solver":['liblinear' , 'saga']
}

poly = PolynomialFeatures().fit(X_train_scaled)
X_train_scaled_poly = poly.transform(X_train_scaled)
X_test_scaled_poly = poly.transform(X_test_scaled)

lin_reg_clf = GridSearchCV(LogisticRegression(random_state=random_state), param_grid=params,cv=5,n_jobs=-1, scoring="recall").fit(X_train_scaled_poly, y_train)
stats(lin_reg_clf, X_train_scaled_poly, X_test_scaled_poly)


0.9180327868852459
0.9069767441860465
{'C': 0.1, 'penalty': 'l2', 'solver': 'saga'}
0.8939999999999999
[[23 10]
 [ 4 39]]


In [14]:
from sklearn.neural_network import MLPClassifier

params = {
    "activation":['logistic'],
    "hidden_layer_sizes":[(500,1000,500,50),(100,1000,200),(500,1500,500,60),(100,500,70)]
}

neural = GridSearchCV(MLPClassifier(random_state=random_state), param_grid=params,cv=5,n_jobs=-1, scoring="recall").fit(X_train_scaled_poly, y_train)

stats(neural, X_train_scaled_poly, X_test_scaled_poly)


0.8934426229508197
0.8837209302325582
{'activation': 'logistic', 'hidden_layer_sizes': (500, 1500, 500, 60)}
0.9359999999999999
[[24  9]
 [ 5 38]]


In [66]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, recall_score, roc_auc_score
import pandas as pd
import numpy as np

In [67]:
df = pd.read_csv('heart.csv')

In [68]:
df = df.loc[:, ['cp','exang','thalach','sex','age','target']]

In [65]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np

df = pd.read_csv('heart.csv')

df = df.loc[:, ['cp','exang','thalach','sex','age','target']]

random_state = 0

X, y = df.drop(columns=['target']), df.loc[:, 'target']

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=random_state)


scaler = MinMaxScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)


params = [{
    "max_depth":[3, 4, 5],
    "n_estimators":[60, 70, 50, 40]
},
{
    "learning_rate":[ 0.1, 1, 0.5],
    "max_depth":[2],
    "n_estimators":[10,50]
},{
    "gamma":[0.1, 0.5, 1, 2, 5, 10],
    "C":[0.1, 0.5, 1, 2, 5, 10],
    "degree":[2,3,6,9]
},{
    "activation":['logistic'],
    "hidden_layer_sizes":[(100,50),(100,100),(200,50),(100,150,50),(100)]
},{
    "penalty":['l1', 'l2'],
    "C":[0.1, 0.5, 1, 2, 5, 10],
    "solver":['liblinear' , 'saga']
}]
clfs = [RandomForestClassifier,GradientBoostingClassifier,SVC,MLPClassifier,LogisticRegression]

for clf, prms in zip(clfs, params):
    if clf == LogisticRegression:
        poly = PolynomialFeatures().fit(X_train_scaled)
        X_train_scaled = poly.transform(X_train_scaled)
        X_test_scaled = poly.transform(X_test_scaled)


    grid = GridSearchCV(clf(random_state=random_state), param_grid=prms,cv=5,n_jobs=-1, scoring="recall").fit(X_train_scaled, y_train)

    print('\n',clf.__name__)

    stats(grid, X_train_scaled, X_test_scaled)



 RandomForestClassifier
TRAIN 0.8852459016393442
TEST 0.813953488372093
{'max_depth': 3, 'n_estimators': 40}
AUC 0.7251585623678647
RECALL 0.813953488372093
[[21 12]
 [ 8 35]]

 GradientBoostingClassifier
TRAIN 0.8032786885245902
TEST 0.7209302325581395
{'learning_rate': 0.5, 'max_depth': 2, 'n_estimators': 10}
AUC 0.7089499647639184
RECALL 0.7209302325581395
[[23 10]
 [12 31]]

 SVC
TRAIN 0.8770491803278688
TEST 0.9302325581395349
{'C': 0.1, 'degree': 2, 'gamma': 0.1}
AUC 0.7075405214940098
RECALL 0.9302325581395349
[[16 17]
 [ 3 40]]

 MLPClassifier
TRAIN 1.0
TEST 1.0
{'activation': 'logistic', 'hidden_layer_sizes': (100, 150, 50)}
AUC 0.5
RECALL 1.0
[[ 0 33]
 [ 0 43]]

 LogisticRegression
TRAIN 0.860655737704918
TEST 0.9069767441860465
{'C': 0.1, 'penalty': 'l1', 'solver': 'saga'}
AUC 0.6959126145172656
RECALL 0.9069767441860465
[[16 17]
 [ 4 39]]


In [None]:
['cp','ca','thal','oldpeak','exang','thalach','sex','age','target'] - RandomForest, SVC,  Logistic, 

 RandomForestClassifier
TRAIN 0.983216237314598
TEST 0.9154334038054968
{'max_depth': 5, 'n_estimators': 70}
AUC 0.828752642706131
RECALL 0.9302325581395349
[[24  9]
 [ 3 40]]


 SVC
TRAIN 0.950975800156128
TEST 0.8992248062015503
{'C': 5, 'degree': 2, 'gamma': 0.5}
AUC 0.832276250880902
RECALL 0.9069767441860465
[[25  8]
 [ 4 39]]


 LogisticRegression
TRAIN 0.9549570647931304
TEST 0.8886539816772374
{'C': 5, 'penalty': 'l1', 'solver': 'liblinear'}
AUC 0.835799859055673
RECALL 0.8837209302325582
[[26  7]
 [ 5 38]]

 ++++++++++++++++++==++++++++==++++++++*+++++==++++++*++++++++==++++++++++==++++++++++++++++++++++

 ['cp','exang','thalach','sex','age','target'] - GradientBoostingClassifier,  SVC, MLPClassifier

 GradientBoostingClassifier
TRAIN 0.8956674473067915
TEST 0.835799859055673
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 10}
AUC 0.7822410147991544
RECALL 0.8372093023255814
[[24  9]
 [ 7 36]]

 SVC
TRAIN 0.877751756440281
TEST 0.8266384778012684
{'C': 10, 'degree': 2, 'gamma': 0.1} <<<<<++++++++++++++++
AUC 0.7822410147991544
RECALL 0.8372093023255814
[[24  9]
 [ 7 36]]

 MLPClassifier
TRAIN 0.8658079625292741
TEST 0.7935165609584215
{'activation': 'logistic', 'hidden_layer_sizes': (100, 50)}
AUC 0.758985200845666
RECALL 0.7906976744186046
[[24  9]
 [ 9 34]]

  ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

  ['cp','thal','exang','thalach','sex','age','target'] - MLPClassifier, RandomForestClassifier, SVC

RandomForestClassifier
TRAIN 0.9493364558938329
TEST 0.8784355179704016
{'max_depth': 4, 'n_estimators': 70}
AUC 0.8090204369274137
RECALL 0.8604651162790697
[[25  8]
 [ 6 37]]

 SVC
TRAIN 0.9007025761124122
TEST 0.8689217758985202
{'C': 2, 'degree': 2, 'gamma': 0.5}
AUC 0.7973925299506696
RECALL 0.8372093023255814
[[25  8]
 [ 7 36]]

 MLPClassifier
TRAIN 0.8755659640905542
TEST 0.8231148696264976
{'activation': 'logistic', 'hidden_layer_sizes': (100, 100)}
AUC 0.8171247357293868
RECALL 0.9069767441860465
[[24  9]
 [ 4 39]]


+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
['cp','ca','exang','thalach','sex','age','target'] - LogisticRegression,  GradientBoostingClassifier, SVC


 GradientBoostingClassifier
TRAIN 0.9450819672131148
TEST 0.8393234672304439
{'learning_rate': 0.5, 'max_depth': 2, 'n_estimators': 10}
AUC 0.7938689217758985
RECALL 0.8604651162790697
[[24  9]
 [ 6 37]]

 SVC
TRAIN 0.9450429352068697
TEST 0.8724453840732911
{'C': 2, 'degree': 2, 'gamma': 2}
AUC 0.8125440451021847
RECALL 0.8372093023255814
[[26  7]
 [ 7 36]]



 LogisticRegression
TRAIN 0.9259953161592507
TEST 0.8618745595489782
{'C': 5, 'penalty': 'l1', 'solver': 'saga'}
AUC 0.8090204369274137
RECALL 0.8604651162790697
[[25  8]
 [ 6 37]]

In [1]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np

df = pd.read_csv('heart.csv')
df = df.loc[:, ['cp','exang','thalach','sex','age','target']]

random_state = 0

X, y = df.drop(columns=['target']), df.loc[:, 'target']
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=random_state)

scaler = MinMaxScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

clf = SVC(random_state=random_state, C=10, gamma=0.1, degree=2).fit(X_train_scaled, y_train)
clf.predict(X_test_scaled)
clf.score(X_test_scaled, y_test)


0.7894736842105263

In [2]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, clf.predict(X_test_scaled))

array([[24,  9],
       [ 7, 36]])

In [11]:
clf.predict(scaler.transform([[1,1,180,1,50]]))

array([1])