In [90]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,f1_score
from imblearn.over_sampling import SMOTE

In [91]:
df=pd.read_pickle("./pickle_file/PreprocessedHeartDataset.pkl")
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBloodPressure,SerumCholestoral,fastingBloodSugar,RestingECGResults,MaxHeartRate,ExerciseInducedAngina,ST_Depression,Slope,ColoredVessels,Thal_level,target
0,0.952197,0.681005,1.973123,0.763956,-0.256334,2.394438,-1.005832,0.015443,-0.696631,1.087338,-2.274579,-0.714429,-2.148873,1
1,-1.915313,0.681005,1.002577,-0.092738,0.072199,-0.417635,0.898962,1.633471,-0.696631,2.122573,-2.274579,-0.714429,-0.512922,1
2,-1.474158,-1.468418,0.032031,-0.092738,-0.816773,-0.417635,-1.005832,0.977514,-0.696631,0.310912,0.976352,-0.714429,-0.512922,1
3,0.180175,0.681005,0.032031,-0.663867,-0.198357,-0.417635,0.898962,1.239897,-0.696631,-0.206705,0.976352,-0.714429,-0.512922,1
4,0.290464,-1.468418,-0.938515,-0.663867,2.08205,-0.417635,0.898962,0.583939,1.435481,-0.379244,0.976352,-0.714429,-0.512922,1


## Functions

In [92]:
def get_scores(y_pred,y_test):
    acc = accuracy_score(y_test,y_pred)
    f1=f1_score(y_test,y_pred)
    print("Accuracy",acc)
    print("F1- Score:",f1)
    return acc,f1

In [93]:
#!pip install scikit-learn==0.23.1
#!pip install imbalanced-learn==0.7.0

In [94]:
def balance(xtrain,ytrain):
    print("Initial class distribution in training set")
    print("-------------------------------------------")
    unique,count=np.unique(ytrain,return_counts=True)
    class_dist = {k:v for (k,v) in zip(unique,count)}
    print(class_dist)
    #for key,value in class_dist:
        #print("{}:{}".format(key,value))
    print("Balancing using SMOTE")
    print("-"*20)
    smt=SMOTE()
    xtrain_sm,ytrain_sm=smt.fit_resample(xtrain,ytrain)
    print("Balancing COMPLETE!\n Distribution after balancing")
    print("-"*20)
    unique2,count2=np.unique(ytrain_sm,return_counts=True)
    class_dist2 = {k:v for (k,v) in zip(unique2,count2)}
    print(class_dist2)
    #for key,value in class_dist2:
        #print("{}:{}".format(key,value))
    return xtrain_sm,ytrain_sm
    

### Dividing the data into test and train sets

In [95]:
x=df.iloc[:,0:13].values
y=df.iloc[:,13].values

In [96]:
# Dividing data into train and test split
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.3,random_state=1)

In [97]:
xtest.shape

(91, 13)

## Models

### Logistic Regression

In [98]:
log_reg=LogisticRegression()


In [99]:
log_reg.fit(xtrain,ytrain)

LogisticRegression()

In [100]:
y_pred=log_reg.predict(xtest)

In [101]:
print("Evaluating Logistic Regression ")
print("-"*10)
acc_lr,f1_lr = get_scores(ytest,y_pred)

Evaluating Logistic Regression 
----------
Accuracy 0.7802197802197802
F1- Score: 0.803921568627451


### Random Forest Classifier

In [102]:
from sklearn.ensemble import RandomForestClassifier


In [103]:
rf=RandomForestClassifier(n_estimators=100,random_state=0)
rf.fit(xtrain,ytrain)

RandomForestClassifier(random_state=0)

In [104]:
y_pred_rf=rf.predict(xtest)

In [105]:
print("Evaluating Random Forest Classifier ")
print("-"*10)
acc_rf,f1_rf = get_scores(ytest,y_pred_rf)

Evaluating Random Forest Classifier 
----------
Accuracy 0.7362637362637363
F1- Score: 0.76


### Support Vector Classifier

In [106]:
from sklearn.svm import SVC
sv_classifier = SVC(kernel="linear",probability=True)
sv_classifier.fit(xtrain,ytrain)

SVC(kernel='linear', probability=True)

In [107]:
y_pred_svc=sv_classifier.predict(xtest)

In [108]:
print("Evaluating Support Vector Classifier ")
print("-"*10)
acc_svc,f1_svc = get_scores(ytest,y_pred_svc)

Evaluating Support Vector Classifier 
----------
Accuracy 0.7802197802197802
F1- Score: 0.8076923076923077


### K-Nearest Neighbours

In [109]:
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=7)
knn.fit(xtrain,ytrain)


KNeighborsClassifier(n_neighbors=7)

In [110]:
y_pred_knn=knn.predict(xtest)

In [111]:
print("Evaluating KNN Classifier ")
print("-"*10)
acc_knn,f1_knn = get_scores(ytest,y_pred_knn)

Evaluating KNN Classifier 
----------
Accuracy 0.8241758241758241
F1- Score: 0.8399999999999999


## Balancing Classes using SMOTE

In [112]:
xtrain_sm,ytrain_sm=balance(xtrain,ytrain)

Initial class distribution in training set
-------------------------------------------
{0: 97, 1: 115}
Balancing using SMOTE
--------------------
Balancing COMPLETE!
 Distribution after balancing
--------------------
{0: 115, 1: 115}


### Retraining the models on balanced data

In [113]:
log_reg_bal=LogisticRegression()
log_reg_bal.fit(xtrain_sm,ytrain_sm)

LogisticRegression()

In [114]:
ypred_bal=log_reg_bal.predict(xtest)

In [115]:
print("Evaluating Logical Regression on Balanced Dataset ")
print("-"*10)
acc_lr_bal,f1_lr_bal = get_scores(ytest,ypred_bal)

Evaluating Logical Regression on Balanced Dataset 
----------
Accuracy 0.7692307692307693
F1- Score: 0.796116504854369


In [116]:
rf_bal=RandomForestClassifier(n_estimators=100,random_state=0)
rf_bal.fit(xtrain_sm,ytrain_sm)

RandomForestClassifier(random_state=0)

In [117]:
ypred_bal_rf=rf_bal.predict(xtest)

In [118]:
print("Evaluating Random Forest Classifier on Balanced Dataset ")
print("-"*10)
acc_rf_bal,f1_rf_bal = get_scores(ytest,ypred_bal_rf)

Evaluating Random Forest Classifier on Balanced Dataset 
----------
Accuracy 0.7362637362637363
F1- Score: 0.7551020408163266


In [119]:
sv_classifier_bal = SVC(kernel="linear",probability=True)
sv_classifier_bal.fit(xtrain_sm,ytrain_sm)

SVC(kernel='linear', probability=True)

In [120]:
ypred_svc_bal = sv_classifier_bal.predict(xtest)

In [121]:
print("Evaluating Support Vector Classifier on Balanced Dataset ")
print("-"*10)
acc_svc_bal,f1_svc_bal = get_scores(ytest,ypred_svc_bal)

Evaluating Support Vector Classifier on Balanced Dataset 
----------
Accuracy 0.7912087912087912
F1- Score: 0.8155339805825242


In [128]:
knn_bal=KNeighborsClassifier(n_neighbors=7)
knn_bal.fit(xtrain_sm,ytrain_sm)

KNeighborsClassifier(n_neighbors=7)

In [129]:
ypred_knn_bal=knn_bal.predict(xtest)

In [130]:
print("Evaluating Logical Regression on Balanced Dataset ")
print("-"*10)
acc_knn_bal,f1_knn_bal = get_scores(ytest,ypred_knn_bal)

Evaluating Logical Regression on Balanced Dataset 
----------
Accuracy 0.8021978021978022
F1- Score: 0.816326530612245


### Results

In [132]:
acc=[]
acc_bal=[]
acc.append(acc_lr)
acc.append(acc_rf)
acc.append(acc_svc)
acc.append(acc_knn)
acc_bal.append(acc_lr_bal)
acc_bal.append(acc_rf_bal)
acc_bal.append(acc_svc_bal)
acc_bal.append(acc_knn_bal)
acc_bal

[0.7692307692307693,
 0.7362637362637363,
 0.7912087912087912,
 0.8021978021978022]

In [134]:
f1=[]
f1_bal=[]
f1.append(f1_lr)
f1.append(f1_rf)
f1.append(f1_svc)
f1.append(f1_knn)
f1_bal.append(f1_lr_bal)
f1_bal.append(f1_rf_bal)
f1_bal.append(f1_svc_bal)
f1_bal.append(f1_knn_bal)
f1_bal


[0.796116504854369, 0.7551020408163266, 0.8155339805825242, 0.816326530612245]

In [136]:
results = pd.DataFrame(list(zip(acc,f1)), columns = ["Accuracy","f1_Score"], index=["Logical Regression","Random Forest Classifier","Support Vector Classifier","KNN Classifier"])
results

Unnamed: 0,Accuracy,f1_Score
Logical Regression,0.78022,0.803922
Random Forest Classifier,0.736264,0.76
Support Vector Classifier,0.78022,0.807692
KNN Classifier,0.824176,0.84


In [137]:
results_bal = pd.DataFrame(list(zip(acc_bal,f1_bal)), columns = ["Accuracy","f1_Score"], index=["Logical Regression","Random Forest Classifier","Support Vector Classifier","KNN Classifier"])
results_bal

Unnamed: 0,Accuracy,f1_Score
Logical Regression,0.769231,0.796117
Random Forest Classifier,0.736264,0.755102
Support Vector Classifier,0.791209,0.815534
KNN Classifier,0.802198,0.816327
