In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import RobustScaler

# Train Test Split
from sklearn.model_selection import train_test_split

# Models
import torch
import torch.nn as nn
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Metrics
from sklearn.metrics import accuracy_score, classification_report, roc_curve

# Cross Validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
print('Packages imported...')

Packages imported...


In [4]:
df = pd.read_csv('heart.csv')
df = df.rename(columns={'age':'Age', 'sex':'Sex', 'cp':'Chest_pain', 'trestbps':'Resting_blood_pressure','chol':'Cholesterol','fbs':'Fasting_blood_sugar',
                           'restecg':'ECG_results_rest', 'thalach':'Maximum_HR','exang':'Exercise_induced_agina','oldpeak':'ST_depression_by_exercise',
                           'ca':'no_major_vessels','thal':'Thalassemia_types','target':'Heart_atack','slope':'ST_slope'})
df

Unnamed: 0,Age,Sex,Chest_pain,Resting_blood_pressure,Cholesterol,Fasting_blood_sugar,ECG_results_rest,Maximum_HR,Exercise_induced_agina,ST_depression_by_exercise,ST_slope,no_major_vessels,Thalassemia_types,Heart_atack
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


 Show which columns have unique values, to identify categorical and continuos columns

In [5]:
dict ={}
for i in list(df.columns):
    dict[i] = df[i].value_counts().shape[0]
pd.DataFrame(dict, index=['unique_count']).T

Unnamed: 0,unique_count
Age,41
Sex,2
Chest_pain,4
Resting_blood_pressure,49
Cholesterol,152
Fasting_blood_sugar,2
ECG_results_rest,3
Maximum_HR,91
Exercise_induced_agina,2
ST_depression_by_exercise,40


In [12]:
dict

{'Age': 41,
 'Sex': 2,
 'Chest_pain': 4,
 'Resting_blood_pressure': 49,
 'Cholesterol': 152,
 'Fasting_blood_sugar': 2,
 'ECG_results_rest': 3,
 'Maximum_HR': 91,
 'Exercise_induced_agina': 2,
 'ST_depression_by_exercise': 40,
 'ST_slope': 3,
 'no_major_vessels': 5,
 'Thalassemia_types': 4,
 'Heart_atack': 2}

In [6]:
continuos = ['Age', 'Resting_blood_pressure','Cholesterol','Maximum_HR', 'ST_depression_by_exercise']
categorical = ['Sex', 'Chest_pain', 'Fasting_blood_sugar', 'ECG_results_rest', 'Exercise_induced_agina','ST_slope','no_major_vessels','Thalassemia_types']
target = ['Heart_atack']

In [7]:
df[continuos].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,303.0,54.366337,9.082101,29.0,47.5,55.0,61.0,77.0
Resting_blood_pressure,303.0,131.623762,17.538143,94.0,120.0,130.0,140.0,200.0
Cholesterol,303.0,246.264026,51.830751,126.0,211.0,240.0,274.5,564.0
Maximum_HR,303.0,149.646865,22.905161,71.0,133.5,153.0,166.0,202.0
ST_depression_by_exercise,303.0,1.039604,1.161075,0.0,0.0,0.8,1.6,6.2


### Scaling and ecoding features

In [8]:
# Get dummies convert the categorical data into 0's and 1's 
df1 = df
df1 = pd.get_dummies(df, columns = categorical, drop_first = True)
df1.head()

Unnamed: 0,Age,Resting_blood_pressure,Cholesterol,Maximum_HR,ST_depression_by_exercise,Heart_atack,Sex_1,Chest_pain_1,Chest_pain_2,Chest_pain_3,...,Exercise_induced_agina_1,ST_slope_1,ST_slope_2,no_major_vessels_1,no_major_vessels_2,no_major_vessels_3,no_major_vessels_4,Thalassemia_types_1,Thalassemia_types_2,Thalassemia_types_3
0,63,145,233,150,2.3,1,1,0,0,1,...,0,0,0,0,0,0,0,1,0,0
1,37,130,250,187,3.5,1,1,0,1,0,...,0,0,0,0,0,0,0,0,1,0
2,41,130,204,172,1.4,1,0,1,0,0,...,0,0,1,0,0,0,0,0,1,0
3,56,120,236,178,0.8,1,1,1,0,0,...,0,0,1,0,0,0,0,0,1,0
4,57,120,354,163,0.6,1,0,0,0,0,...,1,0,1,0,0,0,0,0,1,0


In [9]:
x = df1.drop(['Heart_atack'], axis=1)
y = df1[['Heart_atack']]

# Scaler removes the median and scales the data according to the quantile range (defaults to IQR: Interquartile Range)
scaler = RobustScaler()


x[continuos] = scaler.fit_transform(x[continuos])

x.head()

Unnamed: 0,Age,Resting_blood_pressure,Cholesterol,Maximum_HR,ST_depression_by_exercise,Sex_1,Chest_pain_1,Chest_pain_2,Chest_pain_3,Fasting_blood_sugar_1,...,Exercise_induced_agina_1,ST_slope_1,ST_slope_2,no_major_vessels_1,no_major_vessels_2,no_major_vessels_3,no_major_vessels_4,Thalassemia_types_1,Thalassemia_types_2,Thalassemia_types_3
0,0.592593,0.75,-0.110236,-0.092308,0.9375,1,0,0,1,1,...,0,0,0,0,0,0,0,1,0,0
1,-1.333333,0.0,0.15748,1.046154,1.6875,1,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
2,-1.037037,0.0,-0.566929,0.584615,0.375,0,1,0,0,0,...,0,0,1,0,0,0,0,0,1,0
3,0.074074,-0.5,-0.062992,0.769231,0.0,1,1,0,0,0,...,0,0,1,0,0,0,0,0,1,0
4,0.148148,-0.5,1.795276,0.307692,-0.125,0,0,0,0,0,...,1,0,1,0,0,0,0,0,1,0


### Split Dataset

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=3)
print("The shape of X_train is      ", x_train.shape)
print("The shape of X_test is       ",x_test.shape)
print("The shape of y_train is      ",y_train.shape)
print("The shape of y_test is       ",y_test.shape)


The shape of X_train is       (242, 22)
The shape of X_test is        (61, 22)
The shape of y_train is       (242, 1)
The shape of y_test is        (61, 1)


### Support Vector Machines SVM

In [11]:
clf = SVC(kernel='linear', C=1, random_state=3).fit(x_train, y_train)
# Prediction*
y_predict = clf.predict(x_test)

accuracy_score(y_test, y_predict)

0.8688524590163934

In [25]:
randon = np.arange(1,25)
predictions = []
for i, r in enumerate(randon):
    clf = SVC(kernel='linear', C=r, random_state=1).fit(x_train, y_train)
# Prediction*
    y_predict = clf.predict(x_test)
    predictions.append(accuracy_score(y_test, y_predict))
print(f'Highest prediction {np.max(predictions)}, index: {1+predictions.index(np.max(predictions))}')

Highest prediction 0.8688524590163934, index: 1


### Logistic regression

In [23]:
logreg = LogisticRegression()

logreg.fit(x_train, y_train)
# calculating the probabilities
y_pred_proba = logreg.predict_log_proba(x_test)
y_pred = np.argmax(y_pred_proba, axis=1)
print('Logistic Regression Accuracy', accuracy_score(y_test, y_predict))

Logistic Regression Accuracy 0.8688524590163934


In [22]:
# instantiating the object
logreg = LogisticRegression()

# fitting the object
logreg.fit(x_train, y_train)

# calculating the probabilities
y_pred_proba = logreg.predict_proba(x_test)

# finding the predicted valued
y_pred = np.argmax(y_pred_proba,axis=1)

# printing the test accuracy
print("The test accuracy score of Logistric Regression is ", accuracy_score(y_test, y_pred))

The test accuracy score of Logistric Regression is  0.8852459016393442


In [29]:
random_state = np.arange(1,25)
prediction = []

train_accuracy = []
test_accuracy =[]

for i, r_s in enumerate(random_state):
    hr_model = RandomForestClassifier(random_state=r_s)
    hr_model.fit(x_train, y_train)
    yi_predict = hr_model.predict(x_test)
    
    
    prediction.append(accuracy_score(y_test, yi_predict))
print(f'Hisghest prediction, {np.max(prediction)}, Random State: {1+prediction.index(np.max(prediction))}')

Hisghest prediction, 0.8688524590163934, Random State: 1
