# IMPORTING LIBRARIES

In [1]:
import numpy as np
import pandas as pd
import math as mt

import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error


# CONNECTING TO DRIVE

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# IMPORT DATASET

In [3]:
df = pd.read_csv("/content/drive/MyDrive/heart.csv")
df.tail(5)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0
302,57,0,1,130,236,0,0,174,0,0.0,1,1,2,0



# PREPROCESSING THE DATA

In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB
None


In [5]:
# Check null values
df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

# Splitting the data

In [6]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, train_size = 0.9, test_size = 0.1, random_state = 152)
y_train = df_train.pop('target')
x_train = df_train
y_test = df_test.pop('target')
x_test = df_test
x_train = x_train.T
y_train = y_train.T
x_test = x_test.T
y_test = y_test.T

# Training the data Using Logistic Regression

In [None]:
accuracies = {}
accuracies1 = {}
lr = LogisticRegression()
lr.fit(x_train.T,y_train.T)
acc = lr.score(x_train.T,y_train.T)*100
acc1 = lr.score(x_test.T,y_test.T)*100
accuracies['Logistic Regression'] = acc
accuracies1['Logistic Regression'] = acc1

In [8]:
print("Training Accuracy {:.2f}%".format(acc))
print("Testing Accuracy {:.2f}%".format(acc1))

Training Accuracy 85.29%
Testing Accuracy 93.55%


In [9]:
import math as mt
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
#Training Accuracies
prediction = lr.predict(x_test.T)
rmse = mt.sqrt(mean_squared_error(y_test.T, prediction)) 
print('Root mean square error', rmse) 
mse = (mean_squared_error(y_test, prediction)) 
print('Mean square error', mse) 
mae=mean_absolute_error(y_test, prediction)
print('Mean absolute error', mae)

Root mean square error 0.254000254000381
Mean square error 0.06451612903225806
Mean absolute error 0.06451612903225806


In [10]:
prediction = lr.predict(x_test.T)

In [11]:
d = pd.DataFrame({'Actual values':y_test.T, 'Predicted values':prediction})
d

Unnamed: 0,Actual values,Predicted values
131,1,1
162,1,1
45,1,1
255,0,0
258,0,1
2,1,1
21,1,1
1,1,1
213,0,0
38,1,1


In [12]:
import pickle
pickle.dump(lr , open('lr.pkl' , 'wb'))


# Preprocessing

**DUMMY VALUES TREATMENT**

In [13]:
#Since 'cp', 'thal' and 'slope' are categorical variables we'll turn them into dummy variables.
a = pd.get_dummies(df['cp'], prefix = "cp")
b = pd.get_dummies(df['thal'], prefix = "thal")
c = pd.get_dummies(df['slope'], prefix = "slope")

In [14]:
frames = [df, a, b, c]
df = pd.concat(frames, axis = 1)
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,...,cp_1,cp_2,cp_3,thal_0,thal_1,thal_2,thal_3,slope_0,slope_1,slope_2
0,63,1,3,145,233,1,0,150,0,2.3,...,0,0,1,0,1,0,0,1,0,0
1,37,1,2,130,250,0,1,187,0,3.5,...,0,1,0,0,0,1,0,1,0,0
2,41,0,1,130,204,0,0,172,0,1.4,...,1,0,0,0,0,1,0,0,0,1
3,56,1,1,120,236,0,1,178,0,0.8,...,1,0,0,0,0,1,0,0,0,1
4,57,0,0,120,354,0,1,163,1,0.6,...,0,0,0,0,0,1,0,0,0,1


In [15]:
df = df.drop(columns = ['cp', 'thal', 'slope'])
df.head()

Unnamed: 0,age,sex,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,ca,...,cp_1,cp_2,cp_3,thal_0,thal_1,thal_2,thal_3,slope_0,slope_1,slope_2
0,63,1,145,233,1,0,150,0,2.3,0,...,0,0,1,0,1,0,0,1,0,0
1,37,1,130,250,0,1,187,0,3.5,0,...,0,1,0,0,0,1,0,1,0,0
2,41,0,130,204,0,0,172,0,1.4,0,...,1,0,0,0,0,1,0,0,0,1
3,56,1,120,236,0,1,178,0,0.8,0,...,1,0,0,0,0,1,0,0,0,1
4,57,0,120,354,0,1,163,1,0.6,0,...,0,0,0,0,0,1,0,0,0,1


# Normalisation

In [16]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
num_vars = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak',]
df[num_vars] = scaler.fit_transform(df[num_vars])
df

Unnamed: 0,age,sex,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,ca,...,cp_1,cp_2,cp_3,thal_0,thal_1,thal_2,thal_3,slope_0,slope_1,slope_2
0,0.708333,1,0.481132,0.244292,1,0,0.603053,0,0.370968,0,...,0,0,1,0,1,0,0,1,0,0
1,0.166667,1,0.339623,0.283105,0,1,0.885496,0,0.564516,0,...,0,1,0,0,0,1,0,1,0,0
2,0.250000,0,0.339623,0.178082,0,0,0.770992,0,0.225806,0,...,1,0,0,0,0,1,0,0,0,1
3,0.562500,1,0.245283,0.251142,0,1,0.816794,0,0.129032,0,...,1,0,0,0,0,1,0,0,0,1
4,0.583333,0,0.245283,0.520548,0,1,0.702290,1,0.096774,0,...,0,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,0.583333,0,0.433962,0.262557,0,1,0.396947,1,0.032258,0,...,0,0,0,0,0,0,1,0,1,0
299,0.333333,1,0.150943,0.315068,0,1,0.465649,0,0.193548,0,...,0,0,1,0,0,0,1,0,1,0
300,0.812500,1,0.471698,0.152968,1,1,0.534351,0,0.548387,2,...,0,0,0,0,0,0,1,0,1,0
301,0.583333,1,0.339623,0.011416,0,1,0.335878,1,0.193548,1,...,0,0,0,0,0,0,1,0,1,0


In [17]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, train_size = 0.9, test_size = 0.1, random_state = 152)
y_train = df_train.pop('target')
x_train = df_train
y_test = df_test.pop('target')
x_test = df_test
x_train = x_train.T
y_train = y_train.T
x_test = x_test.T
y_test = y_test.T

# TRAIN THE MODEL

**KNN**

In [18]:
from sklearn.neighbors import KNeighborsClassifier
knn3 = KNeighborsClassifier(n_neighbors = 3)  
knn3.fit(x_train.T, y_train.T)
prediction = knn3.predict(x_test.T)
print("{} NN Training Score: {:.2f}%".format(3, knn3.score(x_train.T, y_train.T)*100))
print("{} NN Testing Score: {:.2f}%".format(3, knn3.score(x_test.T, y_test.T)*100))
accuracies1['KNN'] = prediction

3 NN Training Score: 88.97%
3 NN Testing Score: 93.55%


**ERROR METRICS CALCULATION**

In [19]:
rmse = mt.sqrt(mean_squared_error(y_test.T, prediction)) 
print('Root mean square error', rmse) 
mse = (mean_squared_error(y_test, prediction)) 
print('Mean square error', mse) 
mae=mean_absolute_error(y_test, prediction)
print('Mean absolute error', mae)


Root mean square error 0.254000254000381
Mean square error 0.06451612903225806
Mean absolute error 0.06451612903225806


**DECESION TREE**

In [20]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(max_depth=5)
dtc.fit(x_train.T, y_train.T)
acc = dtc.score(x_train.T, y_train.T)*100
acc1 = dtc.score(x_test.T, y_test.T)*100
accuracies['Decision Tree'] = acc
accuracies1['Decision Tree'] = acc1
print("Decision Tree Training Accuracy {:.2f}%".format(acc))
print("Decision Tree Testing Accuracy {:.2f}%".format(acc1))

Decision Tree Training Accuracy 92.28%
Decision Tree Testing Accuracy 77.42%


**ERROR METRICS CALCULATION**

In [21]:
prediction = dtc.predict(x_test.T)
rmse = mt.sqrt(mean_squared_error(y_test.T, prediction)) 
print('Root mean square error', rmse) 
mse = (mean_squared_error(y_test, prediction)) 
print('Mean square error', mse) 
mae=mean_absolute_error(y_test, prediction)
print('Mean absolute error', mae)

Root mean square error 0.47519096331149147
Mean square error 0.22580645161290322
Mean absolute error 0.22580645161290322


**RANDOM FOREST**

In [22]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 1000,max_depth=5, random_state = 1)
#n_estimators=3,max_depth=3
rf.fit(x_train.T, y_train.T)
acc = rf.score(x_train.T,y_train.T)*100
acc1 = rf.score(x_test.T,y_test.T)*100
accuracies['Random Forest'] = acc
accuracies1['Random Forest'] = acc1
print("Random Forest Algorithm Training Accuracy Score : {:.2f}%".format(acc))
print("Random Forest Algorithm Testing Accuracy Score : {:.2f}%".format(acc1))

Random Forest Algorithm Training Accuracy Score : 93.01%
Random Forest Algorithm Testing Accuracy Score : 90.32%


In [23]:
prediction = rf.predict(x_test.T)
rmse = mt.sqrt(mean_squared_error(y_test.T, prediction)) 
print('Root mean square error', rmse) 
mse = (mean_squared_error(y_test, prediction)) 
print('Mean square error', mse) 
mae=mean_absolute_error(y_test, prediction)
print('Mean absolute error', mae)

Root mean square error 0.3110855084191276
Mean square error 0.0967741935483871
Mean absolute error 0.0967741935483871
