
# IMPORTING LIBRARIES

In [1]:
import numpy as np
import pandas as pd
import math as mt

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

# CONNECTING TO DRIVE

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# LOAD THE DATA

In [3]:
df = pd.read_csv("/content/drive/MyDrive/heart_cleveland_upload.csv")
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0,1
4,64,1,0,110,211,0,2,144,1,1.8,1,0,0,0


In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 297 entries, 0 to 296
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        297 non-null    int64  
 1   sex        297 non-null    int64  
 2   cp         297 non-null    int64  
 3   trestbps   297 non-null    int64  
 4   chol       297 non-null    int64  
 5   fbs        297 non-null    int64  
 6   restecg    297 non-null    int64  
 7   thalach    297 non-null    int64  
 8   exang      297 non-null    int64  
 9   oldpeak    297 non-null    float64
 10  slope      297 non-null    int64  
 11  ca         297 non-null    int64  
 12  thal       297 non-null    int64  
 13  condition  297 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 32.6 KB
None


# DATA PREPROCESSING

**NULL VALUES TREATMENT**

In [5]:
#Check null values
df.isnull().sum()

age          0
sex          0
cp           0
trestbps     0
chol         0
fbs          0
restecg      0
thalach      0
exang        0
oldpeak      0
slope        0
ca           0
thal         0
condition    0
dtype: int64

In [6]:
outliers = []
def detect_outliers_zscore(df):
    thres = 3
    mean = np.mean(df)
    std = np.std(df)
    for i in df:
        z_score = (i-mean)/std
        if (np.abs(z_score) > thres):
           outliers.append(i)
    return outliers

**DUMMY VALUES TREATMENT**

In [7]:
#Since 'cp', 'thal' and 'slope' are categorical variables we'll turn them into dummy variables.
a = pd.get_dummies(df['cp'], prefix = "cp")
b = pd.get_dummies(df['thal'], prefix = "thal")
c = pd.get_dummies(df['slope'], prefix = "slope")

In [8]:
frames = [df, a, b, c]
df = pd.concat(frames, axis = 1)
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,...,cp_0,cp_1,cp_2,cp_3,thal_0,thal_1,thal_2,slope_0,slope_1,slope_2
0,69,1,0,160,234,1,2,131,0,0.1,...,1,0,0,0,1,0,0,0,1,0
1,69,0,0,140,239,0,0,151,0,1.8,...,1,0,0,0,1,0,0,1,0,0
2,66,0,0,150,226,0,0,114,0,2.6,...,1,0,0,0,1,0,0,0,0,1
3,65,1,0,138,282,1,2,174,0,1.4,...,1,0,0,0,1,0,0,0,1,0
4,64,1,0,110,211,0,2,144,1,1.8,...,1,0,0,0,1,0,0,0,1,0


In [9]:
df = df.drop(columns = ['cp', 'thal', 'slope'])
df.head()

Unnamed: 0,age,sex,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,ca,...,cp_0,cp_1,cp_2,cp_3,thal_0,thal_1,thal_2,slope_0,slope_1,slope_2
0,69,1,160,234,1,2,131,0,0.1,1,...,1,0,0,0,1,0,0,0,1,0
1,69,0,140,239,0,0,151,0,1.8,2,...,1,0,0,0,1,0,0,1,0,0
2,66,0,150,226,0,0,114,0,2.6,0,...,1,0,0,0,1,0,0,0,0,1
3,65,1,138,282,1,2,174,0,1.4,1,...,1,0,0,0,1,0,0,0,1,0
4,64,1,110,211,0,2,144,1,1.8,0,...,1,0,0,0,1,0,0,0,1,0


**NORMALISATION**

In [10]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
num_vars = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak',]
df[num_vars] = scaler.fit_transform(df[num_vars])
df

Unnamed: 0,age,sex,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,ca,...,cp_0,cp_1,cp_2,cp_3,thal_0,thal_1,thal_2,slope_0,slope_1,slope_2
0,0.833333,1,0.622642,0.246575,1,2,0.458015,0,0.016129,1,...,1,0,0,0,1,0,0,0,1,0
1,0.833333,0,0.433962,0.257991,0,0,0.610687,0,0.290323,2,...,1,0,0,0,1,0,0,1,0,0
2,0.770833,0,0.528302,0.228311,0,0,0.328244,0,0.419355,0,...,1,0,0,0,1,0,0,0,0,1
3,0.750000,1,0.415094,0.356164,1,2,0.786260,0,0.225806,1,...,1,0,0,0,1,0,0,0,1,0
4,0.729167,1,0.150943,0.194064,0,2,0.557252,1,0.290323,0,...,1,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,0.229167,1,0.547170,0.221461,0,0,0.839695,0,0.000000,0,...,0,0,0,1,0,0,1,1,0,0
293,0.208333,1,0.226415,0.212329,0,0,0.526718,0,0.193548,0,...,0,0,0,1,0,0,1,0,1,0
294,0.125000,1,0.245283,0.164384,0,0,0.450382,1,0.258065,0,...,0,0,0,1,0,0,1,0,1,0
295,0.125000,0,0.415094,0.130137,0,0,0.847328,0,0.225806,0,...,0,0,0,1,1,0,0,1,0,0


# SPLITTING THE DATA

In [11]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, train_size = 0.9, test_size = 0.1, random_state = 152)
y_train = df_train.pop('condition')
x_train = df_train
y_test = df_test.pop('condition')
x_test = df_test
x_train = x_train.T
y_train = y_train.T
x_test = x_test.T
y_test = y_test.T



# TRAINING

**LOGISTIC REGRESSION**

In [12]:
accuracies = {}
accuracies1 = {}
lr = LogisticRegression()
lr.fit(x_train.T,y_train.T)
acc = lr.score(x_train.T,y_train.T)*100
acc1 = lr.score(x_test.T,y_test.T)*100
accuracies['Logistic Regression'] = acc
accuracies1['Logistic Regression'] = acc1
print("Training Accuracy {:.2f}%".format(acc))
print("Testing Accuracy {:.2f}%".format(acc1))

Training Accuracy 85.77%
Testing Accuracy 83.33%


**ERROR METRICS CALCULATION**

In [13]:
prediction = lr.predict(x_test.T)
rmse = mt.sqrt(mean_squared_error(y_test.T, prediction)) 
print('Root mean square error', rmse) 
mse = (mean_squared_error(y_test, prediction)) 
print('Mean square error', mse) 
mae=mean_absolute_error(y_test, prediction)
print('Mean absolute error', mae)

Root mean square error 0.408248290463863
Mean square error 0.16666666666666666
Mean absolute error 0.16666666666666666


**KNN**

In [14]:
from sklearn.neighbors import KNeighborsClassifier
knn3 = KNeighborsClassifier(n_neighbors = 3)  
# n_neighbors means k\n",
knn3.fit(x_train.T, y_train.T)
prediction = knn3.predict(x_test.T)
print("{} NN Training Score: {:.2f}%".format(3, knn3.score(x_train.T, y_train.T)*100))
print("{} NN Testing Score: {:.2f}%".format(3, knn3.score(x_test.T, y_test.T)*100))

3 NN Training Score: 89.89%
3 NN Testing Score: 90.00%


**ERROR METRICS CALCULATION**

In [15]:
rmse = mt.sqrt(mean_squared_error(y_test.T, prediction)) 
print('Root mean square error', rmse) 
mse = (mean_squared_error(y_test, prediction)) 
print('Mean square error', mse) 
mae=mean_absolute_error(y_test, prediction)
print('Mean absolute error', mae)

Root mean square error 0.31622776601683794
Mean square error 0.1
Mean absolute error 0.1


**DECISION TREE**

In [16]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(max_depth=5, random_state = 1)
dtc.fit(x_train.T, y_train.T)
acc = dtc.score(x_train.T, y_train.T)*100
acc1 = dtc.score(x_test.T, y_test.T)*100
accuracies['Decision Tree'] = acc
accuracies1['Decision Tree'] = acc1
print("Decision Tree Training Accuracy {:.2f}%".format(acc))
print("Decision Tree Testing Accuracy {:.2f}%".format(acc1))

Decision Tree Training Accuracy 92.88%
Decision Tree Testing Accuracy 76.67%


**ERROR METRICS CALCULATION**

In [17]:
prediction = dtc.predict(x_test.T)
rmse = mt.sqrt(mean_squared_error(y_test.T, prediction)) 
print('Root mean square error', rmse) 
mse = (mean_squared_error(y_test, prediction)) 
print('Mean square error', mse) 
mae=mean_absolute_error(y_test, prediction)
print('Mean absolute error', mae)

Root mean square error 0.48304589153964794
Mean square error 0.23333333333333334
Mean absolute error 0.23333333333333334


**RANDOM FOREST**

In [18]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 1000,max_depth=5, random_state = 1)
rf.fit(x_train.T, y_train.T)
acc = rf.score(x_train.T,y_train.T)*100
acc1 = rf.score(x_test.T,y_test.T)*100
accuracies['Random Forest'] = acc
accuracies1['Random Forest'] = acc1
print("Random Forest Algorithm Training Accuracy Score : {:.2f}%".format(acc))
print("Random Forest Algorithm Testing Accuracy Score : {:.2f}%".format(acc1))

Random Forest Algorithm Training Accuracy Score : 93.26%
Random Forest Algorithm Testing Accuracy Score : 83.33%


**ERROR METRICS CALCULATION**

In [19]:
prediction = rf.predict(x_test.T)
rmse = mt.sqrt(mean_squared_error(y_test.T, prediction)) 
print('Root mean square error', rmse) 
mse = (mean_squared_error(y_test, prediction)) 
print('Mean square error', mse) 
mae=mean_absolute_error(y_test, prediction)
print('Mean absolute error', mae)

Root mean square error 0.408248290463863
Mean square error 0.16666666666666666
Mean absolute error 0.16666666666666666
