In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [2]:
#loading the data into a dataframe
df = pd.read_csv("heart.csv")
df.shape

(303, 14)

In [3]:
#checking for duplicates 
df.duplicated().sum()

1

In [4]:
#dropping the duplicates
df.drop_duplicates()
df.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [5]:
#dropping null entries
df.dropna()
df.shape

(303, 14)

In [6]:
#splitting the dataframe into input features and targets
X = df.drop(["output"], axis= 1)  #input features
y = df["output"]

m, n = X.shape
print(f"Dimensions of X and y: {X.shape, y.shape}")

Dimensions of X and y: ((303, 13), (303,))


In [7]:
#scaling down the features and converting X&y to numpy.ndarray
con_cols = ["age","trtbps","chol","thalachh","oldpeak"]
X[con_cols] = RobustScaler().fit_transform(X[con_cols])

X.head

<bound method NDFrame.head of           age  sex  cp  trtbps      chol  fbs  restecg  thalachh  exng  \
0    0.592593    1   3    0.75 -0.110236    1        0 -0.092308     0   
1   -1.333333    1   2    0.00  0.157480    0        1  1.046154     0   
2   -1.037037    0   1    0.00 -0.566929    0        0  0.584615     0   
3    0.074074    1   1   -0.50 -0.062992    0        1  0.769231     0   
4    0.148148    0   0   -0.50  1.795276    0        1  0.307692     1   
..        ...  ...  ..     ...       ...  ...      ...       ...   ...   
298  0.148148    0   0    0.50  0.015748    0        1 -0.923077     1   
299 -0.740741    1   3   -1.00  0.377953    0        1 -0.646154     0   
300  0.962963    1   0    0.70 -0.740157    1        1 -0.369231     0   
301  0.148148    1   0    0.00 -1.716535    0        1 -1.169231     1   
302  0.148148    0   1    0.00 -0.062992    0        0  0.646154     0   

     oldpeak  slp  caa  thall  
0     0.9375    0    0      1  
1     1.6875    0

In [8]:
X = X.values
y = y.values

In [9]:
#splitting X and y into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

print(f"Shape of X_train and X_test: {X_train.shape, X_test.shape}")
print(f"Shape of y_train and y_test: {y_train.shape, y_test.shape}")

Shape of X_train and X_test: ((242, 13), (61, 13))
Shape of y_train and y_test: ((242,), (61,))


In [10]:
#fitting the model to the training set
model = LogisticRegression().fit(X_train, y_train)

In [11]:
pred_test = model.predict(X_test)

k = pred_test.shape[0]
c = 0.

for i in range(k):
    if(pred_test[i]!=y_test[i]):
        c += 1

print(f"Mispredictions rate on test set: {(c/k)*100}%")

Mispredictions rate on test set: 14.754098360655737%
