In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
df = pd.read_csv('heart.csv')

In [4]:
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,59,1,1,140,221,0,1,164,1,0.0,2,0,2,1
1021,60,1,0,125,258,0,0,141,1,2.8,1,1,3,0
1022,47,1,0,110,275,0,0,118,1,1.0,1,1,2,0
1023,50,0,0,110,254,0,0,159,0,0.0,2,0,2,1


In [5]:
df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [6]:
df.dtypes

age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal          int64
target        int64
dtype: object

In [7]:
# Changing column datatype

df['oldpeak'] = df['oldpeak'].astype(float)
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,59,1,1,140,221,0,1,164,1,0.0,2,0,2,1
1021,60,1,0,125,258,0,0,141,1,2.8,1,1,3,0
1022,47,1,0,110,275,0,0,118,1,1.0,1,1,2,0
1023,50,0,0,110,254,0,0,159,0,0.0,2,0,2,1


In [10]:
# Changing multiple columns datatype 

change_col = ['oldpeak']

df[change_col] = df[change_col].astype(int)

df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1,1,3,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,59,1,1,140,221,0,1,164,1,0,2,0,2,1
1021,60,1,0,125,258,0,0,141,1,2,1,1,3,0
1022,47,1,0,110,275,0,0,118,1,1,1,1,2,0
1023,50,0,0,110,254,0,0,159,0,0,2,0,2,1


In [11]:
df.dtypes

age         int64
sex         int64
cp          int64
trestbps    int64
chol        int64
fbs         int64
restecg     int64
thalach     int64
exang       int64
oldpeak     int64
slope       int64
ca          int64
thal        int64
target      int64
dtype: object

In [12]:
# Counting Males and Females

countMale = len(df[df.sex == 1])
countFemale = len(df[df.sex == 0])

total = countMale + countFemale

percentMale = ((countMale/total) * 100)
percentFemale = ((countFemale/total) * 100)

print("Percentage of males are: {:.2f}%".format(percentMale))
print("Percentage of females are: {:.2f}%".format(percentFemale))

Percentage of males are: 69.56%
Percentage of females are: 30.44%


In [13]:
healthy = len(df[df.target == 0])
unhealthy = len(df[df.target == 1])

total = healthy + unhealthy

percentHealthy = (healthy/total) * 100
percentUnhealthy = (unhealthy/total) * 100

print("Pecentage of healthy people are: {:.2f}%".format(percentHealthy))
print("Pecentage of unhealthy people are: {:.2f}%".format(percentUnhealthy))

Pecentage of healthy people are: 48.68%
Pecentage of unhealthy people are: 51.32%


In [14]:
df.groupby('target').mean()

Unnamed: 0_level_0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,56.569138,0.827655,0.482966,134.106212,251.292585,0.164329,0.456914,139.130261,0.549098,1.274549,1.166333,1.158317,2.539078
1,52.408745,0.570342,1.378327,129.245247,240.979087,0.134981,0.598859,158.585551,0.134981,0.342205,1.593156,0.370722,2.119772


In [15]:
summary_df = df['age'].agg(['mean', 'std', 'min', 'max'])

# We first target the age column. Then, we calculate the mean, standard deviation, min. value and max value
# of the age column

summary_df

mean    54.434146
std      9.072290
min     29.000000
max     77.000000
Name: age, dtype: float64

In [16]:
a = pd.get_dummies(df['cp'], prefix = "cp")
b = pd.get_dummies(df['thal'], prefix = "thal")
c = pd.get_dummies(df['slope'], prefix = "slope")

frames = [df, a, b, c]

df = pd.concat(frames, axis = 1)

df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,...,cp_1,cp_2,cp_3,thal_0,thal_1,thal_2,thal_3,slope_0,slope_1,slope_2
0,52,1,0,125,212,0,1,168,0,1,...,0,0,0,0,0,0,1,0,0,1
1,53,1,0,140,203,1,0,155,1,3,...,0,0,0,0,0,0,1,1,0,0
2,70,1,0,145,174,0,1,125,1,2,...,0,0,0,0,0,0,1,1,0,0
3,61,1,0,148,203,0,1,161,0,0,...,0,0,0,0,0,0,1,0,0,1
4,62,0,0,138,294,1,1,106,0,1,...,0,0,0,0,0,1,0,0,1,0


In [17]:
df = df.drop(['cp', 'thal', 'slope'], axis=1)

In [18]:
df.head()

Unnamed: 0,age,sex,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,ca,...,cp_1,cp_2,cp_3,thal_0,thal_1,thal_2,thal_3,slope_0,slope_1,slope_2
0,52,1,125,212,0,1,168,0,1,2,...,0,0,0,0,0,0,1,0,0,1
1,53,1,140,203,1,0,155,1,3,0,...,0,0,0,0,0,0,1,1,0,0
2,70,1,145,174,0,1,125,1,2,0,...,0,0,0,0,0,0,1,1,0,0
3,61,1,148,203,0,1,161,0,0,1,...,0,0,0,0,0,0,1,0,0,1
4,62,0,138,294,1,1,106,0,1,3,...,0,0,0,0,0,1,0,0,1,0


DATA MODEL BUILDING

In [22]:
y = df.target.values
x_data = df.drop(['target'], axis = 1)

In [23]:
x = (x_data - np.min(x_data)) / (np.max(x_data) - np.min(x_data)).values

  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)


In [24]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2,random_state=0)

In [25]:
clf = LogisticRegression()
clf.fit(x_train, y_train)

In [26]:
y_pred = clf.predict(x_test)

In [27]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8634146341463415
