# Heart Failure Dataset

In [1]:
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns


# Data Prepration and cleaning
''' 1. Load the file using Pandas
    2. Look at some information about the data & the columns
    3. Fix any missing values or incoreect values'''

In [2]:
df = pd.read_csv('Downloads/heart.csv')

In [3]:
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [4]:
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [5]:
''' 1. Normal BP range is between 90/60 and 120/180
    2. Normal Cholestrol range is 150 mg
    3. Normal Fasting Blood Sugar level is 5.6 to 6.0 mml
    4. Old peak range is between 0-6
    5. Heart disease is between 25-110'''

' 1. Normal BP range is between 90/60 and 120/180\n    2. Normal Cholestrol range is 150 mg\n    3. Normal Fasting Blood Sugar level is 5.6 to 6.0 mml\n    4. Old peak range is between 0-6\n    5. Heart disease is between 25-110'

In [6]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [7]:
Counter(df['Sex'])

Counter({'M': 725, 'F': 193})

In [8]:
Counter(df['ChestPainType'])

Counter({'ATA': 173, 'NAP': 203, 'ASY': 496, 'TA': 46})

In [9]:
Counter(df["ExerciseAngina"])

Counter({'N': 547, 'Y': 371})

In [10]:
Counter(df['RestingECG'])

Counter({'Normal': 552, 'ST': 178, 'LVH': 188})

In [11]:
Counter(df['ST_Slope'])

Counter({'Up': 395, 'Flat': 460, 'Down': 63})

In [12]:
df.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

## Categorical Columns
1. Sex
2. ChestPainType
3. RestingECG
4. ExcerciseAngnia
5. ST_Slope

In [13]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [14]:
#Feature Engineering
# Converting Male and Female to 0 to 1.
df_sex = pd.get_dummies(df['Sex'],drop_first=True)
df_sex

Unnamed: 0,M
0,1
1,0
2,1
3,0
4,1
...,...
913,1
914,1
915,1
916,0


In [15]:
df['Sex']=df_sex

In [16]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,0,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,1,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,0,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,1,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    uint8  
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(4), uint8(1)
memory usage: 79.9+ KB


In [18]:
# Converting Object type to integer
df['Sex']= df['Sex'].astype(int)

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    int32  
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int32(1), int64(6), object(4)
memory usage: 82.6+ KB


In [20]:
# Using Map function Maping categorical to numerical feature for machine learning alogorithmn
df['ChestPainType'] = df['ChestPainType'].map({'ATA': 0, 'NAP': 1, 'ASY': 2, 'TA': 3})

In [21]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,0,140,289,0,Normal,172,N,0.0,Up,0
1,49,0,1,160,180,0,Normal,156,N,1.0,Flat,1
2,37,1,0,130,283,0,ST,98,N,0.0,Up,0
3,48,0,2,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,1,1,150,195,0,Normal,122,N,0.0,Up,0


In [22]:
df['ChestPainType'] = df['ChestPainType'].astype(int)

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    int32  
 2   ChestPainType   918 non-null    int32  
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int32(2), int64(6), object(3)
memory usage: 79.0+ KB


In [24]:
# Droping first coloumn as we need only one feature.
df_excersise = pd.get_dummies(df['ExerciseAngina'],drop_first=True)
df_excersise

Unnamed: 0,Y
0,0
1,0
2,0
3,1
4,0
...,...
913,0
914,0
915,1
916,0


In [25]:
df['ExerciseAngina']=  df_excersise
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,0,140,289,0,Normal,172,0,0.0,Up,0
1,49,0,1,160,180,0,Normal,156,0,1.0,Flat,1
2,37,1,0,130,283,0,ST,98,0,0.0,Up,0
3,48,0,2,138,214,0,Normal,108,1,1.5,Flat,1
4,54,1,1,150,195,0,Normal,122,0,0.0,Up,0


In [26]:
Counter(df['RestingECG'])

Counter({'Normal': 552, 'ST': 178, 'LVH': 188})

In [27]:
df['RestingECG']= df['RestingECG'].map({'Normal':0,'ST':1,'LVH':2})

In [28]:
Counter(df['RestingECG'])

Counter({0: 552, 1: 178, 2: 188})

In [29]:
df['RestingECG']= df['RestingECG'].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    int32  
 2   ChestPainType   918 non-null    int32  
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    int32  
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    uint8  
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int32(3), int64(6), object(1), uint8(1)
memory usage: 69.2+ KB


In [30]:
df['ST_Slope'] = df['ST_Slope'].map({'Up': 0, 'Flat': 1, 'Down': 2})

In [31]:
Counter(df['ST_Slope'])

Counter({0: 395, 1: 460, 2: 63})

In [32]:
df['ST_Slope'] = df['ST_Slope'].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    int32  
 2   ChestPainType   918 non-null    int32  
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    int32  
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    uint8  
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    int32  
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int32(4), int64(6), uint8(1)
memory usage: 65.6 KB


In [33]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,0,140,289,0,0,172,0,0.0,0,0
1,49,0,1,160,180,0,0,156,0,1.0,1,1
2,37,1,0,130,283,0,1,98,0,0.0,0,0
3,48,0,2,138,214,0,0,108,1,1.5,1,1
4,54,1,1,150,195,0,0,122,0,0.0,0,0


In [34]:
# Converted all categorical feature to Numerical Feature
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [35]:
# Creating training , test, dataset for dependent and independent variables
scaler = StandardScaler()
scaler.fit_transform(df)

array([[-1.4331398 ,  0.51595242, -1.70557305, ..., -0.83243239,
        -1.05211381, -1.11311472],
       [-0.47848359, -1.93816322, -0.53099236, ...,  0.10566353,
         0.59607813,  0.89837999],
       [-1.75135854,  0.51595242, -1.70557305, ..., -0.83243239,
        -1.05211381, -1.11311472],
       ...,
       [ 0.37009972,  0.51595242,  0.64358833, ...,  0.29328271,
         0.59607813,  0.89837999],
       [ 0.37009972, -1.93816322, -1.70557305, ..., -0.83243239,
         0.59607813,  0.89837999],
       [-1.64528563,  0.51595242, -0.53099236, ..., -0.83243239,
        -1.05211381, -1.11311472]])

In [36]:
X = df.drop("HeartDisease", axis = 1)
y = df['HeartDisease']

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0)

In [38]:
model = LogisticRegression()

In [39]:
# Fitting the model
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [40]:
pred = model.predict(X_test)

In [41]:
print("Training set score:", model.score(X_train, y_train))
print("Test set score:", model.score(X_test, y_test))

Training set score: 0.8691860465116279
Test set score: 0.8304347826086956


In [42]:
result = confusion_matrix(y_test, pred)
print('Result of the confusion matrix is:\n', result)

Result of the confusion matrix is:
 [[ 69  23]
 [ 16 122]]


In [43]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

In [44]:
parameters = {'n_neighbors':[3,5,11,19],
             'weights':['uniform','distance'],
             'metric':['minkowski','manhattan']}

In [45]:
clf = GridSearchCV(KNeighborsClassifier(),parameters,cv=3,
                  verbose=1,n_jobs=-1)

In [46]:
clf.fit(X,y)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


GridSearchCV(cv=3, estimator=KNeighborsClassifier(), n_jobs=-1,
             param_grid={'metric': ['minkowski', 'manhattan'],
                         'n_neighbors': [3, 5, 11, 19],
                         'weights': ['uniform', 'distance']},
             verbose=1)

In [47]:
clf.best_score_

0.7309368191721132

In [48]:
clf.best_estimator_

KNeighborsClassifier(metric='manhattan', n_neighbors=19, weights='distance')

In [49]:
clf.best_params_

{'metric': 'manhattan', 'n_neighbors': 19, 'weights': 'distance'}

1. It seems that KNieghborsClassifier is not fitted for this model.
2. Accuracy seems to be low may be we need to evaluate it one more.
3. LogisticRegressor performs well with this dataset.