## Importing the necessary libraries 

In [134]:
import warnings
# Ignore all warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np

In [135]:
data= pd.read_csv('heart.csv')

In [136]:
data.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [137]:
data.ST_Slope.unique()

array(['Up', 'Flat', 'Down'], dtype=object)

In [138]:
data.columns

Index(['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS',
       'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope',
       'HeartDisease'],
      dtype='object')

In [139]:
data.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [140]:
data.shape

(918, 12)

In [141]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [142]:
data.HeartDisease.value_counts()

HeartDisease
1    508
0    410
Name: count, dtype: int64

## Removing the outliers 

In [143]:
def remove_outliers(data, columns, threshold=3):
    for column in columns:
        z_scores = (data[column] - data[column].mean()) / data[column].std()
        data[column] = data[column][abs(z_scores) < threshold]
    return data

columns_to_remove_outliers = ['Cholesterol', 'RestingBP', 'MaxHR', 'Oldpeak']
data = remove_outliers(data, columns_to_remove_outliers, threshold=3)
data.describe()


Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,910.0,915.0,918.0,917.0,911.0,918.0
mean,53.510893,132.052747,197.597814,0.233115,136.89313,0.864544,0.553377
std,9.432617,17.177564,107.51165,0.423046,25.347358,1.010323,0.497414
min,28.0,80.0,0.0,0.0,63.0,-2.0,0.0
25%,47.0,120.0,173.0,0.0,120.0,0.0,0.0
50%,54.0,130.0,222.0,0.0,138.0,0.5,1.0
75%,60.0,140.0,266.5,0.0,156.0,1.5,1.0
max,77.0,185.0,518.0,1.0,202.0,4.0,1.0


## Label Encoding - Converting catogorical variables into numerical variable

In [144]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

columns_to_encode = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
for i in columns_to_encode:
    data[i]= le.fit_transform(data[i])
    # Print mapping
    mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    print("Label Encoding Mapping:",i ,  mapping)
data.head()

Label Encoding Mapping: Sex {'F': 0, 'M': 1}
Label Encoding Mapping: ChestPainType {'ASY': 0, 'ATA': 1, 'NAP': 2, 'TA': 3}
Label Encoding Mapping: RestingECG {'LVH': 0, 'Normal': 1, 'ST': 2}
Label Encoding Mapping: ExerciseAngina {'N': 0, 'Y': 1}
Label Encoding Mapping: ST_Slope {'Down': 0, 'Flat': 1, 'Up': 2}


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,1,140.0,289.0,0,1,172.0,0,0.0,2,0
1,49,0,2,160.0,180.0,0,1,156.0,0,1.0,1,1
2,37,1,1,130.0,283.0,0,2,98.0,0,0.0,2,0
3,48,0,0,138.0,214.0,0,1,108.0,1,1.5,1,1
4,54,1,2,150.0,195.0,0,1,122.0,0,0.0,2,0


In [99]:
X = data.drop(['HeartDisease'] , axis=1)
y= data['HeartDisease']

## Removing Null values 

In [100]:
data.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         8
Cholesterol       3
FastingBS         0
RestingECG        0
MaxHR             1
ExerciseAngina    0
Oldpeak           7
ST_Slope          0
HeartDisease      0
dtype: int64

In [101]:
X['Oldpeak'].fillna(data['Oldpeak'].mean(), inplace=True)
X['RestingBP'].fillna(data['RestingBP'].mean(), inplace=True)
X['Cholesterol'].fillna(data['Cholesterol'].mean(), inplace=True)
X['MaxHR'].fillna(data['MaxHR'].mean(), inplace=True)

In [102]:
X.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
dtype: int64

## Scaling the features in the data

In [103]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled

array([[-1.4331398 ,  0.51595242,  0.22903206, ..., -0.8235563 ,
        -0.85946411,  1.05211381],
       [-0.47848359, -1.93816322,  1.27505906, ..., -0.8235563 ,
         0.13465956, -0.59607813],
       [-1.75135854,  0.51595242,  0.22903206, ..., -0.8235563 ,
        -0.85946411,  1.05211381],
       ...,
       [ 0.37009972,  0.51595242, -0.81699495, ...,  1.21424608,
         0.3334843 , -0.59607813],
       [ 0.37009972, -1.93816322,  0.22903206, ..., -0.8235563 ,
        -0.85946411, -0.59607813],
       [-1.64528563,  0.51595242,  1.27505906, ..., -0.8235563 ,
        -0.85946411,  1.05211381]])

In [104]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=20)

In [105]:
X_train.shape

(734, 11)

In [106]:
X_test.shape

(184, 11)

In [107]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

scores = cross_val_score(SVC(), X, y, cv=5)
scores.mean()

0.6873307198859587

In [108]:
from sklearn.ensemble import BaggingClassifier

bag_model = BaggingClassifier(base_estimator=SVC(), n_estimators=100, max_samples=0.8, random_state=0)
scores = cross_val_score(bag_model, X, y, cv=5)
scores.mean()

0.6873129009265859

As you can see above, using bagging in case of SVM doesn't make much difference in terms of model accuracy. Bagging is effective when we have high variance and instable model such as decision tree. Let's explore how bagging changes the performance for a decision tree classifier.

In [109]:
## Train a model using decision tree and then using bagging
from sklearn.tree import DecisionTreeClassifier

scores = cross_val_score(DecisionTreeClassifier(random_state=0), X, y, cv=5)
scores.mean()

0.7472440009503444

In [110]:
## Use bagging now with decision tree

bag_model = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(random_state=0), 
    n_estimators=100, 
    max_samples=0.9, 
    oob_score=True,
    random_state=0
)

scores = cross_val_score(bag_model, X, y, cv=5)
scores.mean()

0.806016868614873

You can see that with bagging the score improved from 74.72% to 80.60%

In [111]:
## Train a model using Random Forest which itself uses bagging underneath
from sklearn.ensemble import RandomForestClassifier

scores = cross_val_score(RandomForestClassifier(), X, y, cv=5)
scores.mean()

0.8321632216678546

Random forest gave even a better performance with 83% as score. Underneath it used bagging where it sampled not only data rows but also the columns (or features)

## Using boosting model

In [112]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score

In [113]:
## Decision Tree classifier as the weak learner
weak_learner = DecisionTreeClassifier(max_depth=1)

In [114]:
Ada_boost = AdaBoostClassifier(base_estimator=weak_learner, n_estimators=10, learning_rate=1.0, random_state=42)

In [115]:
# Train the AdaBoost model
Ada_boost.fit(X_train, y_train)

In [116]:
y_pred = Ada_boost.predict(X_test)

In [117]:
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 88.59%


## Exporting the model 

In [43]:
import pickle


# Export the model
with open('ada_boost_model.pkl', 'wb') as file:
    pickle.dump(Ada_boost, file)

In [85]:
with open('ada_boost_model.pkl', 'rb') as file:
    model = pickle.load(file)

In [132]:
model.predict([[-1.64528563,  0.51595242,  1.27505906, -0.99763612, -1.84193734,
       -0.55134134,  0.01725451, -0.35123318,  1.21424608, -0.85946411,
       -0.59607813]])

array([1], dtype=int64)