In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder,OrdinalEncoder
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [2]:
from ast import Raise
def load_data(path_file):
  return pd.read_csv(path_file)

def split_data(X, y, test_size, random_state=42):
  return train_test_split(X, y, test_size=test_size, random_state=random_state)

def preprocess_data(df, categorical_features, numerical_features):
  imputer = SimpleImputer(strategy="most_frequent")
  imputer_mean = SimpleImputer(strategy="mean")
  df[categorical_features] = imputer.fit_transform(df[categorical_features])
  df[numerical_features] = imputer_mean.fit_transform(df[numerical_features])

  ordinal_enc = OrdinalEncoder()
  df[categorical_features] = ordinal_enc.fit_transform(df[categorical_features])

  scaler = StandardScaler()
  df[numerical_features] = scaler.fit_transform(df[numerical_features])

  return df

def train_model(X_train, y_train, n_estimators, random_state=42):
  model = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)
  model.fit(X_train, y_train)
  return model

def evaluate_model(model, X_test, y_test):
  y_pred = model.predict(X_test)
  accuracy = accuracy_score(y_test, y_pred)
  classification_rep = classification_report(y_test, y_pred)

  print(f'Accuracy: {accuracy:.2f}')
  print('Classification Report:')
  print(classification_rep)

def sample_data(X_train, y_train, strategy):
  if strategy == 'oversample':
    sampler = RandomOverSampler(sampling_strategy='minority', random_state=42)
    X_resampled, y_resampled = sampler.fit_resample(X_train, y_train)

  elif strategy == 'undersample':
    sampler = RandomUnderSampler(sampling_strategy='majority', random_state=42)
    X_resampled, y_resampled = sampler.fit_resample(X_train, y_train)

  else:
    raise ValueError("Strategy must be 'oversample' or 'undersample'.")

  return X_resampled, y_resampled

In [3]:
df = load_data("dataset/heart_2020_cleaned.csv")
df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


In [4]:
df.describe()

Unnamed: 0,BMI,PhysicalHealth,MentalHealth,SleepTime
count,66310.0,66310.0,66310.0,66309.0
mean,27.989062,3.541683,3.978118,7.103772
std,6.305444,8.106946,8.027488,1.512356
min,12.13,0.0,0.0,1.0
25%,23.69,0.0,0.0,6.0
50%,26.96,0.0,0.0,7.0
75%,31.09,2.0,3.0,8.0
max,87.05,30.0,30.0,24.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66310 entries, 0 to 66309
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   HeartDisease      66310 non-null  object 
 1   BMI               66310 non-null  float64
 2   Smoking           66310 non-null  object 
 3   AlcoholDrinking   66310 non-null  object 
 4   Stroke            66310 non-null  object 
 5   PhysicalHealth    66310 non-null  float64
 6   MentalHealth      66310 non-null  float64
 7   DiffWalking       66310 non-null  object 
 8   Sex               66310 non-null  object 
 9   AgeCategory       66310 non-null  object 
 10  Race              66309 non-null  object 
 11  Diabetic          66309 non-null  object 
 12  PhysicalActivity  66309 non-null  object 
 13  GenHealth         66309 non-null  object 
 14  SleepTime         66309 non-null  float64
 15  Asthma            66309 non-null  object 
 16  KidneyDisease     66309 non-null  object

In [6]:
df.isna().sum()

HeartDisease        0
BMI                 0
Smoking             0
AlcoholDrinking     0
Stroke              0
PhysicalHealth      0
MentalHealth        0
DiffWalking         0
Sex                 0
AgeCategory         0
Race                1
Diabetic            1
PhysicalActivity    1
GenHealth           1
SleepTime           1
Asthma              1
KidneyDisease       1
SkinCancer          1
dtype: int64

In [7]:
df.shape

(66310, 18)

In [8]:
columns = df.columns
columns

Index(['HeartDisease', 'BMI', 'Smoking', 'AlcoholDrinking', 'Stroke',
       'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex', 'AgeCategory',
       'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'SleepTime',
       'Asthma', 'KidneyDisease', 'SkinCancer'],
      dtype='object')

In [9]:
X_heart = df.drop(columns=['HeartDisease'], axis=1)
X_heart.head()

Unnamed: 0,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


In [10]:
X_heart.isna().sum()

BMI                 0
Smoking             0
AlcoholDrinking     0
Stroke              0
PhysicalHealth      0
MentalHealth        0
DiffWalking         0
Sex                 0
AgeCategory         0
Race                1
Diabetic            1
PhysicalActivity    1
GenHealth           1
SleepTime           1
Asthma              1
KidneyDisease       1
SkinCancer          1
dtype: int64

In [11]:
y_heart = df['HeartDisease']
y_heart.head()

0    No
1    No
2    No
3    No
4    No
Name: HeartDisease, dtype: object

In [12]:
categorical_features = X_heart.select_dtypes(include=['object']).columns.tolist()
categorical_features

['Smoking',
 'AlcoholDrinking',
 'Stroke',
 'DiffWalking',
 'Sex',
 'AgeCategory',
 'Race',
 'Diabetic',
 'PhysicalActivity',
 'GenHealth',
 'Asthma',
 'KidneyDisease',
 'SkinCancer']

In [13]:
numerical_features = X_heart.select_dtypes(include=['float64']).columns.tolist()
numerical_features

['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime']

In [14]:
X_preprocess = preprocess_data(X_heart, categorical_features, numerical_features)
X_preprocess.head()

Unnamed: 0,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,-1.80624,1.0,0.0,0.0,-0.066818,3.241622,0.0,0.0,7.0,5.0,2.0,1.0,4.0,-1.391077,1.0,0.0,1.0
1,-1.213098,0.0,0.0,1.0,-0.436873,-0.495566,0.0,0.0,12.0,5.0,0.0,1.0,4.0,-0.068617,0.0,0.0,0.0
2,-0.223469,1.0,0.0,0.0,2.030165,3.241622,0.0,1.0,9.0,5.0,2.0,1.0,1.0,0.592613,1.0,0.0,0.0
3,-0.599338,0.0,0.0,0.0,-0.436873,-0.495566,0.0,0.0,11.0,5.0,0.0,0.0,2.0,-0.729847,0.0,0.0,1.0
4,-0.678635,0.0,0.0,0.0,3.016981,-0.495566,1.0,0.0,4.0,5.0,0.0,1.0,4.0,0.592613,0.0,0.0,0.0


In [15]:
X_preprocess.isna().sum()

BMI                 0
Smoking             0
AlcoholDrinking     0
Stroke              0
PhysicalHealth      0
MentalHealth        0
DiffWalking         0
Sex                 0
AgeCategory         0
Race                0
Diabetic            0
PhysicalActivity    0
GenHealth           0
SleepTime           0
Asthma              0
KidneyDisease       0
SkinCancer          0
dtype: int64

## **Without Oversampling and Undersampling**

In [16]:
X_train, X_test, y_train, y_test = split_data(X_preprocess, y_heart, 0.2)

In [17]:
X_train.head()

Unnamed: 0,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
45443,-0.900667,1.0,0.0,0.0,0.179886,-0.370993,0.0,0.0,9.0,5.0,0.0,1.0,1.0,-1.391077,0.0,0.0,1.0
8718,0.225353,1.0,0.0,1.0,0.179886,2.618757,1.0,1.0,12.0,3.0,2.0,1.0,3.0,0.592613,0.0,0.0,0.0
52402,0.782019,0.0,0.0,0.0,-0.436873,-0.495566,0.0,0.0,5.0,3.0,2.0,1.0,2.0,0.592613,0.0,0.0,0.0
9679,-0.015711,0.0,0.0,0.0,-0.436873,-0.495566,0.0,1.0,7.0,5.0,0.0,1.0,4.0,0.592613,0.0,0.0,0.0
40406,-0.743658,1.0,0.0,0.0,-0.436873,-0.495566,0.0,1.0,4.0,3.0,0.0,0.0,1.0,-0.068617,0.0,0.0,0.0


In [18]:
X_test.head()

Unnamed: 0,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
61146,-0.370962,0.0,0.0,0.0,3.263685,-0.495566,1.0,0.0,12.0,5.0,0.0,0.0,1.0,-1.391077,0.0,1.0,1.0
21558,-0.347173,1.0,1.0,0.0,-0.436873,-0.495566,0.0,1.0,6.0,1.0,0.0,1.0,0.0,0.592613,0.0,0.0,0.0
44964,0.488619,0.0,0.0,0.0,-0.436873,-0.495566,0.0,0.0,8.0,5.0,0.0,1.0,4.0,0.592613,0.0,0.0,0.0
2338,-0.803924,0.0,0.0,0.0,-0.436873,-0.495566,0.0,0.0,8.0,5.0,3.0,1.0,0.0,-0.729847,0.0,0.0,1.0
21317,-0.083906,0.0,0.0,0.0,-0.436873,1.995892,0.0,0.0,7.0,3.0,0.0,1.0,2.0,0.592613,0.0,0.0,0.0


In [19]:
y_train.head()

45443     No
8718     Yes
52402     No
9679      No
40406     No
Name: HeartDisease, dtype: object

In [20]:
y_test.head()

61146    No
21558    No
44964    No
2338     No
21317    No
Name: HeartDisease, dtype: object

In [21]:
model = train_model(X_train, y_train, 100)

In [22]:
evaluate_model(model, X_test, y_test)

Accuracy: 0.91
Classification Report:
              precision    recall  f1-score   support

          No       0.92      0.99      0.95     12103
         Yes       0.42      0.10      0.16      1159

    accuracy                           0.91     13262
   macro avg       0.67      0.54      0.56     13262
weighted avg       0.88      0.91      0.88     13262



## **With Oversampling**

In [23]:
X_over, y_over = sample_data(X_preprocess, y_heart, "oversample")

In [24]:
X_train_over, X_test_over, y_train_over, y_test_over = split_data(X_over, y_over, 0.2)

In [25]:
X_train_over.head()

Unnamed: 0,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
84036,0.03504,1.0,0.0,0.0,-0.436873,-0.121847,0.0,0.0,10.0,4.0,2.0,1.0,2.0,-1.391077,0.0,0.0,0.0
4243,0.680518,0.0,0.0,0.0,-0.19017,-0.495566,0.0,0.0,9.0,5.0,0.0,1.0,0.0,-0.068617,0.0,0.0,0.0
87461,0.219009,1.0,0.0,1.0,-0.066818,-0.24642,0.0,1.0,8.0,5.0,0.0,1.0,4.0,-0.729847,1.0,0.0,0.0
7200,-0.215539,0.0,0.0,0.0,-0.436873,-0.495566,0.0,0.0,10.0,5.0,0.0,1.0,0.0,-0.068617,0.0,0.0,0.0
94271,0.318923,1.0,1.0,0.0,2.030165,3.241622,1.0,1.0,3.0,5.0,0.0,0.0,3.0,-2.052307,1.0,0.0,1.0


In [26]:
X_test_over.head()

Unnamed: 0,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
61933,-0.09818,0.0,0.0,0.0,-0.436873,-0.495566,0.0,1.0,8.0,1.0,0.0,0.0,4.0,-0.729847,1.0,0.0,0.0
44834,-0.459775,1.0,1.0,0.0,-0.436873,-0.495566,0.0,1.0,11.0,5.0,0.0,1.0,0.0,0.592613,0.0,0.0,0.0
36823,1.630498,1.0,0.0,0.0,-0.436873,-0.495566,0.0,0.0,3.0,4.0,0.0,1.0,4.0,0.592613,0.0,0.0,0.0
111759,0.206321,0.0,0.0,0.0,-0.19017,-0.495566,1.0,0.0,12.0,5.0,0.0,1.0,4.0,0.592613,0.0,0.0,1.0
21941,1.135684,0.0,0.0,0.0,-0.436873,-0.495566,0.0,0.0,1.0,3.0,0.0,1.0,2.0,0.592613,0.0,0.0,0.0


In [27]:
y_train_over.head()

84036    Yes
4243      No
87461    Yes
7200      No
94271    Yes
Name: HeartDisease, dtype: object

In [28]:
y_test_over.head()

61933      No
44834      No
36823      No
111759    Yes
21941      No
Name: HeartDisease, dtype: object

In [29]:
model_over = train_model(X_train_over, y_train_over, 100)

In [30]:
evaluate_model(model_over, X_test_over, y_test_over)

Accuracy: 0.98
Classification Report:
              precision    recall  f1-score   support

          No       1.00      0.95      0.97     12237
         Yes       0.95      1.00      0.98     11990

    accuracy                           0.98     24227
   macro avg       0.98      0.98      0.98     24227
weighted avg       0.98      0.98      0.98     24227



## **With Undersampling**

In [31]:
X_under, y_under = sample_data(X_preprocess, y_heart, "undersample")

In [32]:
X_train_under, X_test_under, y_train_under, y_test_under = split_data(X_under, y_under, 0.2)

In [33]:
X_train_under.head()

Unnamed: 0,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
7188,-0.377306,0.0,0.0,0.0,1.290054,0.127299,0.0,0.0,12.0,5.0,0.0,0.0,1.0,0.592613,0.0,0.0,1.0
5441,0.981848,0.0,0.0,0.0,-0.436873,-0.495566,0.0,0.0,1.0,3.0,0.0,1.0,2.0,-0.068617,0.0,0.0,0.0
2406,-0.244086,1.0,0.0,0.0,-0.436873,-0.495566,0.0,1.0,6.0,3.0,0.0,0.0,2.0,0.592613,0.0,0.0,0.0
6143,0.927926,1.0,0.0,0.0,-0.436873,-0.495566,0.0,0.0,11.0,5.0,0.0,1.0,2.0,-0.729847,0.0,0.0,0.0
5354,-1.443059,1.0,0.0,0.0,-0.436873,-0.495566,0.0,0.0,10.0,5.0,0.0,1.0,4.0,0.592613,0.0,1.0,0.0


In [34]:
X_test_under.head()

Unnamed: 0,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
4048,-0.342415,0.0,0.0,0.0,-0.436873,1.373028,0.0,0.0,6.0,5.0,0.0,1.0,4.0,0.592613,0.0,0.0,0.0
8402,0.401392,1.0,1.0,0.0,3.263685,1.373028,0.0,1.0,8.0,5.0,1.0,0.0,4.0,-0.068617,0.0,0.0,0.0
3704,0.453729,0.0,0.0,0.0,-0.436873,-0.495566,0.0,1.0,10.0,5.0,1.0,1.0,0.0,-0.068617,0.0,0.0,1.0
7685,0.6488,1.0,0.0,0.0,3.263685,3.241622,1.0,1.0,4.0,5.0,2.0,0.0,2.0,-2.052307,0.0,0.0,0.0
2847,-0.106109,0.0,0.0,0.0,-0.436873,-0.495566,0.0,1.0,2.0,5.0,0.0,1.0,0.0,-0.068617,0.0,0.0,0.0


In [35]:
y_train_under.head()

7188    Yes
5441     No
2406     No
6143    Yes
5354     No
Name: HeartDisease, dtype: object

In [36]:
y_test_under.head()

4048     No
8402    Yes
3704     No
7685    Yes
2847     No
Name: HeartDisease, dtype: object

In [37]:
model_under = train_model(X_train_under, y_train_under, 100)

In [38]:
evaluate_model(model_under, X_test_under, y_test_under)

Accuracy: 0.75
Classification Report:
              precision    recall  f1-score   support

          No       0.77      0.73      0.75      1178
         Yes       0.73      0.78      0.75      1120

    accuracy                           0.75      2298
   macro avg       0.75      0.75      0.75      2298
weighted avg       0.75      0.75      0.75      2298

