In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OrdinalEncoder
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [15]:
from ast import Raise
def load_data(path_file):
  return pd.read_csv(path_file)

def split_data(X, y, test_size, random_state=42):
  return train_test_split(X, y, test_size=test_size, random_state=random_state)

def preprocess_data(df, categorical_features, numerical_features):
  imputer = SimpleImputer(strategy="most_frequent")
  imputer_mean = SimpleImputer(strategy="mean")
  df[categorical_features] = imputer.fit_transform(df[categorical_features])
  df[numerical_features] = imputer_mean.fit_transform(df[numerical_features])

  ordinal_enc = OrdinalEncoder()
  df[categorical_features] = ordinal_enc.fit_transform(df[categorical_features])

  scaler = StandardScaler()
  df[numerical_features] = scaler.fit_transform(df[numerical_features])

  return df

def train_model(X_train, y_train, n_estimators):
  model = XGBClassifier(n_estimators=n_estimators)
  model.fit(X_train, y_train)
  return model

def evaluate_model(model, X_test, y_test):
  y_pred = model.predict(X_test)
  accuracy = accuracy_score(y_test, y_pred)
  classification_rep = classification_report(y_test, y_pred)

  print(f'Accuracy: {accuracy:.2f}')
  print('Classification Report:')
  print(classification_rep)

def sample_data(X_train, y_train, strategy):
  if strategy == 'oversample':
    sampler = RandomOverSampler(sampling_strategy='minority', random_state=42)
    X_resampled, y_resampled = sampler.fit_resample(X_train, y_train)

  elif strategy == 'undersample':
    sampler = RandomUnderSampler(sampling_strategy='majority', random_state=42)
    X_resampled, y_resampled = sampler.fit_resample(X_train, y_train)

  else:
    raise ValueError("Strategy must be 'oversample' or 'undersample'.")

  return X_resampled, y_resampled

In [16]:
df = load_data("dataset/heart_2020_cleaned.csv")
df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


In [17]:
df.isna().sum()

HeartDisease        0
BMI                 0
Smoking             0
AlcoholDrinking     0
Stroke              0
PhysicalHealth      0
MentalHealth        0
DiffWalking         0
Sex                 0
AgeCategory         0
Race                0
Diabetic            0
PhysicalActivity    0
GenHealth           0
SleepTime           0
Asthma              0
KidneyDisease       0
SkinCancer          0
dtype: int64

In [18]:
df.shape

(319795, 18)

In [38]:
categorical_features = df.select_dtypes(include=['object']).columns.tolist()
categorical_features

['HeartDisease',
 'Smoking',
 'AlcoholDrinking',
 'Stroke',
 'DiffWalking',
 'Sex',
 'AgeCategory',
 'Race',
 'Diabetic',
 'PhysicalActivity',
 'GenHealth',
 'Asthma',
 'KidneyDisease',
 'SkinCancer']

In [39]:
numerical_features = df.select_dtypes(include=['float64']).columns.tolist()
numerical_features

['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime']

In [43]:
X_preprocess = preprocess_data(df, categorical_features, numerical_features)
X_preprocess.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,0.0,-1.84475,1.0,0.0,0.0,-0.046751,3.281069,0.0,0.0,7.0,5.0,2.0,1.0,4.0,-1.460354,1.0,0.0,1.0
1,0.0,-1.256338,0.0,0.0,1.0,-0.42407,-0.490039,0.0,0.0,12.0,5.0,0.0,1.0,4.0,-0.067601,0.0,0.0,0.0
2,0.0,-0.274603,1.0,0.0,0.0,2.091388,3.281069,0.0,1.0,9.0,5.0,2.0,1.0,1.0,0.628776,1.0,0.0,0.0
3,0.0,-0.647473,0.0,0.0,0.0,-0.42407,-0.490039,0.0,0.0,11.0,5.0,0.0,0.0,2.0,-0.763977,0.0,0.0,1.0
4,0.0,-0.726138,0.0,0.0,0.0,3.097572,-0.490039,1.0,0.0,4.0,5.0,0.0,1.0,4.0,0.628776,0.0,0.0,0.0


In [44]:
X_preprocess.isna().sum()

HeartDisease        0
BMI                 0
Smoking             0
AlcoholDrinking     0
Stroke              0
PhysicalHealth      0
MentalHealth        0
DiffWalking         0
Sex                 0
AgeCategory         0
Race                0
Diabetic            0
PhysicalActivity    0
GenHealth           0
SleepTime           0
Asthma              0
KidneyDisease       0
SkinCancer          0
dtype: int64

In [51]:
X_heart = X_preprocess.drop(columns=['HeartDisease'], axis=1)
X_heart.head()

Unnamed: 0,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,-1.84475,1.0,0.0,0.0,-0.046751,3.281069,0.0,0.0,7.0,5.0,2.0,1.0,4.0,-1.460354,1.0,0.0,1.0
1,-1.256338,0.0,0.0,1.0,-0.42407,-0.490039,0.0,0.0,12.0,5.0,0.0,1.0,4.0,-0.067601,0.0,0.0,0.0
2,-0.274603,1.0,0.0,0.0,2.091388,3.281069,0.0,1.0,9.0,5.0,2.0,1.0,1.0,0.628776,1.0,0.0,0.0
3,-0.647473,0.0,0.0,0.0,-0.42407,-0.490039,0.0,0.0,11.0,5.0,0.0,0.0,2.0,-0.763977,0.0,0.0,1.0
4,-0.726138,0.0,0.0,0.0,3.097572,-0.490039,1.0,0.0,4.0,5.0,0.0,1.0,4.0,0.628776,0.0,0.0,0.0


In [52]:
y_heart = X_preprocess['HeartDisease']
y_heart.head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: HeartDisease, dtype: float64

## **Without Oversampling and Undersampling**

In [53]:
X_train, X_test, y_train, y_test = split_data(X_heart, y_heart, 0.2)

In [54]:
X_train.head()

Unnamed: 0,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
95877,-0.785923,1.0,0.0,0.0,-0.42407,-0.490039,0.0,1.0,11.0,5.0,0.0,1.0,4.0,-0.067601,0.0,0.0,0.0
228939,-0.136153,1.0,0.0,1.0,3.349118,-0.490039,0.0,1.0,7.0,5.0,0.0,1.0,2.0,-0.763977,0.0,0.0,0.0
260256,0.68668,0.0,0.0,0.0,-0.172524,-0.238631,0.0,1.0,6.0,3.0,0.0,0.0,4.0,0.628776,0.0,0.0,0.0
84785,0.471139,0.0,0.0,0.0,-0.42407,-0.490039,0.0,0.0,1.0,5.0,0.0,1.0,0.0,0.628776,0.0,0.0,0.0
83845,-0.581395,1.0,0.0,0.0,-0.172524,0.766997,0.0,1.0,12.0,5.0,2.0,1.0,2.0,-0.067601,0.0,0.0,0.0


In [55]:
X_test.head()

Unnamed: 0,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
271884,-0.109407,1.0,0.0,0.0,-0.42407,2.652551,0.0,0.0,1.0,3.0,0.0,1.0,4.0,-0.067601,0.0,0.0,0.0
270361,-1.003038,0.0,0.0,0.0,-0.42407,2.024033,0.0,0.0,2.0,5.0,0.0,1.0,0.0,-0.763977,0.0,0.0,1.0
219060,0.471139,1.0,0.0,0.0,-0.42407,-0.490039,0.0,0.0,4.0,5.0,0.0,1.0,4.0,-0.763977,1.0,0.0,0.0
24010,1.891824,0.0,0.0,0.0,3.349118,-0.490039,0.0,0.0,9.0,5.0,0.0,0.0,2.0,0.628776,0.0,0.0,0.0
181930,1.146082,1.0,0.0,0.0,3.349118,3.281069,1.0,0.0,8.0,5.0,0.0,0.0,1.0,-2.15673,1.0,0.0,1.0


In [56]:
y_train.head()

95877     0.0
228939    1.0
260256    0.0
84785     0.0
83845     1.0
Name: HeartDisease, dtype: float64

In [57]:
y_test.head()

271884    0.0
270361    0.0
219060    0.0
24010     0.0
181930    0.0
Name: HeartDisease, dtype: float64

In [58]:
model = train_model(X_train, y_train, 100)

In [59]:
evaluate_model(model, X_test, y_test)

Accuracy: 0.91
Classification Report:
              precision    recall  f1-score   support

         0.0       0.92      0.99      0.95     58367
         1.0       0.50      0.10      0.16      5592

    accuracy                           0.91     63959
   macro avg       0.71      0.54      0.56     63959
weighted avg       0.88      0.91      0.88     63959



## **With Oversampling**

In [60]:
X_over, y_over = sample_data(X_heart, y_heart, "oversample")

In [61]:
X_train_over, X_test_over, y_train_over, y_test_over = split_data(X_over, y_over, 0.2)

In [62]:
X_train_over.head()

Unnamed: 0,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
35797,-0.719845,0.0,0.0,0.0,-0.298297,-0.490039,0.0,1.0,0.0,5.0,0.0,1.0,4.0,-0.763977,0.0,0.0,0.0
364365,-0.505877,1.0,0.0,0.0,-0.42407,-0.490039,0.0,1.0,11.0,5.0,0.0,1.0,4.0,-0.067601,0.0,0.0,1.0
353877,-0.177058,0.0,0.0,0.0,-0.42407,-0.490039,0.0,1.0,10.0,5.0,0.0,1.0,4.0,0.628776,0.0,0.0,0.0
369932,-0.391026,1.0,0.0,1.0,0.079022,0.138479,1.0,1.0,8.0,5.0,0.0,0.0,3.0,-0.763977,0.0,0.0,0.0
347938,-0.243137,0.0,0.0,0.0,-0.42407,-0.490039,0.0,1.0,11.0,5.0,1.0,1.0,2.0,-0.067601,0.0,0.0,1.0


In [63]:
X_test_over.head()

Unnamed: 0,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
498280,0.779505,1.0,0.0,1.0,0.833659,-0.490039,0.0,0.0,8.0,5.0,0.0,1.0,2.0,-1.460354,0.0,0.0,0.0
103928,0.628468,1.0,0.0,0.0,3.349118,-0.490039,0.0,1.0,10.0,2.0,2.0,0.0,1.0,-0.763977,0.0,0.0,0.0
134719,-0.592408,0.0,0.0,0.0,-0.42407,-0.490039,0.0,0.0,1.0,3.0,0.0,1.0,2.0,3.414282,0.0,0.0,0.0
6075,1.435568,0.0,0.0,0.0,-0.42407,-0.490039,0.0,0.0,5.0,4.0,2.0,1.0,4.0,-0.763977,0.0,0.0,0.0
547424,-0.276176,1.0,0.0,1.0,-0.172524,-0.112928,0.0,1.0,11.0,5.0,2.0,1.0,1.0,0.628776,0.0,1.0,1.0


In [64]:
y_train_over.head()

35797     0.0
364365    1.0
353877    1.0
369932    1.0
347938    1.0
Name: HeartDisease, dtype: float64

In [65]:
y_test_over.head()

498280    1.0
103928    0.0
134719    0.0
6075      0.0
547424    1.0
Name: HeartDisease, dtype: float64

In [66]:
model_over = train_model(X_train_over, y_train_over, 100)

In [67]:
evaluate_model(model_over, X_test_over, y_test_over)

Accuracy: 0.79
Classification Report:
              precision    recall  f1-score   support

         0.0       0.81      0.74      0.78     58485
         1.0       0.76      0.83      0.79     58484

    accuracy                           0.79    116969
   macro avg       0.79      0.79      0.79    116969
weighted avg       0.79      0.79      0.79    116969



## **With Undersampling**

In [68]:
X_under, y_under = sample_data(X_heart, y_heart, "undersample")

In [69]:
X_train_under, X_test_under, y_train_under, y_test_under = split_data(X_under, y_under, 0.2)

In [70]:
model_under = train_model(X_train_under, y_train_under, 100)

In [71]:
evaluate_model(model_under, X_test_under, y_test_under)

Accuracy: 0.76
Classification Report:
              precision    recall  f1-score   support

         0.0       0.77      0.73      0.75      5434
         1.0       0.75      0.79      0.77      5516

    accuracy                           0.76     10950
   macro avg       0.76      0.76      0.76     10950
weighted avg       0.76      0.76      0.76     10950

