In [38]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder, StandardScaler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer

In [39]:
def load_data(file_path):
    df = pd.read_csv(file_path)
    return df

def preprocessing_data(df, categorical_features, numerical_features):
  imputer = SimpleImputer(strategy="most_frequent")
  imputer_mean = SimpleImputer(strategy="mean")
  df[categorical_features] = imputer.fit_transform(df[categorical_features])
  df[numerical_features] = imputer_mean.fit_transform(df[numerical_features])

  ordinal_enc = OrdinalEncoder()
  df[categorical_features] = ordinal_enc.fit_transform(df[categorical_features])

  scaler = StandardScaler()
  df[numerical_features] = scaler.fit_transform(df[numerical_features])

  return df

def train_logistic_regression(X_train, y_train):
  model = LogisticRegression()
  model.fit(X_train, y_train)
  return model

def evaluate_model(model, X_test, y_test):
  y_pred = model.predict(X_test)
  accuracy = accuracy_score(y_test, y_pred)
  classification_rep = classification_report(y_test, y_pred)

  print(f'Accuracy: {accuracy:.2f}')
  print('Classification Report:')
  print(classification_rep)

def undersample_data(X_train, y_train):
  sampler = RandomUnderSampler(sampling_strategy='majority', random_state=42)
  X_resampled, y_resampled = sampler.fit_resample(X_train, y_train)
  return X_resampled, y_resampled

def oversample_data(X_train, y_train):
  smote = SMOTE(random_state=42)
  X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
  return X_resampled, y_resampled

# **Heart Disease**

In [None]:
file_path = "dataset/heart_2020_cleaned.csv"
df = load_data(file_path)
df.head()

In [41]:
df.describe()

Unnamed: 0,BMI,PhysicalHealth,MentalHealth,SleepTime
count,319795.0,319795.0,319795.0,319795.0
mean,28.325399,3.37171,3.898366,7.097075
std,6.3561,7.95085,7.955235,1.436007
min,12.02,0.0,0.0,1.0
25%,24.03,0.0,0.0,6.0
50%,27.34,0.0,0.0,7.0
75%,31.42,2.0,3.0,8.0
max,94.85,30.0,30.0,24.0


In [42]:
df.shape

(319795, 18)

In [43]:
df.isna().sum()

HeartDisease        0
BMI                 0
Smoking             0
AlcoholDrinking     0
Stroke              0
PhysicalHealth      0
MentalHealth        0
DiffWalking         0
Sex                 0
AgeCategory         0
Race                0
Diabetic            0
PhysicalActivity    0
GenHealth           0
SleepTime           0
Asthma              0
KidneyDisease       0
SkinCancer          0
dtype: int64

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319795 entries, 0 to 319794
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   HeartDisease      319795 non-null  object 
 1   BMI               319795 non-null  float64
 2   Smoking           319795 non-null  object 
 3   AlcoholDrinking   319795 non-null  object 
 4   Stroke            319795 non-null  object 
 5   PhysicalHealth    319795 non-null  float64
 6   MentalHealth      319795 non-null  float64
 7   DiffWalking       319795 non-null  object 
 8   Sex               319795 non-null  object 
 9   AgeCategory       319795 non-null  object 
 10  Race              319795 non-null  object 
 11  Diabetic          319795 non-null  object 
 12  PhysicalActivity  319795 non-null  object 
 13  GenHealth         319795 non-null  object 
 14  SleepTime         319795 non-null  float64
 15  Asthma            319795 non-null  object 
 16  KidneyDisease     31

In [45]:
df.columns

Index(['HeartDisease', 'BMI', 'Smoking', 'AlcoholDrinking', 'Stroke',
       'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex', 'AgeCategory',
       'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'SleepTime',
       'Asthma', 'KidneyDisease', 'SkinCancer'],
      dtype='object')

In [53]:
# seq_categorical_value = ['AgeCategory', 'GenHealth']
# unseq_categorical_value = ['Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'Sex', 'Race', 'Diabetic', 'PhysicalActivity', 'Asthma', 'KidneyDisease', 'SkinCancer']
# numerical_value = ['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime']

categorical_features = df.select_dtypes(include=['object']).columns.tolist()
categorical_features

['HeartDisease',
 'Smoking',
 'AlcoholDrinking',
 'Stroke',
 'DiffWalking',
 'Sex',
 'AgeCategory',
 'Race',
 'Diabetic',
 'PhysicalActivity',
 'GenHealth',
 'Asthma',
 'KidneyDisease',
 'SkinCancer']

In [54]:
numerical_features = df.select_dtypes(include=['float64']).columns.tolist()
numerical_features

['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime']

In [58]:
X_preprocess = preprocessing_data(df, categorical_features, numerical_features)

In [63]:
X_preprocess.shape

(319795, 18)

In [59]:
X = X_preprocess.drop(columns=['HeartDisease'], axis=1)
X.head()

Unnamed: 0,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,-1.84475,1.0,0.0,0.0,-0.046751,3.281069,0.0,0.0,7.0,5.0,2.0,1.0,4.0,-1.460354,1.0,0.0,1.0
1,-1.256338,0.0,0.0,1.0,-0.42407,-0.490039,0.0,0.0,12.0,5.0,0.0,1.0,4.0,-0.067601,0.0,0.0,0.0
2,-0.274603,1.0,0.0,0.0,2.091388,3.281069,0.0,1.0,9.0,5.0,2.0,1.0,1.0,0.628776,1.0,0.0,0.0
3,-0.647473,0.0,0.0,0.0,-0.42407,-0.490039,0.0,0.0,11.0,5.0,0.0,0.0,2.0,-0.763977,0.0,0.0,1.0
4,-0.726138,0.0,0.0,0.0,3.097572,-0.490039,1.0,0.0,4.0,5.0,0.0,1.0,4.0,0.628776,0.0,0.0,0.0


In [60]:
y = X_preprocess['HeartDisease']
y.head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: HeartDisease, dtype: float64

In [71]:
X_train_resampled, y_train_resampled = oversample_data(X, y)

In [72]:
X_train_resampled.shape

(584844, 17)

In [73]:
X_train_resampled.head()

Unnamed: 0,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,-1.84475,1.0,0.0,0.0,-0.046751,3.281069,0.0,0.0,7.0,5.0,2.0,1.0,4.0,-1.460354,1.0,0.0,1.0
1,-1.256338,0.0,0.0,1.0,-0.42407,-0.490039,0.0,0.0,12.0,5.0,0.0,1.0,4.0,-0.067601,0.0,0.0,0.0
2,-0.274603,1.0,0.0,0.0,2.091388,3.281069,0.0,1.0,9.0,5.0,2.0,1.0,1.0,0.628776,1.0,0.0,0.0
3,-0.647473,0.0,0.0,0.0,-0.42407,-0.490039,0.0,0.0,11.0,5.0,0.0,0.0,2.0,-0.763977,0.0,0.0,1.0
4,-0.726138,0.0,0.0,0.0,3.097572,-0.490039,1.0,0.0,4.0,5.0,0.0,1.0,4.0,0.628776,0.0,0.0,0.0


In [74]:
y_train_resampled.head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: HeartDisease, dtype: float64

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X_train_resampled, y_train_resampled, test_size=0.2, random_state=42, shuffle=True)

In [76]:
model = train_logistic_regression(X_train, y_train)

In [77]:
evaluate_model(model, X_test, y_test)

Accuracy: 0.75
Classification Report:
              precision    recall  f1-score   support

         0.0       0.76      0.73      0.75     58485
         1.0       0.74      0.77      0.75     58484

    accuracy                           0.75    116969
   macro avg       0.75      0.75      0.75    116969
weighted avg       0.75      0.75      0.75    116969

