In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier
import joblib

# Load the dataset (replace 'healthcare-dataset-stroke-data.csv' with your file path)
data = pd.read_csv('healthcare-dataset-stroke-data.csv')

# Drop 'id' column if present
if 'id' in data.columns:
    data = data.drop('id', axis=1)

# Handle missing values
data['bmi'].fillna(data['bmi'].median(), inplace=True)

# Encode categorical variables
le = LabelEncoder()
categorical_cols = ['gender', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
for col in categorical_cols:
    data[col] = le.fit_transform(data[col])

# Create derived features
data['age_glucose'] = data['age'] * data['avg_glucose_level']
data['comorbidity'] = data['hypertension'] | data['heart_disease']
data['age_group'] = pd.cut(data['age'], bins=[0, 40, 60, 120], labels=[0, 1, 2], include_lowest=True)
data['age_group'] = data['age_group'].astype(int)

# Define features and target
X = data[['age', 'gender', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type',
          'avg_glucose_level', 'bmi', 'smoking_status', 'age_glucose', 'comorbidity', 'age_group']]
y = data['stroke']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE to the training set
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Scale the features
scaler = StandardScaler()
X_train_smote_scaled = scaler.fit_transform(X_train_smote)
X_test_scaled = scaler.transform(X_test)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['bmi'].fillna(data['bmi'].median(), inplace=True)


In [2]:
# Initialize and train the Decision Tree
dt_model = DecisionTreeClassifier(class_weight='balanced', random_state=42)
dt_model.fit(X_train_smote_scaled, y_train_smote)

# Predict on the test set (default threshold of 0.5)
dt_pred = dt_model.predict(X_test_scaled)

# Evaluate the model (default threshold)
print("\nDecision Tree (SMOTE, Default Threshold) Test Set Performance:")
print(classification_report(y_test, dt_pred))


Decision Tree (SMOTE, Default Threshold) Test Set Performance:
              precision    recall  f1-score   support

           0       0.96      0.92      0.94       972
           1       0.16      0.28      0.20        50

    accuracy                           0.89      1022
   macro avg       0.56      0.60      0.57      1022
weighted avg       0.92      0.89      0.90      1022



In [3]:
# Get the predicted probabilities for the test set
dt_prob = dt_model.predict_proba(X_test_scaled)[:, 1]

# Adjust the decision threshold to 0.3
threshold = 0.3
dt_pred_adjusted = (dt_prob >= threshold).astype(int)

# Evaluate the model (adjusted threshold)
print("\nDecision Tree (SMOTE, Threshold=0.3) Test Set Performance:")
print(classification_report(y_test, dt_pred_adjusted))


Decision Tree (SMOTE, Threshold=0.3) Test Set Performance:
              precision    recall  f1-score   support

           0       0.96      0.92      0.94       972
           1       0.16      0.28      0.20        50

    accuracy                           0.89      1022
   macro avg       0.56      0.60      0.57      1022
weighted avg       0.92      0.89      0.90      1022



In [4]:
from sklearn.naive_bayes import GaussianNB

# Initialize and train the Naive Bayes classifier
nb_model = GaussianNB()
nb_model.fit(X_train_smote_scaled, y_train_smote)

# Predict on the test set (default threshold of 0.5)
nb_pred = nb_model.predict(X_test_scaled)

# Evaluate the model (default threshold)
print("\nNaive Bayes (SMOTE, Default Threshold) Test Set Performance:")
print(classification_report(y_test, nb_pred))


Naive Bayes (SMOTE, Default Threshold) Test Set Performance:
              precision    recall  f1-score   support

           0       0.98      0.73      0.84       972
           1       0.12      0.72      0.21        50

    accuracy                           0.73      1022
   macro avg       0.55      0.73      0.52      1022
weighted avg       0.94      0.73      0.81      1022

