In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

# Load and shuffle the dataset
data = pd.read_csv('healthcare-dataset-stroke-data.csv')
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

# Separate features and target
X = data.drop(columns=['id', 'stroke'])
y = data['stroke']

# Define numeric and categorical features
numeric_features = ['age', 'avg_glucose_level', 'bmi']
categorical_features = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

# Preprocessing pipeline with imputation
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Fill NaNs in numeric columns with mean
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Fill NaNs in categorical columns with most frequent value
    ('encoder', OneHotEncoder())
])

# Combine transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Apply preprocessing
X_preprocessed = preprocessor.fit_transform(X)

# Apply SMOTE on preprocessed data
smote = SMOTE(random_state=42, sampling_strategy=0.3)
X_res, y_res = smote.fit_resample(X_preprocessed, y)

# Split the resampled data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, stratify=y_res, random_state=42)

print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)


Training set shape: (5055, 19)
Testing set shape: (1264, 19)


In [2]:
# Print count of instances with stroke = 1 in training set after split
print("Count of instances with stroke = 1 in training set:", y_train.sum())

# Print count of instances with stroke = 1 in testing set after split
print("Count of instances with stroke = 1 in testing set:", y_test.sum())

Count of instances with stroke = 1 in training set: 1166
Count of instances with stroke = 1 in testing set: 292


In [10]:
# 1 Logistic Regression
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)

# Classification Report:
#               precision    recall  f1-score   support

#            0       0.83      0.89      0.86       972
#            1       0.52      0.41      0.46       292

#     accuracy                           0.78      1264
#    macro avg       0.68      0.65      0.66      1264
# weighted avg       0.76      0.78      0.77      1264


# Confusion Matrix:
# [[862 110]
#  [171 121]]

# The Logistic Regression model shows high accuracy for non-stroke cases but struggles with low recall for stroke cases, indicating it misses many true stroke instances.

In [13]:
# 2 Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier

tree_model = DecisionTreeClassifier(random_state=42)
tree_model.fit(X_train, y_train)
y_pred = tree_model.predict(X_test)

# Classification Report:
#               precision    recall  f1-score   support

#            0       0.92      0.91      0.91       972
#            1       0.71      0.74      0.72       292

#     accuracy                           0.87      1264
#    macro avg       0.81      0.82      0.82      1264
# weighted avg       0.87      0.87      0.87      1264


# Confusion Matrix:
# [[882  90]
#  [ 76 216]]

#The Decision Tree model achieves an accuracy of 87%, with strong performance on non-stroke cases and improved recall for stroke cases, indicating a more balanced detection across both classes.


In [15]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)
y_pred = knn_model.predict(X_test)

# Classification Report:
#               precision    recall  f1-score   support

#            0       0.96      0.89      0.92       972
#            1       0.71      0.88      0.78       292

#     accuracy                           0.89      1264
#    macro avg       0.83      0.89      0.85      1264
# weighted avg       0.90      0.89      0.89      1264


# Confusion Matrix:
# [[866 106]
#  [ 35 257]]

# The K-Nearest Neighbors model achieves an accuracy of 89%, with high recall for stroke cases (88%) and good overall precision, indicating balanced performance across both classes.

In [19]:
from sklearn.svm import SVC

svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)

# Classification Report:
#               precision    recall  f1-score   support

#            0       0.83      0.91      0.87       972
#            1       0.57      0.39      0.46       292

#     accuracy                           0.79      1264
#    macro avg       0.70      0.65      0.67      1264
# weighted avg       0.77      0.79      0.78      1264


# Confusion Matrix:
# [[888  84]
#  [179 113]]


# The SVM model achieves an accuracy of 79%, performing well on non-stroke cases but with low recall (39%) for stroke cases, meaning it misses a significant number of true stroke instances.

In [22]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(random_state=42, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)

# Classification Report:
#               precision    recall  f1-score   support

#            0       0.94      0.95      0.95       972
#            1       0.84      0.81      0.83       292

#     accuracy                           0.92      1264
#    macro avg       0.89      0.88      0.89      1264
# weighted avg       0.92      0.92      0.92      1264


# Confusion Matrix:
# [[928  44]
#  [ 55 237]]

# The XGBoost model achieves a high accuracy of 92%, with strong recall (81%) and precision (84%) for stroke cases, indicating balanced and reliable performance across both classes.

In [31]:
from sklearn.ensemble import RandomForestClassifier

# Train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Classification Report:
#               precision    recall  f1-score   support

#            0       0.94      0.96      0.95       972
#            1       0.86      0.81      0.83       292

#     accuracy                           0.92      1264
#    macro avg       0.90      0.89      0.89      1264
# weighted avg       0.92      0.92      0.92      1264


# Confusion Matrix:
# [[932  40]
#  [ 55 237]]

# The Random Forest model achieved an accuracy of 92%, with strong precision (86%) and recall (81%) for stroke cases, demonstrating balanced and reliable performance across both stroke and non-stroke cases.


In [26]:
# Neural network
from sklearn.neural_network import MLPClassifier

mlp_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
mlp_model.fit(X_train, y_train)
y_pred = mlp_model.predict(X_test)

# Classification Report:
#               precision    recall  f1-score   support

#            0       0.95      0.90      0.92       972
#            1       0.72      0.85      0.78       292

#     accuracy                           0.89      1264
#    macro avg       0.83      0.87      0.85      1264
# weighted avg       0.90      0.89      0.89      1264


# Confusion Matrix:
# [[874  98]
#  [ 44 248]]

# The MLPClassifier achieved an accuracy of 89%, with good recall (85%) for stroke cases, meaning it effectively identifies most actual stroke cases while maintaining high performance on non-stroke cases as well.

In [45]:
# Deep Learning

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Define the model architecture
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    # Dropout(0.2),
    Dense(32, activation='relu'),
    # Dropout(0.2),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])


# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=1)

# Make predictions
y_pred = (model.predict(X_test) > 0.5).astype("int32")

# Classification Report:
#               precision    recall  f1-score   support

#            0       0.95      0.90      0.92       972
#            1       0.72      0.84      0.77       292

#     accuracy                           0.89      1264
#    macro avg       0.83      0.87      0.85      1264
# weighted avg       0.90      0.89      0.89      1264


# Confusion Matrix:
# [[877  95]
#  [ 48 244]]

# The adjusted neural network model achieved an accuracy of 89%, with improved recall (86%) for stroke cases, indicating better balance between accuracy and generalization across both classes.

Epoch 1/50
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7205 - loss: 0.5449 - val_accuracy: 0.7923 - val_loss: 0.3944
Epoch 2/50
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 886us/step - accuracy: 0.8223 - loss: 0.3561 - val_accuracy: 0.8170 - val_loss: 0.3791
Epoch 3/50
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 907us/step - accuracy: 0.8188 - loss: 0.3587 - val_accuracy: 0.8190 - val_loss: 0.3692
Epoch 4/50
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8275 - loss: 0.3448 - val_accuracy: 0.8348 - val_loss: 0.3613
Epoch 5/50
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8285 - loss: 0.3421 - val_accuracy: 0.8279 - val_loss: 0.3537
Epoch 6/50
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 961us/step - accuracy: 0.8400 - loss: 0.3396 - val_accuracy: 0.8328 - val_loss: 0.3474
Epoch 7/50
[1m127/127

In [2]:
# Hyperparameters 

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_model = RandomForestClassifier(random_state=42)
rf_random = RandomizedSearchCV(estimator=rf_model, param_distributions=param_grid,
                               n_iter=50, cv=3, random_state=42, n_jobs=-1)
rf_random.fit(X_train, y_train)

# Best parameters
print(rf_random.best_params_)
y_pred = rf_random.predict(X_test)

# Classification Report:
#               precision    recall  f1-score   support

#            0       0.95      0.96      0.95       972
#            1       0.85      0.82      0.83       292

#     accuracy                           0.92      1264
#    macro avg       0.90      0.89      0.89      1264
# weighted avg       0.92      0.92      0.92      1264


# Confusion Matrix:
# [[929  43]
#  [ 53 239]]

# Hyperparameter tuning improved stroke case recall slightly (from 81% to 82%) while maintaining overall accuracy at 92%, resulting in fewer missed stroke cases.

{'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': None}


In [58]:
from sklearn.metrics import classification_report, confusion_matrix

# Evaluate the model
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.96      0.95       972
           1       0.85      0.82      0.83       292

    accuracy                           0.92      1264
   macro avg       0.90      0.89      0.89      1264
weighted avg       0.92      0.92      0.92      1264


Confusion Matrix:
[[929  43]
 [ 53 239]]


In [4]:
import joblib
joblib.dump(rf_random.best_estimator_, 'random_forest_model-2.joblib')

['random_forest_model-2.joblib']

In [9]:
joblib.dump(preprocessor, 'preprocessor.joblib')


['preprocessor.joblib']