In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import RandomForestClassifier

# Load the dataset (replace 'healthcare-dataset-stroke-data.csv' with your file path)
data = pd.read_csv('healthcare-dataset-stroke-data.csv')

# Drop 'id' column if present
if 'id' in data.columns:
    data = data.drop('id', axis=1)

# Handle missing values
data['bmi'].fillna(data['bmi'].median(), inplace=True)

# Encode categorical variables
le = LabelEncoder()
categorical_cols = ['gender', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
for col in categorical_cols:
    data[col] = le.fit_transform(data[col])

# Create derived features
data['age_glucose'] = data['age'] * data['avg_glucose_level']
data['comorbidity'] = data['hypertension'] | data['heart_disease']
data['age_group'] = pd.cut(data['age'], bins=[0, 40, 60, 120], labels=[0, 1, 2], include_lowest=True)
data['age_group'] = data['age_group'].astype(int)

# Define features and target
X = data[['age', 'gender', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type',
          'avg_glucose_level', 'bmi', 'smoking_status', 'age_glucose', 'comorbidity', 'age_group']]
y = data['stroke']

# Split the data (before balancing to avoid data leakage)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['bmi'].fillna(data['bmi'].median(), inplace=True)


In [2]:
# Apply Random Over Sampling to the training set
ros = RandomOverSampler(random_state=42)
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)

# Check the class distribution after ROS
print("Class distribution after ROS:", pd.Series(y_train_ros).value_counts())

Class distribution after ROS: stroke
0    3889
1    3889
Name: count, dtype: int64


In [3]:
# Initialize and train the Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_ros, y_train_ros)

# Predict on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
print("\nRandom Forest Test Set Performance:")
print(classification_report(y_test, y_pred))


Random Forest Test Set Performance:
              precision    recall  f1-score   support

           0       0.95      0.99      0.97       972
           1       0.00      0.00      0.00        50

    accuracy                           0.94      1022
   macro avg       0.48      0.49      0.48      1022
weighted avg       0.90      0.94      0.92      1022



<h1> Logistic regression</h1>

In [6]:
from sklearn.linear_model import LogisticRegression

# Initialize Logistic Regression with class weights
lr_model = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000)

# Train the model
lr_model.fit(X_train_ros, y_train_ros)

# Predict on the test set
lr_pred = lr_model.predict(X_test)

# Evaluate the model
print("\nLogistic Regression Test Set Performance:")
print(classification_report(y_test, lr_pred))


Logistic Regression Test Set Performance:
              precision    recall  f1-score   support

           0       0.99      0.76      0.86       972
           1       0.14      0.80      0.25        50

    accuracy                           0.76      1022
   macro avg       0.57      0.78      0.55      1022
weighted avg       0.95      0.76      0.83      1022



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
print(classification_report(y_test, lr_pred))

              precision    recall  f1-score   support

           0       0.99      0.76      0.86       972
           1       0.14      0.80      0.25        50

    accuracy                           0.76      1022
   macro avg       0.57      0.78      0.55      1022
weighted avg       0.95      0.76      0.83      1022



In [7]:
import xgboost as xgb
# Calculate scale_pos_weight (ratio of negative to positive samples in the original training set)
scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])

# Initialize XGBoost with scale_pos_weight
xgb_model = xgb.XGBClassifier(scale_pos_weight=scale_pos_weight, random_state=42)

# Train the model
xgb_model.fit(X_train_ros, y_train_ros)

# Predict on the test set
xgb_pred = xgb_model.predict(X_test)

# Evaluate the model
print("\nXGBoost Test Set Performance:")
print(classification_report(y_test, xgb_pred))


XGBoost Test Set Performance:
              precision    recall  f1-score   support

           0       0.96      0.95      0.95       972
           1       0.20      0.26      0.23        50

    accuracy                           0.91      1022
   macro avg       0.58      0.60      0.59      1022
weighted avg       0.92      0.91      0.92      1022



In [None]:
# import pandas as pd

# # Create a dictionary with the metrics
# metrics_data = {
#     'Model': ['Random Forest', 'Logistic Regression', 'XGBoost'],
#     'F1-Score (Class 1)': [0.00, 0.25, 0.23],
#     'Recall (Class 1)': [0.00, 0.80, 0.26],
#     'Precision (Class 1)': [0.00, 0.14, 0.20],
#     'Weighted F1-Score': [0.92, 0.83, 0.92],
#     'Accuracy': [0.94, 0.76, 0.91]
# }

# # Convert to a DataFrame and save to CSV
# metrics_df = pd.DataFrame(metrics_data)
# metrics_df.to_csv('model_performance_metrics.csv', index=False)
# print("Model performance metrics saved to 'model_performance_metrics.csv'")

Model performance metrics saved to 'model_performance_metrics.csv'


In [None]:
# import joblib

# # Save the Random Forest model
# joblib.dump(rf_model, 'models/random_forest_model.pkl')

# # Save the Logistic Regression model (with scaled data, if you re-trained it)
# joblib.dump(lr_model, 'models/logistic_regression_model.pkl')

# # Save the XGBoost model
# joblib.dump(xgb_model, 'models/xgboost_model.pkl')

# print("Models saved as 'random_forest_model.pkl', 'logistic_regression_model.pkl', and 'xgboost_model.pkl'")

Models saved as 'random_forest_model.pkl', 'logistic_regression_model.pkl', and 'xgboost_model.pkl'
