In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, make_scorer, f1_score
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
import joblib

<h1>Smote Logistic regression

In [10]:


# Load the dataset (replace 'healthcare-dataset-stroke-data.csv' with your file path)
data = pd.read_csv('healthcare-dataset-stroke-data.csv')

# Drop 'id' column if present
if 'id' in data.columns:
    data = data.drop('id', axis=1)

# Handle missing values
data['bmi'].fillna(data['bmi'].median(), inplace=True)

# Encode categorical variables
le = LabelEncoder()
categorical_cols = ['gender', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
for col in categorical_cols:
    data[col] = le.fit_transform(data[col])

# Create derived features
data['age_glucose'] = data['age'] * data['avg_glucose_level']
data['comorbidity'] = data['hypertension'] | data['heart_disease']
data['age_group'] = pd.cut(data['age'], bins=[0, 40, 60, 120], labels=[0, 1, 2], include_lowest=True)
data['age_group'] = data['age_group'].astype(int)

# Define features and target
X = data[['age', 'gender', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type',
          'avg_glucose_level', 'bmi', 'smoking_status', 'age_glucose', 'comorbidity', 'age_group']]
y = data['stroke']

# Split the data (before balancing to avoid data leakage)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE to the training set
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Check the class distribution after SMOTE
print("Class distribution after SMOTE:", pd.Series(y_train_smote).value_counts())

# Scale the features
scaler = StandardScaler()
X_train_smote_scaled = scaler.fit_transform(X_train_smote)
X_test_scaled = scaler.transform(X_test)

Class distribution after SMOTE: stroke
0    3889
1    3889
Name: count, dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['bmi'].fillna(data['bmi'].median(), inplace=True)


In [None]:
# import joblib
# joblib.dump(scaler, 'scalers/Logistic_scaler_smote_new.pkl')
# print("New scaler saved as 'scaler_smote_new.pkl'")

New scaler saved as 'scaler_smote_new.pkl'


In [2]:
# Initialize and train Logistic Regression with scaled data
lr_model_smote = LogisticRegression(class_weight='balanced', random_state=42, max_iter=2000)
lr_model_smote.fit(X_train_smote_scaled, y_train_smote)

# Predict on the test set
lr_pred_smote = lr_model_smote.predict(X_test_scaled)

# Evaluate the model
print("\nLogistic Regression (SMOTE) Test Set Performance:")
print(classification_report(y_test, lr_pred_smote))


Logistic Regression (SMOTE) Test Set Performance:
              precision    recall  f1-score   support

           0       0.98      0.83      0.90       972
           1       0.17      0.68      0.28        50

    accuracy                           0.82      1022
   macro avg       0.58      0.76      0.59      1022
weighted avg       0.94      0.82      0.87      1022



In [None]:
# # Save the tuned Logistic Regression model
# joblib.dump(lr_model_smote, 'models1/logistic_regression_smote_model.pkl')
# print("Tuned Logistic Regression (SMOTE) model saved as 'logistic_regression_smote_tuned_model.pkl'")

Tuned Logistic Regression (SMOTE) model saved as 'logistic_regression_smote_tuned_model.pkl'


C (Inverse of Regularization Strength)
Controls the trade-off between fitting the training data and keeping the model simple (to prevent overfitting).

Smaller values of C increase regularization (simpler model, less overfitting), while larger values reduce regularization (more complex model, better fit to training data).

We’ll test a range of values to find the best balance.

solver (different fitters)

class_weight


In [4]:
# Define the parameter grid for GridSearchCV
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Range of regularization strengths
    'solver': ['lbfgs', 'liblinear', 'saga'],  # Different solvers
    'penalty': ['l1', 'l2'],  # L1 and L2 penalties (we'll handle solver-penalty compatibility in GridSearchCV)
    'class_weight': ['balanced', {0: 1, 1: 10}]  # Balanced vs. custom weight for minority class
}

# Handle solver-penalty compatibility (e.g., 'lbfgs' only supports 'l2')
# GridSearchCV will automatically skip incompatible combinations

# Initialize Logistic Regression
lr_model = LogisticRegression(max_iter=2000, random_state=42)

# Define a custom scorer for F1-score of the minority class (class 1)
f1_scorer = make_scorer(f1_score, pos_label=1)

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=lr_model,
    param_grid=param_grid,
    scoring=f1_scorer,  # Optimize for F1-score of the stroke class
    cv=5,  # 5-fold cross-validation
    n_jobs=-1,  # Use all available CPU cores
    verbose=1
)

# Fit GridSearchCV
grid_search.fit(X_train_smote_scaled, y_train_smote)

# Print the best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best F1-Score (Class 1) on Cross-Validation:", grid_search.best_score_)



Fitting 5 folds for each of 60 candidates, totalling 300 fits
Best Parameters: {'C': 0.1, 'class_weight': 'balanced', 'penalty': 'l1', 'solver': 'saga'}
Best F1-Score (Class 1) on Cross-Validation: 0.8407496018409976


50 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Shubham\Documents\ds_mini\mini\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Shubham\Documents\ds_mini\mini\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Shubham\Documents\ds_mini\mini\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1193, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
         

In [6]:
# Get the best model from GridSearchCV
best_lr_model = grid_search.best_estimator_

# Predict on the test set
lr_pred_tuned = best_lr_model.predict(X_test_scaled)

# Evaluate the model
print("\nBest hyperparameters Logistic Regression (SMOTE, Tuned) Test Set Performance:")
print(classification_report(y_test, lr_pred_tuned))


Best hyperparameters Logistic Regression (SMOTE, Tuned) Test Set Performance:
              precision    recall  f1-score   support

           0       0.98      0.83      0.90       972
           1       0.17      0.70      0.27        50

    accuracy                           0.82      1022
   macro avg       0.58      0.76      0.59      1022
weighted avg       0.94      0.82      0.87      1022



<h1>XGBoost

In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
import xgboost as xgb
import joblib

# Load the dataset (replace 'healthcare-dataset-stroke-data.csv' with your file path)
data = pd.read_csv('healthcare-dataset-stroke-data.csv')

# Drop 'id' column if present
if 'id' in data.columns:
    data = data.drop('id', axis=1)

# Handle missing values
data['bmi'].fillna(data['bmi'].median(), inplace=True)

# Encode categorical variables
le = LabelEncoder()
categorical_cols = ['gender', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
for col in categorical_cols:
    data[col] = le.fit_transform(data[col])

# Create derived features
data['age_glucose'] = data['age'] * data['avg_glucose_level']
data['comorbidity'] = data['hypertension'] | data['heart_disease']
data['age_group'] = pd.cut(data['age'], bins=[0, 40, 60, 120], labels=[0, 1, 2], include_lowest=True)
data['age_group'] = data['age_group'].astype(int)

# Define features and target
X = data[['age', 'gender', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type',
          'avg_glucose_level', 'bmi', 'smoking_status', 'age_glucose', 'comorbidity', 'age_group']]
y = data['stroke']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE to the training set
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Scale the features
scaler = StandardScaler()
X_train_smote_scaled = scaler.fit_transform(X_train_smote)
X_test_scaled = scaler.transform(X_test)

# Calculate scale_pos_weight (ratio of negative to positive samples in the original training set)
scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])

# Initialize and train XGBoost
xgb_model_smote = xgb.XGBClassifier(scale_pos_weight=scale_pos_weight, random_state=42)
xgb_model_smote.fit(X_train_smote_scaled, y_train_smote)

# Predict on the test set (default threshold of 0.5)
xgb_pred_smote = xgb_model_smote.predict(X_test_scaled)

# Evaluate the model (default threshold)
print("\nXGBoost (SMOTE, Default Threshold) Test Set Performance:")
print(classification_report(y_test, xgb_pred_smote))

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['bmi'].fillna(data['bmi'].median(), inplace=True)



XGBoost (SMOTE, Default Threshold) Test Set Performance:
              precision    recall  f1-score   support

           0       0.96      0.89      0.93       972
           1       0.14      0.34      0.20        50

    accuracy                           0.86      1022
   macro avg       0.55      0.62      0.56      1022
weighted avg       0.92      0.86      0.89      1022



In [9]:
# Get the predicted probabilities for the test set
xgb_prob_smote = xgb_model_smote.predict_proba(X_test_scaled)[:, 1]

# Adjust the decision threshold to 0.3
threshold = 0.3
xgb_pred_smote_adjusted = (xgb_prob_smote >= threshold).astype(int)

# Evaluate the model (adjusted threshold)
print("\nXGBoost (SMOTE, Threshold=0.3) Test Set Performance:")
print(classification_report(y_test, xgb_pred_smote_adjusted))


XGBoost (SMOTE, Threshold=0.3) Test Set Performance:
              precision    recall  f1-score   support

           0       0.96      0.86      0.91       972
           1       0.11      0.34      0.17        50

    accuracy                           0.84      1022
   macro avg       0.54      0.60      0.54      1022
weighted avg       0.92      0.84      0.87      1022

