In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report,  precision_score, recall_score, f1_score,confusion_matrix
import pickle

In [2]:
df = pd.read_csv("healthcare-dataset-stroke-data.csv")
label_encoder = LabelEncoder()
df['smoking_status_encoded'] = label_encoder.fit_transform(df['smoking_status'])
df['gender_encoded'] = label_encoder.fit_transform(df['gender'])
print(df[['smoking_status', 'smoking_status_encoded', 'gender', 'gender_encoded']].head())


    smoking_status  smoking_status_encoded  gender  gender_encoded
0  formerly smoked                       1    Male               1
1     never smoked                       2  Female               0
2     never smoked                       2    Male               1
3           smokes                       3  Female               0
4     never smoked                       2  Female               0


In [3]:
df['smoking_status_encoded'].unique()

array([1, 2, 3, 0])

In [4]:
df['smoking_status_encoded'].value_counts()

smoking_status_encoded
2    1892
0    1544
1     885
3     789
Name: count, dtype: int64

In [5]:
df['smoking_status'].value_counts()

smoking_status
never smoked       1892
Unknown            1544
formerly smoked     885
smokes              789
Name: count, dtype: int64

In [6]:
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns
numerical_columns = numerical_columns.drop(['id'])
new_df_column = list(numerical_columns) + ['smoking_status_encoded','gender_encoded']
new_df = df[new_df_column]

In [7]:
new_df.columns

Index(['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi',
       'stroke', 'smoking_status_encoded', 'gender_encoded'],
      dtype='object')

In [8]:
new_df.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,smoking_status_encoded,gender_encoded
0,67.0,0,1,228.69,36.6,1,1,1
1,61.0,0,0,202.21,,1,2,0
2,80.0,0,1,105.92,32.5,1,2,1
3,49.0,0,0,171.23,34.4,1,3,0
4,79.0,1,0,174.12,24.0,1,2,0


In [9]:
new_df.describe()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,smoking_status_encoded,gender_encoded
count,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0,5110.0,5110.0
mean,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728,1.376908,0.414286
std,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532,1.071534,0.493044
min,0.08,0.0,0.0,55.12,10.3,0.0,0.0,0.0
25%,25.0,0.0,0.0,77.245,23.5,0.0,0.0,0.0
50%,45.0,0.0,0.0,91.885,28.1,0.0,2.0,0.0
75%,61.0,0.0,0.0,114.09,33.1,0.0,2.0,1.0
max,82.0,1.0,1.0,271.74,97.6,1.0,3.0,2.0


In [10]:
new_df.shape

(5110, 8)

In [11]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   age                     5110 non-null   float64
 1   hypertension            5110 non-null   int64  
 2   heart_disease           5110 non-null   int64  
 3   avg_glucose_level       5110 non-null   float64
 4   bmi                     4909 non-null   float64
 5   stroke                  5110 non-null   int64  
 6   smoking_status_encoded  5110 non-null   int32  
 7   gender_encoded          5110 non-null   int32  
dtypes: float64(3), int32(2), int64(3)
memory usage: 279.6 KB


## Split Data to train, test, validate 

In [12]:
X = new_df.drop('stroke', axis=1)
y = df['stroke']

imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=0)
X_validate, X_test, y_validate, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=0)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("X_val shape:", X_validate.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)
print("y_val shape:", y_validate.shape)

X_train shape: (3066, 7)
X_test shape: (1022, 7)
X_val shape: (1022, 7)
y_train shape: (3066,)
y_test shape: (1022,)
y_val shape: (1022,)


## Train Model

In [13]:

model = LogisticRegression(random_state=0)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
classification_report_output = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report_output)
print("\nConfusion Matrix:\n", conf_matrix)

Accuracy: 0.9510763209393346

Classification Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.97       972
           1       0.00      0.00      0.00        50

    accuracy                           0.95      1022
   macro avg       0.48      0.50      0.49      1022
weighted avg       0.90      0.95      0.93      1022


Confusion Matrix:
 [[972   0]
 [ 50   0]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Test Model

In [14]:

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)


print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)


Accuracy: 0.9510763209393346
Precision: 0.0
Recall: 0.0
F1 Score: 0.0
Confusion Matrix:
 [[972   0]
 [ 50   0]]


  _warn_prf(average, modifier, msg_start, len(result))


## Evaluate Model using evaluation matrices 

In [15]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)


print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.9510763209393346
Precision: 0.0
Recall: 0.0
F1 Score: 0.0
Confusion Matrix:
 [[972   0]
 [ 50   0]]


  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
df['stroke'].value_counts()

stroke
0    4861
1     249
Name: count, dtype: int64

## Hyper Parameter tuning to improve performance 

In [17]:

param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  
    'solver': ['lbfgs', 'liblinear', 'saga'], 
    'max_iter': [100, 200, 300]  
}


grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)


best_params = grid_search.best_params_
best_model = grid_search.best_estimator_


y_pred = best_model.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Best Hyperparameters:", best_params)
print("Accuracy with Best Model:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)

Best Hyperparameters: {'C': 0.001, 'max_iter': 100, 'solver': 'lbfgs'}
Accuracy with Best Model: 0.9510763209393346
Precision: 0.0
Recall: 0.0
F1 Score: 0.0
Confusion Matrix:
 [[972   0]
 [ 50   0]]


  _warn_prf(average, modifier, msg_start, len(result))


## Save Model to a pickle file 

In [18]:
with open("logistic_regression_model.pkl", "wb") as file:
    pickle.dump(model, file)

## Predict values for validate set using pickle file

In [19]:
with open("logistic_regression_model.pkl", "rb") as file:
    loaded_model = pickle.load(file)
y_pred_validate = loaded_model.predict(X_validate)

accuracy = accuracy_score(y_validate, y_pred_validate)
precision = precision_score(y_validate, y_pred_validate)
recall = recall_score(y_validate, y_pred_validate)
f1 = f1_score(y_validate, y_pred_validate)
conf_matrix = confusion_matrix(y_validate, y_pred_validate)


print("Validation Accuracy :", accuracy)
print("Valdation Precision:", precision)
print("Validation Recall:", recall)
print("Validation F1 Score:", f1)
print("Validation Confusion Matrix:\n", conf_matrix)

Validation Accuracy : 0.9540117416829745
Valdation Precision: 0.0
Validation Recall: 0.0
Validation F1 Score: 0.0
Validation Confusion Matrix:
 [[975   0]
 [ 47   0]]


  _warn_prf(average, modifier, msg_start, len(result))


## Write validate set along with it's predicted values to csv file

In [20]:
validate_with_predictions = pd.DataFrame({'age': X_validate[:, 0],
                                          'hypertension': X_validate[:, 1],
                                          'heart_disease': X_validate[:, 2],
                                          'avg_glucose_level': X_validate[:, 3],
                                          'bmi': X_validate[:, 4],
                                          'smoking_status_encoded': X_validate[:, 5],
                                          'gender_encoded': X_validate[:, 6],
                                          'stroke': y_validate,
                                          'Predicted_Stroke': y_pred_validate})

In [21]:
validate_with_predictions.to_csv("validation_with_predictions.csv", index=False)