In [None]:
# 1. Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report



In [2]:
import pandas as pd  # Make sure pandas is imported

# Load your dataset
file_path = file_path = r"C:\Users\Srinjoy RayChaudhuri\Brain Stroke\brain_stroke.csv"
data = pd.read_csv('brain_stroke.csv')

# Display the first few rows of the dataset
data.head()


Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Load dataset
data = pd.read_csv('brain_stroke.csv')

# Convert categorical features into dummy/one-hot encoded variables
categorical_features = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
numeric_features = ['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi']

# Separate features and target variable
X = data.drop('stroke', axis=1)
y = data['stroke']

# Encode categorical variables into a format compatible with the model
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)
    ])

# Create a pipeline that combines the preprocessor with Logistic Regression
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

# Define parameter grid for GridSearchCV to perform hyperparameter tuning with cross-validation
param_grid = {
    'classifier__C': [0.01, 0.1, 1, 10, 100],
    'classifier__solver': ['liblinear', 'lbfgs'],
}

# Use GridSearchCV with cross-validation to find the best parameters
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X, y)

# Extract best estimator from grid search
best_model = grid_search.best_estimator_

# Display best parameters found by GridSearchCV
print("Best parameters found by GridSearchCV:", grid_search.best_params_)

# Evaluate model using cross-validation to confirm it generalizes well
cv_scores = cross_val_score(best_model, X, y, cv=5, scoring='accuracy')
print(f"Cross-validation accuracy: {cv_scores.mean() * 100:.2f}%")

# Train-test split for a final evaluation on the test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the best model on the training data
best_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Evaluate final model performance
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Model Accuracy: {accuracy * 100:.2f}%")
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)


Best parameters found by GridSearchCV: {'classifier__C': 0.1, 'classifier__solver': 'lbfgs'}
Cross-validation accuracy: 95.04%
Model Accuracy: 94.58%
Confusion Matrix:
 [[943   0]
 [ 54   0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.97       943
           1       0.00      0.00      0.00        54

    accuracy                           0.95       997
   macro avg       0.47      0.50      0.49       997
weighted avg       0.89      0.95      0.92       997



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [12]:


import joblib
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

# Load the dataset (assuming the data is available as 'brain_stroke.csv')
df = pd.read_csv('brain_stroke.csv')

# Feature selection
X = df[['age', 'gender', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'smoking_status', 'avg_glucose_level', 'bmi']]
y = df['stroke']  # Assuming 'stroke' is the target column

# Define preprocessor with handle_unknown='ignore' for OneHotEncoder
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['age', 'avg_glucose_level', 'bmi']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['gender', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'])
    ]
)

# Define and train the model
model = LogisticRegression(max_iter=1000)

# Create a pipeline that first processes the data and then classifies
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])

# Train the pipeline with the dataset
pipeline.fit(X, y)

# Save the preprocessor and model
joblib.dump(preprocessor, 'preprocessor.pkl')
joblib.dump(pipeline, 'brain_stroke_model.pkl')


['brain_stroke_model.pkl']