In [3]:
import pandas as pd
from imblearn.over_sampling import SMOTE

# Load dataset
df = pd.read_csv("stroke_data.csv")

# Drop non-numeric columns (like ID)
df.drop(columns=['id'], inplace=True)  

# Convert categorical columns to numeric
df['gender'] = df['gender'].map({'Male': 1, 'Female': 0, 'Other': 2})
df['ever_married'] = df['ever_married'].map({'Yes': 1, 'No': 0})
df['work_type'] = df['work_type'].map({'children': 0, 'Govt_job': 1, 'Never_worked': 2, 'Private': 3, 'Self-employed': 4})
df['Residence_type'] = df['Residence_type'].map({'Urban': 1, 'Rural': 0})
df['smoking_status'] = df['smoking_status'].map({'formerly smoked': 0, 'never smoked': 1, 'smokes': 2, 'Unknown': 3})

# Ensure no NaN values exist
df.dropna(inplace=True)

# Split into features and target
X = df.drop(columns=['stroke'])
y = df['stroke']

# Apply SMOTE
smote = SMOTE(sampling_strategy=1.0, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

print(f"Original class distribution:\n{y.value_counts()}")
print(f"Resampled class distribution:\n{pd.Series(y_resampled).value_counts()}")



Original class distribution:
stroke
0    4700
1     209
Name: count, dtype: int64
Resampled class distribution:
stroke
1    4700
0    4700
Name: count, dtype: int64


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Train a RandomForest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.9574468085106383
              precision    recall  f1-score   support

           0       0.96      0.95      0.96       922
           1       0.95      0.97      0.96       958

    accuracy                           0.96      1880
   macro avg       0.96      0.96      0.96      1880
weighted avg       0.96      0.96      0.96      1880



In [5]:
import pickle
with open('stroke_prediction_model.pkl', 'wb') as f:
    pickle.dump(model, f)

