In [8]:
import pandas as pd
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median')
X_imputed = pd.DataFrame(imputer.fit_transform(X_encoded), columns=X_encoded.columns)


In [9]:
imputer_cat = SimpleImputer(strategy='most_frequent')
X_imputed = pd.DataFrame(imputer_cat.fit_transform(X_encoded), columns=X_encoded.columns)


In [10]:
X_encoded = X_encoded.dropna()
y = y[X_encoded.index]  # align target


In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
import joblib
import os

df = pd.read_csv(r"C:\Users\rhegd\OneDrive\Desktop\healthcare-dataset-stroke-data.csv")

X = df.drop('stroke', axis=1)
y = df['stroke']

categorical_cols = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

# Impute missing values (numerical and encoded categorical all as numbers now)
imputer = SimpleImputer(strategy='median')
X_imputed = pd.DataFrame(imputer.fit_transform(X_encoded), columns=X_encoded.columns)

# Align target after imputation (if any rows dropped, usually none with SimpleImputer)
# y = y.loc[X_imputed.index]

X_train, X_test, y_train, y_test = train_test_split(
    X_imputed, y, test_size=0.2, random_state=42, stratify=y)

smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

model = RandomForestClassifier(class_weight='balanced', random_state=42)
model.fit(X_train_sm, y_train_sm)

y_pred = model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

save_dir = r"C:\Users\rhegd\OneDrive\Desktop\Django\Predict\stroke_model"
os.makedirs(save_dir, exist_ok=True)
model_path = os.path.join(save_dir, "stroke_model.joblib")
joblib.dump(model, model_path)
print(f"Model saved to '{model_path}'")


Accuracy: 0.9501
Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       972
           1       0.40      0.04      0.07        50

    accuracy                           0.95      1022
   macro avg       0.68      0.52      0.52      1022
weighted avg       0.93      0.95      0.93      1022

Model saved to 'C:\Users\rhegd\OneDrive\Desktop\Django\Predict\stroke_model\stroke_model.joblib'


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import joblib

# Load dataset
df = pd.read_csv(r"C:\Users\rhegd\OneDrive\Desktop\healthcare-dataset-stroke-data.csv")

# Drop rows with null BMI
df.dropna(subset=['bmi'], inplace=True)

# Keep only the required features
df = df[['age', 'gender', 'hypertension', 'heart_disease', 'work_type',
         'avg_glucose_level', 'bmi', 'smoking_status', 'stroke']]

# Encode categorical features
le_gender = LabelEncoder()
le_work = LabelEncoder()
le_smoke = LabelEncoder()

df['gender'] = le_gender.fit_transform(df['gender'])
df['work_type'] = le_work.fit_transform(df['work_type'])
df['smoking_status'] = le_smoke.fit_transform(df['smoking_status'])

# Save encoders for future decoding (optional)
joblib.dump(le_gender, 'le_gender.joblib')
joblib.dump(le_work, 'le_work.joblib')
joblib.dump(le_smoke, 'le_smoke.joblib')

# Features and labels
X = df.drop('stroke', axis=1)
y = df['stroke']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Save model
joblib.dump(model, 'stroke_model.joblib')


['stroke_model.joblib']