In [1]:
#Extract and Load the Dataset

import pandas as pd
df = pd.read_csv("/content/healthcare-dataset-stroke-data.csv")
print(df.head())

#Data cleaning

df = pd.read_csv("/content/healthcare-dataset-stroke-data.csv")

# Display initial shape and column names
print("Initial shape:", df.shape)
print("Column names:", df.columns.tolist())

# Drop the 'id' column — it's just a unique identifier

df.drop(columns=['id'], inplace=True)

# Check for missing values

print("\nMissing values before handling:\n", df.isnull().sum())

# Fill missing BMI values with the mean

df = df.fillna({'bmi': df['bmi'].mean()})

# Fill missing smoking_status values with 'Unknown'

df = df.fillna({'smoking_status' : 'Unknown'})

# Optional: Display cleaned unique values for categorical columns

print("\nUnique values in 'gender':", df['gender'].unique())
print("Unique values in 'smoking_status':", df['smoking_status'].unique())

# Confirm no more missing data

print("\nMissing values after handling:\n", df.isnull().sum())

# Show the first few cleaned rows

print("\nCleaned Data Sample:\n", df.head())


      id  gender   age  hypertension  heart_disease ever_married  \
0   9046    Male  67.0             0              1          Yes   
1  51676  Female  61.0             0              0          Yes   
2  31112    Male  80.0             0              1          Yes   
3  60182  Female  49.0             0              0          Yes   
4   1665  Female  79.0             1              0          Yes   

       work_type Residence_type  avg_glucose_level   bmi   smoking_status  \
0        Private          Urban             228.69  36.6  formerly smoked   
1  Self-employed          Rural             202.21   NaN     never smoked   
2        Private          Rural             105.92  32.5     never smoked   
3        Private          Urban             171.23  34.4           smokes   
4  Self-employed          Rural             174.12  24.0     never smoked   

   stroke  
0       1  
1       1  
2       1  
3       1  
4       1  
Initial shape: (5110, 12)
Column names: ['id', 'gender',

In [None]:
from sqlalchemy import create_engine
from urllib.parse import quote_plus
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
df = pd.read_csv("/content/healthcare-dataset-stroke-data.csv")

username = "root"
password = "Store@033"
host = "localhost"
database = "healthcare_db"
encoded_password = quote_plus(password)

engine = create_engine(f"mysql+pymysql://{username}:{encoded_password}@{host}/{database}")
df.to_sql(name='patients', con=engine, if_exists='replace', index=False)

query = """
SELECT gender, COUNT(*) AS total_patients, SUM(stroke) AS stroke_cases
FROM patients
GROUP BY gender;
"""
df_vis = pd.read_sql(query, con=engine)
sns.set(style="whitegrid")
sns.barplot(data = df_vis, x='gender', y='stroke_cases', hue='gender',palette='pastel')
plt.title("Strokes Cases By Gender")
plt.show()

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler
from sqlalchemy import create_engine
from urllib.parse import quote_plus

# Connect to DB and load data
username = "root"
password = quote_plus("Store@033")
host = "localhost"
database = "healthcare_db"

engine = create_engine(f"mysql+mysqlconnector://{username}:{password}@{host}/{database}")
df = pd.read_sql("SELECT * FROM patients", con=engine)
df_encoded = pd.get_dummies(df, drop_first=True)

# Drop unwanted columns
x = df_encoded.drop('stroke', axis=1)
y = df_encoded['stroke']
x = x.drop(columns=['id', 'gender_Other', 'work_type_Never_worked'], errors='ignore')

# Drop rows with missing values
x = x.dropna()
y = y[x.index]

# Feature Selection: Keep top 10 most predictive
selector = SelectKBest(score_func=f_classif, k=10)
x_selected = selector.fit_transform(x, y)
selected_columns = x.columns[selector.get_support()]

# Scale features
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x_selected)

# Train-test split
x_train, x_test, y_train, y_test = train_test_split(
    x_scaled, y, test_size=0.2, random_state=42
)

# Train logistic regression with balanced class weights
model = LogisticRegression(solver='liblinear', class_weight='balanced', max_iter=10000)
model.fit(x_train, y_train)

# Predict PROBABILITIES instead of hard 0/1
y_probs = model.predict_proba(x_test)[:, 1]

# Tune threshold (optional: adjust cutoff from 0.5 to boost recall or precision)
threshold = 0.4
y_pred = (y_probs > threshold).astype(int)

# Evaluate performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Show final feature importance
importances = pd.Series(model.coef_[0], index=selected_columns)
print("Feature Impact:\n", importances.sort_values(ascending=False))

import pickle

# Save the model
with open("stroke_model.pkl", "wb") as f:
    pickle.dump(model, f)

# Save the scaler (used before training)
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

# Save selected feature names
with open("selected_features.pkl", "wb") as f:
    pickle.dump(selected_columns, f)

In [None]:
# import streamlit as st
# import pickle
# import numpy as np

# # Load model and preprocessing tools
# model = pickle.load(open("stroke_model.pkl", "rb"))
# scaler = pickle.load(open("scaler.pkl", "rb"))
# selected_features = pickle.load(open("selected_features.pkl", "rb"))

# st.title("🧠 Stroke Risk Prediction Dashboard")
# st.write("Enter patient data below to estimate their stroke risk.")

# # Build input form
# input_data = {}

# for feature in selected_features:
#     if 'age' in feature:
#         input_data[feature] = st.slider("Age", 0, 100, 45)
#     elif 'bmi' in feature:
#         input_data[feature] = st.slider("BMI", 10.0, 50.0, 25.0)
#     elif 'glucose' in feature:
#         input_data[feature] = st.slider("Avg Glucose Level", 60.0, 250.0, 100.0)
#     else:
#         input_data[feature] = st.selectbox(f"{feature}", [0, 1])

# # Prepare input
# x_input = np.array([list(input_data.values())]).astype(float)
# x_scaled = scaler.transform(x_input)

# # Predict
# if st.button("Predict Stroke Risk"):
#     prediction = model.predict(x_scaled)[0]
#     prob = model.predict_proba(x_scaled)[0][1]

#     if prediction == 1:
#         st.error(f"⚠️ High Stroke Risk (Probability: {prob:.2f})")
#     else:
#         st.success(f"✅ Low Stroke Risk (Probability: {prob:.2f})")

import streamlit as st
import pickle
import numpy as np

# Page config
st.set_page_config(page_title="Stroke Risk Predictor", layout="centered")

# Load model artifacts
model = pickle.load(open("stroke_model.pkl", "rb"))
scaler = pickle.load(open("scaler.pkl", "rb"))
selected_features = pickle.load(open("selected_features.pkl", "rb"))

# Header
st.markdown("""
    <h1 style='text-align: center; color: #4B8BBE;'>🧠 Stroke Risk Prediction</h1>
    <p style='text-align: center;'>Provide patient data below to evaluate the likelihood of a stroke.</p>
    <hr>
""", unsafe_allow_html=True)

# Input layout
st.subheader("📋 Patient Information")
cols = st.columns(2)
inputs = []

for idx, feature in enumerate(selected_features):
    if 'age' in feature.lower():
        value = cols[idx % 2].slider("Age", 0, 100, 50)
    elif 'bmi' in feature.lower():
        value = cols[idx % 2].slider("BMI", 10.0, 50.0, 25.0)
    elif 'glucose' in feature.lower():
        value = cols[idx % 2].slider("Avg Glucose Level", 60.0, 250.0, 100.0)
    else:
        value = cols[idx % 2].selectbox(f"{feature}", [0, 1], format_func=lambda x: "Yes" if x else "No")
    inputs.append(value)

# Predict
x_input = np.array([inputs]).astype(float)
x_scaled = scaler.transform(x_input)

if st.button("🔍 Predict Stroke Risk"):
    prediction = model.predict(x_scaled)[0]
    prob = model.predict_proba(x_scaled)[0][1]

    if prediction == 1:
        st.markdown(f"""
            <div style='background-color:#ffe6e6; padding:20px; border-radius:10px;'>
                <h3 style='color:#c0392b;'>⚠️ High Stroke Risk</h3>
                <p>Estimated probability: <strong>{prob:.2f}</strong></p>
            </div>
        """, unsafe_allow_html=True)
    else:
        st.markdown(f"""
            <div style='background-color:#e6ffe6; padding:20px; border-radius:10px;'>
                <h3 style='color:#27ae60;'>✅ Low Stroke Risk</h3>
                <p>Estimated probability: <strong>{prob:.2f}</strong></p>
            </div>
        """, unsafe_allow_html=True)

# Footer
st.markdown("""
<hr>
<p style='text-align: center; font-size: 12px;'>Built with ❤️ using Scikit-learn & Streamlit</p>
""", unsafe_allow_html=True)