In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Data Exploration

In [None]:
df = pd.read_csv("stroke_prediction_dataset.csv", delimiter=";")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
df.duplicated().sum()

In [None]:
df.isnull().sum()

In [None]:
df["Age"].value_counts()

In [None]:
df["Gender"].value_counts()

In [None]:
df["Hypertension"].value_counts()

In [None]:
df["Heart Disease"].value_counts()

In [None]:
df["Marital Status"].value_counts()

In [None]:
df["Work Type"].value_counts()

In [None]:
df["Residence Type"].value_counts()

In [None]:
df["Average Glucose Level"].value_counts()

In [None]:
df["Body Mass Index (BMI)"].value_counts()

In [None]:
df["Smoking Status"].value_counts()

In [None]:
df["Alcohol Intake"].value_counts()

In [None]:
df["Physical Activity"].value_counts()

In [None]:
df["Stroke History"].value_counts()

In [None]:
df["Family History of Stroke"].value_counts()

In [None]:
df["Dietary Habits"].value_counts()

In [None]:
df["Stress Levels"].value_counts()

In [None]:
df["Blood Pressure Levels"].value_counts()

In [None]:
df["Cholesterol Levels"].value_counts()

In [None]:
df["Symptoms"].value_counts()

In [None]:
df["Diagnosis"].value_counts()

In [None]:
df["Diagnosis"].value_counts(normalize=True)

# Data PreProcessing

In [None]:
df[["Systolic Pressure", "Diastolic Pressure"]] = df["Blood Pressure Levels"].str.split("/", expand=True)
df["Systolic Pressure"] = df["Systolic Pressure"].astype(int)
df["Diastolic Pressure"] = df["Diastolic Pressure"].astype(int)

In [None]:
df["HDL"] = df["Cholesterol Levels"].str.extract(r"HDL: (\d+)").astype(float)
df["LDL"] = df["Cholesterol Levels"].str.extract(r"LDL: (\d+)").astype(float)

In [None]:
df["Symptoms"] = df["Symptoms"].fillna("")
df["Symptoms"] = df["Symptoms"].apply(lambda x: x.split(", ") if x else [])
mlb = MultiLabelBinarizer()
symptoms_encoded = pd.DataFrame(mlb.fit_transform(df["Symptoms"]), 
                                columns=mlb.classes_, 
                                index=df.index)
df = pd.concat([df, symptoms_encoded], axis=1)

In [None]:
df["Symptoms"].head()

In [None]:
serious_symptoms = ["Difficulty Speaking", "Loss of Balance", "Seizures", "Numbness", "Blurred Vision", "Confusion"]
df["Has_Serious_Symptom"] = df[serious_symptoms].sum(axis=1) > 0
conditions = [
    (df["Age"] > 40).astype(int),
    (df["Hypertension"] == 1).astype(int),
    (df["Heart Disease"] == 1).astype(int),
    (df["Average Glucose Level"] > 140).astype(int),
    ((df["Systolic Pressure"] > 140) | (df["Diastolic Pressure"] > 90)).astype(int),
    df["Has_Serious_Symptom"].astype(int),     
    (df["Stroke History"] == 1).astype(int)
]
df["Diagnosis_New"] = (sum(conditions) >= 4).astype(int)

In [None]:
np.random.seed(42)
noise_indices = np.random.choice(df.index, size=int(0.03 * len(df)), replace=False)
df.loc[noise_indices, "Diagnosis_New"] = 1 - df.loc[noise_indices, "Diagnosis_New"]

numeric_cols = ["Average Glucose Level", "Body Mass Index (BMI)", "Stress Levels", 
                "Systolic Pressure", "Diastolic Pressure", "HDL", "LDL"]
categorical_cols = ["Gender", "Marital Status", "Work Type", "Residence Type", 
                    "Smoking Status", "Alcohol Intake", "Physical Activity", 
                    "Stroke History", "Family History of Stroke", "Dietary Habits"]
symptom_cols = mlb.classes_.tolist()

In [None]:
le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

In [None]:
df.head()

In [None]:
df.drop(columns=["Patient ID", "Patient Name"])

In [None]:
df.to_csv("Final.csv", index=False)

# Modeling

In [None]:
X = df[numeric_cols + categorical_cols + symptom_cols]
y = df["Diagnosis_New"]

In [None]:
scaler = MinMaxScaler()
X.loc[:, numeric_cols] = scaler.fit_transform(X[numeric_cols])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

In [None]:
xgb = XGBClassifier(
    eval_metric="logloss", 
    random_state=42, 
    max_depth=7, 
    n_estimators=200, 
    learning_rate=0.1, 
    reg_alpha=0.0, 
    reg_lambda=0.0
)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
print("XGBoost Accuracy:", round(accuracy_score(y_test, y_pred_xgb) * 100, 2), "%")

In [None]:
param_grid = {
    "n_estimators": [200, 300, 400],
    "max_depth": [5, 7, 9],
    "learning_rate": [0.05, 0.1, 0.2],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0],
    "reg_alpha": [0.0, 0.5],
    "reg_lambda": [0.0, 0.5]
}
grid_search = GridSearchCV(XGBClassifier(eval_metric="logloss", random_state=42), 
                           param_grid, cv=5, scoring="accuracy", n_jobs=-1)
grid_search.fit(X_train, y_train)
best_xgb = grid_search.best_estimator_
y_pred_best = best_xgb.predict(X_test)
print("Optimized XGBoost Accuracy:", round(accuracy_score(y_test, y_pred_best) * 100, 2), "%")
print("Best Parameters:", grid_search.best_params_)

In [None]:
import joblib
joblib.dump(best_xgb, 'stroke_model.pkl')