In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from imblearn.combine import SMOTEENN
from xgboost import XGBClassifier

# Step 1: Load Dataset
# Replace with actual dataset path
data = pd.read_csv("stroke_dataset.csv")

# Step 2: Handle Missing Values
# Impute missing BMI with median
data['bmi'].fillna(data['bmi'].median(), inplace=True)

# Impute smoking status with mode or predictive imputation
data['smoking_status'].fillna(data['smoking_status'].mode()[0], inplace=True)

# Step 3: Preprocessing
# Drop irrelevant columns
data.drop(['Patient_ID', 'Residence_type'], axis=1, inplace=True)

# Encode categorical variables
data = pd.get_dummies(data, drop_first=True)

# Scale features
scaler = StandardScaler()
features = [col for col in data.columns if col != 'stroke']
data[features] = scaler.fit_transform(data[features])

# Step 4: Handle Class Imbalance
X = data.drop('stroke', axis=1)
y = data['stroke']

smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

# Step 5: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Step 6: Train XGBoost with Hyperparameter Tuning
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

search = RandomizedSearchCV(xgb, param_grid, cv=3, scoring='roc_auc', n_iter=10, random_state=42)
search.fit(X_train, y_train)

best_model = search.best_estimator_

# Step 7: Evaluate the Model
y_pred = best_model.predict(X_test)
y_prob = best_model.predict_proba(X_test)[:, 1]

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nAUC-ROC Score:")
print(roc_auc_score(y_test, y_prob))

# Optional: Save the model
import joblib
joblib.dump(best_model, "stroke_prediction_model.pkl")


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import TomekLinks

# Step 1: Load Dataset
data = pd.read_csv("stroke_dataset.csv")

# Step 2: Handle Missing Values
data['bmi'].fillna(data['bmi'].median(), inplace=True)
data['smoking_status'].fillna(data['smoking_status'].mode()[0], inplace=True)

# Drop irrelevant columns and preprocess data
data.drop(['Patient_ID', 'Residence_type'], axis=1, inplace=True)
data = pd.get_dummies(data, drop_first=True)
scaler = StandardScaler()
features = [col for col in data.columns if col != 'stroke']
data[features] = scaler.fit_transform(data[features])

X = data.drop('stroke', axis=1)
y = data['stroke']

# Step 3: Define Models
models = {
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "SVM": SVC(probability=True),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# Step 4: Define Sampling Techniques
sampling_techniques = {
    "SMOTE": SMOTE(random_state=42),
    "Borderline SMOTE": BorderlineSMOTE(random_state=42),
    "ADASYN": ADASYN(random_state=42),
    "SMOTE-ENN": SMOTEENN(random_state=42),
    "Tomek Links": TomekLinks()
}

# Step 5: Evaluate Models with Each Sampling Technique
results = []
for sampling_name, sampler in sampling_techniques.items():
    print(f"\n=== Sampling Technique: {sampling_name} ===")
    if sampling_name == "Tomek Links":
        # Tomek Links is an under-sampling technique, so apply only after initial sampling
        smote = SMOTE(random_state=42)
        X_resampled, y_resampled = smote.fit_resample(X, y)
        X_resampled, y_resampled = sampler.fit_resample(X_resampled, y_resampled)
    else:
        X_resampled, y_resampled = sampler.fit_resample(X, y)

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

    for model_name, model in models.items():
        print(f"\n--- Model: {model_name} ---")
        if model_name == "XGBoost":
            # Hyperparameter tuning for XGBoost
            param_grid = {
                'n_estimators': [100, 200],
                'max_depth': [3, 5],
                'learning_rate': [0.01, 0.1]
            }
            search = RandomizedSearchCV(model, param_grid, cv=3, scoring='roc_auc', n_iter=5, random_state=42)
            search.fit(X_train, y_train)
            model = search.best_estimator_

        # Train model
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

        # Evaluate
        accuracy = accuracy_score(y_test, y_pred)
        auc = roc_auc_score(y_test, y_prob) if y_prob is not None else "N/A"
        cm = confusion_matrix(y_test, y_pred)
        report = classification_report(y_test, y_pred, output_dict=True)

        results.append({
            "Sampling": sampling_name,
            "Model": model_name,
            "Accuracy": accuracy,
            "AUC": auc,
            "Sensitivity": report['1']['recall'],
            "Specificity": report['0']['recall'],
            "F1-Score": report['1']['f1-score'],
            "Confusion Matrix": cm
        })

# Step 6: Display Results
results_df = pd.DataFrame(results)
print("\n=== Results Summary ===")
print(results_df)

# Optional: Save results
results_df.to_csv("model_comparison_results.csv", index=False)
