In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from imblearn.over_sampling import SMOTE
from sklearn.impute import KNNImputer

In [2]:
# Load the dataset
data = pd.read_csv("healthcare-dataset-stroke-data.csv")


In [3]:
# Data Preprocessing
imputer = KNNImputer(n_neighbors=5, weights='uniform')
data['bmi'] = imputer.fit_transform(data[['bmi']])
data.drop(data[data.gender == 'Other'].index, inplace=True)
data.reset_index(drop=True, inplace=True)
data.gender = data.gender.map({'Male': 0, 'Female': 1})
data.ever_married = data.ever_married.map({'No': 0, 'Yes': 1})
data.work_type = data.work_type.map({'Private': 0, 'Self-employed': 1, 'children': 2, 'Govt_job': 3, 'Never_worked': 4})
data.Residence_type = data.Residence_type.map({'Urban': 0, 'Rural': 1})
data.smoking_status = data.smoking_status.map({'never smoked': 0, 'formerly smoked': 1, 'smokes': 2, 'Unknown': 3})
numeric_features = ['age', 'avg_glucose_level', 'bmi']
scaler = StandardScaler()
data[numeric_features] = scaler.fit_transform(data[numeric_features])


In [4]:
# Dropped the 'id' column and separate features from target
data = data.drop('id', axis=1)
X = data.drop('stroke', axis=1)
y = data['stroke']

In [5]:
# Feature Scaling and Polynomial Features
numeric_features = ['age', 'avg_glucose_level', 'bmi']
scaler = StandardScaler()
data[numeric_features] = scaler.fit_transform(data[numeric_features])

In [6]:
# Generate polynomial features for the numeric columns
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(data[numeric_features])
poly_columns = poly.get_feature_names_out(numeric_features)
X_poly_df = pd.DataFrame(X_poly, columns=poly_columns)

# Combine polynomial features back into the dataset
data = data.drop(numeric_features, axis=1)
data = pd.concat([data, X_poly_df], axis=1)

In [7]:
# Oversample minority class using SMOTE
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X, y)


In [8]:
# Use StratifiedKFold to maintain class proportions
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [9]:
# Initialized lists to store accuracy scores
knn_scores = []
rf_scores = []
gbc_scores = []
combined_scores = []

In [10]:
# Initialized models
knn = KNeighborsClassifier(n_neighbors=3)
rf = RandomForestClassifier(n_estimators=200,max_depth=10, min_samples_split=2, random_state=42)
gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)


In [11]:
# Performed cross-validation manually to average predictions for the combined model
for train_index, test_index in skf.split(X_resampled, y_resampled):
    X_train, X_test = X_resampled.iloc[train_index], X_resampled.iloc[test_index]
    y_train, y_test = y_resampled.iloc[train_index], y_resampled.iloc[test_index]


    # Train both models
    knn.fit(X_train, y_train)
    rf.fit(X_train, y_train)
    gbc.fit(X_train, y_train)


    # Get predictions
    y_pred_knn = knn.predict(X_test)
    y_pred_rf = rf.predict(X_test)
    y_pred_gbc = gbc.predict(X_test)


    # Combine predictions by averaging and rounding to the nearest integer
    combined_predictions = np.round((y_pred_knn + y_pred_rf) / 2).astype(int)

    # Calculate accuracy for each model
    knn_scores.append(accuracy_score(y_test, y_pred_knn))
    rf_scores.append(accuracy_score(y_test, y_pred_rf))
    gbc_scores.append(accuracy_score(y_test, y_pred_gbc))

    combined_scores.append(accuracy_score(y_test, combined_predictions))

In [12]:
# Display cross-validated accuracy scores
print(f"KNN average accuracy: {np.mean(knn_scores)*100:.4f}")
print(f"Random Forest average accuracy: {np.mean(rf_scores)*100:.4f}")
print(f"Gradient Boosting average accuracy: {np.mean(gbc_scores)*100:.4f}")
print(f"Combined Model (KNN + RF) average accuracy: {np.mean(combined_scores)*100:.4f}")

KNN average accuracy: 89.6296
Random Forest average accuracy: 89.2490
Gradient Boosting average accuracy: 84.5165
Combined Model (KNN + RF) average accuracy: 91.6975


In [13]:
# Calculate average accuracies for each model
avg_accuracy_combined = np.mean(combined_scores)
avg_accuracy_gbc = np.mean(gbc_scores)
avg_accuracy_knn = np.mean(knn_scores)
avg_accuracy_rf = np.mean(rf_scores)

# Calculate variances (differences) between the combined model and each individual model
variance_combined_vs_gbc = avg_accuracy_combined - avg_accuracy_gbc
variance_combined_vs_knn = avg_accuracy_combined - avg_accuracy_knn
variance_combined_vs_rf = avg_accuracy_combined - avg_accuracy_rf

# Print the average accuracy scores
print(f"Accuracy score using Combined Model (KNN + RF): {avg_accuracy_combined * 100:.4f}%")
print(f"Accuracy score using Gradient Boosting: {avg_accuracy_gbc * 100:.4f}%")
print(f"Accuracy score using KNN: {avg_accuracy_knn * 100:.4f}%")
print(f"Accuracy score using Random Forest: {avg_accuracy_rf * 100:.4f}%")

# Print the accuracy variances
print(f"Variance between Combined Model (KNN + RF) and KNN: {variance_combined_vs_knn * 100:.4f}%")
print(f"Variance between Combined Model (KNN + RF) and Random Forest: {variance_combined_vs_rf * 100:.4f}%")

Accuracy score using Combined Model (KNN + RF): 91.6975%
Accuracy score using Gradient Boosting: 84.5165%
Accuracy score using KNN: 89.6296%
Accuracy score using Random Forest: 89.2490%
Variance between Combined Model (KNN + RF) and KNN: 2.0679%
Variance between Combined Model (KNN + RF) and Random Forest: 2.4486%
