# Disease Prediction using SVM, Random Forest, and Gradient Boosting
This notebook uses the Pima Indians Diabetes dataset to compare the performance of advanced supervised learning algorithms.
- Dataset Source: OpenML `diabetes`
- Target: Tested positive or negative for diabetes


In [None]:
# Load libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.datasets import fetch_openml

In [None]:
# Load dataset
data = fetch_openml(name='diabetes', version=1, as_frame=True)
df = data.frame
X = df.drop('class', axis=1)
y = df['class'].apply(lambda x: 1 if x == 'tested_positive' else 0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


## Hyperparameter Tuning & Model Training

In [None]:
# SVM GridSearch
param_grid_svm = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto']}
grid_svm = GridSearchCV(SVC(probability=True), param_grid_svm, cv=5, scoring='roc_auc')
grid_svm.fit(X_train, y_train)
best_svm = grid_svm.best_estimator_

In [None]:
# Random Forest GridSearch
param_grid_rf = {'n_estimators': [100, 200], 'max_depth': [None, 10, 20]}
grid_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=5, scoring='roc_auc')
grid_rf.fit(X_train, y_train)
best_rf = grid_rf.best_estimator_

In [None]:
# Gradient Boosting GridSearch
param_grid_gbm = {'n_estimators': [100, 200], 'learning_rate': [0.1, 0.05], 'max_depth': [3, 4]}
grid_gbm = GridSearchCV(GradientBoostingClassifier(), param_grid_gbm, cv=5, scoring='roc_auc')
grid_gbm.fit(X_train, y_train)
best_gbm = grid_gbm.best_estimator_

## Model Evaluation

In [None]:
def evaluate_model(model):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    return {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1 Score': f1_score(y_test, y_pred),
        'AUC-ROC': roc_auc_score(y_test, y_prob)
    }

results = {
    'SVM': evaluate_model(best_svm),
    'Random Forest': evaluate_model(best_rf),
    'Gradient Boosting': evaluate_model(best_gbm)
}
results_df = pd.DataFrame(results).T
results_df

## Visualization of Results

In [None]:
results_df.plot(kind='bar', figsize=(12, 6), title='Model Comparison')
plt.ylabel('Score')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()