In [1]:
#%% Enhanced Diabetes Prediction with >80% Accuracy
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import svm
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
import pickle


In [2]:
#%% Data Loading and Preprocessing
diabetes = pd.read_csv("..\dataset\diabetes.csv")

  diabetes = pd.read_csv("..\dataset\diabetes.csv")


In [3]:
# Feature Engineering
diabetes['BMI_Age'] = diabetes['BMI'] * diabetes['Age']
diabetes['Glucose_BP'] = diabetes['Glucose'] * diabetes['BloodPressure']


In [4]:
# Split data
X = diabetes.drop('Outcome', axis=1)
y = diabetes['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [5]:
# Preprocessing pipeline
pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('smote', SMOTE(random_state=42)),
    ('scaler', StandardScaler()),
    ('feature_sel', SelectKBest(f_classif, k=6)),
    ('classifier', svm.SVC(class_weight='balanced', probability=True, random_state=42))
])


In [6]:
# Hyperparameter tuning
params = {
    'classifier__C': [0.1, 1, 10],
    'classifier__gamma': ['scale', 'auto'],
    'classifier__kernel': ['rbf', 'poly'],
    'feature_sel__k': [5, 6, 7]
}

In [7]:
# Grid Search with cross-validation
grid = GridSearchCV(pipe, params, cv=5, scoring='accuracy', n_jobs=-1)
grid.fit(X_train, y_train)


In [8]:
# Best model
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]


In [9]:
#%% Evaluation
print(f"\nBest Parameters: {grid.best_params_}")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_proba):.2f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Best Parameters: {'classifier__C': 1, 'classifier__gamma': 'scale', 'classifier__kernel': 'poly', 'feature_sel__k': 7}
Accuracy: 0.73
ROC AUC: 0.80

Confusion Matrix:
[[75 25]
 [17 37]]

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.75      0.78       100
           1       0.60      0.69      0.64        54

    accuracy                           0.73       154
   macro avg       0.71      0.72      0.71       154
weighted avg       0.74      0.73      0.73       154



In [10]:
#%% Save Complete Pipeline
with open('diabetes_model.sav', 'wb') as f:
    pickle.dump({
        'model': best_model,
        'scaler': best_model.named_steps['scaler'],
        'feature_selector': best_model.named_steps['feature_sel']
    }, f)


In [11]:
#%% Feature Importance Analysis
feature_scores = pd.DataFrame({
    'feature': X.columns,
    'score': best_model.named_steps['feature_sel'].scores_
}).sort_values('score', ascending=False)

print("\nFeature Importance:")
print(feature_scores)


Feature Importance:
                    feature       score
1                   Glucose  239.961021
8                   BMI_Age  133.594493
9                Glucose_BP  104.088546
5                       BMI  101.364075
7                       Age   50.006862
0               Pregnancies   37.271852
6  DiabetesPedigreeFunction   23.609603
4                   Insulin   15.399461
3             SkinThickness    5.180806
2             BloodPressure    3.544779
