# Diabetes Model Training
## Training with Best Accuracy Optimization

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle
import warnings
warnings.filterwarnings('ignore')

print("‚úÖ Libraries imported successfully!")

‚úÖ Libraries imported successfully!


## 1. Load and Explore Dataset

In [5]:
# Load the dataset
print("Loading diabetes dataset...")
df = pd.read_csv('../diabetes.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
df.head()

Loading diabetes dataset...
Dataset shape: (768, 9)

Columns: ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
# Dataset information
print("Dataset Info:")
print(df.info())
print(f"\nMissing values:")
print(df.isnull().sum())
print(f"\nTarget distribution:")
print(df['Outcome'].value_counts())
print(f"\nDataset statistics:")
print(df.describe())

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
None

Missing values:
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction

## 2. Prepare Data

In [7]:
# Separate features and target
X = df.drop('Outcome', axis=1)
y = df['Outcome']

print(f"Features shape: {X.shape}")
print(f"Features: {X.columns.tolist()}")

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"\n‚úÖ Data prepared:")
print(f"  Training set: {X_train.shape}")
print(f"  Test set: {X_test.shape}")

Features shape: (768, 8)
Features: ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']

‚úÖ Data prepared:
  Training set: (614, 8)
  Test set: (154, 8)


## 3. Train and Evaluate Base Models

In [8]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM (RBF Kernel)": SVC(kernel='rbf', probability=True, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

results = {}

for name, model in models.items():
    print(f"\n{'='*60}")
    print(f"Training {name}...")
    
    # Train the model
    model.fit(X_train_scaled, y_train)
    
    # Predictions
    y_pred = model.predict(X_test_scaled)
    
    # Accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
    # Cross-validation score
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5)
    cv_mean = cv_scores.mean()
    cv_std = cv_scores.std()
    
    results[name] = {'accuracy': accuracy, 'cv_mean': cv_mean, 'cv_std': cv_std}
    
    print(f"\n{name} Results:")
    print(f"  Test Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
    print(f"  Cross-Val Accuracy: {cv_mean:.4f} (+/- {cv_std:.4f})")
    print(f"\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['No Diabetes', 'Diabetes']))


Training Logistic Regression...

Logistic Regression Results:
  Test Accuracy: 0.7143 (71.43%)
  Cross-Val Accuracy: 0.7785 (+/- 0.0126)

Classification Report:
              precision    recall  f1-score   support

 No Diabetes       0.76      0.82      0.79       100
    Diabetes       0.61      0.52      0.56        54

    accuracy                           0.71       154
   macro avg       0.68      0.67      0.67       154
weighted avg       0.71      0.71      0.71       154


Training Random Forest...

Random Forest Results:
  Test Accuracy: 0.7597 (75.97%)
  Cross-Val Accuracy: 0.7639 (+/- 0.0314)

Classification Report:
              precision    recall  f1-score   support

 No Diabetes       0.79      0.85      0.82       100
    Diabetes       0.68      0.59      0.63        54

    accuracy                           0.76       154
   macro avg       0.74      0.72      0.73       154
weighted avg       0.75      0.76      0.76       154


Training SVM (RBF Kernel)...

SVM

In [9]:
# Summary of base models
results_df = pd.DataFrame(results).T
results_df = results_df.sort_values('accuracy', ascending=False)
print("\nüìä Base Model Comparison:")
print(results_df)


üìä Base Model Comparison:
                     accuracy   cv_mean    cv_std
Random Forest        0.759740  0.763881  0.031437
SVM (RBF Kernel)     0.753247  0.754072  0.016562
Gradient Boosting    0.753247  0.749274  0.043044
Logistic Regression  0.714286  0.778529  0.012559


## 4. Hyperparameter Tuning

In [10]:
# Random Forest Tuning
print("\nüîß Tuning Random Forest...")
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_grid = GridSearchCV(
    RandomForestClassifier(random_state=42),
    rf_params,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

rf_grid.fit(X_train_scaled, y_train)
rf_best = rf_grid.best_estimator_
rf_accuracy = accuracy_score(y_test, rf_best.predict(X_test_scaled))

print(f"\n‚úÖ Random Forest Best Params: {rf_grid.best_params_}")
print(f"‚úÖ Random Forest Tuned Accuracy: {rf_accuracy:.4f} ({rf_accuracy*100:.2f}%)")


üîß Tuning Random Forest...
Fitting 5 folds for each of 108 candidates, totalling 540 fits

‚úÖ Random Forest Best Params: {'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}
‚úÖ Random Forest Tuned Accuracy: 0.7532 (75.32%)


In [11]:
# SVM Tuning
print("\nüîß Tuning SVM...")
svm_params = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1],
    'kernel': ['rbf', 'linear']
}

svm_grid = GridSearchCV(
    SVC(probability=True, random_state=42),
    svm_params,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

svm_grid.fit(X_train_scaled, y_train)
svm_best = svm_grid.best_estimator_
svm_accuracy = accuracy_score(y_test, svm_best.predict(X_test_scaled))

print(f"\n‚úÖ SVM Best Params: {svm_grid.best_params_}")
print(f"‚úÖ SVM Tuned Accuracy: {svm_accuracy:.4f} ({svm_accuracy*100:.2f}%)")


üîß Tuning SVM...
Fitting 5 folds for each of 40 candidates, totalling 200 fits

‚úÖ SVM Best Params: {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
‚úÖ SVM Tuned Accuracy: 0.7143 (71.43%)


In [12]:
# Gradient Boosting Tuning
print("\nüîß Tuning Gradient Boosting...")
gb_params = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

gb_grid = GridSearchCV(
    GradientBoostingClassifier(random_state=42),
    gb_params,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

gb_grid.fit(X_train_scaled, y_train)
gb_best = gb_grid.best_estimator_
gb_accuracy = accuracy_score(y_test, gb_best.predict(X_test_scaled))

print(f"\n‚úÖ Gradient Boosting Best Params: {gb_grid.best_params_}")
print(f"‚úÖ Gradient Boosting Tuned Accuracy: {gb_accuracy:.4f} ({gb_accuracy*100:.2f}%)")


üîß Tuning Gradient Boosting...
Fitting 5 folds for each of 72 candidates, totalling 360 fits

‚úÖ Gradient Boosting Best Params: {'learning_rate': 0.01, 'max_depth': 3, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
‚úÖ Gradient Boosting Tuned Accuracy: 0.7468 (74.68%)


## 5. Select Best Model and Save

In [13]:
# Compare tuned models
tuned_models = {
    "Random Forest (Tuned)": (rf_best, rf_accuracy),
    "SVM (Tuned)": (svm_best, svm_accuracy),
    "Gradient Boosting (Tuned)": (gb_best, gb_accuracy)
}

final_best_name = max(tuned_models.items(), key=lambda x: x[1][1])[0]
final_best_model = tuned_models[final_best_name][0]
final_best_accuracy = tuned_models[final_best_name][1]

print("\n" + "="*60)
print("üèÜ FINAL BEST MODEL")
print("="*60)
print(f"\nBest Model: {final_best_name}")
print(f"Best Accuracy: {final_best_accuracy:.4f} ({final_best_accuracy*100:.2f}%)")

# Final evaluation
y_pred_final = final_best_model.predict(X_test_scaled)
print(f"\nüìã Final Classification Report:")
print(classification_report(y_test, y_pred_final, target_names=['No Diabetes', 'Diabetes']))
print(f"\nüìä Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_final))


üèÜ FINAL BEST MODEL

Best Model: Random Forest (Tuned)
Best Accuracy: 0.7532 (75.32%)

üìã Final Classification Report:
              precision    recall  f1-score   support

 No Diabetes       0.79      0.84      0.82       100
    Diabetes       0.67      0.59      0.63        54

    accuracy                           0.75       154
   macro avg       0.73      0.72      0.72       154
weighted avg       0.75      0.75      0.75       154


üìä Confusion Matrix:
[[84 16]
 [22 32]]


In [14]:
# Save the best model and scaler
model_filename = 'diabetes_model.sav'
scaler_filename = 'diabetes_scaler.sav'

pickle.dump(final_best_model, open(model_filename, 'wb'))
pickle.dump(scaler, open(scaler_filename, 'wb'))

print(f"\n‚úÖ Model saved to: {model_filename}")
print(f"‚úÖ Scaler saved to: {scaler_filename}")
print(f"\nModel Type: {type(final_best_model).__name__}")
print(f"Number of features: {X.shape[1]}")
print(f"Feature names: {X.columns.tolist()}")
print("\nüéâ Training complete!")


‚úÖ Model saved to: diabetes_model.sav
‚úÖ Scaler saved to: diabetes_scaler.sav

Model Type: RandomForestClassifier
Number of features: 8
Feature names: ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']

üéâ Training complete!
