In [None]:
"""
Standalone Python script to train the diabetes classification model
"""

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
import joblib

print("="*70)
print("DIABETES CLASSIFICATION MODEL TRAINING")
print("="*70)

# Load dataset
print("\n1. Loading dataset...")
df = pd.read_csv('/content/drive/MyDrive/diabites/diabetes_classification.csv')
print(f"✓ Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")

# Remove unwanted index column
if 'Unnamed: 0' in df.columns:
    df.drop(columns=['Unnamed: 0'], inplace=True)


# Preprocessing
print("\n2. Preprocessing data...")
X = df.drop('Diagnosis', axis=1)
y = df['Diagnosis']

# Encode Gender
label_encoder = LabelEncoder()
X['Gender'] = label_encoder.fit_transform(X['Gender'])
print("   ✓ Gender encoded (F=0, M=1)")

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"   ✓ Data split: {len(X_train)} train, {len(X_test)} test")

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("   ✓ Features scaled")

# Train models
print("\n3. Training multiple models...")
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'XGBoost': XGBClassifier(n_estimators=100, random_state=42, eval_metric='logloss'),
    'SVM': SVC(kernel='rbf', probability=True, random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=42)
}

results = {}
best_accuracy = 0
best_model_name = None
best_model = None

for name, model in models.items():
    print(f"\n   Training {name}...")
    model.fit(X_train_scaled, y_train)

    y_pred = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    results[name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'ROC-AUC': roc_auc
    }

    print(f"      Accuracy: {accuracy:.4f} | F1: {f1:.4f} | ROC-AUC: {roc_auc:.4f}")

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model_name = name
        best_model = model

# Results
print("\n" + "="*70)
print("4. MODEL COMPARISON RESULTS")
print("="*70)
results_df = pd.DataFrame(results).T.sort_values('Accuracy', ascending=False)
print(results_df.to_string())

print("\n" + "="*70)
print(f"5. BEST MODEL: {best_model_name}")
print("="*70)
for metric, value in results[best_model_name].items():
    print(f"   {metric}: {value:.4f}")

# Save models
print("\n6. Saving model and preprocessing objects...")
joblib.dump(best_model, 'diabetes_model.pkl')
print("    ✓ Model saved: diabetes_model.pkl")

joblib.dump(scaler, 'scaler.pkl')
print("    ✓ Scaler saved: scaler.pkl")

joblib.dump(label_encoder, 'label_encoder.pkl')
print("    ✓ Label encoder saved: label_encoder.pkl")

feature_names = X.columns.tolist()
joblib.dump(feature_names, 'feature_names.pkl')
print("    ✓ Feature names saved: feature_names.pkl")

DIABETES CLASSIFICATION MODEL TRAINING

1. Loading dataset...


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/diabites/diabetes_classification.csv'

In [None]:
# Test prediction
print("\n7. Testing prediction...")
sample = pd.DataFrame([{
    'Age': 50, 'Gender': 0, 'BMI': 24, 'Chol': 4.2,
    'TG': 0.9, 'HDL': 2.4, 'LDL': 1.4, 'Cr': 46, 'BUN': 4.7
}])
sample_scaled = scaler.transform(sample)
prediction = best_model.predict(sample_scaled)[0]
prediction_proba = best_model.predict_proba(sample_scaled)[0]

print(f"    Prediction: {'DIABETES' if prediction == 1 else 'NO DIABETES'}")
print(f"    Confidence: {prediction_proba[1]*100:.1f}% diabetes risk")

print("\n" + "="*70)
print("✓ TRAINING COMPLETE!")
print("="*70)
print("\nYou can now run: python app.py")
print("="*70)




7. Testing prediction...
    Prediction: NO DIABETES
    Confidence: 31.9% diabetes risk

✓ TRAINING COMPLETE!

You can now run: python app.py
