In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_selection import SelectFromModel
import joblib
import warnings
warnings.filterwarnings('ignore')

In [None]:

# Load the dataset
diabetes_dataset = pd.read_csv('newDiabetes.csv')

# Function for preprocessing data
def preprocess_data(df):
    # Create a copy to avoid modifying original data
    df = df.copy()
    
    # Drop ID and No_Pation columns
    df = df.drop(['ID', 'No_Pation'], axis=1)
    
    # Convert Gender to numeric
    df['Gender'] = df['Gender'].map({'M': 0, 'F': 1})
        # Handle CLASS (target variable)
    df['CLASS'] = df['CLASS'].map({'N': 0, 'Y': 1, 'P': 2})
    
    # Check for any invalid values in numeric columns
    numeric_columns = ['AGE', 'Urea', 'Cr', 'HbA1c', 'Chol', 'TG', 'HDL', 'LDL', 'VLDL', 'BMI']
    df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')
    
    # Remove rows with any invalid values
    df = df.dropna()
    
    return df

In [3]:
# Apply preprocessing
processed_df = preprocess_data(diabetes_dataset)
print("Processed Dataset Shape:", processed_df.shape)
print("\nClass Distribution:\n", processed_df['CLASS'].value_counts())

# Cell 3: Feature Engineering - Add polynomial features and interactions
def engineer_features(df):
    # Create a copy of dataframe
    df_new = df.copy()
    
    # Create HbA1c to BMI ratio (important diabetes indicator)
    df_new['HbA1c_BMI_ratio'] = df_new['HbA1c'] / df_new['BMI']
    
    # Create Cholesterol to HDL ratio (important cardiovascular risk factor)
    df_new['Chol_HDL_ratio'] = df_new['Chol'] / df_new['HDL']
    
    # Create TG to HDL ratio (insulin resistance marker)
    df_new['TG_HDL_ratio'] = df_new['TG'] / df_new['HDL']
    
    # Age group categorical feature
    df_new['Age_Group'] = pd.cut(df_new['AGE'], 
                                bins=[0, 30, 45, 60, 100], 
                                labels=[0, 1, 2, 3])
    
    # BMI category (underweight, normal, overweight, obese)
    df_new['BMI_Category'] = pd.cut(df_new['BMI'], 
                                   bins=[0, 18.5, 25, 30, 100], 
                                   labels=[0, 1, 2, 3])
    
    return df_new
    

Processed Dataset Shape: (994, 12)

Class Distribution:
 1.0    839
0.0    102
2.0     53
Name: CLASS, dtype: int64


In [4]:
# Apply feature engineering
enhanced_df = engineer_features(processed_df)
print("\nEnhanced Dataset Shape:", enhanced_df.shape)
print("\nNew Features:", [col for col in enhanced_df.columns if col not in processed_df.columns])

# Cell 4: Split features and target
X = enhanced_df.drop('CLASS', axis=1)
y = enhanced_df['CLASS']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=42, 
                                                    stratify=y)


Enhanced Dataset Shape: (994, 17)

New Features: ['HbA1c_BMI_ratio', 'Chol_HDL_ratio', 'TG_HDL_ratio', 'Age_Group', 'BMI_Category']


In [6]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Cell 5: Feature Selection
# Use Random Forest for feature selection
rf_selector = RandomForestClassifier(n_estimators=100, random_state=42)
rf_selector.fit(X_train_scaled, y_train)

# Select most important features
selector = SelectFromModel(rf_selector, threshold='mean')
selector.fit(X_train_scaled, y_train)

# Transform the data to include only selected features
X_train_selected = selector.transform(X_train_scaled)
X_test_selected = selector.transform(X_test_scaled)

# Get selected feature indices
selected_indices = selector.get_support()
selected_features = X.columns[selected_indices]
print("\nSelected Features:", selected_features.tolist())


Selected Features: ['AGE', 'HbA1c', 'BMI', 'HbA1c_BMI_ratio', 'BMI_Category']


In [None]:

# Define parameters for Gradient Boosting
gb_param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5],
    'subsample': [0.8, 1.0]
}

# Initialize Gradient Boosting classifier
gb = GradientBoostingClassifier(random_state=42)

# Setup StratifiedKFold for more robust cross-validation
stratified_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [8]:
# Perform GridSearchCV
gb_grid_search = GridSearchCV(
    estimator=gb,
    param_grid=gb_param_grid,
    cv=stratified_cv,
    n_jobs=-1,
    verbose=1,
    scoring='accuracy'
)

# Train the Gradient Boosting model
print("\nTraining Gradient Boosting model...")
gb_grid_search.fit(X_train_selected, y_train)

# Get best Gradient Boosting model
gb_best_model = gb_grid_search.best_estimator_

# Make predictions with Gradient Boosting
gb_y_pred = gb_best_model.predict(X_test_selected)


Training Gradient Boosting model...
Fitting 5 folds for each of 72 candidates, totalling 360 fits


In [None]:
# Print Gradient Boosting results
print("\nGradient Boosting Best Parameters:", gb_grid_search.best_params_)
print(f"\nGradient Boosting Accuracy Score: {accuracy_score(y_test, gb_y_pred):.4f}")
print("\nGradient Boosting Classification Report:")
print(classification_report(y_test, gb_y_pred))




Gradient Boosting Best Parameters: {'learning_rate': 0.01, 'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 200, 'subsample': 1.0}

Gradient Boosting Accuracy Score: 0.9698

Gradient Boosting Classification Report:
              precision    recall  f1-score   support

         0.0       0.94      0.80      0.86        20
         1.0       0.97      0.99      0.98       168
         2.0       1.00      0.91      0.95        11

    accuracy                           0.97       199
   macro avg       0.97      0.90      0.93       199
weighted avg       0.97      0.97      0.97       199



In [None]:

# Define parameters for Neural Network
nn_param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
    'activation': ['relu', 'tanh'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'adaptive'],
    'max_iter': [500]
}
# Initialize Neural Network classifier
nn = MLPClassifier(random_state=42, early_stopping=True)

# Perform GridSearchCV
nn_grid_search = GridSearchCV(
    estimator=nn,
    param_grid=nn_param_grid,
    cv=stratified_cv,
    n_jobs=-1,
    verbose=1,
    scoring='accuracy'
)

# Train the Neural Network model
print("\nTraining Neural Network model...")
nn_grid_search.fit(X_train_scaled, y_train)


Training Neural Network model...
Fitting 5 folds for each of 48 candidates, totalling 240 fits


In [11]:
# Get best Neural Network model
nn_best_model = nn_grid_search.best_estimator_

# Make predictions with Neural Network
nn_y_pred = nn_best_model.predict(X_test_scaled)

# Print Neural Network results
print("\nNeural Network Best Parameters:", nn_grid_search.best_params_)
print(f"\nNeural Network Accuracy Score: {accuracy_score(y_test, nn_y_pred):.4f}")
print("\nNeural Network Classification Report:")
print(classification_report(y_test, nn_y_pred))


Neural Network Best Parameters: {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (100, 50), 'learning_rate': 'constant', 'max_iter': 500}

Neural Network Accuracy Score: 0.8894

Neural Network Classification Report:
              precision    recall  f1-score   support

         0.0       0.50      0.95      0.66        20
         1.0       0.98      0.94      0.96       168
         2.0       0.00      0.00      0.00        11

    accuracy                           0.89       199
   macro avg       0.49      0.63      0.54       199
weighted avg       0.88      0.89      0.88       199



In [None]:

# Create a voting classifier
voting_clf = VotingClassifier(
    estimators=[
        ('rf', rf_selector),
        ('gb', gb_best_model),
        ('nn', nn_best_model)
    ],
    voting='soft'  # Use predicted probabilities
)

# Train the voting classifier
print("\nTraining Voting Classifier...")
# Need to handle feature dimension differences
voting_clf.fit(X_train_scaled, y_train)  # Use scaled features for all

# Make predictions with Voting Classifier
voting_y_pred = voting_clf.predict(X_test_scaled)


Training Voting Classifier...


In [13]:
# Print Voting Classifier results
print(f"\nVoting Classifier Accuracy Score: {accuracy_score(y_test, voting_y_pred):.4f}")
print("\nVoting Classifier Classification Report:")
print(classification_report(y_test, voting_y_pred))

# Cell 9: Compare all models
print("\n==== MODEL COMPARISON ====")
models = {
    "Random Forest": rf_selector,
    "Gradient Boosting": gb_best_model,
    "Neural Network": nn_best_model,
    "Voting Classifier": voting_clf
}


Voting Classifier Accuracy Score: 0.9749

Voting Classifier Classification Report:
              precision    recall  f1-score   support

         0.0       0.90      0.95      0.93        20
         1.0       0.98      1.00      0.99       168
         2.0       1.00      0.64      0.78        11

    accuracy                           0.97       199
   macro avg       0.96      0.86      0.90       199
weighted avg       0.98      0.97      0.97       199


==== MODEL COMPARISON ====


In [14]:
# Function to evaluate models
def evaluate_model(model, X_test, y_test, model_name):
    if model_name == "Gradient Boosting":
        y_pred = model.predict(X_test_selected)
    else:
        y_pred = model.predict(X_test_scaled)
    
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    # Calculate weighted F1 score
    f1 = report['weighted avg']['f1-score']
    
    return accuracy, f1

In [15]:
# Evaluate all models
results = {}
for name, model in models.items():
    accuracy, f1 = evaluate_model(model, X_test_scaled, y_test, name)
    results[name] = {
        'Accuracy': accuracy,
        'F1 Score': f1
    }

# Display results as a table
results_df = pd.DataFrame.from_dict(results, orient='index')
print("\nModel Performance:")
print(results_df.sort_values('Accuracy', ascending=False))


Model Performance:
                   Accuracy  F1 Score
Voting Classifier  0.974874  0.972892
Random Forest      0.969849  0.964554
Gradient Boosting  0.969849  0.968888
Neural Network     0.889447  0.876709


In [None]:
# Identify the best model
best_model_name = results_df['Accuracy'].idxmax()
print(f"\nBest Model: {best_model_name} with accuracy {results_df.loc[best_model_name, 'Accuracy']:.4f}")




Best Model: Voting Classifier with accuracy 0.9749


In [None]:

# Get the best model
if best_model_name == "Gradient Boosting":
    best_model = gb_best_model
    # Save additional components needed for this model
    joblib.dump(selector, 'feature_selector.joblib')
    print("Feature selector saved as 'feature_selector.joblib'")
elif best_model_name == "Random Forest":
    best_model = rf_selector
elif best_model_name == "Neural Network":
    best_model = nn_best_model
else:
    best_model = voting_clf
# Save the best model
joblib.dump(best_model, 'best_diabetes_model.joblib')
joblib.dump(scaler, 'best_scaler.joblib')
print(f"\nBest model ({best_model_name}) saved as 'best_diabetes_model.joblib'")
print("Scaler saved as 'best_scaler.joblib'")

# Cell 11: Function for making predictions
def predict_diabetes(input_data=None):
    """
    Make predictions for new data using the best model
    
    Parameters:
    input_data (dict): Dictionary containing patient information
    
    Returns:
    dict: Prediction results
    """
    if input_data is None:
        # Sample data
        input_data = {
            'ID': 421,
            'No_Pation': 34227,
            'Gender': 'M',
            'AGE': 48,
            'Urea': 4.7,
            'Cr': 47,
            'HbA1c': 4,
            'Chol': 2.9,
            'TG': 0.8,
            'HDL': 0.9,
            'LDL': 1.6,
            'VLDL': 0.4,
            'BMI': 24
        }
    
    try:
        # Convert input to DataFrame
        input_df = pd.DataFrame([input_data])
        
        # Preprocess input
        input_df = preprocess_data(input_df)
        
        # Apply feature engineering
        input_df = engineer_features(input_df)
        
        # Scale features
        input_scaled = scaler.transform(input_df)
        
        # Apply feature selection if needed
        if best_model_name == "Gradient Boosting":
            input_scaled = selector.transform(input_scaled)
        # Make prediction
        prediction = best_model.predict(input_scaled)
        probabilities = best_model.predict_proba(input_scaled)
        
        # Map prediction to class
        class_map = {0: 'Non-Diabetic', 1: 'Diabetic', 2: 'Pre-Diabetic'}
        result = class_map[prediction[0]]
        
        # Create probability dictionary
        prob_dict = {}
        for i, label in enumerate(['Non-Diabetic', 'Diabetic', 'Pre-Diabetic']):
            if i < len(probabilities[0]):
                prob_dict[label] = probabilities[0][i]
            else:
                prob_dict[label] = 0.0
        return {
            'prediction': result,
            'probabilities': prob_dict
        }
    
    except Exception as e:
        return f"Error making prediction: {str(e)}"


Best model (Voting Classifier) saved as 'best_diabetes_model.joblib'
Scaler saved as 'best_scaler.joblib'


In [18]:
# Test the model with sample data
sample_result = predict_diabetes()
print("\nSample Prediction Result:")
if isinstance(sample_result, dict):
    print(f"Prediction: {sample_result['prediction']}")
    print("\nProbabilities:")
    for class_name, prob in sample_result['probabilities'].items():
        print(f"{class_name}: {prob:.4f}")
else:
    print(sample_result)


Sample Prediction Result:
Error making prediction: 'CLASS'
