In [14]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Function to preprocess datasets and save encoders/scalers
def preprocess_data(df, target_column, label_encoder_filename, scaler_filename, feature_filename):
    # Handle missing values
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].fillna(df[col].mode()[0])
        else:
            df[col] = df[col].fillna(df[col].median())
    
    # Encode categorical features
    label_encoders = {}
    for col in df.select_dtypes(include=['object']).columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le
    
    # Save label encoders
    joblib.dump(label_encoders, label_encoder_filename)
    
    # Define features and target
    
    X = df.drop(columns=[target_column])
    y = df[target_column]
    
    # Save feature names
    joblib.dump(list(X.columns), feature_filename)
    
    # Standardize numerical features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Save scaler
    joblib.dump(scaler, scaler_filename)
    
    return X_scaled, y

# Function to train XGBoost model
def train_xgb_model(X, y, model_filename):
    # Split dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Define model
    xgb_model = XGBClassifier(eval_metric='logloss', random_state=42, reg_lambda=1)
    
    # Hyperparameter tuning space
    param_dist = {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0]
    }
    
    # Adjust CV splits if data is too small
    n_splits = min(5, len(set(y)))
    
    # Hyperparameter tuning
    random_search = RandomizedSearchCV(
        xgb_model, param_distributions=param_dist, n_iter=min(8, len(param_dist['n_estimators'])),
        scoring='accuracy', cv=n_splits, verbose=1, random_state=42, n_jobs=-1
    )
    random_search.fit(X_train, y_train)
    
    # Get best model
    best_model = random_search.best_estimator_
    
    # Evaluate
    y_pred = best_model.predict(X_test)
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print("Classification Report:\n", classification_report(y_test, y_pred))
    
    # Save model
    joblib.dump(best_model, model_filename)

# Datasets
datasets = [
    ("diabetes.csv", "Outcome", "diabetes_xgb.pkl", "diabetes_labels.pkl", "diabetes_scaler.pkl", "diabetes_features.pkl"),
    ("heart.csv", "target", "heart_xgb.pkl", "heart_labels.pkl", "heart_scaler.pkl", "heart_features.pkl"),
    ("kidney_disease.csv", "classification", "kidney_xgb.pkl", "kidney_labels.pkl", "kidney_scaler.pkl", "kidney_features.pkl")
]

# Process each dataset
for file, target, model_file, encoder_file, scaler_file, feature_file in datasets:
    df = pd.read_csv(file)
    X, y = preprocess_data(df, target, encoder_file, scaler_file, feature_file)
    train_xgb_model(X, y, model_file)



Fitting 2 folds for each of 3 candidates, totalling 6 fits
Accuracy: 0.7467532467532467
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.84      0.81        99
           1       0.67      0.58      0.62        55

    accuracy                           0.75       154
   macro avg       0.72      0.71      0.72       154
weighted avg       0.74      0.75      0.74       154

Fitting 2 folds for each of 3 candidates, totalling 6 fits
Accuracy: 0.8688524590163934
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.83      0.86        29
           1       0.85      0.91      0.88        32

    accuracy                           0.87        61
   macro avg       0.87      0.87      0.87        61
weighted avg       0.87      0.87      0.87        61

Fitting 3 folds for each of 3 candidates, totalling 9 fits




Accuracy: 0.9875
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.98      0.99        52
           2       0.97      1.00      0.98        28

    accuracy                           0.99        80
   macro avg       0.98      0.99      0.99        80
weighted avg       0.99      0.99      0.99        80

