<a href="https://colab.research.google.com/github/PPancham/PhD/blob/main/XGBoost_Model_and_LightGBM_Model_ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import pandas as pd
import numpy as np
import datetime
import joblib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder
import xgboost as xgb
import lightgbm as lgb
from sklearn.impute import SimpleImputer

# Function to save the model
def SaveModel(model, prefix="Trained_Model_"):
    filename = f"{prefix}{datetime.date.today()}.pkl"
    joblib.dump(model, filename)
    return f"Model saved to '{filename}'"

uploadedSpreadsheet = files.upload()
fileName = list(uploadedSpreadsheet.keys())[0]
data = pd.read_excel(fileName)

# Define columns to process
columns_to_process = ['IgG1 Average', 'IgG2 Average', 'IgG3 Average', 'IgG4 Average',
                      'IgA Average', 'IgE Average', 'IgM Average']

# Extract features and target
features = data[columns_to_process]
target = data['Group']

# Apply label encoding to convert string class labels to integers
print("\nEncoding target variable...")
label_encoder = LabelEncoder()
encoded_target = label_encoder.fit_transform(target)
print(f"Original classes: {label_encoder.classes_}")
print(f"Encoded as: {np.unique(encoded_target)}")

# Data preprocessing
print("Data Preprocessing:")

# 1. Check for missing values
print("\nChecking for missing values...")
missing_values = features.isnull().sum()
print(missing_values)

# 2. Impute missing values if any
if missing_values.sum() > 0:
    print("Imputing missing values...")
    imputer = SimpleImputer(strategy='median')
    features = pd.DataFrame(imputer.fit_transform(features),
                           columns=features.columns,
                           index=features.index)

# 3. Check for outliers using IQR method
print("\nChecking for outliers using IQR method...")
def detect_outliers_iqr(df):
    outlier_indices = []
    outlier_values = {}

    for column in df.columns:
        # Calculate Q1 and Q3
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)

        # Calculate IQR
        IQR = Q3 - Q1

        # Define outlier bounds
        outlier_lower = Q1 - 1.5 * IQR
        outlier_upper = Q3 + 1.5 * IQR

        # Find outliers
        column_outliers = df[(df[column] < outlier_lower) | (df[column] > outlier_upper)].index.tolist()
        outlier_values[column] = df.loc[column_outliers, column].tolist()
        outlier_indices.extend(column_outliers)

    # Return unique indices of rows with outliers
    return list(set(outlier_indices)), outlier_values

outlier_indices, outlier_values = detect_outliers_iqr(features)
print(f"Found {len(outlier_indices)} rows with outliers")

# Optionally handle outliers - in this case, we'll keep them but log their presence
for column, values in outlier_values.items():
    if values:
        print(f"Column {column} has {len(values)} outliers")

# 4. Standardize the features for better model performance
print("\nStandardizing features...")
scaler = StandardScaler()
scaled_features = pd.DataFrame(scaler.fit_transform(features),
                              columns=features.columns,
                              index=features.index)

# Now we have preprocessed data ready for modeling
print("\nData preprocessing complete.")

# Split the data for initial evaluation
X_train, X_test, y_train, y_test = train_test_split(
    scaled_features, encoded_target, test_size=0.3, random_state=42, stratify=encoded_target
)

print(f"\nTraining data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

# Model Training and Evaluation Functions
def train_and_evaluate_xgboost(X_train, X_test, y_train, y_test):
    """Train and evaluate XGBoost model"""
    print("\n--- XGBoost Model Training and Evaluation ---")

    # Initialize and train basic XGBoost model
    xgb_model = xgb.XGBClassifier(
        objective='multi:softprob',
        use_label_encoder=False,
        eval_metric='mlogloss',
        random_state=42,
        verbosity=0
    )

    xgb_model.fit(X_train, y_train)

    # Evaluate
    y_pred = xgb_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Basic XGBoost Model Accuracy: {accuracy:.4f}")

    # Cross-validation
    cv_scores = cross_val_score(xgb_model, scaled_features, encoded_target, cv=5)
    print(f"Cross-validation scores: {cv_scores}")
    print(f"Mean CV accuracy: {cv_scores.mean():.4f}")

    # Hyperparameter tuning with GridSearchCV
    print("\nPerforming hyperparameter tuning for XGBoost...")
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 5, 7, 9],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0],
        'gamma': [0, 0.1, 0.2],
        'min_child_weight': [1, 3, 5]
    }

    grid_search = GridSearchCV(
        xgb.XGBClassifier(objective='multi:softprob', use_label_encoder=False, eval_metric='mlogloss'),
        param_grid=param_grid,
        cv=5,
        scoring='accuracy',
        n_jobs=-1,
        verbose=1
    )

    grid_search.fit(scaled_features, encoded_target)

    # Get best parameters and model
    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_

    # Print results
    print(f"\nBest parameters: {best_params}")
    print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")

    # Evaluate best model on test set
    y_pred_best = best_model.predict(X_test)
    best_accuracy = accuracy_score(y_test, y_pred_best)
    print(f"Best XGBoost Model Test Accuracy: {best_accuracy:.4f}")

    # Print detailed classification report
    print("\nClassification Report (Best XGBoost Model):")
    print(classification_report(y_test, y_pred_best))

    # Print report with original class names for better readability
    print("\nClassification Report with original class names:")
    y_test_original = label_encoder.inverse_transform(y_test)
    y_pred_original = label_encoder.inverse_transform(y_pred_best)
    print(classification_report(y_test_original, y_pred_original))

    # Print confusion matrix
    print("\nConfusion Matrix (Best XGBoost Model):")
    print(confusion_matrix(y_test, y_pred_best))

    # Print confusion matrix with class labels
    print("\nConfusion Matrix with class labels:")
    conf_matrix = confusion_matrix(y_test_original, y_pred_original)
    conf_df = pd.DataFrame(conf_matrix,
                         index=label_encoder.classes_,
                         columns=label_encoder.classes_)
    print(conf_df)

    # Feature importance
    feature_importance = pd.DataFrame({
        'Feature': columns_to_process,
        'Importance': best_model.feature_importances_
    }).sort_values('Importance', ascending=False)

    print("\nFeature Importance (Best XGBoost Model):")
    print(feature_importance)

    # Save the best model
    save_result = SaveModel(best_model, "XGBoost_Best_Model_")
    print(save_result)

    return best_model, best_accuracy, feature_importance

def train_and_evaluate_lightgbm(X_train, X_test, y_train, y_test):
    """Train and evaluate LightGBM model"""
    print("\n--- LightGBM Model Training and Evaluation ---")

    # Initialize and train basic LightGBM model
    lgb_model = lgb.LGBMClassifier(
        objective='multiclass',
        random_state=42,
        verbose=-1
    )

    lgb_model.fit(X_train, y_train)

    # Evaluate
    y_pred = lgb_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Basic LightGBM Model Accuracy: {accuracy:.4f}")

    # Cross-validation
    cv_scores = cross_val_score(lgb_model, scaled_features, encoded_target, cv=5)
    print(f"Cross-validation scores: {cv_scores}")
    print(f"Mean CV accuracy: {cv_scores.mean():.4f}")

    # Hyperparameter tuning with GridSearchCV
    print("\nPerforming hyperparameter tuning for LightGBM...")
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 5, 7, 9],
        'learning_rate': [0.01, 0.1, 0.2],
        'num_leaves': [31, 50, 70],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0],
        'min_child_samples': [5, 10, 20],
        'reg_alpha': [0, 0.1, 0.5],
        'reg_lambda': [0, 0.1, 0.5]
    }

    grid_search = GridSearchCV(
        lgb.LGBMClassifier(objective='multiclass'),
        param_grid=param_grid,
        cv=5,
        scoring='accuracy',
        n_jobs=-1,
        verbose=1
    )

    grid_search.fit(scaled_features, encoded_target)

    # Get best parameters and model
    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_

    # Print results
    print(f"\nBest parameters: {best_params}")
    print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")

    # Evaluate best model on test set
    y_pred_best = best_model.predict(X_test)
    best_accuracy = accuracy_score(y_test, y_pred_best)
    print(f"Best LightGBM Model Test Accuracy: {best_accuracy:.4f}")

    # Print detailed classification report
    print("\nClassification Report (Best LightGBM Model):")
    print(classification_report(y_test, y_pred_best))

    # Print report with original class names for better readability
    print("\nClassification Report with original class names:")
    y_test_original = label_encoder.inverse_transform(y_test)
    y_pred_original = label_encoder.inverse_transform(y_pred_best)
    print(classification_report(y_test_original, y_pred_original))

    # Print confusion matrix
    print("\nConfusion Matrix (Best LightGBM Model):")
    print(confusion_matrix(y_test, y_pred_best))

    # Print confusion matrix with class labels
    print("\nConfusion Matrix with class labels:")
    conf_matrix = confusion_matrix(y_test_original, y_pred_original)
    conf_df = pd.DataFrame(conf_matrix,
                         index=label_encoder.classes_,
                         columns=label_encoder.classes_)
    print(conf_df)

    # Feature importance
    feature_importance = pd.DataFrame({
        'Feature': columns_to_process,
        'Importance': best_model.feature_importances_
    }).sort_values('Importance', ascending=False)

    print("\nFeature Importance (Best LightGBM Model):")
    print(feature_importance)

    # Save the best model
    save_result = SaveModel(best_model, "LightGBM_Best_Model_")
    print(save_result)

    return best_model, best_accuracy, feature_importance

# Execute the model training and evaluation
print("\nStarting model training and evaluation...\n")
print("=" * 50)

# Train and evaluate XGBoost
xgb_best_model, xgb_accuracy, xgb_feature_importance = train_and_evaluate_xgboost(X_train, X_test, y_train, y_test)

print("\n" + "=" * 50)

# Train and evaluate LightGBM
lgb_best_model, lgb_accuracy, lgb_feature_importance = train_and_evaluate_lightgbm(X_train, X_test, y_train, y_test)

# Compare the models
print("\n" + "=" * 50)
print("\n--- Model Comparison ---")
print(f"XGBoost Best Accuracy: {xgb_accuracy:.4f}")
print(f"LightGBM Best Accuracy: {lgb_accuracy:.4f}")

# Select the best overall model
if xgb_accuracy > lgb_accuracy:
    best_model = xgb_best_model
    best_model_name = "XGBoost"
    best_accuracy = xgb_accuracy
    best_feature_importance = xgb_feature_importance
else:
    best_model = lgb_best_model
    best_model_name = "LightGBM"
    best_accuracy = lgb_accuracy
    best_feature_importance = lgb_feature_importance

print(f"\nBest Overall Model: {best_model_name} with accuracy: {best_accuracy:.4f}")

# Save the best overall model
overall_save_result = SaveModel(best_model, f"Best_Overall_{best_model_name}_Model_")
print(overall_save_result)

# Plot feature importance for the best model
plt.figure(figsize=(10, 6))
plt.barh(best_feature_importance['Feature'], best_feature_importance['Importance'])
plt.title(f'Feature Importance ({best_model_name} Model)')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.savefig(f'feature_importance_{best_model_name.lower()}.png')
plt.show()

# Save encoder for future use
joblib.dump(label_encoder, f"label_encoder_{datetime.date.today()}.pkl")
print(f"Label encoder saved for future predictions")

print("\nScript execution completed.")

In [None]:
# Install necessary packages
!pip install xgboost lightgbm scikit-learn pandas numpy matplotlib

import pandas as pd
import numpy as np
import datetime
import joblib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RepeatedStratifiedKFold
from sklearn.ensemble import StackingClassifier, VotingClassifier
from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import xgboost as xgb
import lightgbm as lgb

# Load your data
file_name = "Biomarker_16032025_without_SWD_with_GNV.xlsx"
data = pd.read_excel(file_name)

# Define columns to process
columns_to_process = ['IgG1 Average', 'IgG2 Average', 'IgG3 Average', 'IgG4 Average',
                     'IgA Average', 'IgE Average', 'IgM Average']

# Extract features and target
base_features = data[columns_to_process]
target = data['Group']

# Apply label encoding
label_encoder = LabelEncoder()
encoded_target = label_encoder.fit_transform(target)

print("Enhanced Model Training and Evaluation")
print("=" * 50)

# ===== 1. FEATURE ENGINEERING =====
print("\n1. Creating engineered features...")

# Create feature engineering function with better handling of edge cases
def engineer_features(df):
    # Create a copy of the original features
    engineered = df.copy()

    # First, replace zeros and negative values with a small positive number
    # to avoid division by zero and log of zero/negative
    df_safe = df.copy()
    for col in df_safe.columns:
        # Replace zeros and negative values with a small positive number
        df_safe[col] = df_safe[col].replace(0, 1e-6)
        df_safe[col] = df_safe[col].clip(lower=1e-6)  # Ensure all values are positive

    # Add ratio features with safe division
    # IgG class ratios
    engineered['IgG1_IgG2_ratio'] = df_safe['IgG1 Average'] / df_safe['IgG2 Average']
    engineered['IgG1_IgG3_ratio'] = df_safe['IgG1 Average'] / df_safe['IgG3 Average']
    engineered['IgG1_IgG4_ratio'] = df_safe['IgG1 Average'] / df_safe['IgG4 Average']

    # Class comparison ratios
    engineered['IgG_IgA_ratio'] = (df_safe['IgG1 Average'] + df_safe['IgG2 Average'] +
                               df_safe['IgG3 Average'] + df_safe['IgG4 Average']) / df_safe['IgA Average']
    engineered['IgA_IgM_ratio'] = df_safe['IgA Average'] / df_safe['IgM Average']
    engineered['IgE_IgM_ratio'] = df_safe['IgE Average'] / df_safe['IgM Average']

    # Square terms for key features (based on feature importance)
    engineered['IgA_squared'] = df['IgA Average'] ** 2
    engineered['IgM_squared'] = df['IgM Average'] ** 2
    engineered['IgE_squared'] = df['IgE Average'] ** 2

    # Log transformations (using safe values)
    for col in columns_to_process:
        engineered[f'{col}_log'] = np.log(df_safe[col])

    # Clip ratios to avoid extreme values
    ratio_columns = [col for col in engineered.columns if 'ratio' in col]
    for col in ratio_columns:
        # Clip to reasonable range, e.g., between 0.01 and 100
        engineered[col] = engineered[col].clip(lower=0.01, upper=100)

    # Check for and fix any remaining infinity or NaN values
    engineered.replace([np.inf, -np.inf], np.nan, inplace=True)
    engineered.fillna(engineered.median(), inplace=True)

    return engineered

# Create engineered features
engineered_features = engineer_features(base_features)

print(f"Original feature count: {base_features.shape[1]}")
print(f"Engineered feature count: {engineered_features.shape[1]}")

# Check for any remaining problematic values
print("\nChecking for problematic values after engineering:")
print(f"Infinity values: {np.isinf(engineered_features.values).sum()}")
print(f"NaN values: {np.isnan(engineered_features.values).sum()}")

# ===== 2. OUTLIER HANDLING =====
print("\n2. Handling outliers...")

# Create robust scaled features
robust_scaler = RobustScaler()
robust_features = pd.DataFrame(
    robust_scaler.fit_transform(engineered_features),
    columns=engineered_features.columns
)

# ===== 3. ENHANCED CROSS-VALIDATION =====
print("\n3. Setting up enhanced cross-validation...")

# Use repeated stratified k-fold for more stable estimates
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)

# ===== 4. FIND BEST MODELS =====
print("\n4. Training and evaluating base models...")

# Define parameter grids for fine-tuning - using more focused grids based on prior results
xgb_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 6, 7],
    'learning_rate': [0.05, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'min_child_weight': [3, 5],
    'gamma': [0, 0.1],
    'reg_alpha': [0.1, 0.5],
    'reg_lambda': [0.1, 1.0]
}

lgb_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 6, 7],
    'learning_rate': [0.05, 0.1, 0.2],
    'num_leaves': [31, 50],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'min_child_samples': [5, 10],
    'reg_alpha': [0.1, 0.5],
    'reg_lambda': [0.1, 1.0]
}

# Create smaller grids for faster results
xgb_quick_param_grid = {
    'n_estimators': [100],
    'max_depth': [5],
    'learning_rate': [0.1],
    'subsample': [0.9],
    'colsample_bytree': [0.9],
    'min_child_weight': [3],
    'gamma': [0.1],
    'reg_alpha': [0.5],
    'reg_lambda': [1.0]
}

lgb_quick_param_grid = {
    'n_estimators': [100],
    'max_depth': [5],
    'learning_rate': [0.1],
    'num_leaves': [31],
    'subsample': [0.9],
    'colsample_bytree': [0.9],
    'min_child_samples': [10],
    'reg_alpha': [0.5],
    'reg_lambda': [1.0]
}

# Choose whether to use quick grids (faster) or full grids (better results)
use_quick_grid = True  # Set to False for more thorough but slower search

xgb_grid = xgb_quick_param_grid if use_quick_grid else xgb_param_grid
lgb_grid = lgb_quick_param_grid if use_quick_grid else lgb_param_grid

# Function to find best model
def find_best_model(model_class, param_grid, features, target, cv, name):
    print(f"\nFinding best {name} model...")
    grid_search = GridSearchCV(
        model_class(),
        param_grid=param_grid,
        cv=cv,
        scoring='accuracy',
        n_jobs=-1,
        verbose=1
    )

    grid_search.fit(features, target)

    print(f"Best {name} parameters: {grid_search.best_params_}")
    print(f"Best {name} CV accuracy: {grid_search.best_score_:.4f}")

    return grid_search.best_estimator_, grid_search.best_params_, grid_search.best_score_

# Find best models using robust scaled engineered features
xgb_model, xgb_best_params, xgb_cv_score = find_best_model(
    xgb.XGBClassifier,
    xgb_grid,
    robust_features,
    encoded_target,
    cv,
    "XGBoost"
)

lgb_model, lgb_best_params, lgb_cv_score = find_best_model(
    lgb.LGBMClassifier,
    lgb_grid,
    robust_features,
    encoded_target,
    cv,
    "LightGBM"
)

# ===== 5. CREATE ENSEMBLE MODEL =====
print("\n5. Creating ensemble model...")

# Create a voting classifier
voting_clf = VotingClassifier(
    estimators=[
        ('xgb', xgb.XGBClassifier(**xgb_best_params)),
        ('lgb', lgb.LGBMClassifier(**lgb_best_params))
    ],
    voting='soft'  # Use predicted probabilities
)

# Evaluate the voting classifier
cv_scores = cross_val_score(voting_clf, robust_features, encoded_target, cv=cv, scoring='accuracy')
print(f"Voting Classifier CV scores: {cv_scores}")
print(f"Voting Classifier mean CV accuracy: {cv_scores.mean():.4f}")

# ===== 6. FINAL EVALUATION =====
print("\n6. Final evaluation with hold-out test set...")

# Create a train/validation/test split
X_train, X_temp, y_train, y_temp = train_test_split(
    robust_features, encoded_target, test_size=0.3, random_state=42, stratify=encoded_target
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

# Train the ensemble model
voting_clf.fit(X_train, y_train)

# Evaluate on validation set
val_pred = voting_clf.predict(X_val)
val_acc = accuracy_score(y_val, val_pred)
print(f"Validation accuracy: {val_acc:.4f}")

# Evaluate on test set
test_pred = voting_clf.predict(X_test)
test_acc = accuracy_score(y_test, test_pred)
print(f"Test accuracy: {test_acc:.4f}")

# Print classification report
print("\nClassification Report (Test Set):")
print(classification_report(y_test, test_pred))

# Print confusion matrix
print("\nConfusion Matrix (Test Set):")
conf_matrix = confusion_matrix(y_test, test_pred)
print(conf_matrix)

# Convert to original class names
y_test_names = label_encoder.inverse_transform(y_test)
pred_test_names = label_encoder.inverse_transform(test_pred)

print("\nConfusion Matrix with class names:")
conf_df = pd.DataFrame(
    confusion_matrix(y_test_names, pred_test_names),
    index=label_encoder.classes_,
    columns=label_encoder.classes_
)
print(conf_df)

# ===== 7. FEATURE IMPORTANCE =====
print("\n7. Feature importance...")

# Get XGBoost feature importance
xgb_importance = pd.DataFrame({
    'Feature': robust_features.columns,
    'XGBoost Importance': xgb_model.feature_importances_
}).sort_values('XGBoost Importance', ascending=False)

# Get LightGBM feature importance
lgb_importance = pd.DataFrame({
    'Feature': robust_features.columns,
    'LightGBM Importance': lgb_model.feature_importances_
}).sort_values('LightGBM Importance', ascending=False)

# Print top 10 features for each model
print("\nTop 10 XGBoost features:")
print(xgb_importance.head(10))

print("\nTop 10 LightGBM features:")
print(lgb_importance.head(10))

# ===== 8. SAVE BEST MODEL =====
print("\n8. Saving best model...")

# Save the ensemble model
best_model_filename = f"Best_Ensemble_Model_{datetime.date.today()}.pkl"
joblib.dump(voting_clf, best_model_filename)
print(f"Ensemble model saved to '{best_model_filename}'")

# Save feature engineering pipeline for future use
robust_scaler_filename = f"robust_scaler_{datetime.date.today()}.pkl"
joblib.dump(robust_scaler, robust_scaler_filename)
print(f"Robust scaler saved to '{robust_scaler_filename}'")

# Save label encoder
label_encoder_filename = f"label_encoder_{datetime.date.today()}.pkl"
joblib.dump(label_encoder, label_encoder_filename)
print(f"Label encoder saved to '{label_encoder_filename}'")

print("\nEnhanced modeling complete!")

# Final result summary
print("\n" + "=" * 50)
print("FINAL RESULTS SUMMARY")
print("=" * 50)
print(f"XGBoost CV Score: {xgb_cv_score:.4f}")
print(f"LightGBM CV Score: {lgb_cv_score:.4f}")
print(f"Ensemble CV Score: {cv_scores.mean():.4f}")
print(f"Final Test Accuracy: {test_acc:.4f}")
print("=" * 50)

In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder
import matplotlib.pyplot as plt
import xgboost as xgb
import lightgbm as lgb
import random

# Load your data
file_name = "Biomarker_16032025_without_SWD_with_GNV.xlsx"
data = pd.read_excel(file_name)

# Define columns to process
columns_to_process = ['IgG1 Average', 'IgG2 Average', 'IgG3 Average', 'IgG4 Average',
                      'IgA Average', 'IgE Average', 'IgM Average']

# Extract features and target
features = data[columns_to_process]
target = data['Group']

# Load your previously saved models
xgb_model = joblib.load("XGBoost_Best_Model_2025-03-20.pkl")
lgb_model = joblib.load("LightGBM_Best_Model_2025-03-20.pkl")
label_encoder = joblib.load("label_encoder_2025-03-20.pkl")

# Encode target if needed (may be unnecessary if already done)
encoded_target = label_encoder.transform(target)

print("Model Validation Tests")
print("=" * 50)

# 1. Multiple Random State Validation
print("\n1. Testing with multiple random states:")
for seed in [42, 100, 200, 300, 400]:
    # Split with different random states
    X_train, X_test, y_train, y_test = train_test_split(
        features, encoded_target, test_size=0.3,
        random_state=seed, stratify=encoded_target
    )

    # Standardize features (important: fit only on training data)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)  # Only transform test data

    # Make predictions with both models
    xgb_pred = xgb_model.predict(X_test_scaled)
    lgb_pred = lgb_model.predict(X_test_scaled)

    # Calculate accuracies
    xgb_acc = accuracy_score(y_test, xgb_pred)
    lgb_acc = accuracy_score(y_test, lgb_pred)

    print(f"Random state {seed}:")
    print(f"  XGBoost accuracy: {xgb_acc:.4f}")
    print(f"  LightGBM accuracy: {lgb_acc:.4f}")

# 2. K-fold Cross-validation
print("\n2. K-fold cross-validation:")
kf = KFold(n_splits=5, shuffle=True, random_state=42)

xgb_cv_scores = []
lgb_cv_scores = []

for train_idx, test_idx in kf.split(features):
    # Split data
    X_train, X_test = features.iloc[train_idx], features.iloc[test_idx]
    y_train, y_test = encoded_target[train_idx], encoded_target[test_idx]

    # Standardize
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Train fresh models with same hyperparameters
    # For XGBoost
    xgb_params = xgb_model.get_params()
    xgb_fresh = xgb.XGBClassifier(**xgb_params)
    xgb_fresh.fit(X_train_scaled, y_train)
    xgb_pred = xgb_fresh.predict(X_test_scaled)
    xgb_cv_scores.append(accuracy_score(y_test, xgb_pred))

    # For LightGBM
    lgb_params = lgb_model.get_params()
    lgb_fresh = lgb.LGBMClassifier(**lgb_params)
    lgb_fresh.fit(X_train_scaled, y_train)
    lgb_pred = lgb_fresh.predict(X_test_scaled)
    lgb_cv_scores.append(accuracy_score(y_test, lgb_pred))

print(f"XGBoost CV scores: {xgb_cv_scores}")
print(f"XGBoost mean CV accuracy: {np.mean(xgb_cv_scores):.4f}")
print(f"LightGBM CV scores: {lgb_cv_scores}")
print(f"LightGBM mean CV accuracy: {np.mean(lgb_cv_scores):.4f}")

# 3. Feature permutation test
print("\n3. Feature permutation test:")
# Create a baseline dataset
X_train, X_test, y_train, y_test = train_test_split(
    features, encoded_target, test_size=0.3, random_state=42, stratify=encoded_target
)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Get baseline accuracy
xgb_baseline = xgb_model.predict(X_test_scaled)
lgb_baseline = lgb_model.predict(X_test_scaled)
xgb_baseline_acc = accuracy_score(y_test, xgb_baseline)
lgb_baseline_acc = accuracy_score(y_test, lgb_baseline)

# Test importance of each feature by permutation
for col_idx, column in enumerate(columns_to_process):
    # Create a copy of the test data
    X_test_permuted = X_test_scaled.copy()

    # Shuffle the column values
    np.random.seed(42)
    X_test_permuted[:,col_idx] = np.random.permutation(X_test_permuted[:,col_idx])

    # Make predictions with permuted data
    xgb_perm_pred = xgb_model.predict(X_test_permuted)
    lgb_perm_pred = lgb_model.predict(X_test_permuted)

    # Calculate new accuracy
    xgb_perm_acc = accuracy_score(y_test, xgb_perm_pred)
    lgb_perm_acc = accuracy_score(y_test, lgb_perm_pred)

    # Calculate importance as drop in accuracy
    xgb_importance = xgb_baseline_acc - xgb_perm_acc
    lgb_importance = lgb_baseline_acc - lgb_perm_acc

    print(f"Feature: {column}")
    print(f"  XGBoost importance: {xgb_importance:.4f}")
    print(f"  LightGBM importance: {lgb_importance:.4f}")

# 4. Three-way split validation
print("\n4. Three-way split validation:")
# Split into train, validation, test
X_train, X_temp, y_train, y_temp = train_test_split(
    features, encoded_target, test_size=0.4, random_state=42, stratify=encoded_target
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Train new models with same hyperparameters
xgb_params = xgb_model.get_params()
lgb_params = lgb_model.get_params()

new_xgb_model = xgb.XGBClassifier(**xgb_params)
new_lgb_model = lgb.LGBMClassifier(**lgb_params)

new_xgb_model.fit(X_train_scaled, y_train)
new_lgb_model.fit(X_train_scaled, y_train)

# Evaluate on validation set
xgb_val_pred = new_xgb_model.predict(X_val_scaled)
lgb_val_pred = new_lgb_model.predict(X_val_scaled)

xgb_val_acc = accuracy_score(y_val, xgb_val_pred)
lgb_val_acc = accuracy_score(y_val, lgb_val_pred)

print("Validation set results:")
print(f"  XGBoost accuracy: {xgb_val_acc:.4f}")
print(f"  LightGBM accuracy: {lgb_val_acc:.4f}")

# Evaluate on test set
xgb_test_pred = new_xgb_model.predict(X_test_scaled)
lgb_test_pred = new_lgb_model.predict(X_test_scaled)

xgb_test_acc = accuracy_score(y_test, xgb_test_pred)
lgb_test_acc = accuracy_score(y_test, lgb_test_pred)

print("Test set results:")
print(f"  XGBoost accuracy: {xgb_test_acc:.4f}")
print(f"  LightGBM accuracy: {lgb_test_acc:.4f}")

# 5. Learning curve
print("\n5. Learning curve analysis:")
train_sizes = [0.2, 0.4, 0.6, 0.8, 1.0]
xgb_train_scores = []
xgb_test_scores = []
lgb_train_scores = []
lgb_test_scores = []

# Create a fixed test set
X_full_train, X_fixed_test, y_full_train, y_fixed_test = train_test_split(
    features, encoded_target, test_size=0.2, random_state=42, stratify=encoded_target
)
scaler = StandardScaler()
X_full_train_scaled = scaler.fit_transform(X_full_train)
X_fixed_test_scaled = scaler.transform(X_fixed_test)

for size in train_sizes:
    # Take a subset of training data
    subset_size = int(len(X_full_train) * size)
    indices = random.sample(range(len(X_full_train)), subset_size)

    X_subset = X_full_train_scaled[indices]
    y_subset = y_full_train.iloc[indices] if hasattr(y_full_train, 'iloc') else y_full_train[indices]

    # Train models
    subset_xgb = xgb.XGBClassifier(**xgb_params)
    subset_lgb = lgb.LGBMClassifier(**lgb_params)

    subset_xgb.fit(X_subset, y_subset)
    subset_lgb.fit(X_subset, y_subset)

    # Evaluate on training set
    xgb_train_pred = subset_xgb.predict(X_subset)
    lgb_train_pred = subset_lgb.predict(X_subset)

    xgb_train_acc = accuracy_score(y_subset, xgb_train_pred)
    lgb_train_acc = accuracy_score(y_subset, lgb_train_pred)

    # Evaluate on test set
    xgb_test_pred = subset_xgb.predict(X_fixed_test_scaled)
    lgb_test_pred = subset_lgb.predict(X_fixed_test_scaled)

    xgb_test_acc = accuracy_score(y_fixed_test, xgb_test_pred)
    lgb_test_acc = accuracy_score(y_fixed_test, lgb_test_pred)

    # Store results
    xgb_train_scores.append(xgb_train_acc)
    xgb_test_scores.append(xgb_test_acc)
    lgb_train_scores.append(lgb_train_acc)
    lgb_test_scores.append(lgb_test_acc)

    print(f"Training size: {size*100:.0f}%")
    print(f"  XGBoost - Train: {xgb_train_acc:.4f}, Test: {xgb_test_acc:.4f}")
    print(f"  LightGBM - Train: {lgb_train_acc:.4f}, Test: {lgb_test_acc:.4f}")

# Plot learning curves
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.plot(train_sizes, xgb_train_scores, 'o-', label='Training accuracy')
plt.plot(train_sizes, xgb_test_scores, 'o-', label='Test accuracy')
plt.title('XGBoost Learning Curve')
plt.xlabel('Training Set Size (proportion)')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(train_sizes, lgb_train_scores, 'o-', label='Training accuracy')
plt.plot(train_sizes, lgb_test_scores, 'o-', label='Test accuracy')
plt.title('LightGBM Learning Curve')
plt.xlabel('Training Set Size (proportion)')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.savefig('learning_curves.png')

print("\nValidation complete. Learning curves saved to 'learning_curves.png'")