
# Model Training and Validation

This notebook performs model training and validation using XGBoost. It includes:
1. Loading preprocessed data with labels.
2. Splitting the data into train and test sets.
3. Performing hyperparameter tuning with GridSearchCV.
4. Evaluating the best model on the test and train sets.

---
### Inputs and Outputs:
- **Input**:
  - `Final_feature_selection_data_with_ylabel.csv`
- **Outputs**:
  - Performance metrics for the best model on both train and test sets.


In [None]:

# Step 1: Import Libraries and Load Data
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import make_scorer, f1_score, roc_auc_score, accuracy_score, precision_score, recall_score, roc_curve
import xgboost as xgb
import matplotlib.pyplot as plt

# Load preprocessed feature data
input_data = pd.read_csv('Final_feature_selection_data_with_ylabel.csv')

# Separate features and labels
X_data = input_data.drop(columns=['y_label'])
Y_label = input_data['y_label']

print(f"Loaded data shape: {X_data.shape}")
print(f"Label data shape: {Y_label.shape}")


In [None]:

# Step 2: Train-Test Split
X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_label, test_size=0.2, random_state=16)
print("Train and Test data split completed.")
print(f"Train data shape: {X_train.shape}, Train labels: {Y_train.shape}")
print(f"Test data shape: {X_test.shape}, Test labels: {Y_test.shape}")


In [None]:

# Step 3: Define XGBoost Model and Grid Search Parameters
xgb_classifier = xgb.XGBClassifier(objective="binary:logistic", random_state=42)

param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3],
    'reg_alpha': [0.1],
    'reg_lambda': [10],
    'subsample': [0.4],
}

scoring = {
    'accuracy': 'accuracy',
    'f1': make_scorer(f1_score),
    'roc_auc': 'roc_auc',
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    estimator=xgb_classifier,
    param_grid=param_grid,
    cv=cv,
    scoring=scoring,
    refit='accuracy',
    return_train_score=True,
    verbose=2
)


In [None]:

# Step 4: Perform Grid Search
grid_search.fit(X_train, Y_train)

# Best score & parameters
print("Best accuracy score : {0:.4f}".format(grid_search.best_score_))
print("Best parameters: ", grid_search.best_params_)

# Extract best model index and performance
best_index = grid_search.best_index_
cv_results = grid_search.cv_results_

print("Best Model Performance:")
print(f"  Params: {cv_results['params'][best_index]}")
print(f"  Mean accuracy: {cv_results['mean_test_accuracy'][best_index]:.4f} (std: {cv_results['std_test_accuracy'][best_index]:.4f})")
print(f"  Mean F1 score: {cv_results['mean_test_f1'][best_index]:.4f} (std: {cv_results['std_test_f1'][best_index]:.4f})")
print(f"  Mean ROC AUC: {cv_results['mean_test_roc_auc'][best_index]:.4f} (std: {cv_results['std_test_roc_auc'][best_index]:.4f})")


In [None]:

# Step 5: Test Set Evaluation
estimator = grid_search.best_estimator_
Y_pred = estimator.predict(X_test)
Y_prob = estimator.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred)
recall = recall_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)
roc_auc = roc_auc_score(Y_test, Y_prob)

print("Test Set Evaluation:")
print("Accuracy:", accuracy)
print("Precision", precision)
print("Recall", recall)
print("F1_score", f1)
print("ROC_AUC", roc_auc)


In [None]:

# Step 6: Train Set Evaluation
Y_train_pred = estimator.predict(X_train)
Y_train_prob = estimator.predict_proba(X_train)[:, 1]

train_accuracy = accuracy_score(Y_train, Y_train_pred)
train_precision = precision_score(Y_train, Y_train_pred)
train_recall = recall_score(Y_train, Y_train_pred)
train_f1 = f1_score(Y_train, Y_train_pred)
train_roc_auc = roc_auc_score(Y_train, Y_train_prob)

print("Train Set Evaluation:")
print("Accuracy:", train_accuracy)
print("Precision", train_precision)
print("Recall", train_recall)
print("F1_score", train_f1)
print("ROC_AUC", train_roc_auc)
