### XGBoost

The issue is that XGBoost expects class labels to start from 0 (e.g., [0, 1, 2]), but your dataset has labels [-1, 0, 1].

In [6]:
import pandas as pd
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

# Load the datasets
X_train_smote = pd.read_csv("../Data/X_train_smote.csv")
y_train_smote = pd.read_csv("../Data/y_train_smote.csv")
X_test = pd.read_csv("../Data/X_test.csv")
y_test = pd.read_csv("../Data/y_test.csv")

# Convert labels to 0-based encoding
y_train_smote = y_train_smote.replace({-1: 0, 0: 1, 1: 2})
y_test = y_test.replace({-1: 0, 0: 1, 1: 2})

# Initialize XGBoost Classifier with default parameters
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)

# Train the model
xgb_model.fit(X_train_smote, y_train_smote)

# Predictions
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred_xgb)
print(f"🔹 XGBoost Performance:\nAccuracy: {accuracy:.4f}\n")

# Classification Report
print("Classification Report:")
print(classification_report(y_test, y_pred_xgb))


Parameters: { "use_label_encoder" } are not used.



🔹 XGBoost Performance:
Accuracy: 0.8685

Classification Report:
              precision    recall  f1-score   support

           0       0.33      0.17      0.22        12
           1       0.44      0.75      0.56        16
           2       0.95      0.92      0.94       185

    accuracy                           0.87       213
   macro avg       0.58      0.61      0.57       213
weighted avg       0.88      0.87      0.87       213



#### Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

# Define the model
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)

# Define the parameter grid to search
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5, 6],
    'n_estimators': [50, 100, 150],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'scale_pos_weight': [1, 2, 3]
}

# Grid search with cross-validation
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='accuracy', cv=5, verbose=1, n_jobs=-1)

# Fit grid search
grid_search.fit(X_train_smote, y_train_smote)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Print the results
print("Best Parameters:", best_params)
print("Best Cross-validation Accuracy:", best_score)

# Now fit the model using the best parameters found
best_model = grid_search.best_estimator_

# Predict with the best model
y_pred_xgb_best = best_model.predict(X_test)

# Evaluate the model
from sklearn.metrics import accuracy_score, classification_report
accuracy = accuracy_score(y_test, y_pred_xgb_best)
print(f"Best XGBoost Accuracy: {accuracy}")
print(classification_report(y_test, y_pred_xgb_best))


Fitting 5 folds for each of 1296 candidates, totalling 6480 fits
