# 1. Train and evaluate models

We're gonna use 3 type of Machine Learning (for the classification problem with multiple results) and use the one that has the best performance. They are:
- **Random Forest**
- **XGBoost**
- **CatBoost**

## 1.1 Import neccessary packages and dataset

In [1]:
import os

import duckdb
import joblib
import pandas as pd
from catboost import CatBoostClassifier

# Model and pre-processing imports
from imblearn.over_sampling import ADASYN
from sklearn.ensemble import RandomForestClassifier

# Metric imports
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from xgboost import XGBClassifier

# --- Data Loading (from your original notebook) ---

# Load TRAIN
con = duckdb.connect("../database/ML/X_train.duckdb")
X_train = con.execute("SELECT * FROM X_train").df()
con.close()

con = duckdb.connect("../database/ML/y_train.duckdb")
y_train = con.execute("SELECT * FROM y_train").df()
con.close()

# Load TEST
con = duckdb.connect("../database/ML/X_test.duckdb")
X_test = con.execute("SELECT * FROM X_test").df()
con.close()

con = duckdb.connect("../database/ML/y_test.duckdb")
y_test = con.execute("SELECT * FROM y_test").df()
con.close()

# Apply ADASYN to balance the training data
adasyn = ADASYN(random_state=42, n_neighbors=6)
X_train_balanced, y_train_balanced = adasyn.fit_resample(X_train, y_train)

print(f"Original Training shape: {X_train.shape}")
print(f"Balanced Training shape: {X_train_balanced.shape}")

Original Training shape: (203820, 36)
Balanced Training shape: (445239, 36)


## 1.2 Define model and run comparisons

In [2]:
def train_and_evaluate_model(model, X_train_balanced, y_train_balanced, X_test, y_test, model_name):
    """
    Trains a model on balanced data and evaluates it on the test set.
    Returns a dictionary of key metrics and the trained model.
    """
    print(f"--- Training {model_name} ---")

    # Fit the model
    if model_name == "CatBoost":
        # CatBoost has its own fitting method and prints verbose output
        model.fit(X_train_balanced, y_train_balanced)
    else:
        # Standard fit for scikit-learn compatible models
        # .values.ravel() is needed to convert the DataFrame column to the expected shape
        model.fit(X_train_balanced, y_train_balanced.values.ravel())

    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate metrics (zero_division=0 handles cases where a class has no true or predicted samples)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average="weighted", zero_division=0)
    recall = recall_score(y_test, y_pred, average="weighted", zero_division=0)
    f1 = f1_score(y_test, y_pred, average="weighted", zero_division=0)

    # Store results
    metrics = {
        "Model": model_name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "Confusion Matrix": confusion_matrix(y_test, y_pred),
        "Trained Model": model,
    }

    print(f"--- Finished {model_name}. Accuracy: {accuracy:.4f} ---")
    return metrics

In [3]:
# Define the models to compare

# 1. CatBoost
catboost_model = CatBoostClassifier(
    iterations=1200,
    learning_rate=0.005,
    depth=6,
    loss_function="MultiClass",
    verbose=100,
    random_seed=42,
)

# 2. Random Forest Classifier
rf_model = RandomForestClassifier(
    n_estimators=300, max_depth=22, random_state=42, n_jobs=-1, class_weight="balanced"
)

# 3. XGBoost Classifier
xgb_model = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    random_state=42,
    n_jobs=-1,
    use_label_encoder=False,
    eval_metric="mlogloss",
)

models_to_compare = [
    (catboost_model, "CatBoost"),
    (rf_model, "RandomForest"),
    (xgb_model, "XGBoost"),
]

# Run the comparison loop
comparison_results = []
for model, name in models_to_compare:
    results = train_and_evaluate_model(
        model, X_train_balanced, y_train_balanced, X_test, y_test, name
    )
    comparison_results.append(results)

--- Training CatBoost ---
0:	learn: 1.0969137	total: 118ms	remaining: 2m 20s
100:	learn: 0.9852967	total: 5.63s	remaining: 1m 1s
200:	learn: 0.9368638	total: 11s	remaining: 54.5s
300:	learn: 0.9118837	total: 16.1s	remaining: 48.1s
400:	learn: 0.8974849	total: 21.1s	remaining: 42.1s
500:	learn: 0.8879095	total: 26.4s	remaining: 36.9s
600:	learn: 0.8808380	total: 31.5s	remaining: 31.3s
700:	learn: 0.8751713	total: 36.7s	remaining: 26.1s
800:	learn: 0.8702011	total: 42.2s	remaining: 21s
900:	learn: 0.8657788	total: 47.1s	remaining: 15.6s
1000:	learn: 0.8618907	total: 52s	remaining: 10.3s
1100:	learn: 0.8584528	total: 56.9s	remaining: 5.12s
1199:	learn: 0.8549856	total: 1m 2s	remaining: 0us
--- Finished CatBoost. Accuracy: 0.4890 ---
--- Training RandomForest ---
--- Finished RandomForest. Accuracy: 0.5456 ---
--- Training XGBoost ---


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


--- Finished XGBoost. Accuracy: 0.5514 ---


## 1.3 Display Comparison Table

In [6]:
# Compile the results into a DataFrame
metrics_df = pd.DataFrame(comparison_results)

# Drop the Confusion Matrix and Trained Model for the summary table
summary_df = metrics_df.drop(columns=["Confusion Matrix", "Trained Model"])

# Format for display
summary_df["Accuracy"] = summary_df["Accuracy"].map("{:.4f}".format)
summary_df["Precision"] = summary_df["Precision"].map("{:.4f}".format)
summary_df["Recall"] = summary_df["Recall"].map("{:.4f}".format)
summary_df["F1 Score"] = summary_df["F1 Score"].map("{:.4f}".format)

print("## Model Comparison Summary")
print(summary_df)

print("\n---")
print("## Confusion Matrices")

# Print each confusion matrix for detailed analysis
for index, row in metrics_df.iterrows():
    print(f"\n--- {row['Model']} Confusion Matrix ---\n")
    print(row["Confusion Matrix"])

## Model Comparison Summary
          Model Accuracy Precision  Recall F1 Score
0      CatBoost   0.4890    0.7303  0.4890   0.5199
1  RandomForest   0.5456    0.7153  0.5456   0.5807
2       XGBoost   0.5514    0.7236  0.5514   0.5849

---
## Confusion Matrices

--- CatBoost Confusion Matrix ---

[[26440 19822 18457]
 [ 1588  7869   482]
 [ 1837  2447  8410]]

--- RandomForest Confusion Matrix ---

[[32977 17009 14733]
 [ 2253  7058   628]
 [ 3172  1901  7621]]

--- XGBoost Confusion Matrix ---

[[32881 15704 16134]
 [ 2280  7215   444]
 [ 2728  1894  8072]]


# 2. Save the XGBoost model

In [5]:
save_path = "../src/models/xgboost.pkl"
os.makedirs(os.path.dirname(save_path), exist_ok=True)

# compress=3 is a good balance between speed and size (range 0-9)
joblib.dump(xgb_model, save_path, compress=3)

print(f"Compressed model saved to {save_path}")

Compressed model saved to ../src/models/xgboost.pkl
