# CatBoost Model Development

This notebook demonstrates the development of a CatBoost model for predicting blend properties. CatBoost is a high-performance open-source library for gradient boosting on decision trees. It is particularly effective in handling categorical features and provides state-of-the-art accuracy.

Here's a breakdown of what we'll cover:

1.  **Data Loading:** Loading the processed training, validation, and test datasets.
2.  **Hyperparameter Tuning & Model Training:** Using Optuna to find the optimal hyperparameters for the CatBoost model and training the model on the combined training and validation data.

### 1. Load the Datasets

In [None]:
import pandas as pd
from pathlib import Path

# Directory where the processed data is stored
data_path = Path("../processed_data")

# Load the training and validation datasets
X_train, X_val, y_train, y_val = (
    pd.read_csv(data_path / "X_train.csv"),
    pd.read_csv(data_path / "X_val.csv"),
    pd.read_csv(data_path / "y_train.csv"),
    pd.read_csv(data_path / "y_val.csv")
)

# Display the shapes of the datasets
print(f"train shape: {X_train.shape}")
print(f"val shape: {X_val.shape}")

### 2. Hyperparameter Tuning & Model Training

In [None]:
import optuna
import warnings
import numpy as np
import catboost as cb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_percentage_error

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")

# Define the objective function for Optuna
def objective(trial: optuna.Trial, X_train: pd.DataFrame, y_train: pd.Series, X_val: pd.DataFrame, y_val: pd.Series):
    """
    Objective function for Optuna to minimize.
    This function trains a CatBoost model with a set of hyperparameters
    suggested by Optuna and returns the cross-validated MAPE.

    Parameters:
      trial (optuna.Trial): An Optuna trial object that suggests hyperparameters.
      X (pd.DataFrame): Feature matrix for training.
      y (pd.Series): Target variable for training.

    Returns:
      float: The mean absolute percentage error (MAPE) of the model on the validation set during cross-validation.
    """
    # Define the hyperparameter search space for CatBoost
    param = {
        'objective': 'MAPE',
        'iterations': trial.suggest_int('iterations', 100, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'depth': trial.suggest_int('depth', 3, 8),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        'random_strength': trial.suggest_float('random_strength', 0, 10),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1),
        'boosting_type': trial.suggest_categorical('boosting_type', ['Ordered', 'Plain']),
        'random_state': 42,
        'verbose': 0
    }

    # Train the model and evaluate using the validation set
    model = cb.CatBoostRegressor(**param)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=10, verbose=False)
    preds = model.predict(X_val)
    mape = mean_absolute_percentage_error(y_val, preds)

    return mape

In [None]:
import joblib

# Define the directory for saving models and Optuna studies
model_dir = Path("../models/catboost")
model_dir.mkdir(parents=True, exist_ok=True)

optuna_dir = Path("../optuna_db")
optuna_dir.mkdir(parents=True, exist_ok=True)
storage_name = f"sqlite:///{optuna_dir}/catboost_studies.db"

# Dictionary to store the best models
best_models = {}

# Iterate over each target property to tune and train a model
for target in y_train.columns:
    print(f"\n--- Tuning and Training for {target} ---\n")

    # Create an Optuna study to find the best hyperparameters
    study = optuna.create_study(direction='minimize',
                                study_name='catboost-tuning-' + target,
                                storage=storage_name,
                                load_if_exists=True)
    study.optimize(lambda trial: objective(trial, X_train, y_train[target], X_val, y_val[target]), n_trials=50)

    # Get the best hyperparameters
    best_params = study.best_params
    print(f"\nBEST MAPE FOR {target}: {study.best_value}")
    print(f"BEST HYPERPARAMETERS FOR {target}: {best_params}")

    # Train the final model with the best hyperparameters on the entire training set
    final_model = cb.CatBoostRegressor(**best_params, random_state=42, verbose=0)
    final_model.fit(X_train, y_train[target])

    # Save the trained model to a file
    joblib.dump(final_model, f'{model_dir}/{target}_model.joblib')
    print(f"Saved best model for {target}")

    # Store the best model with its MAPE score in the dictionary
    best_models[target] = (final_model, study.best_value)