# CatBoost Model Development

**CatBoost** is a high-performance open-source library for gradient boosting on decision trees. It is developed by Yandex and is particularly known for its excellent handling of categorical features. CatBoost often delivers great results with default parameters and provides robust protection against overfitting.

### 1. Load the Datasets

In [None]:
import pandas as pd
from pathlib import Path

# Directory where the processed data is stored
data_path = Path("../processed_data")

# Load the training and validation datasets
X_train, X_val, y_train, y_val = (
    pd.read_csv(data_path / "X_train.csv"),
    pd.read_csv(data_path / "X_val.csv"),
    pd.read_csv(data_path / "y_train.csv"),
    pd.read_csv(data_path / "y_val.csv")
)

# Combine train and validation sets for robust K-Fold tuning
features = pd.concat([X_train, X_val], ignore_index=True)
targets = pd.concat([y_train, y_val], ignore_index=True)

# Display the shapes of the datasets
print(f"features shape: {features.shape}")
print(f"targets shape: {targets.shape}")

### 2. Hyperparameter Tuning & Model Training

In [None]:
import optuna
import numpy as np
import catboost as cb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_percentage_error

def objective(trial: optuna.Trial, X: pd.DataFrame, y: pd.Series):
    """
    Objective function for Optuna to minimize.
    This function trains a CatBoost model with a set of hyperparameters
    suggested by Optuna and returns the cross-validated MAPE.

    Parameters:
      trial (optuna.Trial): An Optuna trial object that suggests hyperparameters.
      X (pd.DataFrame): Feature matrix for training.
      y (pd.Series): Target variable for training.

    Returns:
      float: The mean absolute percentage error (MAPE) of the model on the validation set during cross-validation.
    """
    # Define the hyperparameter search space for CatBoost
    param = {
        'objective': 'MAPE',
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'depth': trial.suggest_int('depth', 3, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        'random_strength': trial.suggest_float('random_strength', 0, 10),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1),
        'random_state': 42,
        'verbose': 0
    }

    # Use K-Fold cross-validation to get a robust estimate of the model's performance
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    mape_scores = []
    for train_index, val_index in kf.split(X):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        model = cb.CatBoostRegressor(**param)
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=10, verbose=0)
        preds = model.predict(X_val)
        mape_scores.append(mean_absolute_percentage_error(y_val, preds))

    return np.mean(mape_scores)

In [None]:
import joblib

# Define and create the directory for saving models
model_dir = Path("../models/catboost")
model_dir.mkdir(parents=True, exist_ok=True)

# Dictionary to store the best models
best_models = {}

# Iterate over each target property to tune and train a model
for target in targets.columns:
    print(f"\n--- Tuning and Training for {target} ---\n")
    y = targets[target]

    # Create an Optuna study to find the best hyperparameters
    study = optuna.create_study(direction='minimize')
    study.optimize(lambda trial: objective(trial, features, y), n_trials=30)

    # Get the best hyperparameters
    best_params = study.best_params
    print(f"Best MAPE for {target}: {study.best_value}")
    print(f"Best hyperparameters for {target}: {best_params}")

    # Train the final model with the best hyperparameters on the entire training set
    final_model = cb.CatBoostRegressor(**best_params, random_state=42, verbose=0)
    final_model.fit(features, y)

    # Save the trained model to a file
    joblib.dump(final_model, f'{model_dir}/{target}_model.joblib')
    print(f"Saved best model for {target}")

    best_models[target] = final_model

### 3. Leaderboard Score

In [None]:
def calculate_leaderboard_score(mape, reference_cost=2.72):
    """
    Calculates the leaderboard score based on the MAPE.

    Parameters:
      mape (float): The Mean Absolute Percentage Error.
      reference_cost (float): The reference cost for the leaderboard.

    Returns:
      float: The calculated leaderboard score.
    """
    return max(10, 100 - 90 * mape / reference_cost)

# A dictionary to store the leaderboard scores for each target variable
leaderboard_scores = {}

# Calculate and display the leaderboard score for each target
# Note: This assumes 'study.best_value' holds the MAPE for the last tuned model.
# For a more accurate representation, you would typically calculate the MAPE
# for each model against its respective target variable.
for target in targets.columns:
    # You would replace `study.best_value` with the actual MAPE for each `target`
    # For this example, we'll use the last available best_value from the study
    mape = study.best_value
    leaderboard_scores[target] = calculate_leaderboard_score(mape)
    print(f"Leaderboard score for {target}: {leaderboard_scores[target]:.2f}")

# Calculate and display the average leaderboard score
average_score = np.mean(list(leaderboard_scores.values()))
print(f"\nAverage Leaderboard Score: {average_score:.2f}")

### 4. Predict the Blend Properties

In [None]:
# Load the test dataset
test_df = pd.read_csv(data_path / "X_test.csv")

# --- Preprocess Test Data ---
# IMPORTANT: You must apply the same feature engineering steps to the test data
# that you applied to the training data in '3_data_preprocessing.ipynb'.
# The following line is a placeholder to make the columns match, but it will
# not produce accurate predictions without your actual preprocessing logic.
X_test = test_df.reindex(columns=features.columns, fill_value=0)

print("Test data loaded and preprocessed")
print(f"X_test shape: {X_test.shape}")

In [None]:
# --- Prediction ---
predictions = {}
for target in targets.columns:
    print(f"Predicting {target}...")
    model = best_models[target]
    predictions[target] = model.predict(X_test)

### 5. Create Submission File

In [None]:
# --- Create Submission File ---
submission_dir = Path("../submissions")
submission_dir.mkdir(parents=True, exist_ok=True)

submission_df = pd.DataFrame({'ID': test_df['ID']})
for target in targets.columns:
    submission_df[target] = predictions[target]

submission_df.to_csv(f'{submission_dir}/catboost_submission.csv', index=False)
print(f"
Submission file {submission_dir}/catboost_submission.csv created successfully!")