In [None]:
import catboost as cb
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, root_mean_squared_error

# Load the data and create engineered features (same as your notebook)
df = pd.read_csv('../../data/data.csv')
df[["observation_date", "date_of_introduction"]] = df[["observation_date", "date_of_introduction"]].apply(
    pd.to_datetime
)
df['days_since_introduction'] = (df['observation_date'] - df['date_of_introduction']).dt.days
df['initial_growth_potential'] = df['initial_female_count'] * df['leaf_area_cm2']

# Define features and target
features = [
    'initial_female_count',
    'leaf_area_cm2',
    'population_density',
    'days_since_introduction',
    'initial_growth_potential'
]
target = 'cumulative_mite_count'

X = df[features]
y = df[target]

# Split data (using the same random_state for a fair comparison)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Create and Train the Baseline CatBoost Model ---
# Use verbose=0 to avoid printing training progress for each iteration
cat_model = cb.CatBoostRegressor(random_state=42, verbose=0)
cat_model.fit(X_train, y_train)

# --- Make Predictions and Evaluate ---
y_pred_cat = cat_model.predict(X_test)
r2_cat = r2_score(y_test, y_pred_cat)
mae_cat = mean_absolute_error(y_test, y_pred_cat)
mse_cat = mean_squared_error(y_test, y_pred_cat)
rmse_cat = root_mean_squared_error(y_test, y_pred_cat)

print("--- CatBoost Regressor Baseline Performance ---")
print(f"Mean Absolute Error (MAE): {mae_cat:.2f}")
print(f"Mean Squared Error (MSE): {mse_cat:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse_cat:.2f}")
print(f"R-squared (R²): {r2_cat:.4f}")

--- CatBoost Regressor Baseline Performance ---
Mean Absolute Error (MAE): 33.56
Mean Squared Error (MSE): 3475.59
Root Mean Squared Error (RMSE): 58.95
R-squared (R²): 0.8248


In [3]:
# Define the parameter grid for CatBoost
param_grid_cat = {
    'iterations': [300, 500, 700, 1000], # n_estimators
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [3, 5, 7, 9], # max_depth
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bylevel': [0.6, 0.8, 1.0] # colsample_bytree
}

# Randomized search
cat_random = RandomizedSearchCV(
    estimator=cb.CatBoostRegressor(random_state=42, verbose=0),
    param_distributions=param_grid_cat,
    n_iter=100,
    scoring='r2',
    cv=5,
    n_jobs=-1,
    verbose=2,
    random_state=42
)

# Fit the RandomizedSearchCV
cat_random.fit(X_train, y_train)

# Get the best model
best_cat_model = cat_random.best_estimator_

# Print the best parameters
print("Best parameters found for CatBoost:")
print(cat_random.best_params_)

# Evaluate the tuned model
y_pred_tuned_cat = best_cat_model.predict(X_test)
r2_tuned_cat = r2_score(y_test, y_pred_tuned_cat)
mae_tuned_cat = mean_absolute_error(y_test, y_pred_tuned_cat)
mse_tuned_cat = mean_squared_error(y_test, y_pred_tuned_cat)
rmse_tuned_cat = root_mean_squared_error(y_test, y_pred_tuned_cat)

print("\n--- Tuned CatBoost Performance ---")
print(f"Mean Absolute Error (MAE): {mae_tuned_cat:.2f}")
print(f"Mean Squared Error (MSE): {mse_tuned_cat:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse_tuned_cat:.2f}")
print(f"R-squared (R²): {r2_tuned_cat:.4f}")

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best parameters found for CatBoost:
{'subsample': 0.6, 'learning_rate': 0.01, 'iterations': 700, 'depth': 9, 'colsample_bylevel': 1.0}

--- Tuned CatBoost Performance ---
Mean Absolute Error (MAE): 33.55
Mean Squared Error (MSE): 3216.36
Root Mean Squared Error (RMSE): 56.71
R-squared (R²): 0.8379
