# Improving MAPE with CatBoost and Optuna Hyperparameter Tuning

In [15]:

import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_percentage_error
from catboost import CatBoostRegressor
import optuna
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, PowerTransformer

In [1]:
! pip install optuna catboost


Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.4.0-py3-none-any.whl (395 kB)
Downloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.9.0 optuna-4.4.0


In [16]:
# Load the data
data = pd.read_csv("train.csv")

#
base_features = [col for col in data.columns if col not in [f"BlendProperty{i}" for i in range(1, 11)]]
targets = [f"BlendProperty{i}" for i in range(1, 11)]



In [17]:
# Outlier handling using IQR method
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[column] = df[column].clip(lower_bound, upper_bound)
    return df

for column in data.columns:
    data = remove_outliers(data, column)

In [18]:
# Creating  weighted columns
weighted_data = {}
for i in range(1, 6):
    for j in range(1, 11):
        prop_col = f"Component{i}_Property{j}"
        frac_col = f"Component{i}_fraction"
        weighted_col = f"Weighted_Component{i}_Property{j}"
        weighted_data[weighted_col] = data[prop_col] * data[frac_col]

# Combine
data_with_weighted = pd.concat([data[base_features], pd.DataFrame(weighted_data)], axis=1)
data_with_weighted = data_with_weighted.copy()  # De-fragmented copy

scaler = StandardScaler()
scaled_features = scaler.fit_transform(data_with_weighted)
scaled_df_features = pd.DataFrame(scaled_features, columns=data_with_weighted.columns)


pt = PowerTransformer(method='yeo-johnson')
transformed_targets_df = pd.DataFrame(pt.fit_transform(data[targets]), columns=targets, index=data.index)

In [19]:
# Initial feature selection using Random Forest feature importance
top_features = {}
for target in targets:
    X = scaled_df_features
    y = transformed_targets_df[target]
    rf_initial = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_initial.fit(X, y)
    feature_importance = pd.Series(rf_initial.feature_importances_, index=scaled_df_features.columns).sort_values(ascending=False)
    top_features[target] = feature_importance.head(25).index.tolist()  # Use top 25 features

x_base = scaled_df_features
y = transformed_targets_df

In [None]:
# Train and evaluate for each target using top 25 features
best_models = {}

for target in targets:
    print(f'\nOptimizing for {target}...')
    

    X_selected = x_base[top_features[target]]
    y_selected = y[target]
    
    def objective(trial):
        params = {
            'iterations': trial.suggest_int('iterations', 500, 3000),
            'depth': trial.suggest_int('depth', 4, 10),
            'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1)
            # 'random_strength': trial.suggest_int('random_strength', 10, 50),
            # 'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.01, 100),
            # 'border_count': trial.suggest_int('border_count', 32, 255),
            # 'silent': True
        }
        
        X_train, X_valid, y_train, y_valid = train_test_split(X_selected, y_selected, test_size=0.2, random_state=42)
        model = CatBoostRegressor(**params)
        model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=50, verbose=False)
        preds = model.predict(X_valid)
        mape = mean_absolute_percentage_error(y_valid, preds)
        return mape
    
    # Optimize hyperparameters
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=10)
    
    best_params = study.best_trial.params
    print(f'Best MAPE for {target}: {study.best_value:.4f}')
    print(f'Best Parameters for {target}: {best_params}')
    
    # Train final model with best parameters
    final_model = CatBoostRegressor(**best_params)
    final_model.fit(X_selected, y_selected)
    best_models[target] = final_model
    
    # Save the model
    # final_model.save_model(f'catboost_model_{target}.cbm')

[I 2025-07-16 12:35:03,861] A new study created in memory with name: no-name-56c5562c-bf12-40ea-bd80-c3985b07e9c9



Optimizing for BlendProperty1...


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1)
[I 2025-07-16 12:35:16,770] Trial 0 finished with value: 0.43472524689987607 and parameters: {'iterations': 1015, 'depth': 9, 'learning_rate': 0.05659048813346511}. Best is trial 0 with value: 0.43472524689987607.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1)
[I 2025-07-16 12:35:24,229] Trial 1 finished with value: 0.2915443977839816 and parameters: {'iterations': 1827, 'depth': 7, 'learning_rate': 0.05991087389315111}. Best is trial 1 with value: 0.2915443977839816.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1)
[I 2025-07-16 12:35:38,298] Trial 2 finished with value: 0.31023587976513184 and parameters: {'iterations': 2201, 'depth': 8, 'learning_rate': 0.07627097587402332}. Best is trial 1 with value: 0.2915443977839816.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1)
[I 2025-07-16 12:35:39,900] Trial 3 finished with value: 0.27258587925185

Best MAPE for BlendProperty1: 0.2215
Best Parameters for BlendProperty1: {'iterations': 2111, 'depth': 4, 'learning_rate': 0.03285037406834459}
0:	learn: 0.9822278	total: 3.82ms	remaining: 8.05s
1:	learn: 0.9651148	total: 7.45ms	remaining: 7.86s
2:	learn: 0.9482605	total: 12.9ms	remaining: 9.03s
3:	learn: 0.9320438	total: 16.1ms	remaining: 8.51s
4:	learn: 0.9175118	total: 18.3ms	remaining: 7.69s
5:	learn: 0.9028083	total: 20.8ms	remaining: 7.3s
6:	learn: 0.8894118	total: 23.8ms	remaining: 7.14s
7:	learn: 0.8750541	total: 26.2ms	remaining: 6.9s
8:	learn: 0.8612151	total: 28.1ms	remaining: 6.56s
9:	learn: 0.8489185	total: 29.7ms	remaining: 6.24s
10:	learn: 0.8367736	total: 31.3ms	remaining: 5.98s
11:	learn: 0.8258191	total: 33.2ms	remaining: 5.8s
12:	learn: 0.8135849	total: 35.2ms	remaining: 5.68s
13:	learn: 0.8018148	total: 37.7ms	remaining: 5.64s
14:	learn: 0.7903404	total: 39.4ms	remaining: 5.51s
15:	learn: 0.7788173	total: 41.2ms	remaining: 5.39s
16:	learn: 0.7683544	total: 43.1ms	re

[I 2025-07-16 12:37:07,332] A new study created in memory with name: no-name-9d1241b3-db9c-49e8-b78c-68376103e577


2099:	learn: 0.0257843	total: 3.6s	remaining: 18.9ms
2100:	learn: 0.0257700	total: 3.61s	remaining: 17.2ms
2101:	learn: 0.0257565	total: 3.61s	remaining: 15.4ms
2102:	learn: 0.0257467	total: 3.61s	remaining: 13.7ms
2103:	learn: 0.0257379	total: 3.61s	remaining: 12ms
2104:	learn: 0.0257263	total: 3.61s	remaining: 10.3ms
2105:	learn: 0.0257110	total: 3.62s	remaining: 8.58ms
2106:	learn: 0.0257001	total: 3.62s	remaining: 6.87ms
2107:	learn: 0.0256920	total: 3.62s	remaining: 5.15ms
2108:	learn: 0.0256769	total: 3.62s	remaining: 3.43ms
2109:	learn: 0.0256630	total: 3.62s	remaining: 1.72ms
2110:	learn: 0.0256499	total: 3.62s	remaining: 0us

Optimizing for BlendProperty2...


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1)
[I 2025-07-16 12:37:37,868] Trial 0 finished with value: 0.4058552249446398 and parameters: {'iterations': 821, 'depth': 10, 'learning_rate': 0.03538706701196921}. Best is trial 0 with value: 0.4058552249446398.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1)
[I 2025-07-16 12:37:42,530] Trial 1 finished with value: 0.28158380048584586 and parameters: {'iterations': 1435, 'depth': 6, 'learning_rate': 0.04674424918670305}. Best is trial 1 with value: 0.28158380048584586.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1)
[I 2025-07-16 12:37:46,539] Trial 2 finished with value: 0.44468572085945257 and parameters: {'iterations': 615, 'depth': 7, 'learning_rate': 0.012390634016217952}. Best is trial 1 with value: 0.28158380048584586.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1)
[I 2025-07-16 12:37:49,454] Trial 3 finished with value: 0.3052288509828

Best MAPE for BlendProperty2: 0.2424
Best Parameters for BlendProperty2: {'iterations': 2069, 'depth': 5, 'learning_rate': 0.03596240890003016}
0:	learn: 0.9791091	total: 2.54ms	remaining: 5.25s
1:	learn: 0.9615471	total: 5.13ms	remaining: 5.3s
2:	learn: 0.9437204	total: 7.71ms	remaining: 5.31s
3:	learn: 0.9257708	total: 9.85ms	remaining: 5.08s
4:	learn: 0.9067054	total: 12.1ms	remaining: 4.99s
5:	learn: 0.8929349	total: 14.3ms	remaining: 4.91s
6:	learn: 0.8767396	total: 16.5ms	remaining: 4.87s
7:	learn: 0.8598064	total: 19ms	remaining: 4.9s
8:	learn: 0.8440097	total: 21.4ms	remaining: 4.89s
9:	learn: 0.8289959	total: 23.3ms	remaining: 4.8s
10:	learn: 0.8135933	total: 25.3ms	remaining: 4.74s
11:	learn: 0.7975278	total: 27.4ms	remaining: 4.69s
12:	learn: 0.7830413	total: 29.5ms	remaining: 4.67s
13:	learn: 0.7691341	total: 31.6ms	remaining: 4.64s
14:	learn: 0.7580208	total: 34.3ms	remaining: 4.69s
15:	learn: 0.7452041	total: 36.6ms	remaining: 4.7s
16:	learn: 0.7325269	total: 38.6ms	remai

[I 2025-07-16 12:38:34,849] A new study created in memory with name: no-name-f191d13e-f54b-44bf-ae4f-9126ae349048



Optimizing for BlendProperty3...


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1)
[I 2025-07-16 12:38:38,761] Trial 0 finished with value: 0.46109430581511945 and parameters: {'iterations': 1242, 'depth': 6, 'learning_rate': 0.0828895241960452}. Best is trial 0 with value: 0.46109430581511945.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1)
[I 2025-07-16 12:39:43,867] Trial 1 finished with value: 0.6911641454356585 and parameters: {'iterations': 1872, 'depth': 10, 'learning_rate': 0.028187626655980888}. Best is trial 0 with value: 0.46109430581511945.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1)
[I 2025-07-16 12:39:47,098] Trial 2 finished with value: 0.43985988864259307 and parameters: {'iterations': 1631, 'depth': 4, 'learning_rate': 0.026508270785281143}. Best is trial 2 with value: 0.43985988864259307.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1)
[I 2025-07-16 12:40:24,137] Trial 3 finished with value: 0.6233064974

Best MAPE for BlendProperty3: 0.4023
Best Parameters for BlendProperty3: {'iterations': 2648, 'depth': 5, 'learning_rate': 0.08484535013750771}
0:	learn: 0.9443053	total: 2.7ms	remaining: 7.15s
1:	learn: 0.8975563	total: 5.01ms	remaining: 6.63s
2:	learn: 0.8570314	total: 7.54ms	remaining: 6.64s
3:	learn: 0.8173647	total: 10.7ms	remaining: 7.04s
4:	learn: 0.7750392	total: 13.3ms	remaining: 7.04s
5:	learn: 0.7424835	total: 16ms	remaining: 7.06s
6:	learn: 0.7134017	total: 19.1ms	remaining: 7.22s
7:	learn: 0.6818191	total: 22.4ms	remaining: 7.39s
8:	learn: 0.6514207	total: 26.1ms	remaining: 7.65s
9:	learn: 0.6262710	total: 28.6ms	remaining: 7.55s
10:	learn: 0.6048058	total: 31ms	remaining: 7.42s
11:	learn: 0.5812912	total: 33ms	remaining: 7.26s
12:	learn: 0.5584219	total: 35.1ms	remaining: 7.11s
13:	learn: 0.5375209	total: 37.2ms	remaining: 7s
14:	learn: 0.5204286	total: 39.3ms	remaining: 6.89s
15:	learn: 0.5042144	total: 42.8ms	remaining: 7.04s
16:	learn: 0.4885526	total: 45.4ms	remaining

[I 2025-07-16 12:41:36,774] A new study created in memory with name: no-name-155e6be5-cd5e-4e96-a3ef-3c8193f9a50b


2628:	learn: 0.0062636	total: 5.64s	remaining: 40.8ms
2629:	learn: 0.0062590	total: 5.64s	remaining: 38.6ms
2630:	learn: 0.0062549	total: 5.64s	remaining: 36.5ms
2631:	learn: 0.0062490	total: 5.65s	remaining: 34.3ms
2632:	learn: 0.0062452	total: 5.65s	remaining: 32.2ms
2633:	learn: 0.0062386	total: 5.65s	remaining: 30ms
2634:	learn: 0.0062337	total: 5.65s	remaining: 27.9ms
2635:	learn: 0.0062282	total: 5.65s	remaining: 25.7ms
2636:	learn: 0.0062252	total: 5.66s	remaining: 23.6ms
2637:	learn: 0.0062160	total: 5.66s	remaining: 21.4ms
2638:	learn: 0.0062117	total: 5.66s	remaining: 19.3ms
2639:	learn: 0.0062076	total: 5.67s	remaining: 17.2ms
2640:	learn: 0.0062021	total: 5.67s	remaining: 15ms
2641:	learn: 0.0061989	total: 5.67s	remaining: 12.9ms
2642:	learn: 0.0061953	total: 5.67s	remaining: 10.7ms
2643:	learn: 0.0061900	total: 5.68s	remaining: 8.59ms
2644:	learn: 0.0061814	total: 5.68s	remaining: 6.44ms
2645:	learn: 0.0061776	total: 5.68s	remaining: 4.29ms
2646:	learn: 0.0061711	total: 5.

  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1)
[I 2025-07-16 12:41:51,551] Trial 0 finished with value: 0.34733848287943814 and parameters: {'iterations': 2031, 'depth': 9, 'learning_rate': 0.08083015706334613}. Best is trial 0 with value: 0.34733848287943814.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1)
[I 2025-07-16 12:41:59,312] Trial 1 finished with value: 0.19411052983603377 and parameters: {'iterations': 2482, 'depth': 6, 'learning_rate': 0.02358175520526238}. Best is trial 1 with value: 0.19411052983603377.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1)
[I 2025-07-16 12:42:04,624] Trial 2 finished with value: 0.19879771922425937 and parameters: {'iterations': 1573, 'depth': 6, 'learning_rate': 0.04448330888899712}. Best is trial 1 with value: 0.19411052983603377.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1)
[I 2025-07-16 12:42:11,124] Trial 3 finished with value: 0.24690073928

Best MAPE for BlendProperty4: 0.1941
Best Parameters for BlendProperty4: {'iterations': 2482, 'depth': 6, 'learning_rate': 0.02358175520526238}
0:	learn: 0.9873162	total: 4.43ms	remaining: 11s
1:	learn: 0.9750789	total: 8.63ms	remaining: 10.7s
2:	learn: 0.9622195	total: 13.5ms	remaining: 11.2s
3:	learn: 0.9498239	total: 17.5ms	remaining: 10.8s
4:	learn: 0.9372320	total: 20.8ms	remaining: 10.3s
5:	learn: 0.9263305	total: 24.1ms	remaining: 9.94s
6:	learn: 0.9150382	total: 28.1ms	remaining: 9.94s
7:	learn: 0.9036297	total: 31.7ms	remaining: 9.8s
8:	learn: 0.8924902	total: 34.8ms	remaining: 9.57s
9:	learn: 0.8824325	total: 37.9ms	remaining: 9.36s
10:	learn: 0.8730776	total: 41ms	remaining: 9.21s
11:	learn: 0.8616822	total: 45.1ms	remaining: 9.28s
12:	learn: 0.8515439	total: 48.4ms	remaining: 9.19s
13:	learn: 0.8423836	total: 51.5ms	remaining: 9.07s
14:	learn: 0.8321805	total: 54.5ms	remaining: 8.96s
15:	learn: 0.8222835	total: 57.5ms	remaining: 8.85s
16:	learn: 0.8113652	total: 61.2ms	rema

[I 2025-07-16 12:43:29,622] A new study created in memory with name: no-name-ec088b50-4a11-4e96-84d1-589c6263c9ca


2467:	learn: 0.0175162	total: 7.7s	remaining: 43.7ms
2468:	learn: 0.0175060	total: 7.71s	remaining: 40.6ms
2469:	learn: 0.0174980	total: 7.71s	remaining: 37.5ms
2470:	learn: 0.0174889	total: 7.71s	remaining: 34.3ms
2471:	learn: 0.0174762	total: 7.71s	remaining: 31.2ms
2472:	learn: 0.0174685	total: 7.72s	remaining: 28.1ms
2473:	learn: 0.0174621	total: 7.72s	remaining: 25ms
2474:	learn: 0.0174513	total: 7.72s	remaining: 21.9ms
2475:	learn: 0.0174379	total: 7.73s	remaining: 18.7ms
2476:	learn: 0.0174279	total: 7.73s	remaining: 15.6ms
2477:	learn: 0.0174145	total: 7.73s	remaining: 12.5ms
2478:	learn: 0.0174039	total: 7.74s	remaining: 9.36ms
2479:	learn: 0.0173969	total: 7.74s	remaining: 6.24ms
2480:	learn: 0.0173880	total: 7.74s	remaining: 3.12ms
2481:	learn: 0.0173758	total: 7.75s	remaining: 0us

Optimizing for BlendProperty5...


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1)
[I 2025-07-16 12:43:35,093] Trial 0 finished with value: 0.09544941713165865 and parameters: {'iterations': 2359, 'depth': 6, 'learning_rate': 0.034372941072310356}. Best is trial 0 with value: 0.09544941713165865.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1)
[I 2025-07-16 12:43:36,723] Trial 1 finished with value: 0.10934200228493308 and parameters: {'iterations': 760, 'depth': 4, 'learning_rate': 0.05577333971954893}. Best is trial 0 with value: 0.09544941713165865.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1)
[I 2025-07-16 12:43:45,593] Trial 2 finished with value: 0.12416041846859326 and parameters: {'iterations': 2191, 'depth': 8, 'learning_rate': 0.045274265322293755}. Best is trial 0 with value: 0.09544941713165865.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1)
[I 2025-07-16 12:44:04,575] Trial 3 finished with value: 0.1028676404

Best MAPE for BlendProperty5: 0.0849
Best Parameters for BlendProperty5: {'iterations': 2893, 'depth': 5, 'learning_rate': 0.04012904871960454}
0:	learn: 0.9652999	total: 3.96ms	remaining: 11.5s
1:	learn: 0.9329746	total: 6.83ms	remaining: 9.88s
2:	learn: 0.9048779	total: 9.45ms	remaining: 9.1s
3:	learn: 0.8772218	total: 15ms	remaining: 10.8s
4:	learn: 0.8475844	total: 17.8ms	remaining: 10.3s
5:	learn: 0.8192846	total: 20.5ms	remaining: 9.86s
6:	learn: 0.7946924	total: 23ms	remaining: 9.48s
7:	learn: 0.7699060	total: 25.7ms	remaining: 9.26s
8:	learn: 0.7452494	total: 29.9ms	remaining: 9.58s
9:	learn: 0.7204715	total: 32.7ms	remaining: 9.44s
10:	learn: 0.6983564	total: 35.4ms	remaining: 9.28s
11:	learn: 0.6758050	total: 37.9ms	remaining: 9.11s
12:	learn: 0.6562473	total: 40.7ms	remaining: 9.03s
13:	learn: 0.6356198	total: 45.4ms	remaining: 9.33s
14:	learn: 0.6144530	total: 47.8ms	remaining: 9.17s
15:	learn: 0.5960957	total: 50ms	remaining: 8.99s
16:	learn: 0.5785928	total: 52.5ms	remain

[I 2025-07-16 12:46:42,475] A new study created in memory with name: no-name-4f0fcd7f-7935-4632-bab9-7863ab0733b1



Optimizing for BlendProperty6...


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1)


In [None]:
# computing ovrall model  overall MAPE
overall_mape = {}

for target in targets:
    X_selected = x_base[top_features[target]]
    y_selected = y[target]
    
   
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y_selected, test_size=0.2, random_state=42)
    
    
    y_pred = best_models[target].predict(X_test)
    
    # Inverse transform to original scale
    y_test_full = pd.DataFrame(np.column_stack([y_test] + [np.zeros(len(y_test))]*(len(targets)-1)), columns=targets, index=X_test.index)
    y_pred_full = pd.DataFrame(np.column_stack([y_pred] + [np.zeros(len(y_pred))]*(len(targets)-1)), columns=targets, index=X_test.index)
    y_test_inv = pt.inverse_transform(y_test_full)[:, targets.index(target)]
    y_pred_inv = pt.inverse_transform(y_pred_full)[:, targets.index(target)]
    
    # Calculate MAPE on original scale
    mask = y_test_inv != 0
    test_mape = np.mean(np.abs((y_test_inv[mask] - y_pred_inv[mask]) / y_test_inv[mask]) * 100) if mask.any() else 0.0
    overall_mape[target] = test_mape
    
    print(f'Test MAPE for {target} (original scale): {test_mape:.2f}%')

print(f'\nAverage MAPE across all targets: {np.mean(list(overall_mape.values())):.2f}%')