# Improving MAPE with CatBoost and Optuna Hyperparameter Tuning

In [1]:

import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_percentage_error
from catboost import CatBoostRegressor
import optuna
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, PowerTransformer

In [1]:
! pip install optuna catboost


Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.4.0-py3-none-any.whl (395 kB)
Downloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.9.0 optuna-4.4.0


In [5]:
# Load the data
data = pd.read_csv("/Volumes/Extreme SSD/ShellAi/train.csv")

#
base_features = [col for col in data.columns if col not in [f"BlendProperty{i}" for i in range(1, 11)]]
targets = [f"BlendProperty{i}" for i in range(1, 11)]



In [7]:
# Outlier handling using IQR method
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[column] = df[column].clip(lower_bound, upper_bound)
    return df

for column in data.columns:
    data = remove_outliers(data, column)

In [9]:
# Creating  weighted columns
weighted_data = {}
for i in range(1, 6):
    for j in range(1, 11):
        prop_col = f"Component{i}_Property{j}"
        frac_col = f"Component{i}_fraction"
        weighted_col = f"Weighted_Component{i}_Property{j}"
        weighted_data[weighted_col] = data[prop_col] * data[frac_col]

# Combine
data_with_weighted = pd.concat([data[base_features], pd.DataFrame(weighted_data)], axis=1)
data_with_weighted = data_with_weighted.copy()  # De-fragmented copy

scaler = StandardScaler()
scaled_features = scaler.fit_transform(data_with_weighted)
scaled_df_features = pd.DataFrame(scaled_features, columns=data_with_weighted.columns)


pt = PowerTransformer(method='yeo-johnson')
transformed_targets_df = pd.DataFrame(pt.fit_transform(data[targets]), columns=targets, index=data.index)

In [11]:
# Initial feature selection using Random Forest feature importance
top_features = {}
for target in targets:
    X = scaled_df_features
    y = transformed_targets_df[target]
    rf_initial = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_initial.fit(X, y)
    feature_importance = pd.Series(rf_initial.feature_importances_, index=scaled_df_features.columns).sort_values(ascending=False)
    top_features[target] = feature_importance.head(25).index.tolist()  # Use top 25 features

x_base = scaled_df_features
y = transformed_targets_df

In [33]:
# Train and evaluate for each target using top 25 features
best_models = {}

for target in targets:
    print(f'\nOptimizing for {target}...')
    

    X_selected = x_base[top_features[target]]
    y_selected = y[target]
    
    def objective(trial):
        params = {
        'iterations': trial.suggest_int('iterations', 500, 3000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0), # <--- ADD THIS LINE FOR REGULARIZATION
        'task_type': 'CPU'
    }
    
    X_train, X_valid, y_train, y_valid = train_test_split(X_selected, y_selected, test_size=0.2, random_state=42)
    model = CatBoostRegressor(**params)
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=50, verbose=False)
    preds = model.predict(X_valid)
    
    # Since you must use MAPE, we keep it as the objective to minimize.
    # Note: MAPE is sensitive, so regularization may or may not have a large impact.
    mape = mean_absolute_percentage_error(y_valid, preds)
    return mape


    
    # Optimize hyperparameters
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=25)
    
    best_params = study.best_trial.params
    print(f'Best MAPE for {target}: {study.best_value:.4f}')
    print(f'Best Parameters for {target}: {best_params}')
    
    # Train final model with best parameters
    final_model = CatBoostRegressor(**best_params)
    final_model.fit(X_selected, y_selected)
    best_models[target] = final_model
    
    # Save the model
    # final_model.save_model(f'catboost_model_{target}.cbm')

SyntaxError: 'return' outside function (4255690502.py, line 28)

In [35]:
import optuna
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error
from catboost import CatBoostRegressor

# 1. Define the objective function ONCE, outside the loop.
# We add X and y as arguments to pass the data in.
def objective(trial, X, y):
    """Optuna objective function to tune CatBoost hyperparameters."""
    
    # Define the hyperparameter search space
    params = {
        'iterations': trial.suggest_int('iterations', 500, 3000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
        'task_type': 'CPU',
        'verbose': False  # Keep CatBoost quiet during tuning
    }
    
    # Split data for validation
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Initialize and train the model
    model = CatBoostRegressor(**params)
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=50, verbose=False)
    
    # Make predictions and calculate MAPE for the trial
    preds = model.predict(X_valid)
    mape = mean_absolute_percentage_error(y_valid, preds)
    
    return mape

# 2. Main training loop
best_models = {}

for target in targets:
    print(f'\nOptimizing for {target}...')

    # Select data for the current target
    X_selected = x_base[top_features[target]]
    y_selected = y[target]
    
    # Create the study
    study = optuna.create_study(direction='minimize')
    
    # Optimize the objective function, passing the current data using a lambda
    study.optimize(lambda trial: objective(trial, X_selected, y_selected), n_trials=25)
    
    # Get the best parameters from the study
    best_params = study.best_trial.params
    print(f'Best Validation MAPE for {target}: {study.best_value:.4f}')
    print(f'Best Parameters for {target}: {best_params}')
    
    # Train the final model for the target using the best parameters on the full dataset
    print(f'Training final model for {target}...')
    final_model = CatBoostRegressor(**best_params, verbose=False) # Added verbose=False for cleaner output
    final_model.fit(X_selected, y_selected)
    
    # Store the final model
    best_models[target] = final_model
    
    # Optional: Save the model to a file
    # final_model.save_model(f'catboost_model_{target}.cbm')

print("\nAll models have been trained successfully.")


[I 2025-07-16 21:55:47,963] A new study created in memory with name: no-name-0c80a120-7f98-4069-bba5-0b90b664995e



Optimizing for BlendProperty1...


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-16 21:55:49,503] Trial 0 finished with value: 0.24577929720137548 and parameters: {'iterations': 2440, 'depth': 5, 'learning_rate': 0.07683411482424313, 'l2_leaf_reg': 1.844373913102075}. Best is trial 0 with value: 0.24577929720137548.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-16 21:55:50,274] Trial 1 finished with value: 0.31201044346693557 and parameters: {'iterations': 1510, 'depth': 4, 'learning_rate': 0.013795469181238362, 'l2_leaf_reg': 1.9212106140658447}. Best is trial 0 with value: 0.24577929720137548.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-16 21:55:54,925] Trial 2 finished with value: 0.3855275366068455 

Best Validation MAPE for BlendProperty1: 0.2281
Best Parameters for BlendProperty1: {'iterations': 1498, 'depth': 4, 'learning_rate': 0.028603677385191217, 'l2_leaf_reg': 1.0252163083229229}
Training final model for BlendProperty1...


[I 2025-07-16 21:56:53,052] A new study created in memory with name: no-name-5e1934b9-934c-4d36-bd05-250e9127e250



Optimizing for BlendProperty2...


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-16 21:56:53,934] Trial 0 finished with value: 0.27353948246062465 and parameters: {'iterations': 1754, 'depth': 4, 'learning_rate': 0.07553715290927085, 'l2_leaf_reg': 3.0125204668582883}. Best is trial 0 with value: 0.27353948246062465.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-16 21:56:55,305] Trial 1 finished with value: 0.25459220145775624 and parameters: {'iterations': 2832, 'depth': 4, 'learning_rate': 0.014012533087993074, 'l2_leaf_reg': 1.0701595783861924}. Best is trial 1 with value: 0.25459220145775624.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-16 21:56:56,490] Trial 2 finished with value: 0.2706855029843904

Best Validation MAPE for BlendProperty2: 0.2080
Best Parameters for BlendProperty2: {'iterations': 2235, 'depth': 4, 'learning_rate': 0.058558199517942125, 'l2_leaf_reg': 4.578916099911351}
Training final model for BlendProperty2...


[I 2025-07-16 21:58:15,137] A new study created in memory with name: no-name-6e804022-0f09-4b23-a321-0e390abb3f0f



Optimizing for BlendProperty3...


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-16 21:58:15,680] Trial 0 finished with value: 0.6222074017023804 and parameters: {'iterations': 723, 'depth': 5, 'learning_rate': 0.021021070494680685, 'l2_leaf_reg': 2.0405697332269312}. Best is trial 0 with value: 0.6222074017023804.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-16 21:58:16,238] Trial 1 finished with value: 0.4831515815523249 and parameters: {'iterations': 1070, 'depth': 4, 'learning_rate': 0.09410096277176021, 'l2_leaf_reg': 3.471457739858996}. Best is trial 1 with value: 0.4831515815523249.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-16 21:58:17,659] Trial 2 finished with value: 0.4438083461436436 and p

Best Validation MAPE for BlendProperty3: 0.4089
Best Parameters for BlendProperty3: {'iterations': 2927, 'depth': 5, 'learning_rate': 0.04145568528529696, 'l2_leaf_reg': 1.6151199251912989}
Training final model for BlendProperty3...


[I 2025-07-16 21:59:26,774] A new study created in memory with name: no-name-309fd78b-f5b2-43e8-ad26-062c1eb0cd54



Optimizing for BlendProperty4...


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-16 21:59:39,570] Trial 0 finished with value: 0.3759469536017135 and parameters: {'iterations': 2281, 'depth': 9, 'learning_rate': 0.0737055953873005, 'l2_leaf_reg': 9.869104512099891}. Best is trial 0 with value: 0.3759469536017135.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-16 21:59:41,822] Trial 1 finished with value: 0.2065294237719668 and parameters: {'iterations': 2000, 'depth': 6, 'learning_rate': 0.015061503165420225, 'l2_leaf_reg': 1.219423834914911}. Best is trial 1 with value: 0.2065294237719668.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-16 21:59:45,152] Trial 2 finished with value: 0.19263466629750134 and p

Best Validation MAPE for BlendProperty4: 0.1703
Best Parameters for BlendProperty4: {'iterations': 1505, 'depth': 4, 'learning_rate': 0.035366674458589015, 'l2_leaf_reg': 1.908375213567634}
Training final model for BlendProperty4...


[I 2025-07-16 22:00:52,726] A new study created in memory with name: no-name-ffcbc09f-619b-4a55-849f-dfbfb7dea52f



Optimizing for BlendProperty5...


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-16 22:00:59,939] Trial 0 finished with value: 0.16308413371083283 and parameters: {'iterations': 689, 'depth': 10, 'learning_rate': 0.0979602344208541, 'l2_leaf_reg': 2.8497562014908677}. Best is trial 0 with value: 0.16308413371083283.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-16 22:01:07,915] Trial 1 finished with value: 0.09553547507608297 and parameters: {'iterations': 2575, 'depth': 8, 'learning_rate': 0.028791233825036327, 'l2_leaf_reg': 5.255892903680692}. Best is trial 1 with value: 0.09553547507608297.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-16 22:01:08,971] Trial 2 finished with value: 0.14041994065548052 

Best Validation MAPE for BlendProperty5: 0.0615
Best Parameters for BlendProperty5: {'iterations': 2896, 'depth': 6, 'learning_rate': 0.010870587159797222, 'l2_leaf_reg': 1.0270532634630507}
Training final model for BlendProperty5...


[I 2025-07-16 22:02:34,040] A new study created in memory with name: no-name-b673a844-aa2e-4847-9280-7df0d6e23352



Optimizing for BlendProperty6...


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-16 22:02:37,433] Trial 0 finished with value: 1.85130473965965 and parameters: {'iterations': 577, 'depth': 9, 'learning_rate': 0.010506888401152603, 'l2_leaf_reg': 7.045651178510261}. Best is trial 0 with value: 1.85130473965965.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-16 22:02:40,368] Trial 1 finished with value: 0.6993822105295635 and parameters: {'iterations': 2591, 'depth': 6, 'learning_rate': 0.015471289627536163, 'l2_leaf_reg': 3.646413904381303}. Best is trial 1 with value: 0.6993822105295635.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-16 22:03:04,924] Trial 2 finished with value: 1.5448382115578518 and param

Best Validation MAPE for BlendProperty6: 0.3836
Best Parameters for BlendProperty6: {'iterations': 983, 'depth': 4, 'learning_rate': 0.04044952452988433, 'l2_leaf_reg': 2.506960901362049}
Training final model for BlendProperty6...


[I 2025-07-16 22:04:01,324] A new study created in memory with name: no-name-494ab026-a455-4c00-b550-76a1f3e78a9b



Optimizing for BlendProperty7...


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-16 22:04:02,616] Trial 0 finished with value: 0.7230996916283147 and parameters: {'iterations': 2526, 'depth': 5, 'learning_rate': 0.07253568656254392, 'l2_leaf_reg': 1.1645606049263157}. Best is trial 0 with value: 0.7230996916283147.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-16 22:04:04,035] Trial 1 finished with value: 0.7476120687967744 and parameters: {'iterations': 2337, 'depth': 6, 'learning_rate': 0.07618589029539936, 'l2_leaf_reg': 5.303387746052746}. Best is trial 0 with value: 0.7230996916283147.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-16 22:04:09,481] Trial 2 finished with value: 0.9865259574911789 and p

Best Validation MAPE for BlendProperty7: 0.4968
Best Parameters for BlendProperty7: {'iterations': 1285, 'depth': 7, 'learning_rate': 0.044582632969784826, 'l2_leaf_reg': 2.76197839308037}
Training final model for BlendProperty7...


[I 2025-07-16 22:05:52,877] A new study created in memory with name: no-name-bb50105b-cdba-4e4a-8582-58b9841d85a9



Optimizing for BlendProperty8...


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-16 22:05:57,083] Trial 0 finished with value: 1.5790119094945336 and parameters: {'iterations': 724, 'depth': 9, 'learning_rate': 0.07741172135427335, 'l2_leaf_reg': 4.913396878276839}. Best is trial 0 with value: 1.5790119094945336.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-16 22:05:57,690] Trial 1 finished with value: 0.9883542954597138 and parameters: {'iterations': 2254, 'depth': 4, 'learning_rate': 0.060088978270234725, 'l2_leaf_reg': 5.149014703908128}. Best is trial 1 with value: 0.9883542954597138.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-16 22:05:58,376] Trial 2 finished with value: 1.5855888241802256 and pa

Best Validation MAPE for BlendProperty8: 0.5916
Best Parameters for BlendProperty8: {'iterations': 2228, 'depth': 4, 'learning_rate': 0.025726938137650427, 'l2_leaf_reg': 3.390000192230687}
Training final model for BlendProperty8...


[I 2025-07-16 22:07:00,177] A new study created in memory with name: no-name-d718024f-3a1c-4896-bbd5-ba613f51ecaa



Optimizing for BlendProperty9...


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-16 22:07:08,347] Trial 0 finished with value: 2.4243642297830137 and parameters: {'iterations': 717, 'depth': 10, 'learning_rate': 0.01520094439397782, 'l2_leaf_reg': 4.617624443979101}. Best is trial 0 with value: 2.4243642297830137.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-16 22:07:09,899] Trial 1 finished with value: 2.725720265317173 and parameters: {'iterations': 2811, 'depth': 4, 'learning_rate': 0.035482846015342934, 'l2_leaf_reg': 3.2510170029447094}. Best is trial 0 with value: 2.4243642297830137.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-16 22:07:16,805] Trial 2 finished with value: 2.8452190569044102 and p

Best Validation MAPE for BlendProperty9: 1.7673
Best Parameters for BlendProperty9: {'iterations': 787, 'depth': 9, 'learning_rate': 0.01025880865324625, 'l2_leaf_reg': 2.2950732596043455}
Training final model for BlendProperty9...


[I 2025-07-16 22:09:37,885] A new study created in memory with name: no-name-b9bbd551-5196-45ac-8c9c-953b838979ae



Optimizing for BlendProperty10...


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-16 22:09:40,260] Trial 0 finished with value: 0.23169145702860525 and parameters: {'iterations': 2643, 'depth': 5, 'learning_rate': 0.03363767901925521, 'l2_leaf_reg': 4.5926570952873895}. Best is trial 0 with value: 0.23169145702860525.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-16 22:09:41,731] Trial 1 finished with value: 0.24399450468330655 and parameters: {'iterations': 2597, 'depth': 4, 'learning_rate': 0.017734386486254083, 'l2_leaf_reg': 1.3544523018484347}. Best is trial 0 with value: 0.23169145702860525.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-16 22:09:45,474] Trial 2 finished with value: 0.1967738385254828

Best Validation MAPE for BlendProperty10: 0.1968
Best Parameters for BlendProperty10: {'iterations': 2965, 'depth': 6, 'learning_rate': 0.018224036840569523, 'l2_leaf_reg': 1.1265967149037723}
Training final model for BlendProperty10...

All models have been trained successfully.


In [39]:
# computing ovrall model  overall MAPE
overall_mape = {}

for target in targets:
    X_selected = x_base[top_features[target]]
    y_selected = y[target]
    
   
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y_selected, test_size=0.2, random_state=42)
    
    
    y_pred = best_models[target].predict(X_test)
    
    # Inverse transform to original scale
    y_test_full = pd.DataFrame(np.column_stack([y_test] + [np.zeros(len(y_test))]*(len(targets)-1)), columns=targets, index=X_test.index)
    y_pred_full = pd.DataFrame(np.column_stack([y_pred] + [np.zeros(len(y_pred))]*(len(targets)-1)), columns=targets, index=X_test.index)
    y_test_inv = pt.inverse_transform(y_test_full)[:, targets.index(target)]
    y_pred_inv = pt.inverse_transform(y_pred_full)[:, targets.index(target)]
    
    # Calculate MAPE on original scale
    mask = y_test_inv != 0
    test_mape = np.mean(np.abs((y_test_inv[mask] - y_pred_inv[mask]) / y_test_inv[mask]) * 100) if mask.any() else 0.0
    overall_mape[target] = test_mape
    
    print(f'Test MAPE for {target} (original scale): {test_mape:.2f}%')

print(f'\nAverage MAPE across all targets: {np.mean(list(overall_mape.values())):.2f}%')

Test MAPE for BlendProperty1 (original scale): 251.12%
Test MAPE for BlendProperty2 (original scale): 0.00%
Test MAPE for BlendProperty3 (original scale): 0.00%
Test MAPE for BlendProperty4 (original scale): 0.00%
Test MAPE for BlendProperty5 (original scale): 0.00%
Test MAPE for BlendProperty6 (original scale): 0.00%
Test MAPE for BlendProperty7 (original scale): 0.00%
Test MAPE for BlendProperty8 (original scale): 0.00%
Test MAPE for BlendProperty9 (original scale): 0.00%
Test MAPE for BlendProperty10 (original scale): 0.00%

Average MAPE across all targets: 25.11%


In [41]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Initialize a dictionary to store the final MAPE for each target
overall_mape = {}

# Loop through each target variable to evaluate its model
for target in targets:
    # Select the top features and the target column for the current loop iteration
    X_selected = x_base[top_features[target]]
    y_selected = y[target]
    
    # Split the data into training and testing sets to evaluate the model
    # Note: The model was already trained, we only need X_test and y_test here
    _, X_test, _, y_test = train_test_split(X_selected, y_selected, test_size=0.2, random_state=42)
    
    # Predict on the test set using the best model saved for this target
    y_pred = best_models[target].predict(X_test)
    
    # --- Corrected Inverse Transform Section ---
    
    # 1. Create placeholder DataFrames full of zeros
    # These must have the same shape and columns as the original data used to fit the scaler 'pt'
    y_test_full = pd.DataFrame(np.zeros((len(y_test), len(targets))), columns=targets, index=X_test.index)
    y_pred_full = pd.DataFrame(np.zeros((len(y_pred), len(targets))), columns=targets, index=X_test.index)

    # 2. Place the actual test/prediction data into the correct column
    y_test_full[target] = y_test
    y_pred_full[target] = y_pred

    # 3. Apply the inverse transform; it will now work on the correctly structured data
    y_test_inv = pt.inverse_transform(y_test_full)[:, targets.index(target)]
    y_pred_inv = pt.inverse_transform(y_pred_full)[:, targets.index(target)]
    
    # --- MAPE Calculation ---
    
    # Create a mask to avoid division by zero where the true value is 0
    mask = y_test_inv != 0
    
    # Calculate Mean Absolute Percentage Error on the original scale
    # Use the mask to only include non-zero true values in the calculation
    test_mape = np.mean(np.abs((y_test_inv[mask] - y_pred_inv[mask]) / y_test_inv[mask])) * 100 if mask.any() else 0.0
    
    # Store the result in our dictionary
    overall_mape[target] = test_mape

# --- Final Reporting (after the loop has finished) ---

print("--- Final MAPE Results for Each Property ---")
# Print the completed dictionary to see all results clearly
print(overall_mape)

print("\n" + "="*40)
# Print the overall average MAPE across all models
print(f"Average MAPE across all targets: {np.mean(list(overall_mape.values())):.2f}%")


--- Final MAPE Results for Each Property ---
{'BlendProperty1': 251.1180134101612, 'BlendProperty2': 7.079375431703442, 'BlendProperty3': 5.999596399800396, 'BlendProperty4': 14.41290508691333, 'BlendProperty5': 4.0718418782963965, 'BlendProperty6': 65.55713603370799, 'BlendProperty7': 8.861048244820722, 'BlendProperty8': 62.150844442873264, 'BlendProperty9': 90.86546552635582, 'BlendProperty10': 4.533251712097046}

Average MAPE across all targets: 51.46%


In [48]:
from sklearn.preprocessing import RobustScaler

In [50]:
# --- This block replaces your initial feature engineering ---

# 1. Original Weighted Features
weighted_data = {}
for i in range(1, 6):
    for j in range(1, 11):
        prop_col = f"Component{i}_Property{j}"
        frac_col = f"Component{i}_fraction"
        if prop_col in data.columns and frac_col in data.columns:
            weighted_col = f"Weighted_Component{i}_Property{j}"
            weighted_data[weighted_col] = data[prop_col] * data[frac_col]

weighted_df = pd.DataFrame(weighted_data)

# 2. 🌟 New Statistical Features 
# For each blend (row), calculate stats across its component properties.
# This helps the model understand the blend's overall chemical profile.
component_prop_cols = [f"Component{i}_Property{j}" for i in range(1, 6) for j in range(1, 11)]
# Filter for columns that actually exist in the dataframe
existing_prop_cols = [col for col in component_prop_cols if col in data.columns]

statistical_features = pd.DataFrame(index=data.index)
statistical_features['mean_property'] = data[existing_prop_cols].mean(axis=1)
statistical_features['std_dev_property'] = data[existing_prop_cols].std(axis=1)
statistical_features['min_property'] = data[existing_prop_cols].min(axis=1)
statistical_features['max_property'] = data[existing_prop_cols].max(axis=1)

# 3. Combine all features
data_with_all_features = pd.concat([
    data[base_features], 
    weighted_df,
    statistical_features
], axis=1)

# De-fragment and handle any potential missing values from std dev calculation
data_with_all_features = data_with_all_features.copy()
data_with_all_features.fillna(0, inplace=True)

# 4. Scale the combined feature set
scaler = RobustScaler()
scaled_features = scaler.fit_transform(data_with_all_features)
scaled_df_features = pd.DataFrame(scaled_features, columns=data_with_all_features.columns)

# Target transformation remains the same
pt = PowerTransformer(method='yeo-johnson')
transformed_targets_df = pd.DataFrame(pt.fit_transform(data[targets]), columns=targets, index=data.index)

# Feature selection and variable assignment remain the same
# (Your Random Forest feature selection code would go here)
# ...
x_base = scaled_df_features
y = transformed_targets_df


In [52]:
# --- This block replaces your combined training AND evaluation loops ---

from sklearn.model_selection import KFold
import optuna
import gc # Garbage Collector

# Re-define the objective function once
def objective(trial, X, y):
    params = {
        'objective':'MAE',
        'iterations': trial.suggest_int('iterations', 500, 3000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
        'task_type': 'CPU',
        'verbose': False
    }
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
    model = CatBoostRegressor(**params)
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=50)
    preds = model.predict(X_valid)
    return mean_absolute_percentage_error(y_valid, preds)

# --- Main Loop ---
best_models = {}
overall_cv_scores = {}

for target in targets:
    print(f"\n{'='*20}\nProcessing Target: {target}\n{'='*20}")
    
    X_selected = x_base[top_features[target]]
    y_selected = y[target]
    
    # 1. 🎯 Tune Hyperparameters with Optuna (once per target)
    print(f"Phase 1: Tuning hyperparameters for {target}...")
    study = optuna.create_study(direction='minimize')
    study.optimize(lambda trial: objective(trial, X_selected, y_selected), n_trials=25)
    best_params = study.best_trial.params
    print(f"Best Tuned Parameters Found for {target}.")

    # 2.  ক্রস Get a Robust Score with K-Fold Cross-Validation
    print(f"Phase 2: Evaluating with 5-Fold Cross-Validation for {target}...")
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    fold_mape_scores = []

    for fold, (train_index, val_index) in enumerate(kf.split(X_selected, y_selected)):
        X_train, X_val = X_selected.iloc[train_index], X_selected.iloc[val_index]
        y_train, y_val = y_selected.iloc[train_index], y_selected.iloc[val_index]
        
        # Train model with the best params found earlier
        model = CatBoostRegressor(**best_params, verbose=False)
        model.fit(X_train, y_train)
        
        # Predict on the validation set for this fold
        y_pred = model.predict(X_val)
        
        # Correctly inverse transform for MAPE calculation
        y_val_full = pd.DataFrame(np.zeros((len(y_val), len(targets))), columns=targets, index=X_val.index)
        y_pred_full = pd.DataFrame(np.zeros((len(y_pred), len(targets))), columns=targets, index=X_val.index)
        y_val_full[target] = y_val
        y_pred_full[target] = y_pred
        y_val_inv = pt.inverse_transform(y_val_full)[:, targets.index(target)]
        y_pred_inv = pt.inverse_transform(y_pred_full)[:, targets.index(target)]
        
        # Calculate and store MAPE for the fold
        mask = y_val_inv != 0
        mape = np.mean(np.abs((y_val_inv[mask] - y_pred_inv[mask]) / y_val_inv[mask])) * 100 if mask.any() else 0.0
        fold_mape_scores.append(mape)
        print(f"  Fold {fold+1} MAPE: {mape:.2f}%")

    # Report the robust CV score
    mean_mape = np.mean(fold_mape_scores)
    std_mape = np.std(fold_mape_scores)
    overall_cv_scores[target] = {'mean_mape': mean_mape, 'std_mape': std_mape}
    print(f"\nRobust CV Score for {target}: {mean_mape:.2f}% (+/- {std_mape:.2f}%)")

    # 3. 🚀 Train Final Model on ALL Data for submission
    print(f"Phase 3: Training final model on all data for {target}...")
    final_model = CatBoostRegressor(**best_params, verbose=False)
    final_model.fit(X_selected, y_selected)
    best_models[target] = final_model
    
    # Clean up memory before next target
    del study, model, final_model
    gc.collect()

print("\n\n--- All models trained. Final CV Scores ---")
print(pd.DataFrame(overall_cv_scores).T)


[I 2025-07-17 00:08:54,774] A new study created in memory with name: no-name-2cbcab82-6982-478b-a3d0-e4c1f41f75c5



Processing Target: BlendProperty1
Phase 1: Tuning hyperparameters for BlendProperty1...


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-17 00:09:01,353] Trial 0 finished with value: 0.4472251865552374 and parameters: {'iterations': 2865, 'depth': 9, 'learning_rate': 0.03574818796831661, 'l2_leaf_reg': 7.4616208637644705}. Best is trial 0 with value: 0.4472251865552374.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-17 00:09:03,048] Trial 1 finished with value: 0.25789089261321746 and parameters: {'iterations': 2322, 'depth': 5, 'learning_rate': 0.038194725542866036, 'l2_leaf_reg': 3.0703448705619483}. Best is trial 1 with value: 0.25789089261321746.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-17 00:09:04,296] Trial 2 finished with value: 0.32476555578860006 

Best Tuned Parameters Found for BlendProperty1.
Phase 2: Evaluating with 5-Fold Cross-Validation for BlendProperty1...
  Fold 1 MAPE: 83.83%
  Fold 2 MAPE: 44.69%
  Fold 3 MAPE: 30.66%
  Fold 4 MAPE: 32.53%
  Fold 5 MAPE: 41.88%

Robust CV Score for BlendProperty1: 46.72% (+/- 19.31%)
Phase 3: Training final model on all data for BlendProperty1...


[I 2025-07-17 00:10:54,366] A new study created in memory with name: no-name-d4243f1c-4088-4a91-a637-802bdadca963



Processing Target: BlendProperty2
Phase 1: Tuning hyperparameters for BlendProperty2...


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-17 00:10:55,663] Trial 0 finished with value: 0.32163270951559936 and parameters: {'iterations': 2212, 'depth': 4, 'learning_rate': 0.02335845879649332, 'l2_leaf_reg': 7.33665154254136}. Best is trial 0 with value: 0.32163270951559936.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-17 00:10:59,924] Trial 1 finished with value: 0.6626265663211516 and parameters: {'iterations': 2079, 'depth': 9, 'learning_rate': 0.08562695626230804, 'l2_leaf_reg': 1.513344287012587}. Best is trial 0 with value: 0.32163270951559936.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-17 00:11:02,407] Trial 2 finished with value: 0.33684384899099085 and

Best Tuned Parameters Found for BlendProperty2.
Phase 2: Evaluating with 5-Fold Cross-Validation for BlendProperty2...
  Fold 1 MAPE: 50.63%
  Fold 2 MAPE: 43.48%
  Fold 3 MAPE: 42.58%
  Fold 4 MAPE: 29.85%
  Fold 5 MAPE: 21.30%

Robust CV Score for BlendProperty2: 37.57% (+/- 10.54%)
Phase 3: Training final model on all data for BlendProperty2...


[I 2025-07-17 00:12:18,034] A new study created in memory with name: no-name-a68351f1-c75a-4eff-82af-3918db6e109c



Processing Target: BlendProperty3
Phase 1: Tuning hyperparameters for BlendProperty3...


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-17 00:12:34,027] Trial 0 finished with value: 0.6090550917872963 and parameters: {'iterations': 1709, 'depth': 10, 'learning_rate': 0.015848049756904122, 'l2_leaf_reg': 1.1076346593905428}. Best is trial 0 with value: 0.6090550917872963.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-17 00:12:36,779] Trial 1 finished with value: 0.5667562114164544 and parameters: {'iterations': 2126, 'depth': 7, 'learning_rate': 0.055494043403330376, 'l2_leaf_reg': 1.0372909301396702}. Best is trial 1 with value: 0.5667562114164544.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-17 00:12:38,469] Trial 2 finished with value: 0.6917449779891871 a

Best Tuned Parameters Found for BlendProperty3.
Phase 2: Evaluating with 5-Fold Cross-Validation for BlendProperty3...
  Fold 1 MAPE: 70.75%
  Fold 2 MAPE: 48.22%
  Fold 3 MAPE: 152.22%
  Fold 4 MAPE: 116.81%
  Fold 5 MAPE: 57.00%

Robust CV Score for BlendProperty3: 89.00% (+/- 39.47%)
Phase 3: Training final model on all data for BlendProperty3...


[I 2025-07-17 00:15:14,536] A new study created in memory with name: no-name-86ce9084-c76b-415a-a2fd-3d4bc9bdef4f



Processing Target: BlendProperty4
Phase 1: Tuning hyperparameters for BlendProperty4...


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-17 00:15:15,953] Trial 0 finished with value: 0.29874110523054126 and parameters: {'iterations': 2289, 'depth': 6, 'learning_rate': 0.08968601074948258, 'l2_leaf_reg': 1.7287595418212856}. Best is trial 0 with value: 0.29874110523054126.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-17 00:15:22,486] Trial 1 finished with value: 0.5800501208538086 and parameters: {'iterations': 2072, 'depth': 10, 'learning_rate': 0.06856151795230506, 'l2_leaf_reg': 5.67342689810466}. Best is trial 0 with value: 0.29874110523054126.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-17 00:15:24,461] Trial 2 finished with value: 0.3001903923423796 an

Best Tuned Parameters Found for BlendProperty4.
Phase 2: Evaluating with 5-Fold Cross-Validation for BlendProperty4...
  Fold 1 MAPE: 28.82%
  Fold 2 MAPE: 34.72%
  Fold 3 MAPE: 49.14%
  Fold 4 MAPE: 76.18%
  Fold 5 MAPE: 26.04%

Robust CV Score for BlendProperty4: 42.98% (+/- 18.42%)
Phase 3: Training final model on all data for BlendProperty4...


[I 2025-07-17 00:16:18,420] A new study created in memory with name: no-name-cfe94578-c3f9-48bd-af8d-f8a3d2444ba0



Processing Target: BlendProperty5
Phase 1: Tuning hyperparameters for BlendProperty5...


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-17 00:16:19,654] Trial 0 finished with value: 0.13329920367996015 and parameters: {'iterations': 2153, 'depth': 7, 'learning_rate': 0.04151570867264242, 'l2_leaf_reg': 3.1008773636245266}. Best is trial 0 with value: 0.13329920367996015.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-17 00:16:20,550] Trial 1 finished with value: 0.1353895828226553 and parameters: {'iterations': 859, 'depth': 7, 'learning_rate': 0.04413977449987113, 'l2_leaf_reg': 9.81410269346635}. Best is trial 0 with value: 0.13329920367996015.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-17 00:16:20,864] Trial 2 finished with value: 0.1330255731762243 and 

Best Tuned Parameters Found for BlendProperty5.
Phase 2: Evaluating with 5-Fold Cross-Validation for BlendProperty5...
  Fold 1 MAPE: 23.11%
  Fold 2 MAPE: 9.96%
  Fold 3 MAPE: 8.45%
  Fold 4 MAPE: 16.88%
  Fold 5 MAPE: 17.72%

Robust CV Score for BlendProperty5: 15.23% (+/- 5.38%)
Phase 3: Training final model on all data for BlendProperty5...


[I 2025-07-17 00:17:13,092] A new study created in memory with name: no-name-09d36c39-78a8-42c3-884a-7696cdfefbad



Processing Target: BlendProperty6
Phase 1: Tuning hyperparameters for BlendProperty6...


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-17 00:17:14,535] Trial 0 finished with value: 0.6368781840436779 and parameters: {'iterations': 2340, 'depth': 4, 'learning_rate': 0.04965577846366219, 'l2_leaf_reg': 5.2752231741608195}. Best is trial 0 with value: 0.6368781840436779.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-17 00:17:23,467] Trial 1 finished with value: 1.0793061535925954 and parameters: {'iterations': 1433, 'depth': 9, 'learning_rate': 0.019941734372476736, 'l2_leaf_reg': 1.042787648068554}. Best is trial 0 with value: 0.6368781840436779.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-17 00:17:24,953] Trial 2 finished with value: 0.5737907491005916 and 

Best Tuned Parameters Found for BlendProperty6.
Phase 2: Evaluating with 5-Fold Cross-Validation for BlendProperty6...
  Fold 1 MAPE: 79.30%
  Fold 2 MAPE: 21.11%
  Fold 3 MAPE: 40.19%
  Fold 4 MAPE: 27.19%
  Fold 5 MAPE: 19.83%

Robust CV Score for BlendProperty6: 37.52% (+/- 22.10%)
Phase 3: Training final model on all data for BlendProperty6...


[I 2025-07-17 00:18:32,695] A new study created in memory with name: no-name-2a416811-6f78-4d0f-825e-f556be0d8dba



Processing Target: BlendProperty7
Phase 1: Tuning hyperparameters for BlendProperty7...


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-17 00:18:33,690] Trial 0 finished with value: 0.9309385709901341 and parameters: {'iterations': 1689, 'depth': 4, 'learning_rate': 0.031073329913370414, 'l2_leaf_reg': 8.872126610531968}. Best is trial 0 with value: 0.9309385709901341.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-17 00:18:50,144] Trial 1 finished with value: 1.3293929382838205 and parameters: {'iterations': 1569, 'depth': 10, 'learning_rate': 0.02267176250382469, 'l2_leaf_reg': 1.1993207442745093}. Best is trial 0 with value: 0.9309385709901341.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-17 00:18:50,901] Trial 2 finished with value: 0.8578252285123297 and

Best Tuned Parameters Found for BlendProperty7.
Phase 2: Evaluating with 5-Fold Cross-Validation for BlendProperty7...
  Fold 1 MAPE: 50.66%
  Fold 2 MAPE: 94.25%
  Fold 3 MAPE: 220.63%
  Fold 4 MAPE: 50.35%
  Fold 5 MAPE: 59.85%

Robust CV Score for BlendProperty7: 95.15% (+/- 64.77%)
Phase 3: Training final model on all data for BlendProperty7...


[I 2025-07-17 00:20:12,276] A new study created in memory with name: no-name-f88228b5-9ac3-47f7-b8cf-575174bd18c4



Processing Target: BlendProperty8
Phase 1: Tuning hyperparameters for BlendProperty8...


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-17 00:20:16,862] Trial 0 finished with value: 1.191581962364081 and parameters: {'iterations': 682, 'depth': 9, 'learning_rate': 0.05121428296504929, 'l2_leaf_reg': 7.92581610399652}. Best is trial 0 with value: 1.191581962364081.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-17 00:20:17,503] Trial 1 finished with value: 0.7059989783685512 and parameters: {'iterations': 682, 'depth': 5, 'learning_rate': 0.02318703651379302, 'l2_leaf_reg': 1.0800656438601846}. Best is trial 1 with value: 0.7059989783685512.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-17 00:20:19,177] Trial 2 finished with value: 1.0632052525140276 and parame

Best Tuned Parameters Found for BlendProperty8.
Phase 2: Evaluating with 5-Fold Cross-Validation for BlendProperty8...
  Fold 1 MAPE: 124.80%
  Fold 2 MAPE: 37.55%
  Fold 3 MAPE: 69.61%
  Fold 4 MAPE: 46.39%
  Fold 5 MAPE: 40.99%

Robust CV Score for BlendProperty8: 63.87% (+/- 32.46%)
Phase 3: Training final model on all data for BlendProperty8...


[I 2025-07-17 00:21:28,862] A new study created in memory with name: no-name-95c26bf5-32d2-4468-9c8c-b97e231ecf60



Processing Target: BlendProperty9
Phase 1: Tuning hyperparameters for BlendProperty9...


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-17 00:21:35,811] Trial 0 finished with value: 2.707751667802723 and parameters: {'iterations': 2413, 'depth': 10, 'learning_rate': 0.07734324886761394, 'l2_leaf_reg': 2.962454867764752}. Best is trial 0 with value: 2.707751667802723.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-17 00:21:41,061] Trial 1 finished with value: 1.706655018832003 and parameters: {'iterations': 721, 'depth': 9, 'learning_rate': 0.0145747585266011, 'l2_leaf_reg': 2.971901726718451}. Best is trial 1 with value: 1.706655018832003.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-17 00:21:51,867] Trial 2 finished with value: 1.4106141872485956 and paramet

Best Tuned Parameters Found for BlendProperty9.
Phase 2: Evaluating with 5-Fold Cross-Validation for BlendProperty9...
  Fold 1 MAPE: 166.08%
  Fold 2 MAPE: 62.89%
  Fold 3 MAPE: 50.39%
  Fold 4 MAPE: 336.08%
  Fold 5 MAPE: 96.53%

Robust CV Score for BlendProperty9: 142.40% (+/- 104.85%)
Phase 3: Training final model on all data for BlendProperty9...


[I 2025-07-17 00:23:31,916] A new study created in memory with name: no-name-30032349-73fe-4078-87e0-aaf2f263cba4



Processing Target: BlendProperty10
Phase 1: Tuning hyperparameters for BlendProperty10...


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-17 00:23:32,459] Trial 0 finished with value: 0.5256265185228898 and parameters: {'iterations': 648, 'depth': 5, 'learning_rate': 0.014402030940869445, 'l2_leaf_reg': 8.168659842967397}. Best is trial 0 with value: 0.5256265185228898.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-17 00:23:36,049] Trial 1 finished with value: 0.47196851778026255 and parameters: {'iterations': 1007, 'depth': 8, 'learning_rate': 0.02588446787590826, 'l2_leaf_reg': 2.9344882293176315}. Best is trial 1 with value: 0.47196851778026255.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1.0, 10.0),
[I 2025-07-17 00:23:37,075] Trial 2 finished with value: 0.3327859009468421 and

Best Tuned Parameters Found for BlendProperty10.
Phase 2: Evaluating with 5-Fold Cross-Validation for BlendProperty10...
  Fold 1 MAPE: 26.94%
  Fold 2 MAPE: 47.26%
  Fold 3 MAPE: 36.63%
  Fold 4 MAPE: 25.66%
  Fold 5 MAPE: 60.05%

Robust CV Score for BlendProperty10: 39.31% (+/- 12.96%)
Phase 3: Training final model on all data for BlendProperty10...


--- All models trained. Final CV Scores ---
                  mean_mape    std_mape
BlendProperty1    46.719776   19.308304
BlendProperty2    37.566734   10.535834
BlendProperty3    89.000769   39.474215
BlendProperty4    42.979928   18.416754
BlendProperty5    15.226076    5.382736
BlendProperty6    37.524390   22.099206
BlendProperty7    95.147646   64.773017
BlendProperty8    63.867631   32.458283
BlendProperty9   142.395657  104.847614
BlendProperty10   39.309656   12.959300


In [54]:
import lightgbm as lgb
from sklearn.model_selection import KFold
import gc

# Store the final, production-ready models
final_models_for_target = {}
# Store the overall CV scores for each model type
overall_cv_scores = {}

for target in targets:
    print(f"\n{'='*20}\nProcessing Target: {target}\n{'='*20}")
    
    X_selected = x_base[top_features[target]]
    y_selected = y[target]

    # 1. Define the models you want to ensemble
    models_to_train = {
        'catboost': CatBoostRegressor(
            # Using pre-tuned or robust parameters
            objective='MAE',
            iterations=2000,
            learning_rate=0.05,
            depth=6,
            l2_leaf_reg=5.0,
            verbose=False
        ),
        'lightgbm': lgb.LGBMRegressor(
            # Using common robust parameters
            objective='mae',
            n_estimators=2000,
            learning_rate=0.05,
            num_leaves=31,
            reg_lambda=5.0,
            random_state=42,
            n_jobs=-1
        )
    }

    final_models_for_target[target] = {}
    
    # 2. Loop through each model type, train it with CV
    for model_name, model_instance in models_to_train.items():
        print(f"\n--- Training {model_name} for {target} ---")
        
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        fold_mape_scores = []

        for fold, (train_index, val_index) in enumerate(kf.split(X_selected, y_selected)):
            X_train, X_val = X_selected.iloc[train_index], X_selected.iloc[val_index]
            y_train, y_val = y_selected.iloc[train_index], y_selected.iloc[val_index]
            
            # For LightGBM, we need a callback for early stopping
            if 'lightgbm' in model_name:
                model_instance.fit(X_train, y_train,
                                   eval_set=[(X_val, y_val)],
                                   callbacks=[lgb.early_stopping(50, verbose=False)])
            else: # For CatBoost
                model_instance.fit(X_train, y_train,
                                   eval_set=[(X_val, y_val)],
                                   early_stopping_rounds=50, verbose=False)

            y_pred = model_instance.predict(X_val)
            
            # Correctly inverse transform for MAPE calculation
            y_val_full = pd.DataFrame(np.zeros((len(y_val), len(targets))), columns=targets, index=X_val.index)
            y_pred_full = pd.DataFrame(np.zeros((len(y_pred), len(targets))), columns=targets, index=X_val.index)
            y_val_full[target] = y_val
            y_pred_full[target] = y_pred
            y_val_inv = pt.inverse_transform(y_val_full)[:, targets.index(target)]
            y_pred_inv = pt.inverse_transform(y_pred_full)[:, targets.index(target)]
            
            mask = y_val_inv != 0
            mape = np.mean(np.abs((y_val_inv[mask] - y_pred_inv[mask]) / y_val_inv[mask])) * 100 if mask.any() else 0.0
            fold_mape_scores.append(mape)

        # Report the robust CV score for this specific model
        mean_mape = np.mean(fold_mape_scores)
        std_mape = np.std(fold_mape_scores)
        if model_name not in overall_cv_scores: overall_cv_scores[model_name] = {}
        overall_cv_scores[model_name][target] = mean_mape
        print(f"Robust CV Score for {model_name}: {mean_mape:.2f}% (+/- {std_mape:.2f}%)")
        
        # 3. Train final model on ALL data and store it
        print(f"Training final {model_name} on all data...")
        final_model = model_instance.fit(X_selected, y_selected)
        final_models_for_target[target][model_name] = final_model
        
        gc.collect()

print("\n\n--- All models trained. Final CV Scores by Model Type ---")
print(pd.DataFrame(overall_cv_scores))



Processing Target: BlendProperty1

--- Training catboost for BlendProperty1 ---
Robust CV Score for catboost: 336.53% (+/- 544.21%)
Training final catboost on all data...

--- Training lightgbm for BlendProperty1 ---
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000415 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5329
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 25
[LightGBM] [Info] Start training from score 0.016151
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000287 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5326
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 25
[LightGBM] [Info] Start training from score 0.035604
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000395 secon

In [2]:
import optuna
import lightgbm as lgb
import catboost as cb
from sklearn.model_selection import KFold
import pandas as pd
import numpy as np
import gc

# --- Optuna Objective Function for CatBoost (defined once) ---
def objective_catboost(trial, X, y):
    """Objective function to find the best CatBoost parameters."""
    params = {
        'objective': 'MAE',
        'iterations': trial.suggest_int('iterations', 1000, 4000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.05),
        'depth': trial.suggest_int('depth', 5, 10),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 0.5, 30.0),
        'colsample_bylevel': trial.suggest_uniform('colsample_bylevel', 0.5, 1.0),
        'verbose': False
    }
    
    # Using a single validation split for speed during tuning
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
    model = cb.CatBoostRegressor(**params)
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=50)
    
    y_pred = model.predict(X_valid)
    mask = y_valid != 0
    mape = np.mean(np.abs((y_valid[mask] - y_pred[mask]) / y_valid[mask])) * 100 if mask.any() else 0.0
    return mape

# --- Main Training and Evaluation Loop ---
final_models_for_target = {}
overall_cv_scores = {}

for target in targets:
    print(f"\n{'='*20}\nProcessing Target: {target}\n{'='*20}")
    
    # 🌟 CRITICAL: Use ALL features and the ORIGINAL target variable
    X_selected = x_base 
    y_selected = data[target]

    final_models_for_target[target] = {}

    # --- 1. Tune and Train CatBoost (The Winning Model) ---
    print(f"\n--- Tuning CatBoost with Optuna for {target} ---")
    study_catboost = optuna.create_study(direction='minimize')
    study_catboost.optimize(lambda trial: objective_catboost(trial, X_selected, y_selected), n_trials=50)
    best_params_catboost = study_catboost.best_trial.params
    print(f"Best parameters found for CatBoost.")
    
    # Now get a robust score for the tuned CatBoost model using K-Fold CV
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    catboost_fold_scores = []
    for fold, (train_index, val_index) in enumerate(kf.split(X_selected, y_selected)):
        X_train, X_val = X_selected.iloc[train_index], X_selected.iloc[val_index]
        y_train, y_val = y_selected.iloc[train_index], y_selected.iloc[val_index]

        model = cb.CatBoostRegressor(objective='MAE', **best_params_catboost, verbose=False)
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=50)
        y_pred = model.predict(X_val)
        mask = y_val != 0
        mape = np.mean(np.abs((y_val[mask] - y_pred[mask]) / y_val[mask])) * 100 if mask.any() else 0.0
        catboost_fold_scores.append(mape)

    mean_mape_cb = np.mean(catboost_fold_scores)
    std_mape_cb = np.std(catboost_fold_scores)
    if 'catboost' not in overall_cv_scores: overall_cv_scores['catboost'] = {}
    overall_cv_scores['catboost'][target] = mean_mape_cb
    print(f"✅ Robust CV Score for CatBoost: {mean_mape_cb:.2f}% (+/- {std_mape_cb:.2f}%)")
    
    # Train final CatBoost model on all data
    final_catboost_model = cb.CatBoostRegressor(objective='MAE', **best_params_catboost, verbose=False).fit(X_selected, y_selected)
    final_models_for_target[target]['catboost'] = final_catboost_model

    # --- 2. Train LightGBM (with Robust Parameters) ---
    print(f"\n--- Training LightGBM with robust parameters for {target} ---")
    robust_params_lightgbm = {
        'objective': 'mae', 'n_estimators': 2000, 'learning_rate': 0.05,
        'num_leaves': 20, 'reg_lambda': 2.0, 'reg_alpha': 2.0,
        'feature_fraction': 0.8, 'bagging_fraction': 0.8, 'bagging_freq': 1,
        'random_state': 42, 'n_jobs': -1
    }
    
    lightgbm_fold_scores = []
    for fold, (train_index, val_index) in enumerate(kf.split(X_selected, y_selected)):
        X_train, X_val = X_selected.iloc[train_index], X_selected.iloc[val_index]
        y_train, y_val = y_selected.iloc[train_index], y_selected.iloc[val_index]
        
        model = lgb.LGBMRegressor(**robust_params_lightgbm)
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(50, verbose=False)])
        y_pred = model.predict(X_val)
        mask = y_val != 0
        mape = np.mean(np.abs((y_val[mask] - y_pred[mask]) / y_val[mask])) * 100 if mask.any() else 0.0
        lightgbm_fold_scores.append(mape)

    mean_mape_lgb = np.mean(lightgbm_fold_scores)
    std_mape_lgb = np.std(lightgbm_fold_scores)
    if 'lightgbm' not in overall_cv_scores: overall_cv_scores['lightgbm'] = {}
    overall_cv_scores['lightgbm'][target] = mean_mape_lgb
    print(f"✅ Robust CV Score for LightGBM: {mean_mape_lgb:.2f}% (+/- {std_mape_lgb:.2f}%)")
    
    # Train final LightGBM model on all data
    final_lightgbm_model = lgb.LGBMRegressor(**robust_params_lightgbm).fit(X_selected, y_selected)
    final_models_for_target[target]['lightgbm'] = final_lightgbm_model
    
    gc.collect()

print("\n\n--- All models trained. Final CV Scores by Model Type ---")
print(pd.DataFrame(overall_cv_scores))



  from .autonotebook import tqdm as notebook_tqdm


NameError: name 'targets' is not defined