In [1]:
import warnings
from optuna.exceptions import ExperimentalWarning

warnings.filterwarnings("ignore", category=ExperimentalWarning)

In [2]:
import optuna
import pandas as pd
import numpy as np

from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
import os
from optuna.exceptions import TrialPruned
import json
import time

import optuna
from catboost import CatBoostRegressor, Pool, cv
from optuna.integration import CatBoostPruningCallback
from sklearn.metrics import mean_absolute_percentage_error

In [3]:
def compute_volume_weighted_component_features(X):
    """
    Computes individual volume-weighted features WjPk = Componentj_fraction * Componentj_Propertyk
    for j in 1..5 and k in 1..10 (total 50 features).
    """
    features = {}
    for comp_idx in range(1, 6):  # Components 1–5
        for prop_idx in range(1, 11):  # Properties 1–10
            vol_col = f'Component{comp_idx}_fraction'
            prop_col = f'Component{comp_idx}_Property{prop_idx}'
            feat_name = f'W{comp_idx}P{prop_idx}'
            features[feat_name] = X[vol_col] * X[prop_col]
    return pd.DataFrame(features)

In [4]:
def get_data(target, threshold=0.1, get_test=False):
    X_train = pd.read_csv(f"{BASE_PATH}/train/{target}_X.csv")
    y_train = pd.read_csv(f"{BASE_PATH}/train/{target}_y.csv")
    X_val = pd.read_csv(f"{BASE_PATH}/val/{target}_X.csv")
    y_val = pd.read_csv(f"{BASE_PATH}/val/{target}_y.csv")
    X_test = pd.read_csv("/pscratch/sd/r/ritesh11/temp_dir/dataset/test.csv")
    
    X_train = pd.concat([X_train, compute_volume_weighted_component_features(X_train)], axis=1)
    X_val = pd.concat([X_val, compute_volume_weighted_component_features(X_val)], axis=1)
    X_test = pd.concat([X_test,compute_volume_weighted_component_features(X_test)],axis=1)

    df = pd.read_csv(os.path.join("/pscratch/sd/r/ritesh11/temp_dir/feature_importance", f"{target}.csv"))
    cols = df[df["importance"] > threshold].iloc[:, 0].tolist()
    # print(cols)
    X_train = X_train[cols]
    X_val = X_val[cols]
    X_test = X_test[cols]
    if get_test:
        return X_train, y_train, X_val, y_val, X_test
    return X_train, y_train, X_val, y_val

In [5]:
TARGETS = [f"BlendProperty{i}" for i in range(1, 11)]
BASE_PATH = "/pscratch/sd/r/ritesh11/temp_dir/dataset/updated"
model_dir = "/pscratch/sd/r/ritesh11/temp_dir/CB_models"
N_TRIALS = 200

In [6]:
fixed_params = {
    "loss_function": "MAPE",
    "eval_metric": "MAPE",
    "verbose": False,
    "random_seed": 42,
    "early_stopping_rounds": 50,
    "iterations": 2000
}

In [7]:
from sklearn.model_selection import KFold

def objective(trial, target):
    threshold = trial.suggest_categorical("threshold", [ 0.1, 0.001, 0.0001, 0.0])
    
    X_train, y_train, X_val, y_val = get_data(target, threshold)
    X = pd.concat([X_train, X_val], axis=0).reset_index(drop=True)
    y = pd.concat([y_train, y_val], axis=0).reset_index(drop=True)

    param_space = {
        "border_count": trial.suggest_int("border_count", 48, 255),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.2, log=True),
        "depth": trial.suggest_int("depth", 4, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1.0, 100.0, log=True),
        "random_strength": trial.suggest_float("random_strength", 0.1, 10.0, log=True),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 1.0),
        "sampling_frequency": trial.suggest_categorical("sampling_frequency", ["PerTree", "PerTreeLevel"]),
        "rsm": trial.suggest_float("rsm", 0.6, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 2, 100),
        "grow_policy": trial.suggest_categorical("grow_policy", ["SymmetricTree", "Depthwise"])
    }

    model = CatBoostRegressor(**param_space, **fixed_params)

    # 5-fold cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    mape_scores = []
    best_iters = []

    for train_idx, val_idx in kf.split(X):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model.fit(X_tr, y_tr, eval_set=(X_val, y_val), use_best_model=True)
        preds = model.predict(X_val)
        mape_scores.append(mean_absolute_percentage_error(y_val, preds))
        best_iters.append(model.get_best_iteration())

    trial.set_user_attr("best_iter", int(np.mean(best_iters)))
    return np.mean(mape_scores)

In [8]:
def log_callback(study, trial):
        if trial.number % 50 == 0 and trial.value is not None:
            print(f"Trial {trial.number}: MAPE = {trial.value:.4f}: BEST: {study.best_trial.value:.4f}" )

In [9]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [10]:
# for target in TARGETS[4:]:

#     pruner = optuna.pruners.MedianPruner(n_startup_trials=50, n_warmup_steps=10)
#     study = optuna.create_study(direction="minimize")
#     study.optimize(lambda trial: objective(trial,target), n_trials=N_TRIALS, show_progress_bar=True, n_jobs=12)

#     print(f"\nBest MAPE for {target}: {study.best_value:.4f}")
#     print(f"Best params for {target}:\n{study.best_params}\n")
    
#     best_iter = study.best_trial.user_attrs["best_iter"]

#     # Save full config
#     complete_params = {**study.best_params, **fixed_params}
#     complete_params["iterations"] = best_iter 
    
#     with open(os.path.join(model_dir, f"best_params_{target}.json"), "w") as f:
#         json.dump(complete_params, f, indent=2)
        
#     with open(os.path.join(model_dir, f"best_mape_{target}.txt"), "w") as f:
#         f.write(f"{study.best_value:.6f}\n")

In [11]:
from tqdm import tqdm

In [42]:
res = {}

In [48]:
for target in tqdm(TARGETS):
    with open(os.path.join(model_dir, f"best_params_{target}.json"), "r") as f:
        params = json.load(f)
    threshold = params.pop('threshold')
    X_train, y_train, X_val, y_val, X_test = get_data(target,threshold,get_test=True)
    X = pd.concat([X_train,X_val], axis=0)
    Y = pd.concat([y_train,y_val], axis=0)
    params['iterations'] = 5000
    params.pop('early_stopping_rounds')
    model = CatBoostRegressor(**params)
    train_pool = Pool(X, Y)
    # val_pool = Pool(X_val, y_val)
    model.fit(
        train_pool,
        # eval_set=val_pool,
        # early_stopping_rounds=50,
    )

    preds = model.predict(X_val)
    print(mean_absolute_percentage_error(y_val, preds))
    preds = model.predict(X_test)
    res[target] = preds

 10%|█         | 1/10 [00:31<04:47, 31.99s/it]

0.13117428185430433


 20%|██        | 2/10 [01:04<04:18, 32.27s/it]

0.05391009442619021


 30%|███       | 3/10 [01:43<04:06, 35.18s/it]

0.04648343134969676


 40%|████      | 4/10 [02:27<03:52, 38.78s/it]

0.14469604220171295


 50%|█████     | 5/10 [03:26<03:50, 46.20s/it]

0.04942053582906186


 60%|██████    | 6/10 [03:59<02:47, 41.75s/it]

0.05288787144973862


 70%|███████   | 7/10 [04:56<02:20, 46.77s/it]

0.0700422803656802


 80%|████████  | 8/10 [05:31<01:25, 42.81s/it]

0.12734515020880255


 90%|█████████ | 9/10 [06:17<00:43, 43.96s/it]

0.148469942979477


100%|██████████| 10/10 [08:25<00:00, 50.57s/it]

0.07099013188124545





In [31]:
for target in tqdm(TARGETS):
    with open(os.path.join(model_dir, f"best_params_{target}.json"), "r") as f:
        params = json.load(f)
    threshold = params.pop('threshold')
    X_train, y_train, X_val, y_val, X_test = get_data(target,threshold,get_test=True)
    X = pd.concat([X_train,X_val], axis=0)
    Y = pd.concat([y_train,y_val], axis=0)
    params['iterations'] = 3000
    params.pop('early_stopping_rounds')
    model = CatBoostRegressor(**params)
    train_pool = Pool(X, Y)
    # val_pool = Pool(X_val, y_val)
    model.fit(
        train_pool,
        # eval_set=val_pool,
        # early_stopping_rounds=50,
    )

    preds = model.predict(X_val)
    print(mean_absolute_percentage_error(y_val, preds))
    preds = model.predict(X_test)
    res[target] = preds

 10%|█         | 1/10 [00:10<01:31, 10.12s/it]

0.21695650573432648


 20%|██        | 2/10 [00:21<01:25, 10.67s/it]

0.09014555536723307


 30%|███       | 3/10 [00:34<01:22, 11.81s/it]

0.11897244315791008


 40%|████      | 4/10 [00:47<01:13, 12.19s/it]

0.2789384607380301


 50%|█████     | 5/10 [00:59<01:01, 12.21s/it]

0.07655426675538678


 60%|██████    | 6/10 [01:07<00:43, 10.90s/it]

0.0858472082384661


 70%|███████   | 7/10 [01:22<00:36, 12.22s/it]

0.13559309037393555


 80%|████████  | 8/10 [01:31<00:22, 11.06s/it]

0.24921039049636892


 90%|█████████ | 9/10 [01:43<00:11, 11.54s/it]

0.32156191606925083


100%|██████████| 10/10 [02:16<00:00, 13.69s/it]

0.1424974491088939





In [49]:
test_df = pd.read_csv("/pscratch/sd/r/ritesh11/temp_dir/dataset/test.csv")

In [50]:
df = pd.DataFrame(res)
df.insert(0, 'ID', test_df.index+1)

In [51]:
df.to_csv("Catboost.csv", index=False)

In [52]:
df

Unnamed: 0,ID,BlendProperty1,BlendProperty2,BlendProperty3,BlendProperty4,BlendProperty5,BlendProperty6,BlendProperty7,BlendProperty8,BlendProperty9,BlendProperty10
0,1,0.114163,0.193414,0.638629,0.566209,0.346522,0.690940,0.632408,0.385268,-0.260213,0.290650
1,2,-0.765918,-0.625692,-1.090108,0.138452,-0.730278,-0.085409,-0.989583,-1.073813,-0.685128,0.046468
2,3,1.703392,1.128573,1.150032,0.946316,2.213660,1.890347,1.099417,2.113906,0.407142,2.143231
3,4,-0.402619,0.365530,0.676963,-0.232582,1.910798,-0.427672,0.692457,0.971075,0.796019,-0.994192
4,5,0.154717,-1.124593,1.025409,0.412171,2.177352,0.253143,1.037057,0.037106,-0.493084,0.949361
...,...,...,...,...,...,...,...,...,...,...,...
495,496,0.267798,-0.937580,1.090756,-0.213249,-0.238756,-0.673350,1.164467,-0.408846,-1.278945,-0.309958
496,497,-2.146254,-1.230730,-0.931392,-2.124117,-0.627643,-2.341220,-1.036523,-1.984258,-1.240633,-1.246106
497,498,1.954243,2.195457,0.186951,1.159278,0.231401,0.553068,0.209754,0.886467,0.333569,0.419324
498,499,-0.075072,0.737974,1.449392,-0.933701,-0.885327,0.048710,1.599327,0.525086,0.164130,1.215143
