In [None]:
 from google.colab import drive
# Mount Google Drive to access files stored there
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Data preprocessing modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import KFold, RandomizedSearchCV

# Algorithms for model training & testing
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
!pip install lightgbm
from lightgbm import LGBMRegressor
import lightgbm as lgb



In [None]:
# Import data:
train_data = pd.read_csv("/content/drive/MyDrive/Colab_notebooks/train.csv")
test_data_n = pd.read_csv("/content/drive/MyDrive/Colab_notebooks/test.csv")

## **Data Preprocessing**

In [None]:
train_data = train_data.drop(["id"], axis=1)
test_data = test_data_n.drop(["id"], axis=1)

In [None]:
display(train_data.head())
display(test_data.head())

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900
3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000
4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500


Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title
0,Land,Rover LR2 Base,2015,98000,Gasoline,240.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,6-Speed A/T,White,Beige,None reported,Yes
1,Land,Rover Defender SE,2020,9142,Hybrid,395.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,8-Speed A/T,Silver,Black,None reported,Yes
2,Ford,Expedition Limited,2022,28121,Gasoline,3.5L V6 24V PDI DOHC Twin Turbo,10-Speed Automatic,White,Ebony,None reported,
3,Audi,A6 2.0T Sport,2016,61258,Gasoline,2.0 Liter TFSI,Automatic,Silician Yellow,Black,None reported,
4,Audi,A6 2.0T Premium Plus,2018,59000,Gasoline,252.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,A/T,Gray,Black,None reported,Yes


In [None]:
display(train_data.info())
display(test_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188533 entries, 0 to 188532
Data columns (total 12 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   brand         188533 non-null  object
 1   model         188533 non-null  object
 2   model_year    188533 non-null  int64 
 3   milage        188533 non-null  int64 
 4   fuel_type     183450 non-null  object
 5   engine        188533 non-null  object
 6   transmission  188533 non-null  object
 7   ext_col       188533 non-null  object
 8   int_col       188533 non-null  object
 9   accident      186081 non-null  object
 10  clean_title   167114 non-null  object
 11  price         188533 non-null  int64 
dtypes: int64(3), object(9)
memory usage: 17.3+ MB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125690 entries, 0 to 125689
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   brand         125690 non-null  object
 1   model         125690 non-null  object
 2   model_year    125690 non-null  int64 
 3   milage        125690 non-null  int64 
 4   fuel_type     122307 non-null  object
 5   engine        125690 non-null  object
 6   transmission  125690 non-null  object
 7   ext_col       125690 non-null  object
 8   int_col       125690 non-null  object
 9   accident      124058 non-null  object
 10  clean_title   111451 non-null  object
dtypes: int64(2), object(9)
memory usage: 10.5+ MB


None

In [None]:
# get the missing values in the train and test data
null_train = train_data.isnull().sum()
null_test = test_data.isnull().sum()

display(null_train[null_train > 0])
display(null_test[null_test > 0])

Unnamed: 0,0
fuel_type,5083
accident,2452
clean_title,21419


Unnamed: 0,0
fuel_type,3383
accident,1632
clean_title,14239


In [None]:
display(train_data['fuel_type'].unique())
display(test_data['fuel_type'].unique())

display(train_data['accident'].unique())
display(test_data['accident'].unique())

display(train_data['clean_title'].unique())
display(test_data['clean_title'].unique())

array(['Gasoline', 'E85 Flex Fuel', nan, 'Hybrid', 'Diesel',
       'Plug-In Hybrid', '–', 'not supported'], dtype=object)

array(['Gasoline', 'Hybrid', 'Diesel', 'E85 Flex Fuel', nan,
       'Plug-In Hybrid', '–', 'not supported'], dtype=object)

array(['None reported', 'At least 1 accident or damage reported', nan],
      dtype=object)

array(['None reported', 'At least 1 accident or damage reported', nan],
      dtype=object)

array(['Yes', nan], dtype=object)

array(['Yes', nan], dtype=object)

In [None]:
# fill 'fuel_type' null values with 'unknown'
train_data['fuel_type'] = train_data['fuel_type'].fillna('Unknown')
test_data['fuel_type'] = test_data['fuel_type'].fillna('Unknown')

# fill 'accident' null values with 'unknown'
train_data['accident'] = train_data['accident'].fillna('Unknown')
test_data['accident'] = test_data['accident'].fillna('Unknown')

# fill 'clean_title' null values with 'No'
train_data['clean_title'] = train_data['clean_title'].fillna('No')
test_data['clean_title'] = test_data['clean_title'].fillna('No')

Helper functions to reduce the amount of unique values in their specific column

In [None]:
# engine: 'Gasoline', 'E85 Flex Fuel', 'Hybrid', 'Diesel','Plug-In Hybrid'
def engine_fuel(engine):
    if 'Gasoline' in engine:
        return 'Gasoline'
    elif 'E85 Flex Fuel' in engine: return 'E85 Flex Fuel'
    elif 'Hybrid' in engine: return 'Hybrid'
    elif 'Diesel' in engine: return 'Diesel'
    elif 'Plug-In Hybrid' in engine: return 'Plug-In Hybrid'
    else:
        return 'None'

# transmission: Manual, Automatic, Continuously Variable, Dual-Clutch,...
def transmission_type(transmission):
    t = transmission.lower()

    if 'a/t' in t or 'automatic' in t or 'at' in t:
        return 'Automatic'
    elif 'm/t' in t or 'manual' in t or 'mt' in t: return 'Manual'
    elif 'cvt' in t: return 'CVT'
    elif 'dct' in t: return 'DCT'
    elif 'fixed gear' in t: return 'Fixed Gear'
    elif 'variable' in t: return 'Variable'
    elif 'single-speed' in t or '1-speed' in t: return 'Single-speed'
    else:
        return 'Other'

In [None]:
# apply the helper functions to the train and test data
train_data['engine_new'] = train_data['engine'].apply(engine_fuel)
test_data['engine_new'] = test_data['engine'].apply(engine_fuel)

train_data['transmission'] = train_data['transmission'].apply(transmission_type)
test_data['transmission'] = test_data['transmission'].apply(transmission_type)

lightgbm will use the ordinal encoded data and catboost will not use any encoding for its data

In [None]:
# get the values of X and y
X_lgbm = train_data.drop(columns=['price'], axis=1)
X_cat = train_data.drop(columns=['price'], axis=1)
y = train_data['price']

test_data_lgbm = test_data.copy()
test_data_cat = test_data.copy()

In [None]:
X_lgbm

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,engine_new
0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,Automatic,Yellow,Gray,None reported,Yes,Gasoline
1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,Automatic,Silver,Beige,At least 1 accident or damage reported,Yes,Gasoline
2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,Automatic,Blue,Gray,None reported,Yes,
3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Other,Black,Black,None reported,Yes,Gasoline
4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,Automatic,Black,Beige,None reported,Yes,Gasoline
...,...,...,...,...,...,...,...,...,...,...,...,...
188528,Cadillac,Escalade ESV Platinum,2017,49000,Gasoline,420.0HP 6.2L 8 Cylinder Engine Gasoline Fuel,Other,White,Beige,None reported,Yes,Gasoline
188529,Mercedes-Benz,AMG C 43 AMG C 43 4MATIC,2018,28600,Gasoline,385.0HP 3.0L V6 Cylinder Engine Gasoline Fuel,Automatic,White,Black,At least 1 accident or damage reported,Yes,Gasoline
188530,Mercedes-Benz,AMG GLC 63 Base 4MATIC,2021,13650,Gasoline,469.0HP 4.0L 8 Cylinder Engine Gasoline Fuel,Automatic,White,Black,None reported,Yes,Gasoline
188531,Audi,S5 3.0T Prestige,2022,13895,Gasoline,3.0L,Automatic,Daytona Gray Pearl Effect,Black,None reported,No,


In [None]:
# get the names of the numerical columns and categorical columns in the data
num_cols = [col for col in train_data.columns if train_data[col].dtype in ['int64', 'float64']]
cat_cols = [col for col in train_data.columns if train_data[col].dtype in ['object']]

print("Num Cols: ", num_cols)
print("Cat Cols: ", cat_cols)

Num Cols:  ['model_year', 'milage', 'price']
Cat Cols:  ['brand', 'model', 'fuel_type', 'engine', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title', 'engine_new']


In [None]:
# Ordinal Encode the features for X_lgbm:
oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

cat_enc_tr = oe.fit_transform(X_lgbm[cat_cols])
cat_enc_te = oe.transform(test_data_lgbm[cat_cols])

In [None]:
cat_enc_tr = pd.DataFrame(data=cat_enc_tr, columns=cat_cols)
X_lgbm[cat_cols] = cat_enc_tr

cat_enc_te = pd.DataFrame(data=cat_enc_te, columns=cat_cols)
test_data_lgbm[cat_cols] = cat_enc_te

In [None]:
display(X_lgbm.head())
display(test_data_lgbm.head())

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,engine_new
0,31.0,495.0,2007,213000,2.0,116.0,0.0,312.0,71.0,1.0,1.0,1.0
1,28.0,930.0,2002,143250,2.0,366.0,0.0,263.0,10.0,0.0,1.0,1.0
2,9.0,1575.0,2002,136731,1.0,640.0,0.0,38.0,71.0,1.0,1.0,3.0
3,16.0,758.0,2017,19500,2.0,863.0,4.0,29.0,14.0,1.0,1.0,1.0
4,36.0,1077.0,2021,7388,2.0,259.0,0.0,29.0,10.0,1.0,1.0,1.0


Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,engine_new
0,26.0,1390.0,2015,98000,2.0,325.0,0.0,304.0,10.0,1.0,1.0,1.0
1,26.0,1377.0,2020,9142,3.0,787.0,0.0,263.0,14.0,1.0,1.0,1.0
2,14.0,636.0,2022,28121,2.0,540.0,0.0,304.0,57.0,1.0,0.0,3.0
3,3.0,182.0,2016,61258,2.0,192.0,0.0,261.0,14.0,1.0,0.0,3.0
4,3.0,181.0,2018,59000,2.0,364.0,0.0,128.0,14.0,1.0,1.0,1.0


## **Model Training & Evaluation**

In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.5.0-py3-none-any.whl (400 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.9.0 optuna-4.5.0


In [None]:
import optuna

# Define Optuna objective functions
def objective_lgbm(trial):
    lgbm_params = {
        "objective": "regression",
        "metric": "rmse",
        "n_estimators": trial.suggest_int("n_estimators", 200, 2000),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.2),
        "num_leaves": trial.suggest_int("num_leaves", 31, 511),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 200),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-6, 1.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-6, 1.0, log=True),
        "random_state": 42,
        "early_stopping_rounds": 200,
        "n_jobs": -1,
        "verbosity": -1,
        "boosting_type": trial.suggest_categorical("boosting_type", ["gbdt"]),
    }

    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    rmse_scores = []

    for fold, (tr_idx, val_idx) in enumerate(cv.split(X_lgbm, y)):
        X_tr, X_val = X_lgbm.iloc[tr_idx], X_lgbm.iloc[val_idx]
        y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]

        model = LGBMRegressor(**lgbm_params)
        model.fit(X_tr, y_tr,
                  eval_set=[(X_val, y_val)],
                  eval_metric="rmse",)

        preds = model.predict(X_val)
        rmse = root_mean_squared_error(y_val, preds)
        rmse_scores.append(rmse)

    return np.mean(rmse_scores)


# LGBM Optuna study
print("Running Optuna study for LightGBM...")
pruner = optuna.pruners.SuccessiveHalvingPruner()
study_lgbm = optuna.create_study(direction="minimize", pruner=pruner)
study_lgbm.optimize(objective_lgbm, n_trials=150, timeout=60*90)

print("\nBest parameters for LightGBM:", study_lgbm.best_params)
print("Best RMSE for LightGBM:", study_lgbm.best_value)
best_lgb_params = study_lgbm.best_params

[I 2025-09-12 08:25:37,244] A new study created in memory with name: no-name-6b40a011-2d85-44b0-82f0-ed63829ee24c


Running Optuna study for LightGBM...


[I 2025-09-12 08:25:57,536] Trial 0 finished with value: 72748.21729833036 and parameters: {'n_estimators': 658, 'learning_rate': 0.13174083274815573, 'num_leaves': 44, 'max_depth': 12, 'min_child_samples': 26, 'subsample': 0.7140739459616219, 'colsample_bytree': 0.6080906796572331, 'reg_alpha': 0.0036180769256207073, 'reg_lambda': 0.0001340215661298201, 'boosting_type': 'gbdt'}. Best is trial 0 with value: 72748.21729833036.
[I 2025-09-12 08:26:22,129] Trial 1 finished with value: 72709.68501749609 and parameters: {'n_estimators': 888, 'learning_rate': 0.1840440941253236, 'num_leaves': 280, 'max_depth': 10, 'min_child_samples': 182, 'subsample': 0.5798270113431705, 'colsample_bytree': 0.8443343698302974, 'reg_alpha': 0.037164339434207855, 'reg_lambda': 0.02689412433395196, 'boosting_type': 'gbdt'}. Best is trial 1 with value: 72709.68501749609.
[I 2025-09-12 08:26:52,218] Trial 2 finished with value: 72935.02510930951 and parameters: {'n_estimators': 678, 'learning_rate': 0.0755219606


Best parameters for LightGBM: {'n_estimators': 1995, 'learning_rate': 0.05101421798433064, 'num_leaves': 98, 'max_depth': 9, 'min_child_samples': 155, 'subsample': 0.795214556350197, 'colsample_bytree': 0.437798275994333, 'reg_alpha': 7.733789387891466e-06, 'reg_lambda': 1.2064000395683555e-06, 'boosting_type': 'gbdt'}
Best RMSE for LightGBM: 72527.33246460892


In [None]:
n_splits = 10
fold = KFold(n_splits=n_splits, shuffle=True, random_state=42)

oof_preds = np.zeros(len(X_lgbm))          # store OOF predictions for whole train set
y_pred_folds = []                     # store per-fold test predictions
rmse = []

i = 1
for train_index, val_index in fold.split(X_lgbm, y):
    print(f"Fold {i}/{n_splits}:")

    X_tr, X_val = X_lgbm.iloc[train_index], X_lgbm.iloc[val_index]
    y_tr, y_val = y.iloc[train_index], y.iloc[val_index]

    m = LGBMRegressor(**best_lgb_params)

    # fit with early stopping rounds
    m.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        eval_metric='rmse',
        callbacks=[lgb.early_stopping(stopping_rounds=100)]
    )

    # predict validation using best_iteration_
    preds = m.predict(X_val, num_iteration=m.best_iteration_)
    oof_preds[val_index] = preds            # assign to OOF array

    r_error = root_mean_squared_error(y_val, preds)
    rmse.append(r_error)

    print(f"Fold {i} — RMSE: {r_error:.6f}  R2: {r2_score(y_val, preds):.6f}")

    # predict test set using best_iteration_ and store prediction
    pr = m.predict(test_data_lgbm, num_iteration=m.best_iteration_)
    y_pred_folds.append(pr)

    i += 1

# OOF Estimate
overall_oof_rmse = root_mean_squared_error(y, oof_preds)
print(f"\nOOF RMSE: {overall_oof_rmse:.6f}")
print(f"Per-fold RMSE mean ± std: {np.mean(rmse):.6f} ± {np.std(rmse):.6f}")

# aggregate test predictions across folds (mean)
final_test_pred_mean_lgbm = np.mean(y_pred_folds, axis=0)
# median aggregation
# final_test_pred_median_lgbm = np.median(y_pred_folds, axis=0)

Fold 1/10:
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[233]	valid_0's rmse: 69049.9	valid_0's l2: 4.76788e+09
Fold 1 — RMSE: 69049.859968  R2: 0.167384
Fold 2/10:
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[125]	valid_0's rmse: 66514.1	valid_0's l2: 4.42412e+09
Fold 2 — RMSE: 66514.095519  R2: 0.180050
Fold 3/10:
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[120]	valid_0's rmse: 62379.8	valid_0's l2: 3.89123e+09
Fold 3 — RMSE: 62379.754863  R2: 0.175577
Fold 4/10:
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[199]	valid_0's rmse: 74181.7	valid_0's l2: 5.50292e+09
Fold 4 — RMSE: 74181.677495  R2: 0.152250
Fold 5/10:
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[135]	valid_0's rmse: 84989.6	valid_0's l2: 7.22323e+09
Fold 5 — RMS

In [None]:
oof_preds

array([ 9292.37299504, 11350.55065667, 12311.68556948, ...,
       83042.0037103 , 68272.08410518, 31629.79230968])

In [None]:
!pip install catBoost

from catboost import CatBoostRegressor, Pool

Collecting catBoost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catBoost
Successfully installed catBoost-1.2.8


In [None]:
import optuna

# Define Optuna objective functions for CatBoost
def objective_catboost(trial):
    catboost_params = {
        "iterations": trial.suggest_int("iterations", 500, 5000), # Tunable
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.2, log=True), # Tunable
        "depth": trial.suggest_int("depth", 4, 10), # Tunable
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-6, 10.0, log=True), # Tunable
        "loss_function": "RMSE",
        "eval_metric": "RMSE",
        "random_state": 42,
        "verbose": 0,
        "early_stopping_rounds": 200, # Tunable
        "cat_features": cat_cols # Use non-encoded categorical features
    }

    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    rmse_scores = []

    for fold, (tr_idx, val_idx) in enumerate(cv.split(X_cat, y)):
        X_tr, X_val = X_cat.iloc[tr_idx], X_cat.iloc[val_idx]
        y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]

        train_pool = Pool(data=X_tr, label=y_tr, cat_features=cat_cols)
        val_pool = Pool(data=X_val, label=y_val, cat_features=cat_cols)

        model = CatBoostRegressor(**catboost_params)
        model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=catboost_params["early_stopping_rounds"], verbose=False)

        preds = model.predict(val_pool)
        rmse = root_mean_squared_error(y_val, preds)
        rmse_scores.append(rmse)

    return np.mean(rmse_scores)


# Run Optuna studies for CatBoost
print("Running Optuna study for CatBoost...")
pruner = optuna.pruners.SuccessiveHalvingPruner()
study_catboost = optuna.create_study(direction="minimize", pruner=pruner)
study_catboost.optimize(objective_catboost, n_trials=150, timeout=60*90)

print("\nBest parameters for CatBoost:", study_catboost.best_params)
print("Best RMSE for CatBoost:", study_catboost.best_value)
best_catboost_params = study_catboost.best_params

[I 2025-09-12 09:58:00,975] A new study created in memory with name: no-name-470487bd-e81b-4981-ace7-6cb6cc8ec31e


Running Optuna study for CatBoost...


[I 2025-09-12 10:16:55,647] Trial 0 finished with value: 72968.22192255367 and parameters: {'iterations': 1891, 'learning_rate': 0.1009904690143163, 'depth': 10, 'l2_leaf_reg': 0.00019131566317391068}. Best is trial 0 with value: 72968.22192255367.
[I 2025-09-12 11:11:31,019] Trial 1 finished with value: 73058.80580065653 and parameters: {'iterations': 888, 'learning_rate': 0.0029728415475214587, 'depth': 10, 'l2_leaf_reg': 2.1918527074988196e-06}. Best is trial 0 with value: 72968.22192255367.
[I 2025-09-12 11:52:47,702] Trial 2 finished with value: 72817.30406403168 and parameters: {'iterations': 2790, 'learning_rate': 0.012560903908017925, 'depth': 5, 'l2_leaf_reg': 0.0010769890121875914}. Best is trial 2 with value: 72817.30406403168.



Best parameters for CatBoost: {'iterations': 2790, 'learning_rate': 0.012560903908017925, 'depth': 5, 'l2_leaf_reg': 0.0010769890121875914}
Best RMSE for CatBoost: 72817.30406403168


In [None]:
n_splits = 10
fold = KFold(n_splits=n_splits, shuffle=True, random_state=42)

oof_preds_catboost = np.zeros(len(X_cat))          # store OOF predictions for whole train set
y_pred_folds_catboost = []                     # store per-fold test predictions
rmse_catboost = []

# Identify categorical features for CatBoost
cat_features_indices = [X_cat.columns.get_loc(col) for col in cat_cols]

i = 1
for train_index, val_index in fold.split(X_cat, y):
    print(f"Fold {i}/{n_splits}:")

    X_tr, X_val = X_cat.iloc[train_index], X_cat.iloc[val_index]
    y_tr, y_val = y.iloc[train_index], y.iloc[val_index]

    # Create CatBoost Pool objects
    train_pool = Pool(data=X_tr, label=y_tr, cat_features=cat_features_indices)
    val_pool = Pool(data=X_val, label=y_val, cat_features=cat_features_indices)
    test_pool = Pool(data=test_data_cat, cat_features=cat_features_indices)


    m_catboost = CatBoostRegressor(**best_catboost_params)

    # fit with early stopping rounds
    m_catboost.fit(
        train_pool,
        eval_set=val_pool,
        early_stopping_rounds=100,
        verbose=False
    )

    # predict validation using best_iteration_
    preds_catboost = m_catboost.predict(val_pool)
    oof_preds_catboost[val_index] = preds_catboost

    r_error_catboost = root_mean_squared_error(y_val, preds_catboost)
    rmse_catboost.append(r_error_catboost)

    print(f"Fold {i} — RMSE: {r_error_catboost:.6f}  R2: {r2_score(y_val, preds_catboost):.6f}")

    # predict test set using best_iteration_ and store predictions
    pr_catboost = m_catboost.predict(test_pool)
    y_pred_folds_catboost.append(pr_catboost)

    i += 1

# OOF Estimate
overall_oof_rmse_catboost = root_mean_squared_error(y, oof_preds_catboost)
print(f"\nOOF RMSE (CatBoost): {overall_oof_rmse_catboost:.6f}")
print(f"Per-fold RMSE mean ± std (CatBoost): {np.mean(rmse_catboost):.6f} ± {np.std(rmse_catboost):.6f}")

# aggregate test predictions across folds (mean)
final_test_pred_mean_catboost = np.mean(y_pred_folds_catboost, axis=0)
# # median aggregation
# final_test_pred_median_catboost = np.median(y_pred_folds_catboost, axis=0)

Fold 1/10:
Fold 1 — RMSE: 69156.714498  R2: 0.164806
Fold 2/10:
Fold 2 — RMSE: 66903.684105  R2: 0.170416
Fold 3/10:
Fold 3 — RMSE: 62568.945070  R2: 0.170568
Fold 4/10:
Fold 4 — RMSE: 74514.685207  R2: 0.144621
Fold 5/10:
Fold 5 — RMSE: 85173.568318  R2: 0.104946
Fold 6/10:
Fold 6 — RMSE: 60728.935246  R2: 0.193498
Fold 7/10:
Fold 7 — RMSE: 74659.438280  R2: 0.145681
Fold 8/10:
Fold 8 — RMSE: 78559.904242  R2: 0.127312
Fold 9/10:
Fold 9 — RMSE: 71699.400261  R2: 0.145580
Fold 10/10:
Fold 10 — RMSE: 81022.469507  R2: 0.124672

OOF RMSE (CatBoost): 72883.190070
Per-fold RMSE mean ± std (CatBoost): 72498.774473 ± 7476.747033


In [None]:
# Ensemble the predictions by taking the average
final_ensembled_predictions = (final_test_pred_mean_lgbm + final_test_pred_mean_catboost) / 2

# Create a submission DataFrame
submission_df = pd.DataFrame({'id': test_data_n['id'], 'price': final_ensembled_predictions})

# Display the first few rows of the submission file
display(submission_df.head())

Unnamed: 0,id,price
0,188533,17473.559805
1,188534,79126.343638
2,188535,53299.491181
3,188536,24993.471052
4,188537,30990.401407


In [None]:
submission_df.to_csv("optuna2_oof_lgbm+cat_sub.csv", index=False)