# Model Training

**Algorithm:** CatBoost Regressor

In [2]:
import pandas as pd

from sklearn.model_selection import train_test_split

from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

Load Data

In [3]:
df = pd.read_csv("../data/processed/cars_cleaned.csv")

df.head()

Unnamed: 0,price,brand,model,year,condition,transmission,body_type,fuel_type,engine_capacity,mileage,car_age
0,5975000.0,Toyota,Yaris,2009,Used,Automatic,Saloon,Petrol,1300,135000,17
1,46800000.0,Toyota,Land Cruiser Sahara,2011,Used,Automatic,SUV / 4x4,Diesel,4600,95000,15
2,17950000.0,Honda,Vezel,2026,Brand New,Tiptronic,SUV / 4x4,Hybrid,1500,0,0
3,12400000.0,Kia,Other Model,2026,Brand New,Automatic,Hatchback,Petrol,990,0,0
4,56000000.0,Toyota,Land Cruiser Prado,2024,Used,Automatic,SUV / 4x4,Petrol,2777,3200,2


Define Features and Target

In [4]:
X = df.drop("price", axis=1)
y = df["price"]

Identify Categorical Features

In [5]:
categorical_cols = X.select_dtypes(include="object").columns
categorical_cols

See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  categorical_cols = X.select_dtypes(include="object").columns


Index(['brand', 'model', 'condition', 'transmission', 'body_type',
       'fuel_type'],
      dtype='str')

In [6]:
cat_features = [X.columns.get_loc(col) for col in categorical_cols]
cat_features

[0, 1, 3, 4, 5, 6]

Train/Test Split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

Train CatBoost Regressor

In [8]:
model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    loss_function='RMSE',
    random_seed=42,
    verbose=200
)

model.fit(
    X_train,
    y_train,
    cat_features=cat_features,
    eval_set=(X_test, y_test),
    early_stopping_rounds=100
)


0:	learn: 15964786.8169258	test: 15392383.1784294	best: 15392383.1784294 (0)	total: 97.5ms	remaining: 1m 37s
200:	learn: 4066709.0194298	test: 4268782.7312762	best: 4268782.7312762 (200)	total: 5.19s	remaining: 20.6s
400:	learn: 3409714.0044528	test: 3901164.5062396	best: 3900267.5486281 (399)	total: 9.78s	remaining: 14.6s
600:	learn: 3044521.3997291	test: 3740157.8354264	best: 3740157.8354264 (600)	total: 14.3s	remaining: 9.5s
800:	learn: 2799251.3622523	test: 3644622.5497418	best: 3644565.8170799 (799)	total: 19s	remaining: 4.71s
999:	learn: 2596573.0389597	test: 3569578.0247103	best: 3569430.4134869 (997)	total: 23.7s	remaining: 0us

bestTest = 3569430.413
bestIteration = 997

Shrink model to first 998 iterations.


CatBoostRegressor(depth=6, iterations=1000, learning_rate=0.05, loss_function='RMSE', random_seed=42, verbose=200)

Save the trained model

In [9]:
joblib.dump(model, "../models/catboost_model.pkl")

['../models/catboost_model.pkl']