In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

# I. Load Dataset

In [2]:
file_path = "/kaggle/input/vucar-handle-missing-value/handled_missing.csv"
df = pd.read_csv(file_path)

df.head(), df.columns

(          id    list_id      list_time  manufacture_date   brand  model  \
 0  148468232  108616925  1693378633111              1980    Jeep     A2   
 1  149864917  109805135  1694308247000              2021   Honda   City   
 2  149580046  109560282  1693462201000              2016     Kia    Rio   
 3  148601679  108727914  1693493126176              2020  Toyota   Vios   
 4  149530234  109517456  1693313503000              2001    Fiat  Siena   
 
      origin              type  seats gearbox    fuel  color  mileage_v2  \
 0        Mỹ  SUV / Cross over    4.0      MT  petrol  green       40000   
 1  Nhật Bản             Sedan    5.0      AT  petrol  white       23000   
 2  Hàn Quốc             Sedan    5.0      AT  petrol  white       78545   
 3  Việt Nam             Sedan    5.0      MT  petrol  white       99999   
 4  Việt Nam             Sedan    5.0      MT  petrol  white      200000   
 
          price condition  
 0  380000000.0      used  
 1  455000000.0      used  


## 1. Filter brands that make up 90% of the total cars

In [3]:
brand_counts = df['brand'].value_counts()

total_cars = brand_counts.sum()
cumulative_percentage = (brand_counts.cumsum() / total_cars) * 100

In [4]:
brands_90_percent = cumulative_percentage[cumulative_percentage <= 90].index.tolist()

num_selected_brands = len(brands_90_percent)
top_brands_summary = brand_counts[brands_90_percent]

num_selected_brands, top_brands_summary

(12,
 brand
 Toyota           17149
 Ford              9978
 Kia               9654
 Hyundai           9470
 Mercedes Benz     4950
 Mitsubishi        4926
 Mazda             4652
 Honda             4199
 Chevrolet         2970
 Vinfast           2363
 Suzuki            1677
 Daewoo            1144
 Name: count, dtype: int64)

In [5]:
df = df[df['brand'].isin(brands_90_percent)]

df.head()

Unnamed: 0,id,list_id,list_time,manufacture_date,brand,model,origin,type,seats,gearbox,fuel,color,mileage_v2,price,condition
1,149864917,109805135,1694308247000,2021,Honda,City,Nhật Bản,Sedan,5.0,AT,petrol,white,23000,455000000.0,used
2,149580046,109560282,1693462201000,2016,Kia,Rio,Hàn Quốc,Sedan,5.0,AT,petrol,white,78545,295000000.0,used
3,148601679,108727914,1693493126176,2020,Toyota,Vios,Việt Nam,Sedan,5.0,MT,petrol,white,99999,368000000.0,used
5,148583720,108712558,1694932494416,2022,Hyundai,Elantra,Việt Nam,Sedan,5.0,AT,petrol,white,45000,455000000.0,used
6,150061598,109974620,1694955795249,2018,Toyota,Innova,Việt Nam,SUV / Cross over,8.0,MT,petrol,white,60000,485000000.0,used


## 2. Filter only used car

In [6]:
used_cars_df = df[df['condition'] == 'used']

output_path = "/kaggle/working/used_car.csv"
used_cars_df.to_csv(output_path, index=False)

In [7]:
print(used_cars_df.shape)
used_cars_df.head()

(64786, 15)


Unnamed: 0,id,list_id,list_time,manufacture_date,brand,model,origin,type,seats,gearbox,fuel,color,mileage_v2,price,condition
1,149864917,109805135,1694308247000,2021,Honda,City,Nhật Bản,Sedan,5.0,AT,petrol,white,23000,455000000.0,used
2,149580046,109560282,1693462201000,2016,Kia,Rio,Hàn Quốc,Sedan,5.0,AT,petrol,white,78545,295000000.0,used
3,148601679,108727914,1693493126176,2020,Toyota,Vios,Việt Nam,Sedan,5.0,MT,petrol,white,99999,368000000.0,used
5,148583720,108712558,1694932494416,2022,Hyundai,Elantra,Việt Nam,Sedan,5.0,AT,petrol,white,45000,455000000.0,used
6,150061598,109974620,1694955795249,2018,Toyota,Innova,Việt Nam,SUV / Cross over,8.0,MT,petrol,white,60000,485000000.0,used


In [8]:
X = used_cars_df.drop(columns=['price', "id", 'list_id', "condition"])  
y = used_cars_df['price']

In [9]:
X.head()

Unnamed: 0,list_time,manufacture_date,brand,model,origin,type,seats,gearbox,fuel,color,mileage_v2
1,1694308247000,2021,Honda,City,Nhật Bản,Sedan,5.0,AT,petrol,white,23000
2,1693462201000,2016,Kia,Rio,Hàn Quốc,Sedan,5.0,AT,petrol,white,78545
3,1693493126176,2020,Toyota,Vios,Việt Nam,Sedan,5.0,MT,petrol,white,99999
5,1694932494416,2022,Hyundai,Elantra,Việt Nam,Sedan,5.0,AT,petrol,white,45000
6,1694955795249,2018,Toyota,Innova,Việt Nam,SUV / Cross over,8.0,MT,petrol,white,60000


# II. Feature Engineering

In [10]:
brand_counts = X['brand'].value_counts()
brand_counts

brand
Toyota           15728
Kia               8776
Ford              8566
Hyundai           8051
Mercedes Benz     4609
Mazda             4170
Mitsubishi        3894
Honda             3725
Chevrolet         2970
Vinfast           1854
Suzuki            1299
Daewoo            1144
Name: count, dtype: int64

## 1. Create new feature

In [11]:
def extract_age_features(df):
    current_year = 2024

    df['Vehicle_Age'] = current_year - df['manufacture_date']
    
    df['Mileage_per_Year'] = df['mileage_v2'] / df['Vehicle_Age']
    df['milage_with_age'] =  df.groupby('Vehicle_Age')['mileage_v2'].transform('mean')
    
    df['Mileage_per_Year_with_age'] =  df.groupby('Vehicle_Age')['Mileage_per_Year'].transform('mean')
    
    return df

def extract_other_features(df):
    
    luxury_brands =  ['Mercedes Benz', 'BMW', 'Audi', 'Porsche', 'LandRover', 
                    'Lexus', 'Jaguar', 'Bentley', 'Maserati', 'Lamborghini', 
                    'Rolls Royce', 'Ferrari', 'Aston Martin', 'Maybach']
    df['Is_Luxury_Brand'] = df['brand'].apply(lambda x: 1 if x in luxury_brands else 0)
    return df

In [12]:
X = extract_age_features(X)
X = extract_other_features(X)

In [13]:
import sklearn
print(sklearn.__version__)

1.2.2


# III. Training models

In [14]:
import pandas as pd
import numpy as np
import gc
import joblib
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

## 1. Handle Numerical & Categorical features

In [15]:
# Identify numerical and categorical features
num_features = X.select_dtypes(include=[np.number]).columns.tolist()
cat_features = X.select_dtypes(include=["object", "category"]).columns.tolist()

# Preprocessing pipelines
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    # ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, num_features),
    ('cat', cat_transformer, cat_features)
])

## 2. Train Test Split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)

preprocessor.fit(X_train)

preprocessor_file = "/kaggle/working/preprocessor.pkl"
joblib.dump(preprocessor, preprocessor_file)
print(f"Preprocessor saved to {preprocessor_file}")

Preprocessor saved to /kaggle/working/preprocessor.pkl


## 3. Transform Data

In [17]:
X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)

In [18]:
X_train = pd.DataFrame(X_train, columns=num_features + cat_features)
X_test = pd.DataFrame(X_test, columns=num_features + cat_features)

In [19]:
X_test.to_csv("X_test.csv", index=False)
y_test.to_csv("y_test.csv", index=False)

In [20]:
X_train

Unnamed: 0,list_time,manufacture_date,seats,mileage_v2,Vehicle_Age,Mileage_per_Year,milage_with_age,Mileage_per_Year_with_age,Is_Luxury_Brand,brand,model,origin,type,gearbox,fuel,color
0,1.698198e+12,2016.0,5.0,54000.0,8.0,6750.000000,83965.389875,10495.673734,0.0,10.0,277.0,4.0,1.0,1.0,3.0,11.0
1,1.696514e+12,2011.0,8.0,99000.0,13.0,7615.384615,101120.942637,7778.534049,0.0,10.0,145.0,6.0,3.0,2.0,3.0,10.0
2,1.694862e+12,2021.0,5.0,40000.0,3.0,13333.333333,35552.505246,11850.835082,0.0,8.0,27.0,4.0,7.0,2.0,3.0,11.0
3,1.694311e+12,2021.0,5.0,38000.0,3.0,12666.666667,35552.505246,11850.835082,0.0,10.0,264.0,6.0,7.0,1.0,3.0,11.0
4,1.696665e+12,2017.0,5.0,68000.0,7.0,9714.285714,82683.072855,11811.867551,0.0,5.0,182.0,6.0,1.0,1.0,3.0,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61541,1.695252e+12,2019.0,5.0,144.0,5.0,28.800000,64202.392049,12840.478410,0.0,8.0,275.0,3.0,3.0,2.0,3.0,11.0
61542,1.696250e+12,2020.0,5.0,50000.0,4.0,12500.000000,46501.469873,11625.367468,1.0,7.0,121.0,6.0,6.0,1.0,3.0,11.0
61543,1.695307e+12,2022.0,-1.0,26000.0,2.0,13000.000000,20009.169533,10004.584766,0.0,5.0,182.0,0.0,1.0,1.0,3.0,3.0
61544,1.696411e+12,2007.0,7.0,122000.0,17.0,7176.470588,119700.255267,7041.191486,0.0,5.0,60.0,0.0,6.0,1.0,2.0,10.0


## 4. LGBM & CatBoost 5-folds CV

In [21]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb
from catboost import CatBoostRegressor
import gc

cv = KFold(n_splits=5, shuffle=True, random_state=42)

lgb_params = {
    "boosting_type": "gbdt",
    "objective": "regression",
    "metric": "mae",
    "max_depth": 16,
    "max_bin": 255,
    "learning_rate": 0.05,
    "n_estimators": 1000,
    "colsample_bytree": 0.8,
    "verbose": -1,
    "random_state": 42,
    "device": "gpu",  # Change to 'cpu' if GPU is not available
}

catboost_params = {
    "iterations": 1000,
    "depth": 16,
    "learning_rate": 0.05,
    "loss_function": "MAE",
    "random_seed": 42,
    "verbose": 100,
    "task_type": "GPU",  # Change to 'CPU' if GPU is not available
}

lgb_oof_pred = np.zeros(X_train.shape[0])
catboost_oof_pred = np.zeros(X_train.shape[0])

lgb_fitted_models = []
catboost_fitted_models = []

for fold, (idx_train, idx_valid) in enumerate(cv.split(X_train, y_train)):
    print(f"Fold {fold + 1}")
    
    X_tr, y_tr = X_train.iloc[idx_train], y_train.iloc[idx_train]
    X_val, y_val = X_train.iloc[idx_valid], y_train.iloc[idx_valid]

    # LightGBM model
    lgb_model = lgb.LGBMRegressor(**lgb_params)
    lgb_model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        eval_metric="mae",
        callbacks=[
            lgb.early_stopping(stopping_rounds=100),
            lgb.log_evaluation(period=300)
        ]
    )
    lgb_fitted_models.append(lgb_model)
    lgb_oof_pred[idx_valid] = lgb_model.predict(X_val)

    # CatBoost model
    catboost_model = CatBoostRegressor(**catboost_params)
    catboost_model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        early_stopping_rounds=100,
    )
    catboost_fitted_models.append(catboost_model)
    catboost_oof_pred[idx_valid] = catboost_model.predict(X_val)

    gc.collect()

lgb_mae = mean_absolute_error(y_train, lgb_oof_pred)
catboost_mae = mean_absolute_error(y_train, catboost_oof_pred)

print(f"LightGBM OOF MAE: {lgb_mae:.4f}")
print(f"CatBoost OOF MAE: {catboost_mae:.4f}")

Fold 1
Training until validation scores don't improve for 100 rounds
[300]	valid_0's l1: 5.71729e+07
[600]	valid_0's l1: 5.29596e+07
[900]	valid_0's l1: 5.0448e+07
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l1: 5.00107e+07


Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 247331128.2996182	test: 247188823.2188465	best: 247188823.2188465 (0)	total: 14.7s	remaining: 4h 4m 47s
100:	learn: 247331107.0026810	test: 247188823.2188465	best: 247188780.6284322 (34)	total: 4m 42s	remaining: 41m 51s
bestTest = 247188780.6
bestIteration = 34
Shrink model to first 35 iterations.
Fold 2
Training until validation scores don't improve for 100 rounds
[300]	valid_0's l1: 6.00658e+07
[600]	valid_0's l1: 5.65365e+07
[900]	valid_0's l1: 5.47579e+07
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l1: 5.43376e+07


Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 247656477.6986413	test: 245863343.3877650	best: 245863343.3877650 (0)	total: 3.4s	remaining: 56m 34s
100:	learn: 247656477.6986413	test: 245863364.6847022	best: 245863322.0908279 (6)	total: 5m 1s	remaining: 44m 43s
bestTest = 245863322.1
bestIteration = 6
Shrink model to first 7 iterations.
Fold 3
Training until validation scores don't improve for 100 rounds
[300]	valid_0's l1: 5.96389e+07
[600]	valid_0's l1: 5.61822e+07
[900]	valid_0's l1: 5.47905e+07
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l1: 5.45788e+07


Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 245743433.9814367	test: 253516057.0197417	best: 253516057.0197417 (0)	total: 1.08s	remaining: 18m 3s
100:	learn: 245743391.3884274	test: 253516078.3166789	best: 253516035.7228045 (13)	total: 4m 26s	remaining: 39m 32s
bestTest = 253516035.7
bestIteration = 13
Shrink model to first 14 iterations.
Fold 4
Training until validation scores don't improve for 100 rounds
[300]	valid_0's l1: 5.74126e+07
[600]	valid_0's l1: 5.40331e+07
[900]	valid_0's l1: 5.22465e+07
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l1: 5.19215e+07


Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 247820418.1915226	test: 245207738.4729872	best: 245207738.4729872 (0)	total: 3.45s	remaining: 57m 24s
100:	learn: 247820418.1915226	test: 245207759.7699244	best: 245207738.4729872 (0)	total: 4m 57s	remaining: 44m 11s
bestTest = 245207738.5
bestIteration = 0
Shrink model to first 1 iterations.
Fold 5
Training until validation scores don't improve for 100 rounds
[300]	valid_0's l1: 5.91431e+07
[600]	valid_0's l1: 5.53968e+07
[900]	valid_0's l1: 5.35888e+07
Did not meet early stopping. Best iteration is:
[989]	valid_0's l1: 5.31491e+07


Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 247936335.0663932	test: 244744146.7440085	best: 244744146.7440085 (0)	total: 3.36s	remaining: 55m 55s
100:	learn: 247936249.8803745	test: 244744125.4470713	best: 244744125.4470713 (15)	total: 4m 46s	remaining: 42m 32s
bestTest = 244744125.4
bestIteration = 15
Shrink model to first 16 iterations.
LightGBM OOF MAE: 52799511.7409
CatBoost OOF MAE: 247304023.7166


## 6. Save pretrained models

In [22]:
import joblib

for i, model in enumerate(lgb_fitted_models):
    joblib.dump(model, f"lgb_model_fold_{i + 1}.joblib")
for i, model in enumerate(catboost_fitted_models):
    joblib.dump(model, f"catboost_model_fold_{i + 1}.joblib")