# I. Load preprocessor for processing Test Set

In [1]:
import joblib
import os
import pandas as pd

preprocessor_file = "/kaggle/input/vucar-used-car-price/preprocessor.pkl"

preprocessor = joblib.load(preprocessor_file)
print("Preprocessor loaded successfully.")

Preprocessor loaded successfully.


In [2]:
X_test = pd.read_csv("/kaggle/input/vucar-used-car-price/X_test.csv")
X_test.head()

Unnamed: 0,list_time,manufacture_date,seats,mileage_v2,Vehicle_Age,Mileage_per_Year,milage_with_age,Mileage_per_Year_with_age,Is_Luxury_Brand,brand,model,origin,type,gearbox,fuel,color
0,1695907000000.0,2014.0,5.0,109000.0,10.0,10900.0,108892.452089,10889.245209,0.0,5.0,150.0,0.0,7.0,1.0,3.0,9.0
1,1698111000000.0,2009.0,5.0,150000.0,15.0,10000.0,111450.74875,7430.049917,0.0,5.0,113.0,0.0,7.0,2.0,3.0,10.0
2,1697098000000.0,2021.0,7.0,19500.0,3.0,6500.0,35552.505246,11850.835082,0.0,8.0,192.0,4.0,6.0,1.0,2.0,11.0
3,1696326000000.0,2016.0,5.0,250000.0,8.0,31250.0,83965.389875,10495.673734,0.0,4.0,134.0,9.0,1.0,2.0,3.0,11.0
4,1696736000000.0,2017.0,5.0,87000.0,7.0,12428.571429,82683.072855,11811.867551,0.0,2.0,206.0,4.0,5.0,1.0,2.0,11.0


In [3]:
y_test = pd.read_csv("/kaggle/input/vucar-used-car-price/y_test.csv")
y_test.head()

Unnamed: 0,price
0,358000000.0
1,190000000.0
2,1130000000.0
3,200000000.0
4,655000000.0


# II. Load Pretrained Models

In [4]:
model_dir = "/kaggle/input/vucar-used-car-price"

catboost_models = []
lgb_models = []

for file_name in os.listdir(model_dir):
    file_path = os.path.join(model_dir, file_name)
    
    if file_name.startswith("catboost_model") and file_name.endswith(".joblib"):
        catboost_models.append(joblib.load(file_path))
    elif file_name.startswith("lgb_model") and file_name.endswith(".joblib"):
        lgb_models.append(joblib.load(file_path))

print(f"Loaded {len(catboost_models)} CatBoost models.")
print(f"Loaded {len(lgb_models)} LightGBM models.")

Loaded 5 CatBoost models.
Loaded 5 LightGBM models.


# III. Voting Ensemble

In [5]:
import numpy as np
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import mean_absolute_error

class VotingModel(BaseEstimator, RegressorMixin):
    def __init__(self, estimators):
        super().__init__()
        self.estimators = estimators
        print(f"Loaded {len(estimators)} estimators for ensemble.")

    def predict(self, dataframe):
        """
        Predict using the average of predictions from all estimators.
        """
        y_preds = [estimator.predict(dataframe) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)

    def predict_chunked(self, dataframe, chunk_size=1000):
        """
        Predict in chunks to handle large datasets.
        """
        n = len(dataframe)
        chunk_preds = []
        for i in range(0, n, chunk_size):
            chunk = dataframe[i:i + chunk_size]
            y_preds_chunk = [estimator.predict(chunk) for estimator in self.estimators]
            chunk_mean = np.mean(y_preds_chunk, axis=0)
            chunk_preds.extend(chunk_mean)
        return np.array(chunk_preds)

In [6]:
fitted_models = catboost_models + lgb_models

model = VotingModel(fitted_models)

X_test = preprocessor.transform(X_test)
y_test = y_test

Loaded 10 estimators for ensemble.


# IV. Inference

In [7]:
final_predictions = model.predict(X_test)

mae_score = mean_absolute_error(y_test, final_predictions)
print(f"Mean Absolute Error (MAE) of the ensemble: {mae_score:.4f}")

Mean Absolute Error (MAE) of the ensemble: 253377168.5942
