In [1]:
import time
import warnings

import catboost as cb
import numpy as np
import pandas as pd
import yaml
from data_engineering import prepare_data, logarithmize_y, expose_y
from sklearn.ensemble import (
    GradientBoostingRegressor,
    HistGradientBoostingRegressor,
    RandomForestRegressor,
    StackingRegressor,
)
from sklearn.linear_model import (
    ElasticNetCV,
    LassoCV,
    LinearRegression,
    RidgeCV,
)
from sklearn.metrics import (
    mean_absolute_error,
    mean_absolute_percentage_error,
    mean_squared_error,
    r2_score,
)
from sklearn.model_selection import (
    RepeatedKFold,
    train_test_split,
)

SEED = 2718281
TRAIN_TEST_SPLIT = 0.80


with open("config.yaml", "r") as f:
    cfg = yaml.safe_load(f)

In [2]:
df_cars_init = pd.read_csv(cfg["dataset"])
df_cars_init.replace('-', np.nan, inplace=True)
df_cars_init["Price"] = pd.to_numeric(df_cars_init["Price"], errors='coerce').astype('Int64')
df_cars_init.dropna(subset=['Price'], axis=0, inplace=True)
y_cars = df_cars_init["Price"]
df_cars_init.drop(columns="Price", inplace=True)

df_cars_train, df_cars_test, y_cars_train, y_cars_test = train_test_split(
    df_cars_init, y_cars, test_size=1 - TRAIN_TEST_SPLIT, random_state=SEED
)

df_cars_train, df_cars_test, y_cars_train, y_cars_test = prepare_data(
    df_train=df_cars_train,
    df_test=df_cars_test,
    y_train=y_cars_train,
    y_test=y_cars_test,
)

y_cars_train = logarithmize_y(y_cars_train)

# Исходные датасеты для бустинга
df_cars_boosting_train = df_cars_init.loc[df_cars_train.index]
df_cars_boosting_test = df_cars_init.loc[df_cars_test.index]
cat_cols = list(df_cars_boosting_train.select_dtypes(include=["object"]).columns)

df_cars_boosting_train[cat_cols] = df_cars_boosting_train[cat_cols].fillna(
    "nan", inplace=False
)
df_cars_boosting_test[cat_cols] = df_cars_boosting_test[cat_cols].fillna(
    "nan", inplace=False
)

<class 'pandas.core.frame.DataFrame'>
Index: 13344 entries, 10812 to 16516
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Brand            13344 non-null  object 
 1   Year             13344 non-null  int32  
 2   Model            13344 non-null  object 
 3   CarSuv           13323 non-null  object 
 4   Title            13344 non-null  object 
 5   UsedOrNew        13344 non-null  object 
 6   Transmission     13151 non-null  object 
 7   EngineVol        12029 non-null  float64
 8   DriveType        13344 non-null  object 
 9   FuelType         12835 non-null  object 
 10  FuelConsumption  12017 non-null  float64
 11  Kilometres       12883 non-null  Int64  
 12  Location         12989 non-null  object 
 13  BodyType         13121 non-null  object 
 14  Doors            12084 non-null  Int64  
 15  Seats            12011 non-null  Int64  
 16  ColorExt         7148 non-null   object 
 17  ColorInt     

In [3]:
linreg = LinearRegression()
lasso_cv = LassoCV(cv=RepeatedKFold(n_splits=5, n_repeats=3), random_state=SEED)
ridge_cv = RidgeCV(cv=RepeatedKFold(n_splits=5, n_repeats=3))
elastic_net_cv = ElasticNetCV(
    cv=RepeatedKFold(n_splits=5, n_repeats=3),
    random_state=SEED,
    max_iter=1000,
    l1_ratio=1.0,
    alphas=[0.01, 0.1, 1.0],
)

n_estimators = 250
random_forest = RandomForestRegressor(
    n_estimators=n_estimators, random_state=SEED, n_jobs=4
)
sk_gradient_boosting = GradientBoostingRegressor(
    n_estimators=n_estimators, random_state=SEED
)
hist_gradient_boosting = HistGradientBoostingRegressor(
    max_iter=n_estimators, random_state=SEED
)
cb_gradient_boosting = cb.CatBoostRegressor(
    n_estimators=n_estimators, random_state=SEED, verbose=0
)

stacking_random_forest = RandomForestRegressor(
    n_estimators=100, random_state=SEED, n_jobs=4
)
stacking_sk_gradient_boosting = GradientBoostingRegressor(
    n_estimators=100, random_state=SEED
)
stacking_hist_gradient_boosting = HistGradientBoostingRegressor(
    max_iter=100, random_state=SEED
)
stacking_meta_regressor = GradientBoostingRegressor(n_estimators=50, random_state=SEED)
stacking_regressor = StackingRegressor(
    estimators=[
        ("random_forest", stacking_random_forest),
        ("sk_gradient_boosting", stacking_sk_gradient_boosting),
        ("hist_gradient_boosting", stacking_hist_gradient_boosting),
    ],
    final_estimator=stacking_meta_regressor,
    cv=5,
    n_jobs=4,
)

models = dict(
    zip(
        [
            "linreg",
            "ridge_cv",
            "lasso_cv",
            "elastic_net_cv",
            "random_forest",
            "sk_gradient_boosting",
            "hist_gradient_boosting",
            "cb_gradient_boosting",
            "stacking",
        ],
        [
            linreg,
            ridge_cv,
            lasso_cv,
            elastic_net_cv,
            random_forest,
            sk_gradient_boosting,
            hist_gradient_boosting,
            cb_gradient_boosting,
            stacking_regressor,
        ],
    )
)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for model_name, model in models.items():
        start = time.time()
        if model_name == "cb_gradient_boosting":
            data = cb.Pool(
                data=df_cars_boosting_train,
                label=y_cars_train,
                cat_features=cat_cols,
            )
            model.fit(data)
        else:
            model.fit(df_cars_train, y_cars_train)

        print(f"{model_name} fit in {time.time() - start:.4f} sec")

linreg fit in 0.0099 sec
ridge_cv fit in 0.9181 sec
lasso_cv fit in 1.0490 sec
elastic_net_cv fit in 0.2100 sec
random_forest fit in 13.8460 sec
sk_gradient_boosting fit in 8.0268 sec
hist_gradient_boosting fit in 1.6800 sec
cb_gradient_boosting fit in 20.1699 sec
stacking fit in 47.0643 sec


In [4]:
y_cars_preds = {}
for model_name, model in models.items():
    if model_name == "cb_gradient_boosting":
        y_pred = model.predict(df_cars_boosting_test)
    else:
        y_pred = model.predict(df_cars_test)
    y_cars_preds[model_name] = expose_y(y_pred)

data = []
for model_name, y_pred in y_cars_preds.items():
    data.append(
        [
            model_name,
            np.sqrt(mean_squared_error(y_true=y_cars_test, y_pred=y_pred)),
            mean_absolute_error(y_true=y_cars_test, y_pred=y_pred),
            r2_score(y_true=y_cars_test, y_pred=y_pred),
            mean_absolute_percentage_error(y_true=y_cars_test, y_pred=y_pred),
        ]
    )

df_res = pd.DataFrame(data, columns=["model_name", "RMSE", "MAE", "R2", "MAPE"])

In [5]:
df_res.sort_values(by="R2", ascending=False)

Unnamed: 0,model_name,RMSE,MAE,R2,MAPE
7,cb_gradient_boosting,26431.970733,6531.395892,0.61599,0.162731
4,random_forest,29876.472283,7274.012735,0.509383,0.178242
6,hist_gradient_boosting,30179.651779,7469.21046,0.499376,0.177795
8,stacking,30405.196229,7432.618361,0.491865,0.176688
5,sk_gradient_boosting,31103.795604,8761.713264,0.468246,0.208901
3,elastic_net_cv,37031.154407,10833.349426,0.246266,0.253273
2,lasso_cv,37270.220082,11064.091026,0.236503,0.255196
1,ridge_cv,37288.917525,11076.011761,0.235736,0.255302
0,linreg,37292.254825,11078.325799,0.2356,0.255327
