## imports

In [2]:
from warnings import filterwarnings

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
import optuna
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_percentage_error

from lib.model_related import *

sns.set()
filterwarnings("ignore")

## reading data

In [5]:
train_raw = pd.read_parquet("data/2022-04-08_train_pre-model.parquet")
test_raw = pd.read_parquet("data/2022-04-08_test_pre-model.parquet")

train_raw.shape, test_raw.shape

((115367, 30), (34686, 28))

In [6]:
currency_dict = {
    "2020-10-20": 77.9241,
    "2020-10-19": 77.9644,
    "2020-10-21": 77.7780,
    "2020-10-25": 76.4667,
    "2020-10-24": 76.4667,
    "2020-10-26": 76.4667,
    "2020-09-09": 75.9645,
    "2021-09-27": 73.0081,
    "2021-09-30": 72.7608,
    "2021-09-26": 73.0081,
    "2021-09-28": 72.6613,
    "2021-09-29": 72.5083,
    "2021-10-01": 72.6642,
}


def submit(hold_out: pd.DataFrame, model, name="submission"):
    preds = model.predict(hold_out)
    submission = pd.read_csv("data/sample_submission.csv")
    submission["price"] = preds
    submission.to_csv(f"{name}.csv", index=False)

In [7]:
train_raw["price"].sum()

158409758714.0

## encoding

In [8]:
train_raw["train/test"] = "train"
test_raw["train/test"] = "test"

data = train_raw.append(test_raw)
data["ptc"].fillna("Оригинал", inplace=True)

data[data.select_dtypes("object").columns.tolist()] = data[
    data.select_dtypes("object").columns.tolist()
].astype(str)

for col in set(data.select_dtypes(exclude=("object")).columns) - {"price"}:
    data[col] = (
        RobustScaler().fit_transform(data[col].values.reshape(-1, 1)).reshape(-1, 1)
    )

for col in ["model_name"]:
    data[col] = LabelEncoder().fit_transform(data[col].astype("str"))

data = pd.get_dummies(
    data,
    columns=[
        "vehicle_transmission",
        "vendor",
        "brand",
        "fuel_type",
        "body_type",
        "color",
        "ptc",
        "drive",
        "wheel",
        "age_cat",
    ],
)

train = data.loc[data["train/test"] == "train"]

train_jane = train.loc[train["sample"] == "jane"]
train_sokolov = train.loc[train["sample"] == "sokolov"]
train_jane["price"] = train_jane["price"] * 0.86
train = train_jane.append(train_sokolov)

train.drop(columns=["sample", "description", "train/test"], inplace=True)
test = data.loc[data["train/test"] == "test"].drop(
    columns=["sample", "description", "train/test", "price"]
)


In [9]:
train["price"].sum()

151033359642.84

## modelling

In [10]:
X_train, X_valid, y_train, y_valid = train_test_split(train.drop(columns="price"), train["price"], random_state = 42, shuffle=True)
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

((86525, 112), (86525,), (28842, 112), (28842,))

In [11]:
lr = LinearRegression().fit(X_train, y_train)
knn = KNeighborsRegressor().fit(X_train, y_train)
lightgbm = LGBMRegressor(random_state=42, silent=True).fit(X_train, y_train)
catboost = CatBoostRegressor(random_state=42, silent=True).fit(X_train, y_train)
rf = RandomForestRegressor(random_state=42).fit(X_train, y_train)

In [12]:
print("lr", mean_absolute_percentage_error(y_valid, lr.predict(X_valid)))
print("knn", mean_absolute_percentage_error(y_valid, knn.predict(X_valid)))
print("lightgbm", mean_absolute_percentage_error(y_valid, lightgbm.predict(X_valid)))
print("catboost", mean_absolute_percentage_error(y_valid, catboost.predict(X_valid)))
print("rf", mean_absolute_percentage_error(y_valid, rf.predict(X_valid)))

lr 0.7946185682561607
knn 0.16951330279094057
lightgbm 0.19102466397466208
catboost 0.1584574973807607
rf 0.13804863667174702


## dumb model submission

In [13]:
submit(test, lr, "lr")
submit(test, knn, "knn")
submit(test, lightgbm, "lightgbm")
submit(test, catboost, "catboost")
submit(test, rf, "rf")

## model tuning

### lightgbm

In [17]:
def objective(trial):

    param = {
        "objective": "regression",
        "metric": "mape",
        "learning_rate": trial.suggest_uniform("learning_rate", 0.001, 1.0),
        "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 10.0),
        "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 10.0),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "verbosity": -1,
    }

    gbm = LGBMRegressor(**param, silent=True)
    cv_roc_auc = cross_val_score(gbm, X_train, y_train, cv=8, scoring="neg_mean_absolute_percentage_error", n_jobs=-1)

    return np.mean(cv_roc_auc)


study = optuna.create_study(
    direction="maximize",
    storage="sqlite:///LGBMClassifier.db",
    study_name="LGBMClassifier",
    load_if_exists=True,
)
study.optimize(objective, timeout=600, n_trials=10)

[32m[I 2022-04-12 22:35:33,204][0m Using an existing study with name 'LGBMClassifier' instead of creating a new one.[0m





[32m[I 2022-04-12 22:35:47,095][0m Trial 5 finished with value: -0.20253670583254396 and parameters: {'learning_rate': 0.07992724445501892, 'lambda_l1': 1.8270468476213303e-05, 'lambda_l2': 0.012791850966944924, 'num_leaves': 28, 'feature_fraction': 0.7698606143353843, 'bagging_fraction': 0.8695690383435876, 'bagging_freq': 2, 'min_child_samples': 48}. Best is trial 5 with value: -0.20253670583254396.[0m




[32m[I 2022-04-12 22:35:59,590][0m Trial 6 finished with value: -0.257565168367874 and parameters: {'learning_rate': 0.744263685304623, 'lambda_l1': 7.552664420954056e-07, 'lambda_l2': 1.9963207632140944e-05, 'num_leaves': 201, 'feature_fraction': 0.7340966128567723, 'bagging_fraction': 0.6079090066839168, 'bagging_freq': 1, 'min_child_samples': 52}. Best is trial 5 with value: -0.20253670583254396.[0m




[32m[I 2022-04-12 22:36:08,280][0m Trial 7 finished with value: -0.1745456552518454 and parameters: {'learning_rate': 0.4216546930140595, 'lambda_l1': 5.529044942764546e-08, 'lambda_l2': 0.49107163604631077, 'num_leaves': 142, 'feature_fraction': 0.5426396065593337, 'bagging_fraction': 0.6354145080468188, 'bagging_freq': 3, 'min_child_samples': 12}. Best is trial 7 with value: -0.1745456552518454.[0m




[32m[I 2022-04-12 22:36:18,834][0m Trial 8 finished with value: -0.2432717613049581 and parameters: {'learning_rate': 0.7172956964597691, 'lambda_l1': 5.858478690290897e-06, 'lambda_l2': 3.9449322532268776e-08, 'num_leaves': 154, 'feature_fraction': 0.679286165875125, 'bagging_fraction': 0.6779233529792716, 'bagging_freq': 4, 'min_child_samples': 70}. Best is trial 7 with value: -0.1745456552518454.[0m




[32m[I 2022-04-12 22:36:28,581][0m Trial 9 finished with value: -0.1567960005144171 and parameters: {'learning_rate': 0.24273738931459424, 'lambda_l1': 0.0007127314011370048, 'lambda_l2': 1.4991431139899208e-08, 'num_leaves': 129, 'feature_fraction': 0.716472706585253, 'bagging_fraction': 0.9079273070338828, 'bagging_freq': 4, 'min_child_samples': 27}. Best is trial 9 with value: -0.1567960005144171.[0m




[32m[I 2022-04-12 22:36:38,610][0m Trial 10 finished with value: -0.3571169344261295 and parameters: {'learning_rate': 0.9494423379774705, 'lambda_l1': 8.976874193475037e-05, 'lambda_l2': 0.00024184845376432643, 'num_leaves': 160, 'feature_fraction': 0.6342281343794924, 'bagging_fraction': 0.52462564650187, 'bagging_freq': 5, 'min_child_samples': 74}. Best is trial 9 with value: -0.1567960005144171.[0m




[32m[I 2022-04-12 22:36:50,387][0m Trial 11 finished with value: -0.22698765690534695 and parameters: {'learning_rate': 0.773164422222387, 'lambda_l1': 1.8437884236649082, 'lambda_l2': 5.646369546324295e-07, 'num_leaves': 225, 'feature_fraction': 0.6137958863887523, 'bagging_fraction': 0.7728914806928449, 'bagging_freq': 6, 'min_child_samples': 22}. Best is trial 9 with value: -0.1567960005144171.[0m





[32m[I 2022-04-12 22:37:03,343][0m Trial 12 finished with value: -0.25967795465332394 and parameters: {'learning_rate': 0.02716689002528746, 'lambda_l1': 0.655717669029221, 'lambda_l2': 3.8521960471096605e-07, 'num_leaves': 225, 'feature_fraction': 0.7442760753734905, 'bagging_fraction': 0.4972661166651129, 'bagging_freq': 2, 'min_child_samples': 78}. Best is trial 9 with value: -0.1567960005144171.[0m





[32m[I 2022-04-12 22:37:13,613][0m Trial 13 finished with value: -0.15895263105959045 and parameters: {'learning_rate': 0.25542847231451854, 'lambda_l1': 5.5278340137009336e-08, 'lambda_l2': 5.616679588596495e-07, 'num_leaves': 112, 'feature_fraction': 0.7415789372960532, 'bagging_fraction': 0.9710703037180636, 'bagging_freq': 4, 'min_child_samples': 42}. Best is trial 9 with value: -0.1567960005144171.[0m








[32m[I 2022-04-12 22:37:24,776][0m Trial 14 finished with value: -0.25642212193797126 and parameters: {'learning_rate': 0.9889916200650344, 'lambda_l1': 1.31641269168062e-07, 'lambda_l2': 0.001308364765353021, 'num_leaves': 181, 'feature_fraction': 0.814929774720756, 'bagging_fraction': 0.8175936493895224, 'bagging_freq': 6, 'min_child_samples': 21}. Best is trial 9 with value: -0.1567960005144171.[0m


In [18]:
study.best_params

{'bagging_fraction': 0.9079273070338828,
 'bagging_freq': 4,
 'feature_fraction': 0.716472706585253,
 'lambda_l1': 0.0007127314011370048,
 'lambda_l2': 1.4991431139899208e-08,
 'learning_rate': 0.24273738931459424,
 'min_child_samples': 27,
 'num_leaves': 129}

In [19]:
lightgbm_optuned = LGBMRegressor(
    **{
        "bagging_fraction": 0.9079273070338828,
        "bagging_freq": 4,
        "feature_fraction": 0.716472706585253,
        "lambda_l1": 0.0007127314011370048,
        "lambda_l2": 1.4991431139899208e-08,
        "learning_rate": 0.24273738931459424,
        "min_child_samples": 27,
        "num_leaves": 129,
        "random_state": 42,
        "silent": True,
    }
).fit(X_train, y_train)

print("lightgbm_optuned", mean_absolute_percentage_error(y_valid, lightgbm_optuned.predict(X_valid)))
submit(test, lightgbm_optuned, "lightgbm_optuned")

lightgbm_optuned 0.1562352982059385


## Ensemble models

In [20]:
estimators = (
    ("lr", lr),
    ("knn", knn),
    ("lightgbm", lightgbm),
    ("catboost", catboost),
    ("rf", rf),
)

meta = StackingRegressor(estimators=estimators, final_estimator=CatBoostRegressor(), n_jobs=-1)
meta.fit(X_train, y_train)

print("meta", mean_absolute_percentage_error(y_valid, meta.predict(X_valid)))
submit(test, meta, "meta")