## imports

In [None]:
# !pip install pycaret catboost lightgbm

In [2]:
from warnings import filterwarnings

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import RobustScaler, LabelEncoder
from pycaret.regression import *

sns.set()
filterwarnings("ignore")

  defaults = yaml.load(f)


## pre-saved data loading

In [3]:
train = pd.read_parquet("https://github.com/XelorR/sf_project_6/raw/master/data/2022-04-08_train_pre-model.parquet")
test = pd.read_parquet("https://github.com/XelorR/sf_project_6/raw/master/data/2022-04-08_test_pre-model.parquet")

train.shape, test.shape

((115367, 30), (34686, 28))

## encoding features

In [4]:
train["train/test"] = "train"
test["train/test"] = "test"

data = train.append(test)
data["ptc"].fillna("Оригинал", inplace=True)

for col in data.select_dtypes(exclude=("object")).columns:
    data[col] = RobustScaler().fit_transform(data[col].values.reshape(-1, 1)).reshape(-1, 1)

data = pd.get_dummies(data, columns=["vehicle_transmission", "vendor", "brand", "fuel_type", "body_type", "color", "ptc", "drive", "wheel", "age_cat"])

for col in ["model_name"]:
    data[col] = LabelEncoder().fit_transform(data[col].astype("str"))

## pycaret setup

In [5]:
s = setup(
    data.loc[data["train/test"] == "train"].drop(columns=["sample", "description", "train/test"]),
    target="price",
    date_features=["parsed_date"],
    normalize=True,
    # preprocess=False,
    # categorical_imputation="mode",
    # imputation_type="iterative",
    # numeric_iterative_imputer="catboost",
    # categorical_iterative_imputer="catboost",
    transformation=True,
    feature_selection=True,
    feature_selection_threshold=0.5,
)

Unnamed: 0,Description,Value
0,session_id,2568
1,Target,price
2,Original Data,"(115367, 113)"
3,Missing Values,False
4,Numeric Features,107
5,Categorical Features,4
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(80756, 77)"


## comparing models

In [6]:
models()

Unnamed: 0_level_0,Name,Reference,Turbo
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lr,Linear Regression,sklearn.linear_model._base.LinearRegression,True
lasso,Lasso Regression,sklearn.linear_model._coordinate_descent.Lasso,True
ridge,Ridge Regression,sklearn.linear_model._ridge.Ridge,True
en,Elastic Net,sklearn.linear_model._coordinate_descent.Elast...,True
lar,Least Angle Regression,sklearn.linear_model._least_angle.Lars,True
llar,Lasso Least Angle Regression,sklearn.linear_model._least_angle.LassoLars,True
omp,Orthogonal Matching Pursuit,sklearn.linear_model._omp.OrthogonalMatchingPu...,True
br,Bayesian Ridge,sklearn.linear_model._bayes.BayesianRidge,True
ard,Automatic Relevance Determination,sklearn.linear_model._bayes.ARDRegression,False
par,Passive Aggressive Regressor,sklearn.linear_model._passive_aggressive.Passi...,True


In [7]:
best = compare_models(n_select=7)
#     include=[
#         "lr",
#         "huber",
#         "en",
#         "catboost",
#         "lightgbm",
#         "rf",
#         "knn",
#         "omp",
#         "dt",
#         "llar",
#         "ada",
#         "br",
#         "kr",
#         "gbr",
#         "svm",
#         "mlp",
#         "et",
#         "lasso",
#         "mlp",
#     ]
# )

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,0.132,0.2222,0.4352,0.9121,0.0967,7.4941,19.611
et,Extra Trees Regressor,0.1202,0.2267,0.4468,0.9093,0.0914,12.166,82.666
rf,Random Forest Regressor,0.1217,0.2469,0.466,0.9017,0.0939,8.7059,83.992
lightgbm,Light Gradient Boosting Machine,0.1551,0.2497,0.4662,0.9011,0.1098,10.0593,1.108
gbr,Gradient Boosting Regressor,0.2072,0.3548,0.5728,0.8569,0.1371,13.4861,18.262
dt,Decision Tree Regressor,0.1602,0.3572,0.5625,0.8545,0.124,5.8795,1.306
knn,K Neighbors Regressor,0.1788,0.3588,0.5832,0.8528,0.1258,15.6708,24.07
br,Bayesian Ridge,0.4919,0.9779,0.9777,0.5968,0.2859,41.6697,0.828
ridge,Ridge Regression,0.4923,0.9779,0.9778,0.5967,0.2861,41.697,0.094
lr,Linear Regression,0.5119,1.0071,0.995,0.5816,0.2955,43.9789,0.514


In [11]:
print(best)

[<catboost.core.CatBoostRegressor object at 0x7f1f65e124d0>, ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                    max_depth=None, max_features='auto', max_leaf_nodes=None,
                    max_samples=None, min_impurity_decrease=0.0,
                    min_impurity_split=None, min_samples_leaf=1,
                    min_samples_split=2, min_weight_fraction_leaf=0.0,
                    n_estimators=100, n_jobs=-1, oob_score=False,
                    random_state=2568, verbose=0, warm_start=False), RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=-1, oob_score=False,
                      random_st

In [12]:
# evaluate_model(best)

In [9]:
# predictions = predict_model(best, data = test)

In [10]:
save_model(best, "2022-04-09_model")

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[], ml_usecase='regression',
                                       numerical_features=[], target='price',
                                       time_features=['parsed_date'])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numer...
                                         max_depth=None, max_features=None,
                                         max_leaf_nodes=None,
                                         min_impurity_decrease=0.0,
                                         min_impurity_split=None,
                                         