## imports

In [1]:
from warnings import filterwarnings

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import RobustScaler, LabelEncoder
from pycaret.regression import *

sns.set()
filterwarnings("ignore")

## pre-saved data loading

In [2]:
train_raw = pd.read_parquet("https://github.com/XelorR/sf_project_6/raw/master/data/2022-04-08_train_pre-model.parquet")
test_raw = pd.read_parquet("https://github.com/XelorR/sf_project_6/raw/master/data/2022-04-08_test_pre-model.parquet")

train_raw.shape, test_raw.shape

((115367, 30), (34686, 28))

## encoding features

In [3]:
train_raw["train/test"] = "train"
test_raw["train/test"] = "test"

data = train_raw.append(test_raw)
data["ptc"].fillna("Оригинал", inplace=True)
data[data.select_dtypes("object").columns.tolist()] = data[data.select_dtypes("object").columns.tolist()].astype(str)

train = data.loc[data["train/test"] == "train"].drop(columns=["sample", "description", "train/test"])
test = data.loc[data["train/test"] == "test"].drop(columns=["sample", "description", "train/test", "price"])

## pycaret setup

In [4]:
s = setup(
    data = train,
    target="price",
    date_features=["parsed_date"],
    high_cardinality_features = ["model_name"], 
    normalize=True,
    transformation=True,
    remove_outliers = True,
    handle_unknown_categorical = True,
    remove_multicollinearity = True,
    # preprocess = False,
    # categorical_imputation = "mode",
    # imputation_type="iterative",
    # numeric_iterative_imputer = "catboost",
    # categorical_iterative_imputer = "catboost",
    # feature_selection=True,
    # feature_selection_threshold = 0.5,
    combine_rare_levels = True,
)

Unnamed: 0,Description,Value
0,session_id,3213
1,Target,price
2,Original Data,"(115367, 28)"
3,Missing Values,False
4,Numeric Features,9
5,Categorical Features,17
6,Ordinal Features,False
7,High Cardinality Features,True
8,High Cardinality Method,frequency
9,Transformed Train Set,"(76718, 89)"


In [6]:
train_encoded = get_config("X")
train_encoded["price"] = get_config("y")

prep_pipe = get_config("prep_pipe")
test_encoded = prep_pipe.transform(test)

train_encoded.shape, test_encoded.shape

((115367, 90), (34686, 89))

In [7]:
train_encoded.to_parquet("data/2022-04-11_train_encoded_full.parquet")
test_encoded.to_parquet("data/2022-04-11_test_encoded_full.parquet")

In [None]:
best = compare_models(exclude=["dummy", "ada"], n_select=7)

IntProgress(value=0, description='Processing: ', max=94)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)


In [None]:
predictions = predict_model(best, data = test)

In [None]:
save_config("models/2022-04-11_model")
save_model(best, "models/2022-04-11_model")