- [previous file - EDA](2022-03-31_train-test_EDA.ipynb)
- [previous file - sklearn experiments](2022-04-12_experiments-sklearn.ipynb)
- [next file - ensembles](2022-04-15_ensemble.ipynb)

## imports

In [1]:
# !pip install catboost optuna joblib pyarrow

In [2]:
from warnings import filterwarnings
import os
import pickle

import joblib
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import RobustScaler, LabelEncoder
from catboost import CatBoostRegressor
import optuna
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_percentage_error

sns.set()
filterwarnings("ignore")

## preprocessing

### reading data

In [3]:
path_to_data = "data"
# path_to_data = "https://github.com/XelorR/sf_project_6/raw/master/data"

train_raw = pd.read_parquet(f"{path_to_data}/2022-04-08_train_pre-model.parquet")
test_raw = pd.read_parquet(f"{path_to_data}/2022-04-08_test_pre-model.parquet")

train_raw.shape, test_raw.shape

((115367, 30), (34686, 28))

### functions

In [4]:
def train_or_load(clf, X, y, filepath: str = "model.joblib", complevel: int = 9):
    """
    Fits and serialize model as .pkl, .pickle or .joblib file.
    Loads serialized model if filepath exists.

    clf - model to fit
    X - dateset
    y - lables
    filepath - where to save
    complevel - compression level (0-9) for joblib, ignored for pickle
    """
    if filepath.endswith(".joblib"):
        if os.path.exists(filepath):
            with open(filepath, "rb") as f:
                clf = joblib.load(f)
        else:
            clf.fit(X, y)
            with open(filepath, "wb") as f:
                joblib.dump(clf, f, compress=complevel)
        return clf
    elif filepath.endswith(".pkl") or filepath.endswith(".pickle"):
        if os.path.exists(filepath):
            with open(filepath, "rb") as f:
                clf = pickle.load(f)
        else:
            clf.fit(X, y)
            with open(filepath, "wb") as f:
                pickle.dump(clf, f)
        return clf


def submit(hold_out: pd.DataFrame, model, name="submission"):
    preds = model.predict(hold_out)
    submission = pd.read_csv(f"{path_to_data}/sample_submission.csv")
    submission["price"] = preds
    submission.to_csv(f"{name}.csv", index=False)


def submit_log(hold_out: pd.DataFrame, model, name="submission"):
    preds = model.predict(hold_out)
    submission = pd.read_csv(f"{path_to_data}/sample_submission.csv")
    submission["price"] = np.exp(preds)
    submission.to_csv(f"{name}.csv", index=False)

### encoding

In [5]:
train_raw["train/test"] = "train"
test_raw["train/test"] = "test"

data = train_raw.append(test_raw)
data["ptc"].fillna("Оригинал", inplace=True)

data[data.select_dtypes("object").columns.tolist()] = data[
    data.select_dtypes("object").columns.tolist()
].astype(str)

for col in set(data.select_dtypes(exclude=("object")).columns) - {"price"}:
    data[col] = (
        RobustScaler().fit_transform(data[col].values.reshape(-1, 1)).reshape(-1, 1)
    )

for col in ["model_name"]:
    data[col] = LabelEncoder().fit_transform(data[col].astype("str"))

data = pd.get_dummies(
    data,
    columns=[
        "vehicle_transmission",
        "vendor",
        "brand",
        "fuel_type",
        "body_type",
        "color",
        "ptc",
        "drive",
        "wheel",
        "age_cat",
    ],
)

train = data.loc[data["train/test"] == "train"]

train_jane = train.loc[train["sample"] == "jane"]
train_sokolov = train.loc[train["sample"] == "sokolov"]
train_jane["price"] = train_jane["price"] * 0.86
train = train_jane.append(train_sokolov)

train.drop(columns=["sample", "description", "train/test"], inplace=True)
test = data.loc[data["train/test"] == "test"].drop(
    columns=["sample", "description", "train/test", "price"]
)

## preparing but not encoding

In [9]:
train_raw["train/test"] = "train"
test_raw["train/test"] = "test"

data_no_enc = train_raw.append(test_raw)
data_no_enc["ptc"].fillna("Оригинал", inplace=True)

data_no_enc[data_no_enc.select_dtypes("object").columns.tolist()] = data_no_enc[
    data_no_enc.select_dtypes("object").columns.tolist()
].astype(str)

train_no_enc = data_no_enc.loc[data_no_enc["train/test"] == "train"]

train_no_enc_jane = train_no_enc.loc[train_no_enc["sample"] == "jane"]
train_no_enc_sokolov = train_no_enc.loc[train_no_enc["sample"] == "sokolov"]
train_no_enc_jane["price"] = train_no_enc_jane["price"] * 0.86
train_no_enc = train_no_enc_jane.append(train_no_enc_sokolov)

train_no_enc.drop(columns=["sample", "description", "train/test"], inplace=True)
test_no_enc = data_no_enc.loc[data_no_enc["train/test"] == "test"].drop(
    columns=["sample", "description", "train/test", "price"]
)

In [10]:
train_no_enc.sample(5, random_state=42).T.reset_index()

Unnamed: 0,index,84022,64128,64226,43487,2657
0,body_type,внедорожник,седан,седан,хэтчбек,хэтчбек
1,brand,PORSCHE,KIA,KIA,CHEVROLET,SKODA
2,color,белый,белый,белый,синий,красный
3,engine_displacement,3.0,1.4,1.4,1.6,1.2
4,engine_power,245.0,107.0,107.0,109.0,60.0
5,fuel_type,дизель,бензин,бензин,бензин,бензин
6,km_age,120676.0,53600.0,68793.0,181000.0,286000.0
7,model_year,2010.0,2015.0,2015.0,2004.0,2007.0
8,model_name,CAYENNE,RIO,RIO,LACETTI,FABIA
9,number_of_doors,5,4,4,5,5


In [11]:
cat_features = [0, 1, 2, 5, 8, 9, 10, 11, 12, 13, 14, 15, 16, 21, 27]

### preparing to train models

In [12]:
if os.path.exists("models"):
    pass
else:
    os.mkdir("models")

In [13]:
X_train, X_valid, y_train, y_valid = train_test_split(
    train.drop(columns="price"), train["price"], random_state=42, shuffle=True
)
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

((86525, 112), (86525,), (28842, 112), (28842,))

In [14]:
X_train_no_enc, X_valid_no_enc, y_train_no_enc, y_valid_no_enc = train_test_split(
    train_no_enc.drop(columns="price"), train_no_enc["price"], random_state=42, shuffle=True
)
X_train_no_enc.shape, y_train_no_enc.shape, X_valid_no_enc.shape, y_valid_no_enc.shape

((86525, 27), (86525,), (28842, 27), (28842,))

## tuning

### is it faster on GPU?

![](img/catboost_on_gpu.png)

it is **faster on GPU** even using laptop's intergrated Nvidia GTX 970 M.

### 

### is it better to do manual encoding or keep raw?

In [None]:
model = CatBoostRegressor(task_type="GPU", devices="0", random_state=42).fit(
    X_train_no_enc, np.log(y_train_no_enc), cat_features=cat_features
)

print(
    "no preproc",
    mean_absolute_percentage_error(y_valid, model.predict(X_valid_no_enc)),
)

CatBoostError: Bad value for num_feature[non_default_doc_idx=0,feature_idx=20]="old": Cannot convert 'b'old'' to float

In [None]:
model = CatBoostRegressor(task_type="GPU", devices="0", random_state=42).fit(
    X_train, np.log(y_train)
)

print(
    "no preproc",
    mean_absolute_percentage_error(y_valid, model.predict(X_valid)),
)
