[previous file - EDA](2022-03-31_train-test_EDA.ipynb)

## imports

In [None]:
# !pip install catboost lightgbm xgboost optuna

In [1]:
from warnings import filterwarnings

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, StackingRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
import optuna
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_percentage_error

sns.set()
filterwarnings("ignore")

  from pandas import MultiIndex, Int64Index


## reading data

In [2]:
train_raw = pd.read_parquet("https://github.com/XelorR/sf_project_6/raw/master/data/2022-04-08_train_pre-model.parquet")
test_raw = pd.read_parquet("https://github.com/XelorR/sf_project_6/raw/master/data/2022-04-08_test_pre-model.parquet")

train_raw.shape, test_raw.shape

((115367, 30), (34686, 28))

In [3]:
def train_or_load(clf, X, y, filepath: str, complevel=9):
    if os.path.exists(filepath):
        with open(filepath, "rb") as f:
            clf = joblib.load(f)
    else:
        clf.fit(X, y)
        with open(filepath, "wb") as f:
            joblib.dump(clf, f, compress=complevel)
    return clf


def submit(hold_out: pd.DataFrame, model, name="submission"):
    preds = model.predict(hold_out)
    submission = pd.read_csv("https://github.com/XelorR/sf_project_6/raw/master/data/sample_submission.csv")
    submission["price"] = preds
    submission.to_csv(f"{name}.csv", index=False)
    

def submit_log(hold_out: pd.DataFrame, model, name="submission"):
    preds = model.predict(hold_out)
    submission = pd.read_csv("https://github.com/XelorR/sf_project_6/raw/master/data/sample_submission.csv")
    submission["price"] = np.exp(preds)
    submission.to_csv(f"{name}.csv", index=False)

## encoding

In [4]:
train_raw["train/test"] = "train"
test_raw["train/test"] = "test"

data = train_raw.append(test_raw)
data["ptc"].fillna("Оригинал", inplace=True)

data[data.select_dtypes("object").columns.tolist()] = data[
    data.select_dtypes("object").columns.tolist()
].astype(str)

for col in set(data.select_dtypes(exclude=("object")).columns) - {"price"}:
    data[col] = (
        RobustScaler().fit_transform(data[col].values.reshape(-1, 1)).reshape(-1, 1)
    )

for col in ["model_name"]:
    data[col] = LabelEncoder().fit_transform(data[col].astype("str"))

data = pd.get_dummies(
    data,
    columns=[
        "vehicle_transmission",
        "vendor",
        "brand",
        "fuel_type",
        "body_type",
        "color",
        "ptc",
        "drive",
        "wheel",
        "age_cat",
    ],
)

train = data.loc[data["train/test"] == "train"]

train_jane = train.loc[train["sample"] == "jane"]
train_sokolov = train.loc[train["sample"] == "sokolov"]
train_jane["price"] = train_jane["price"] * 0.86
train = train_jane.append(train_sokolov)

train.drop(columns=["sample", "description", "train/test"], inplace=True)
test = data.loc[data["train/test"] == "test"].drop(
    columns=["sample", "description", "train/test", "price"]
)

## modelling

In [5]:
X_train, X_valid, y_train, y_valid = train_test_split(train.drop(columns="price"), train["price"], random_state = 42, shuffle=True)
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

((86525, 112), (86525,), (28842, 112), (28842,))

## base models

### lightgbm

#### Version 1

In [6]:
lightgbm_optuned = LGBMRegressor(
    **{
        "bagging_fraction": 0.9079273070338828,
        "bagging_freq": 4,
        "feature_fraction": 0.716472706585253,
        "lambda_l1": 0.0007127314011370048,
        "lambda_l2": 1.4991431139899208e-08,
        "learning_rate": 0.24273738931459424,
        "min_child_samples": 27,
        "num_leaves": 129,
        "random_state": 42,
        "silent": True,
    }
).fit(X_train, y_train)

lightgbm_optuned 0.1562352982059385


In [11]:
print("lightgbm_optuned", mean_absolute_percentage_error(y_valid, lightgbm_optuned.predict(X_valid)))
submit(test, lightgbm_optuned, "lightgbm_optuned")

lightgbm_optuned 0.1562352982059385


#### Version 2

In [7]:
lightgbm_optuned_1899 = LGBMRegressor(
    **{
        'learning_rate': 0.2200394016092361, 
        'lambda_l1': 3.6405456215002115e-08, 
        'lambda_l2': 3.9256724979441087, 
        'num_leaves': 251, 
        'feature_fraction': 0.7849386830734889, 
        'bagging_fraction': 0.999471799816821, 
        'bagging_freq': 7, 
        'min_child_samples': 5, 
        "random_state": 42,
        "silent": True
    }
).fit(X_train, np.log(y_train))

lightgbm_optuned_1899_log 0.1266083430966481


In [12]:
print("lightgbm_optuned_1899_log", mean_absolute_percentage_error(y_valid, np.exp(lightgbm_optuned_1899.predict(X_valid))))
submit_log(test, lightgbm_optuned_1899, "lightgbm_optuned_log_1899")

lightgbm_optuned_1899_log 0.1266083430966481


#### Version 3

In [8]:
lightgbm_optuned_1258 = LGBMRegressor(
    **{
        'learning_rate': 0.2034225924278744, 
        'lambda_l1': 1.6905457446408715e-07, 
        'lambda_l2': 3.410817513919556, 
        'num_leaves': 237, 
        'feature_fraction': 0.8139002011435048, 
        'bagging_fraction': 0.9996914517711281, 
        'bagging_freq': 2, 
        'min_child_samples': 5
    }
).fit(X_train, np.log(y_train))

lightgbm_optuned_1258_log 0.12681490786747857


In [13]:
print("lightgbm_optuned_1258_log", mean_absolute_percentage_error(y_valid, np.exp(lightgbm_optuned_1258.predict(X_valid))))
submit_log(test, lightgbm_optuned_1258, "lightgbm_optuned_log_1258")

lightgbm_optuned_1258_log 0.12681490786747857


### xgboost

#### Version 1 (manual)

In [9]:
xbgr_custom = xgb.XGBRegressor(
    objective='reg:squarederror',
    colsample_bytree=0.5,
    learning_rate=0.05,
    max_depth=12,
    alpha=1,
    n_estimators=1000,
    random_state=42,
    n_jobs=-1
).fit(X_train, np.log(y_train))

In [10]:
print("xbgr_custom_log", mean_absolute_percentage_error(y_valid, np.exp(xbgr_custom.predict(X_valid))))
submit_log(test, lightgbm_optuned_1258, "xbgr_custom_log")

xbgr_custom_log 0.1196754335163977


### extra trees

#### Version 1 (tuned by hand)

In [14]:
etr_cust = ExtraTreesRegressor(
    n_estimators=800,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='auto',
    max_depth=15,
    bootstrap=True,
    random_state=42, 
    n_jobs=-1,
    verbose=0
).fit(X_train, np.log(y_train))

In [15]:
print("etr_cust_log", mean_absolute_percentage_error(y_valid, np.exp(etr_cust.predict(X_valid))))
submit_log(test, etr_cust, "etr_cust_log")
# etr_cust_log 0.1376278076519815

etr_cust_log 0.13762780765198146


#### Version 2 (optuna)

In [16]:
etr_tuned_78 = ExtraTreesRegressor(
    **{
        'n_estimators': 936, 
        'min_samples_split': 3, 
        'min_samples_leaf': 1, 
        'max_samples': 0.9894458395539251, 
        'max_features': 'auto',
        'max_depth': None,
        'bootstrap': True,
        'random_state': 42,
        'n_jobs': -1,
        'verbose': 0
    }
).fit(X_train, np.log(y_train))

In [17]:
print("etr_tuned_78_log", mean_absolute_percentage_error(y_valid, np.exp(etr_tuned_78.predict(X_valid))))
submit_log(test, etr_tuned_78, "etr_tuned_78_log")

etr_tuned_78_log 0.1243004166689037


#### Version 3 (default)

In [18]:
etr_default = ExtraTreesRegressor().fit(X_train, np.log(y_train))

In [19]:
print("etr_default_log", mean_absolute_percentage_error(y_valid, np.exp(etr_default.predict(X_valid))))
submit_log(test, etr_default, "etr_default_log")

etr_default_log 0.12949125299183772


### random forest

#### Version 1 (tuned by hand)

In [20]:
rf_tuned = RandomForestRegressor(
    random_state=42,
    n_estimators=800,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='log2',
    max_depth=None,
    bootstrap=True
).fit(X_train, np.log(y_train))

In [21]:
print("rf_tuned_log", mean_absolute_percentage_error(y_valid, np.exp(rf_tuned.predict(X_valid))))
submit_log(test, rf_tuned, "rf_tuned_log")
# rf_tuned_log 0.1315354887566232

rf_tuned_log 0.1315218319297354


#### Version 2

In [22]:
rf_optuned_174 = RandomForestRegressor(
    **{
        'n_estimators': 450, 
        'min_samples_split': 4, 
        'min_samples_leaf': 2, 
        'max_samples': 0.9899165552020569, 
        'max_features': 'auto',
        'random_state': 42,
        'max_depth': None,
        'bootstrap': True
    }
).fit(X_train, np.log(y_train))

In [23]:
print("rf_optuned_174_log", mean_absolute_percentage_error(y_valid, np.exp(rf_optuned_174.predict(X_valid))))
submit_log(test, rf_optuned_174, "rf_optuned_log_174")

rf_optuned_174_log 0.12683767196612786


#### Version 3 (default)

In [24]:
rf_default = RandomForestRegressor().fit(X_train, np.log(y_train))

In [25]:
print("rf_default_log", mean_absolute_percentage_error(y_valid, np.exp(rf_default.predict(X_valid))))
submit_log(test, rf_default, "rf_default_log")

rf_default_log 0.1273281116849625
