In [None]:
!pip install -qq catboost

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import *
from sklearn.ensemble import *
from sklearn.tree import *
from sklearn.linear_model import *
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.metrics import *
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.utils import shuffle
from sklearn.dummy import DummyRegressor
from catboost import CatBoostRegressor, Pool
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import random
random_state=42
random.seed(random_state)
np.random.seed(random_state)

import warnings
warnings.filterwarnings('ignore')
import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)
import timeit
from functools import lru_cache

# 1. Data preparation

</div><div class="paragraph"><strong>Features</strong></div><ul><li><em>DateCrawled</em> — date profile was downloaded from the database</li><li><em>VehicleType</em> — vehicle body type</li><li><em>RegistrationYear</em> — vehicle registration year</li><li><em>Gearbox</em> — gearbox type</li><li><em>Power</em> — power (hp)</li><li><em>Model</em> — vehicle model</li><li>Mileage — mileage (measured in km due to dataset's regional specifics)</li><li><em>RegistrationMonth</em> — vehicle registration month</li><li><em>FuelType</em> — fuel type</li><li><em>Brand</em> — vehicle brand</li><li><em>NotRepaired</em> — vehicle repaired or not</li><li><em>DateCreated</em> — date of profile creation</li><li><em>NumberOfPictures</em> — number of vehicle pictures</li><li><em>PostalCode</em> —  postal code of profile owner (user)</li><li><em>LastSeen</em> — date of the last activity of the user</li></ul><div class="paragraph"><strong>Target</strong></div><div class="paragraph"><em>Price</em> — price (Euro)</div></div>

#### Helper functions:

In [3]:
#missing value ratio
def missing_values(df):
    df_nulls=pd.concat([df.dtypes, df.isna().sum(), df.isna().sum()/len(df)], axis=1)
    df_nulls.columns = ["type","count","missing_ratio"]
    df_nulls=df_nulls[df_nulls["count"]>0]
    df_nulls.sort_values(by="missing_ratio", ascending=False)
    return df_nulls

#outliers by 3 sigma rule
def outlier(data):
    data_mean, data_std = np.mean(data), np.std(data)
    cut_off = data_std * 3
    lower, upper = data_mean - cut_off, data_mean + cut_off
    outliers = [x for x in data if x < lower or x > upper]
    outliers_removed = [x for x in data if x >= lower and x <= upper]
    return len(outliers)

# full description statistics 
def describe_full(df, target_name=""):
    data_describe = df.describe().T
    df_numeric = df._get_numeric_data()
    if target_name in df.columns:
        corr_with_target=df_numeric.drop(target_name, axis=1).apply(lambda x: x.corr(df_numeric[target_name]))
        data_describe['corr_with_target']=corr_with_target
    dtype_df = df_numeric.dtypes
    data_describe['dtypes'] = dtype_df
    data_null = df_numeric.isnull().sum()/len(df) * 100
    data_describe['Missing %'] = data_null
    Cardinality = df_numeric.apply(pd.Series.nunique)
    data_describe['Cardinality'] = Cardinality
    df_skew = df_numeric.skew(axis=0, skipna=True)
    data_describe['Skew'] = df_skew
    data_describe['outliers %']=[outlier(df_numeric[col])/len(df) * 100 for col in df_numeric.columns]
    data_describe['kurtosis']=df_numeric.kurtosis()
    return data_describe

def display_classification_report(y_true, y_pred):
    display(pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).T)


def plot_roc(y_test, preds, ax=None, label='model'):
    with plt.style.context('seaborn-whitegrid'):
        if not ax: fig, ax = plt.subplots(1, 1)
        fpr, tpr, thresholds = roc_curve(y_test, preds)
        ax.plot([0, 1], [0, 1],'r--')
        ax.plot(fpr, tpr, lw=2, label=label)
        ax.legend(loc='lower right')
        ax.set_title(
             'ROC curve\n'
            f""" AP: {average_precision_score(
                y_test, preds, pos_label=1
            ):.2} | """
            f'AUC: {auc(fpr, tpr):.2}')
        ax.set_xlabel('False Positive Rate (FPR)')
        ax.set_ylabel('True Positive Rate (TPR)')
        ax.annotate(f'AUC: {auc(fpr, tpr):.2}', xy=(.43, .025))
        ax.legend()
        ax.grid()
        return ax
    

def plot_pr(y_test, preds, ax=None, label='model'):
    with plt.style.context('seaborn-whitegrid'):
        precision, recall, thresholds = precision_recall_curve(y_test, preds)
        if not ax: fig, ax = plt.subplots()
        ax.plot([0, 1], [1, 0],'r--')    
        ax.plot(recall, precision, lw=2, label=label)
        ax.legend()
        ax.set_title(
            'Precision-recall curve\n'
            f""" AP: {average_precision_score(
                y_test, preds, pos_label=1
            ):.2} | """
            f'AUC: {auc(recall, precision):.2}'
        )
        ax.set_xlabel('Recall')
        ax.set_ylabel('Precision')
        ax.set_xlim(-0.05, 1.05)
        ax.set_ylim(-0.05, 1.05)
        ax.legend()
        ax.grid()
        return ax

def show_feature_importances(df, features, target):
  X, y = df[features].values,df[target].values
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
  rfc = DecisionTreeRegressor().fit(X_train, y_train)
  y_pred = rfc.predict(X_test)
  df_feature_importances = pd.DataFrame(((zip(features, rfc.feature_importances_)))).rename(columns={0:"feature",1:"coeff"}).sort_values(by="coeff", ascending = False )
  sns.barplot(data=df_feature_importances, x=df_feature_importances["coeff"], y=df_feature_importances["feature"])
  return df_feature_importances

In [4]:
df =  pd.read_csv("autos.csv")

In [5]:
target = "Price"
features = list(set(df.columns)-set(target))

#### type conversion

In [6]:
df["DateCrawled"] = pd.to_datetime(df["DateCrawled"])
df["DateCreated"] = pd.to_datetime(df["DateCreated"])
df["LastSeen"] = pd.to_datetime(df["LastSeen"])

#### imputing

In [7]:
df["NotRepaired"] = df["NotRepaired"].fillna('yes')
df["NotRepaired"] = (df["NotRepaired"] == 'yes').astype('int')

In [8]:
df["Gearbox"] = (df["Gearbox"] == "auto").astype("int")

let's impute others categorical features by model depends on others values and without target (for avoiding leakage in the reaulting model)

In [9]:
df.isna().sum()

DateCrawled              0
Price                    0
VehicleType          37490
RegistrationYear         0
Gearbox                  0
Power                    0
Model                19705
Kilometer                0
RegistrationMonth        0
FuelType             32895
Brand                    0
NotRepaired              0
DateCreated              0
NumberOfPictures         0
PostalCode               0
LastSeen                 0
dtype: int64

In [10]:
def impute_value(in_df, features, target):
  encoders=dict()
  df = in_df.copy()
  for col in df[features].select_dtypes('object').columns:
    df.loc[df[col].isna(), col] = "None"
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le  
  for col in df[features].select_dtypes('datetime64').columns:
    df[f"{col}_hour"] = df[col].dt.hour
    df[f"{col}_month"] = df[col].dt.month
    df[f"{col}_day"] = df[col].dt.day  
    del df[col]
  features = list(set(df.columns)-set([target]))
  train_df = df[~df[target].isna()]
  test_df = df[df[target].isna()]
  let = LabelEncoder()
  y_train = let.fit_transform(train_df[target])
  y_train = train_df[target].values
  X_train, X_test = train_df[features].values, test_df[features].values
  if len(X_test)==0:
    return in_df
  model = DecisionTreeClassifier().fit(X_train, y_train)
  y_pred = model.predict(X_test)
  df.loc[df[target].isna(), target] = y_pred
  in_df[target] = df[target]
  return in_df

In [11]:
for col in ["FuelType", "VehicleType", "Model"]:
  df = impute_value(df, features=list(set(df.columns)-set([col])-set([target])), target=col)

In [12]:
df.isna().sum()

DateCrawled          0
Price                0
VehicleType          0
RegistrationYear     0
Gearbox              0
Power                0
Model                0
Kilometer            0
RegistrationMonth    0
FuelType             0
Brand                0
NotRepaired          0
DateCreated          0
NumberOfPictures     0
PostalCode           0
LastSeen             0
dtype: int64

In [13]:
describe_full(df, target_name=target)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,corr_with_target,dtypes,Missing %,Cardinality,Skew,outliers %,kurtosis
Price,354369.0,4416.656776,4514.158514,0.0,1050.0,2700.0,6400.0,20000.0,,int64,0.0,3731,1.430609,1.547539,1.410638
RegistrationYear,354369.0,2004.234448,90.227958,1000.0,1999.0,2003.0,2008.0,9999.0,0.026916,int64,0.0,151,74.621005,0.045715,6083.101952
Gearbox,354369.0,0.187051,0.389953,0.0,0.0,0.0,0.0,1.0,0.257725,int32,0.0,2,1.605071,0.0,0.576256
Power,354369.0,110.094337,189.850405,0.0,69.0,105.0,143.0,20000.0,0.158872,int64,0.0,712,60.157196,0.101307,4624.103219
Kilometer,354369.0,128211.172535,37905.34153,5000.0,125000.0,150000.0,150000.0,150000.0,-0.333199,int64,0.0,13,-1.724119,2.150019,1.925124
RegistrationMonth,354369.0,5.714645,3.726421,0.0,3.0,6.0,9.0,12.0,0.110581,int64,0.0,13,0.081207,0.0,-1.148239
NotRepaired,354369.0,0.898259,0.302308,0.0,1.0,1.0,1.0,1.0,0.186431,int32,0.0,2,-2.634799,0.0,4.942192
NumberOfPictures,354369.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,int64,0.0,1,0.0,0.0,0.0
PostalCode,354369.0,50508.689087,25783.096248,1067.0,30165.0,49413.0,71083.0,99998.0,0.076055,int64,0.0,8143,0.077054,0.0,-0.965296


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 354369 entries, 0 to 354368
Data columns (total 16 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   DateCrawled        354369 non-null  datetime64[ns]
 1   Price              354369 non-null  int64         
 2   VehicleType        354369 non-null  object        
 3   RegistrationYear   354369 non-null  int64         
 4   Gearbox            354369 non-null  int32         
 5   Power              354369 non-null  int64         
 6   Model              354369 non-null  object        
 7   Kilometer          354369 non-null  int64         
 8   RegistrationMonth  354369 non-null  int64         
 9   FuelType           354369 non-null  object        
 10  Brand              354369 non-null  object        
 11  NotRepaired        354369 non-null  int32         
 12  DateCreated        354369 non-null  datetime64[ns]
 13  NumberOfPictures   354369 non-null  int64   

# 2. Model training

let's use CV technics and 10% for a final test set. 
we gonna see 3 models - CatBoost, XGBoost, and LGBMBoost

In [15]:
def get_data(df, transform_data=True, apply_encoding=False):
  in_df = df.copy()
  target = "Price"
  features = list(set(in_df.columns)-set([target]))
  if transform_data:
    for col in in_df[features].select_dtypes('datetime64').columns:
      in_df[f"{col}_hour"] = in_df[col].dt.hour
      in_df[f"{col}_month"] = in_df[col].dt.month
      in_df[f"{col}_day"] = in_df[col].dt.day  
      del in_df[col]
  features = list(set(in_df.columns)-set([target]))
  encoders = dict()
  if apply_encoding:
    for col in in_df[features].select_dtypes('object').columns:
      lbl = LabelEncoder().fit(in_df[col].values)
      in_df[col] = lbl.transform(in_df[col].values)
      encoders[col] = lbl
  features = list(set(in_df.columns)-set([target]))
  cat_features = list(in_df[features].select_dtypes('object').columns)
  return in_df[features].values, in_df[target].values, features, target, encoders, cat_features

In [16]:
rmse_func = lambda y_true, y_pred: mean_squared_error(y_true, y_pred, squared=False)
rmsle  = make_scorer(rmse_func, greater_is_better=False)

In [17]:
def eval_model(model, X_train, X_test, y_train, y_test):
  model.fit(X_train, y_train, eval_metric='rmse', verbose = False, eval_set = [(X_test, y_test)])
  y_pred = model.predict(X_test)
  print("RMSE", rmse_func(y_test, y_pred))

In [18]:
log_metrics = {"models": ["catboost", "xgboost", "LGBM"], "rmse_init": [0.0]*3, "rmse_cv": [0.0]*3}
m_idx = {"catboost":0, "xgboost":1, "LGBM":2}

CatBoost

In [None]:
X, y, features, target, _, cat_features = get_data(df)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=random_state)
train_ds = Pool(data=X_train, label=y_train, cat_features=cat_features, feature_names=features)
test_ds = Pool(data=X_test, label=y_test, cat_features=cat_features, feature_names=features)
full_ds = Pool(data=X, label=y, cat_features=cat_features, feature_names=features)

In [None]:
model = CatBoostRegressor(iterations=20, task_type="GPU", devices='0:1', random_seed=random_state, loss_function='RMSE', has_time=True)
model.fit(train_ds, verbose = 0)
y_pred = model.predict(test_ds)
print("RMSE", rmse_func(y_test, y_pred))
print("time {}".format(timeit.timeit()))

In [None]:
log_metrics["rmse_init"][m_idx["catboost"]] = rmse_func(y_test, y_pred)

CatBoost, GridSearch

In [None]:
param_grid = {
        'learning_rate': [0.03, 0.1],
        'depth': [6, 10],
        'l2_leaf_reg': [3, 5, 7, 9],
        'has_time': [True]        
}

model = CatBoostRegressor(iterations=20, loss_function='RMSE', task_type="GPU", devices='0:1', random_seed=random_state)
grid_search_result = model.grid_search(param_grid, 
                                       full_ds,
                                       verbose=0,
                                       partition_random_seed=random_state,
                                       search_by_train_test_split=True,
                                       train_size=0.9,
                                       plot=False)
print("time {}".format(timeit.timeit()))

In [None]:
cv_data = pd.DataFrame(grid_search_result["cv_results"])
best_value = cv_data['test-RMSE-mean'].min()
best_iter = cv_data['test-RMSE-mean'].values.argmin()

print('Best validation RMSE score : {:.4f}±{:.4f} on step {}'.format(
    best_value,
    cv_data['test-RMSE-std'][best_iter],
    best_iter)
)
print("time {}".format(timeit.timeit()))

In [None]:
model = CatBoostRegressor(iterations=20, loss_function='RMSE', task_type="GPU", devices='0:1', random_seed=random_state, **grid_search_result["params"])
model.fit(train_ds, verbose = 1, eval_set = [(X_test, y_test)], use_best_model=True)
y_pred = model.predict(test_ds)
print("RMSE", rmse_func(y_test, y_pred))
print("time {}".format(timeit.timeit()))

In [None]:
log_metrics["rmse_cv"][m_idx["catboost"]] = rmse_func(y_test, y_pred)

XGBoost

In [None]:
X, y, features, target, encoders, _ = get_data(df, apply_encoding=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=random_state)

In [None]:
model = XGBRegressor(tree_method='gpu_hist', gpu_id=0, random_state=random_state, objective='reg:squarederror')
eval_model(model, X_train, X_test, y_train, y_test)
print("time {}".format(timeit.timeit()))

In [None]:
y_pred = model.predict(X_test)
log_metrics["rmse_init"][m_idx["xgboost"]] = rmse_func(y_test, y_pred)

XGBoost, CV

In [None]:
param_grid = {
        'learning_rate': [0.03, 0.1],
        'max_depth': [4, 6, 10],
        'objective':['reg:squarederror']
        
}
model = XGBRegressor(tree_method='gpu_hist', gpu_id=0, random_state=random_state)

grid = GridSearchCV(model,
                        param_grid,
                        cv = 5,
                        n_jobs = 5,
                        verbose=False,
                        scoring=rmsle)

grid.fit(X, y)

print(grid.best_params_)
print("time {}".format(timeit.timeit()))

In [None]:
model = XGBRegressor(tree_method='gpu_hist', gpu_id=0, random_state=random_state, **grid.best_params_)
eval_model(model, X_train, X_test, y_train, y_test)
print("time {}".format(timeit.timeit()))

In [None]:
y_pred = model.predict(X_test)
log_metrics["rmse_cv"][m_idx["xgboost"]] = rmse_func(y_test, y_pred)
print("time {}".format(timeit.timeit()))

LGBMRegressor

In [19]:
X, y, features, target, encoders, cat_features = get_data(df, apply_encoding=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=random_state)

In [20]:
model = LGBMRegressor(objective="RMSE", random_state=random_state, verbose=1)
eval_model(model, X_train, X_test, y_train, y_test)
print("time {}".format(timeit.timeit()))

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1102
[LightGBM] [Info] Number of data points in the train set: 318932, number of used features: 19
[LightGBM] [Info] Start training from score 4414.279718
RMSE 1857.9140379681626
time 0.035925899999710964


In [21]:
y_pred = model.predict(X_test)
log_metrics["rmse_init"][m_idx["LGBM"]] = rmse_func(y_test, y_pred)
print("time {}".format(timeit.timeit()))

time 0.03680789999998524


In [22]:
param_grid = {
        'learning_rate': [0.03, 0.1],
        'max_depth': [4, 6, 10]
}

model = LGBMRegressor(objective="RMSE", random_state=random_state, verbose=0)

grid = GridSearchCV(model,
                        param_grid,
                        cv = 5,
                        n_jobs = 5,
                        verbose=False)

grid.fit(X, y)

print(grid.best_params_)
print("time {}".format(timeit.timeit()))

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
{'learning_rate': 0.1, 'max_depth': 10}
time 0.03153349999956845


In [23]:
model = LGBMRegressor(objective="RMSE", random_state=random_state, verbose=0, **grid.best_params_)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
log_metrics["rmse_cv"][m_idx["LGBM"]] = rmse_func(y_test, y_pred)
print("{}".format(timeit.timeit()))

You can set `force_col_wise=true` to remove the overhead.
0.029403300000012678


In [23]:
import joblib
joblib.dump(model, 'model.pkl')
joblib.dump(encoders,'encoders.pkl')
joblib.dump(features,'features.pkl')

['features.pkl']

In [24]:
ans_predict = model.predict(X_test)
print(ans_predict)
print(y_test)

[14648.15616072 15019.97772019  2380.51897701 ...  2058.70506602
  2417.22514819  1902.65088539]
[19999 14250  2850 ...  1990   950  1500]


In [25]:
i=1
print(X_test[i])
ans_predict_ = model.predict(X_test[i].reshape(1, -1))
print(ans_predict_)
print(y_test[i])

[    3    25     1     6    28 45770     3     0  2011    25     3     2
    29     0   140     7     1 70000    12     0    11]
[15019.97772019]
14250


In [26]:
encoders

{'Model': LabelEncoder(),
 'FuelType': LabelEncoder(),
 'VehicleType': LabelEncoder(),
 'Brand': LabelEncoder()}

In [27]:
features

['DateCreated_month',
 'DateCrawled_day',
 'NotRepaired',
 'RegistrationMonth',
 'Model',
 'PostalCode',
 'DateCrawled_month',
 'DateCreated_hour',
 'RegistrationYear',
 'DateCreated_day',
 'LastSeen_month',
 'FuelType',
 'LastSeen_day',
 'NumberOfPictures',
 'Power',
 'VehicleType',
 'Brand',
 'Kilometer',
 'DateCrawled_hour',
 'Gearbox',
 'LastSeen_hour']

In [30]:
# Assuming 'encoders' is the dictionary you posted
brand_encoder = encoders['Brand'] # get the LabelEncoder for 'Brand' attribute
brand_value = 'audi' # categorical value for 'Brand'
brand_encoded_value = brand_encoder.transform([brand_value]) # convert to encoded numerical value
print(brand_encoded_value) #

[1]


# 3. Model analysis

In [None]:
pd.DataFrame(log_metrics)

Best result is on the XGBoost, and LGBM has weighted result in both cases