In [31]:
!pip install seaborn -q
!pip install catboost -q

import numpy as np
import pandas as pd

In [32]:
catboost_data = pd.read_csv("./train.csv")

In [33]:
cat_features = ["model", "car_type", "fuel_type"]
targets = ["target_class", "target_reg"]
drop_feat = ["car_id"]

In [34]:
filtered_features = [i for i in catboost_data.columns if (i not in targets and i not in drop_feat)]
num_features = [i for i in filtered_features if i not in cat_features]

print("cat_features", cat_features)
print("num_features", len(num_features))
print("targets", targets)

cat_features ['model', 'car_type', 'fuel_type']
num_features 11
targets ['target_class', 'target_reg']


In [35]:
for c in cat_features:
    catboost_data[c] = catboost_data[c].astype(str)

In [36]:
from sklearn.model_selection import train_test_split

X = catboost_data[filtered_features].drop(targets, axis=1, errors="ignore")
y = catboost_data["target_reg"]

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [38]:
from catboost import CatBoostClassifier, CatBoostRegressor, Pool

model = CatBoostRegressor(cat_features=cat_features, eval_metric="RMSE")

model.fit(
    X_train,
    y_train,
    verbose=500,
    plot=False,
)

print(model.best_score_)

Learning rate set to 0.045195
0:	learn: 17.1862834	total: 18.7ms	remaining: 18.7s
500:	learn: 7.8920293	total: 9.13s	remaining: 9.09s
999:	learn: 6.0539782	total: 18s	remaining: 0us
{'learn': {'RMSE': 6.053978188487934}}


In [None]:
from catboost import CatBoostClassifier, CatBoostRegressor, Pool

cbr = CatBoostRegressor(
        depth=4,
        iterations=1000,
        learning_rate=0.09,   
        cat_features=cat_features,
        colsample_bylevel=0.99,
        max_bin=190,
        l2_leaf_reg=5,
        subsample=0.5,)

cbr.fit(
        X_train,
        y_train,
        eval_set=(X_test, y_test),
        verbose=500,
        plot=False)


print(cbr.best_score_)

0:	learn: 16.8709193	test: 17.5903213	best: 17.5903213 (0)	total: 10.2ms	remaining: 10.2s
500:	learn: 8.6450532	test: 12.2367538	best: 11.9853255 (111)	total: 6.04s	remaining: 6.02s


In [None]:
test = pd.read_csv('test.csv')
for c in cat_features:
    test[c] = test[c].astype(str)
    
x_test = test[filtered_features].drop(targets, axis=1, errors="ignore")

y_pred = cbr.predict(x_test)
catboost_res = pd.DataFrame({'car_id': test['car_id'], 'target_reg': y_pred})
catboost_res.to_csv('catboost_res.csv', index=False)

In [None]:
# LGBM


!pip install lightgbm -q

import lightgbm as lgb

df = pd.read_csv("train.csv")
cat_cols = ["car_type", "fuel_type", "model"]
drop_cols = ["car_id", "target_reg", "target_class"]

X = df.drop(drop_cols, axis=1)
y = df["target_reg"]

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
for col in cat_cols:
    X[col] = label_encoder.fit_transform(X[col])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from lightgbm import Dataset

train_data = Dataset(
    X_train,
    y_train,
    categorical_feature=cat_cols,
    free_raw_data=False,
)

val_data = Dataset(
    X_test,
    y_test,
    categorical_feature=cat_cols,
    free_raw_data=False,
)

In [None]:
from lightgbm import train ,LGBMRegressor


reg = LGBMRegressor(metric = "RMSE")

reg.fit(
    X_train,
    y_train,
    eval_set=[
        (X_test, y_test),
    ],
)
print(reg.best_score_)

In [None]:
reg = LGBMRegressor(
    n_estimators=40,
    learning_rate = 0.09,
    cat_feature=[0, 1, 2],
    num_leaves = 8,
    metric="RMSE")

reg.fit(
    X_train,
    y_train,
    eval_set=[
        (X_test, y_test),
    ],
)

print(reg.best_score_)

In [None]:
test = pd.read_csv("./test.csv")

drop_cols = ["car_id", "target_class"]
x_test = test.drop(drop_cols, axis=1)

for col in cat_cols:
    x_test[col] = label_encoder.fit_transform(x_test[col])

y_pred = reg.predict(x_test)
LGBMres = pd.DataFrame({'car_id': test['car_id'], 'target_reg': y_pred})
LGBMres.to_csv('LGBM_res.csv', index=False)

In [None]:
# XGBoost

!pip install xgboost -q
import xgboost as xgb
import warnings; warnings.filterwarnings("ignore")

In [None]:
xgb_data = pd.read_csv("./train.csv")
drop_cols = ['car_id', 'target_reg', 'target_class']
cat_cols = ['car_type', 'fuel_type', 'model']

X = xgb_data.drop(drop_cols, axis=1)
y = xgb_data['target_reg']

 
for col in cat_cols:
    X[col] = X[col].astype('category')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.metrics import mean_poisson_deviance, mean_squared_error

reg = xgb.XGBRegressor(tree_method="hist",
                       eval_metric = mean_poisson_deviance,
                       enable_categorical=True,
                       n_estimators=30, n_jobs=-1
)
reg.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        early_stopping_rounds=10,
        verbose=3)

y_pred = reg.predict(X_test, iteration_range=(0, reg.best_iteration + 1))
mean_squared_error(y_test, y_pred) ** 0.5

In [None]:
reg = xgb.XGBRegressor(tree_method="hist",
                       eval_metric = mean_poisson_deviance,
                       enable_categorical=True,
                       n_estimators=80, 
                       n_jobs=-1,
                       min_child_weight=16,
                       max_bin=128,
                       reg_alpha=275,
                       reg_lambda=275,)

reg.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        early_stopping_rounds=10,
        verbose=3)

y_pred = reg.predict(X_test, iteration_range=(0, reg.best_iteration + 1))
mean_squared_error(y_test, y_pred) ** 0.5

In [None]:
test = pd.read_csv('./test.csv')

drop_cols = ['car_id', 'target_class']
x_test = test.drop(drop_cols, axis=1)

for col in cat_cols:
    x_test[col] = x_test[col].astype('category')

y_pred = reg.predict(x_test)
XGBoostres = pd.DataFrame({'car_id': test['car_id'], 'target_reg': y_pred})
XGBoostres.to_csv('XGBoost_res.csv', index=False)