In [1]:
# This is file for train, prediction

import pandas as pd

dirpath = 'C:/Users/rihot/Desktop/Deep_learning/DACON_used_car_price/'

train = pd.read_csv('data/preprocessed_train.csv')
test = pd.read_csv('data/preprocessed_test.csv')

In [2]:
Y = train[ ['target'] ].values
X = train[ ['title', 'odometer', 'location', 'isimported', 'engine', 'transmission', 'fuel', 'paint', 'year', 'brand' ] ].values

X_test = test[ ['title', 'odometer', 'location', 'isimported', 'engine', 'transmission', 'fuel', 'paint', 'year', 'brand' ] ].values

In [3]:
from sklearn.preprocessing import MinMaxScaler

scalerX = MinMaxScaler()
scalerX.fit(X)
X = scalerX.transform(X)
X_test = scalerX.transform(X_test)

scalerY = MinMaxScaler()
scalerY.fit(Y)
Y = scalerY.transform(Y)

In [6]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import optuna
from optuna import Trial
import xgboost as xgb

def objectiveXGB(trial : Trial, X, Y, test):
    param = {
        "n_estimators" : trial.suggest_int('n_estimators', 200, 2000),  # 생성할 의사 결정 나무 개수 보통 다다익선
        'max_depth':trial.suggest_int('max_depth', 8, 16),  # 트리의 최대 깊이, 커질수록 모델은 복잡해지고 과적합 가능성
        'min_child_weight':trial.suggest_int('min_child_weight', 1, 100),   # sum of instance weight (hessian)의 최솟값을 정의한다. 만약 leaf node의 sum of instance weight 가 이보다 작다면, 분기를 멈춘다. 클수록 보수적인 알고리즘
        'gamma':trial.suggest_int('gamma', 1, 5),   # leaf node의 다음 분기를 만들기 위한 최소 loss reduction. 클수록 보수적
        'learning_rate': trial.suggest_float('learning_rate', 0.0001, 0.01),
        'colsample_bytree':trial.suggest_discrete_uniform('colsample_bytree',0.5, 1, 0.1),  # ratio of the training instance. 과적합 방지
        'nthread' : -1, # thread 수, -1은 core 전부 활성화
        'tree_method': 'hist',
        'predictor': 'cpu_predictor',
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),   # L2 regularization
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0), # L1 regulization
        'subsample': trial.suggest_categorical('subsample', [0.6,0.7,0.8,1.0] ),
        'random_state': 42
    }
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

    y_train = y_train.reshape(-1, 1)
    y_test  = y_test.reshape(-1, 1)

    model = xgb.XGBRegressor(**param)
    xgb_model = model.fit(X_train, y_train, verbose=False, eval_set=[(X_test, y_test)])
    score = mean_squared_error(xgb_model.predict(X_test), y_test, squared=False)

    return score

In [7]:
from optuna.samplers import TPESampler

study = optuna.create_study(direction='minimize',sampler=TPESampler())
study.optimize(lambda trial : objectiveXGB(trial, X,  Y, X_test), n_trials=50)
print('Best trial: score {},\nparams {}'.format(study.best_trial.value,study.best_trial.params))

[32m[I 2022-06-02 17:17:47,570][0m A new study created in memory with name: no-name-9e68e1e7-19b7-43e3-bf17-2e5422e2e38a[0m
[32m[I 2022-06-02 17:17:48,773][0m Trial 0 finished with value: 0.06908933392982373 and parameters: {'n_estimators': 584, 'max_depth': 12, 'min_child_weight': 77, 'gamma': 1, 'learning_rate': 0.00913793118775631, 'colsample_bytree': 0.5, 'lambda': 2.6611472648665178, 'alpha': 0.0021738101767669903, 'subsample': 0.8}. Best is trial 0 with value: 0.06908933392982373.[0m
[32m[I 2022-06-02 17:17:49,601][0m Trial 1 finished with value: 0.2165073751821988 and parameters: {'n_estimators': 445, 'max_depth': 13, 'min_child_weight': 20, 'gamma': 4, 'learning_rate': 0.0017468228372165655, 'colsample_bytree': 0.9, 'lambda': 0.041786583136904386, 'alpha': 0.3357441273827223, 'subsample': 1.0}. Best is trial 0 with value: 0.06908933392982373.[0m
[32m[I 2022-06-02 17:17:51,758][0m Trial 2 finished with value: 0.10148546579991967 and parameters: {'n_estimators': 1168, 

Best trial: score 0.05602744204773561,
params {'n_estimators': 649, 'max_depth': 11, 'min_child_weight': 51, 'gamma': 4, 'learning_rate': 0.009230311901897815, 'colsample_bytree': 1.0, 'lambda': 0.19513942663206896, 'alpha': 1.0303265037384561, 'subsample': 0.7}
