Importing GridSearch CV

In [24]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
import pandas as pd
import joblib


Loading the training data

In [21]:
x_train = pd.read_csv('../data/processed/x_train.csv')
y_log_train = pd.read_csv('../data/processed/y_log_train.csv').squeeze()

Defining the Parameters

In [22]:
param_grid = {
    'max_depth': [3, 5, 6, 7],
    'n_estimators': [100, 200, 300, 400, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.8, 0.9, 1],
    'colsample_bytree': [0.7, 0.8, 0.9, 1],
    'reg_alpha': [0, 0.1, 0.5, 1],
    'reg_lambda': [0.1, 1, 5, 10]
}

Tuning the Parameters with RandomizedSearch CV

In [23]:
random_search = RandomizedSearchCV(XGBRegressor(), param_grid, cv=10, scoring = 'neg_root_mean_squared_error')
random_search.fit(x_train, y_log_train)
print(random_search.best_params_)
print(-random_search.best_score_)

{'subsample': 0.8, 'reg_lambda': 10, 'reg_alpha': 0, 'n_estimators': 500, 'max_depth': 6, 'learning_rate': 0.1, 'colsample_bytree': 1}
44617.122265625


Saving the best model

In [25]:
joblib.dump(random_search.best_estimator_, '../models/xgb_best_model.pkl')

['../models/xgb_best_model.pkl']