This notebook fit XGBMRegressor and dump results (as second level feature) to file for later ensembling. 

In [1]:
import numpy as np
import pandas as pd
from xgboost import XGBRegressor

import time
import sys
import gc
import pickle
sys.version_info

sys.version_info(major=3, minor=6, micro=5, releaselevel='final', serial=0)

## Read data from file, select features.

In [2]:
data = pd.read_pickle('../output/data.pkl')

data = data[[
    'date_block_num',
    'shop_id',
    'item_id',
    'item_cnt_month',
    'city_code',
    'item_category_id',
    'type_code','subtype_code',
    'item_cnt_month_lag_1','item_cnt_month_lag_2','item_cnt_month_lag_3','item_cnt_month_lag_6','item_cnt_month_lag_12',
    'item_avg_sale_last_6', 'item_std_sale_last_6',
    'item_avg_sale_last_12', 'item_std_sale_last_12',
    'shop_avg_sale_last_6', 'shop_std_sale_last_6',
    'shop_avg_sale_last_12', 'shop_std_sale_last_12',
    'category_avg_sale_last_12', 'category_std_sale_last_12',
    'city_avg_sale_last_12', 'city_std_sale_last_12',
    'type_avg_sale_last_12', 'type_std_sale_last_12',
    'subtype_avg_sale_last_12', 'subtype_std_sale_last_12',
    'date_avg_item_cnt_lag_1',
    'date_item_avg_item_cnt_lag_1','date_item_avg_item_cnt_lag_2','date_item_avg_item_cnt_lag_3','date_item_avg_item_cnt_lag_6','date_item_avg_item_cnt_lag_12',
    'date_shop_avg_item_cnt_lag_1','date_shop_avg_item_cnt_lag_2','date_shop_avg_item_cnt_lag_3','date_shop_avg_item_cnt_lag_6','date_shop_avg_item_cnt_lag_12',
    'date_cat_avg_item_cnt_lag_1',
    'date_shop_cat_avg_item_cnt_lag_1',
    'date_city_avg_item_cnt_lag_1',
    'date_item_city_avg_item_cnt_lag_1',
    'delta_price_lag',
    'month','year',
    'item_shop_last_sale','item_last_sale',
    'item_shop_first_sale','item_first_sale',
]]

## Set up validation strategy

Validation strategy is 34 month for the test set, 33 month for the validation set and 13-32 months for the train.

In [4]:
X_train = data[data.date_block_num < 33].drop(['item_cnt_month'], axis=1)
Y_train = data[data.date_block_num < 33]['item_cnt_month']
X_valid = data[data.date_block_num == 33].drop(['item_cnt_month'], axis=1)
Y_valid = data[data.date_block_num == 33]['item_cnt_month']
X_test = data[data.date_block_num == 34].drop(['item_cnt_month'], axis=1)

del data
gc.collect();

## XGBoost

In [5]:
ts = time.time()

model = XGBRegressor(
    max_depth=7,
    n_estimators=1000,
    min_child_weight=300, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    gamma = 0.005,
    eta=0.1,    
    seed=42)

model.fit(
    X_train, 
    Y_train, 
    eval_metric="rmse", 
    eval_set=[(X_train, Y_train), (X_valid, Y_valid)], 
    verbose=10, 
    early_stopping_rounds = 40,
    )

time.time() - ts

[0]	validation_0-rmse:1.15343	validation_1-rmse:1.12026
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 40 rounds.
[10]	validation_0-rmse:0.903358	validation_1-rmse:0.944456
[20]	validation_0-rmse:0.849589	validation_1-rmse:0.912442
[30]	validation_0-rmse:0.832662	validation_1-rmse:0.906915
[40]	validation_0-rmse:0.824349	validation_1-rmse:0.905784
[50]	validation_0-rmse:0.81851	validation_1-rmse:0.907106
[60]	validation_0-rmse:0.814083	validation_1-rmse:0.907069
[70]	validation_0-rmse:0.810025	validation_1-rmse:0.906016
[80]	validation_0-rmse:0.806955	validation_1-rmse:0.905895
[90]	validation_0-rmse:0.804255	validation_1-rmse:0.9049
[100]	validation_0-rmse:0.801693	validation_1-rmse:0.9047
[110]	validation_0-rmse:0.79933	validation_1-rmse:0.904087
[120]	validation_0-rmse:0.797564	validation_1-rmse:0.90373
[130]	validation_0-rmse:0.795422	validation_1-rmse:0.902287
[140]	validation_0-rm

4407.859105825424

In [6]:
Y_pred = model.predict(X_valid).clip(0, 20)
Y_test = model.predict(X_test).clip(0, 20)

X_train_level2 = pd.DataFrame({
    "ID": np.arange(Y_pred.shape[0]), 
    "item_cnt_month": Y_pred
})
X_train_level2.to_csv('../output/xgb_valid.csv', index=False)

submission = pd.DataFrame({
    "ID": np.arange(Y_test.shape[0]), 
    "item_cnt_month": Y_test
})
submission.to_csv('../output/xgb_submission.csv', index=False)

public score: 0.916