### **Regression**
regression with various models

In [55]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import os
import glob
import pickle
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['agg.path.chunksize'] = 100000

from myConfig import *
import gc
from myUtils import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [56]:
data_name = "data_v1"
data_path = os.path.join('data',"data_v1.pickle")
data = pd.read_pickle(data_path) #分割ファイルを結合して読み出し

In [59]:
target = "item_cnt"
print("*** target ***")
print(target,"\n")

feature_ver = "features_v1"
features = feature_config[feature_ver]
print("*** features ***")
display(features)

*** target ***
item_cnt 

*** features ***


['item_id',
 'dept_id',
 'cat_id',
 'store_id',
 'state_id',
 'wday',
 'month',
 'year',
 'event_name_1',
 'event_type_1',
 'event_name_2',
 'event_type_2',
 'snap_CA',
 'snap_TX',
 'snap_WI',
 'price-median',
 'price-mean',
 'price-max',
 'price-min']

### **XGBoost Regression**
https://rdrr.io/cran/xgboost/man/xgb.cv.html

In [60]:
model_name = "XGBoost_v1_with_"+feature_ver

#### [ training ]

In [68]:
import xgboost as xgb

# prepare data matrix
train_part = data["data_part"] == "train"
trainX = data[train_part][features]
trainY = data[train_part][target]

# set config
params = {
    'objective': "reg:squarederror",
    'eval_metric': 'rmse',
    'tree_method':'gpu_hist',
}

# train models
#r"""
model = xgb.train(params,
                  xgb.DMatrix(trainX,trainY),
                  num_boost_round=100)
r"""
# cross validation ver.
model = xgb.cv(params,
               xgb.DMatrix(trainX,trainY),
               early_stopping_rounds=10,
               nfold=4)
r"""

# save model
model_path = os.path.join("models",model_name+".mdl")
with open(model_path,"wb") as f:
    pickle.dump(model,f)
    
del trainX, trainY
gc.collect()

199

#### [ prediction ]

In [53]:
# save model
model_path = os.path.join("models",model_name+".mdl")
with open(model_path,"rb") as f:
    model = pickle.load(f)
    
# predict
test = data[~train_part][["id","d","data_part"]+features]
test[target] = model.predict(xgb.DMatrix(test[features]))

### **Uncertainty prediction by Logit transform**

In [54]:
valid_data = test[test["data_part"] == "validation"]
valid_data = valid_data.pivot(index="id", columns="d", values=target).reset_index()
valid_data.columns = ["id"] + [f"F{d}" for d in range(1, 29)]
display(valid_data)


eval_data = test[test["data_part"] == "evaluation"]
eval_data = eval_data.pivot(index="id", columns="d", values=target).reset_index()
eval_data.columns = ["id"] + [f"F{d}" for d in range(1, 29)]
eval_data["id"] = eval_data["id"].str.replace("_validation$", "_evaluation")
display(eval_data)

sub = pd.concat([valid_data,eval_data])

# save point porediction
sub_path = os.path.join("submission_point",model_name+".csv")
sub.to_csv(sub_path,index=False)

del test, valid_data, eval_data
gc.collect()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,FOODS_1_001_CA_1_validation,0.442428,0.438844,0.438844,0.438844,0.397512,1.164466,1.160786,0.615066,0.652956,...,0.406303,1.187602,1.092621,0.442428,0.438844,0.438844,0.438844,0.397512,1.164466,1.060694
1,FOODS_1_001_CA_2_validation,0.984606,0.981023,0.981023,0.981023,1.009583,1.621427,1.563851,1.028209,1.130039,...,1.018374,1.644564,1.624722,0.984606,0.981023,0.981023,0.981023,1.009583,1.621427,1.592794
2,FOODS_1_001_CA_3_validation,1.654810,1.651227,1.651227,1.651227,1.853045,2.616794,2.661069,1.722167,1.722725,...,1.861556,2.561553,2.541432,1.654810,1.651227,1.651227,1.651227,1.853045,2.616794,2.588161
3,FOODS_1_001_CA_4_validation,0.088337,0.084753,0.084753,0.084753,0.092517,0.232465,0.215737,0.233282,0.300818,...,0.092165,0.306952,0.277968,0.088337,0.084753,0.084753,0.084753,0.092517,0.232465,0.203832
4,FOODS_1_001_TX_1_validation,-0.790279,-0.793862,-0.793862,-0.793862,-0.516204,-0.210501,-0.269441,-0.775526,-0.730070,...,-0.538636,-0.184450,-0.173891,-0.790279,-0.793862,-0.793862,-0.793862,-0.516204,-0.210501,-0.177509
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,HOUSEHOLD_2_516_TX_2_validation,0.438779,0.402187,0.402187,0.402187,0.440337,0.489097,0.522049,0.449514,0.393478,...,0.460599,0.477949,0.489019,0.438779,0.402187,0.402187,0.402187,0.440337,0.489097,0.479905
30486,HOUSEHOLD_2_516_TX_3_validation,0.091139,0.066820,0.066820,0.066820,0.145216,0.208677,0.172912,0.072013,0.011354,...,0.096062,0.197529,0.169743,0.091139,0.066820,0.066820,0.066820,0.145216,0.208677,0.230044
30487,HOUSEHOLD_2_516_WI_1_validation,0.232729,0.247541,0.247541,0.247541,0.338210,0.373703,0.305404,0.213604,0.241624,...,0.338605,0.362555,0.302235,0.232729,0.247541,0.247541,0.247541,0.338210,0.373703,0.312987
30488,HOUSEHOLD_2_516_WI_2_validation,0.395666,0.381392,0.381392,0.381392,0.472061,0.381065,0.341271,0.427622,0.426555,...,0.472456,0.369918,0.309597,0.395666,0.381392,0.381392,0.381392,0.472061,0.381065,0.320349


Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,FOODS_1_001_CA_1_evaluation,0.442428,0.438844,0.438844,0.438844,0.397512,1.164466,1.060694,0.544125,0.438844,...,0.582900,1.244503,1.140730,0.577988,0.610844,0.654029,0.531219,0.489887,1.212575,1.108803
1,FOODS_1_001_CA_2_evaluation,0.984606,0.981023,0.981023,0.981023,1.009583,1.621427,1.592794,1.086303,0.981023,...,1.065935,1.701464,1.672831,1.184107,1.153023,1.260149,1.073398,1.101958,1.669536,1.640904
2,FOODS_1_001_CA_3_evaluation,1.654810,1.651227,1.651227,1.651227,1.853045,2.616794,2.588161,1.767761,1.651227,...,2.011530,2.618173,2.589540,1.753039,1.744849,1.750703,1.743601,1.945420,2.664903,2.636271
3,FOODS_1_001_CA_4_evaluation,0.088337,0.084753,0.084753,0.084753,0.092517,0.232465,0.203832,0.149614,0.084753,...,0.198860,0.354710,0.326078,0.253544,0.308104,0.380936,0.177128,0.184892,0.280574,0.251942
4,FOODS_1_001_TX_1_evaluation,-0.790279,-0.793862,-0.793862,-0.793862,-0.516204,-0.210501,-0.177509,-0.717747,-0.793862,...,-0.448460,-0.158773,-0.125781,-0.603998,-0.575792,-0.525041,-0.658332,-0.380673,-0.162392,-0.129400
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,HOUSEHOLD_2_516_TX_2_evaluation,0.438779,0.402187,0.402187,0.402187,0.440337,0.489097,0.479905,0.446147,0.402187,...,0.455803,0.513315,0.504124,0.412917,0.384622,0.365178,0.395769,0.433920,0.504201,0.495009
30486,HOUSEHOLD_2_516_TX_3_evaluation,0.091139,0.066820,0.066820,0.066820,0.145216,0.208677,0.230044,0.125808,0.066820,...,0.173977,0.163480,0.184847,0.091537,0.092412,0.056071,0.103559,0.181955,0.223781,0.245148
30487,HOUSEHOLD_2_516_WI_1_evaluation,0.232729,0.247541,0.247541,0.247541,0.338210,0.373703,0.312987,0.240097,0.247541,...,0.323815,0.378055,0.317339,0.239520,0.229976,0.243185,0.241123,0.331793,0.388807,0.328091
30488,HOUSEHOLD_2_516_WI_2_evaluation,0.395666,0.381392,0.381392,0.381392,0.472061,0.381065,0.320349,0.403034,0.381392,...,0.381347,0.385417,0.324701,0.297634,0.281580,0.294788,0.270151,0.360820,0.396169,0.335454


1854

In [48]:
sales = pd.read_csv(os.path.join("rawdata","sales_train_validation.csv"))
sub = pd.merge(sub,sales[["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"]], on = "id")
sub["_all_"] = "Total"

del sales
gc.collect()

0

In [49]:
sub = point2unc(sub)
sub

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,FOODS_1_001_CA_1_0.005_validation,-1.522239,-1.509909,-1.509909,-1.509909,-1.367699,-4.006517,-3.993855,-2.116226,-2.246591,...,-1.397947,-4.086121,-3.759326,-1.522239,-1.509909,-1.509909,-1.509909,-1.367699,-4.006517,-3.649474
1,FOODS_1_001_CA_2_0.005_validation,-3.387684,-3.375354,-3.375354,-3.375354,-3.473619,-5.578758,-5.380661,-3.537705,-3.888067,...,-3.503866,-5.658365,-5.590096,-3.387684,-3.375354,-3.375354,-3.375354,-3.473619,-5.578758,-5.480245
2,FOODS_1_001_CA_3_0.005_validation,-5.693618,-5.681289,-5.681289,-5.681289,-6.375677,-9.003468,-9.155803,-5.925369,-5.927291,...,-6.404960,-8.813403,-8.744172,-5.693618,-5.681289,-5.681289,-5.681289,-6.375677,-9.003468,-8.904953
3,FOODS_1_001_CA_4_0.005_validation,-0.303936,-0.291607,-0.291607,-0.291607,-0.318317,-0.799830,-0.742276,-0.802640,-1.035010,...,-0.317108,-1.056115,-0.956390,-0.303936,-0.291607,-0.291607,-0.291607,-0.318317,-0.799830,-0.701315
4,FOODS_1_001_TX_1_0.005_validation,2.719073,2.731401,2.731401,2.731401,1.776075,0.724260,0.927052,2.668312,2.511916,...,1.853258,0.634626,0.598296,2.719073,2.731401,2.731401,2.731401,1.776075,0.724260,0.610746
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
771115,WI_2_HOBBIES_0.995_evaluation,1420.231573,1401.429074,1401.429074,1401.429074,1598.935276,1675.175641,1594.192415,1502.465668,1565.032586,...,1712.909476,1636.709430,1550.856397,1420.231573,1401.429074,1401.429074,1401.429074,1598.935276,1675.175641,1475.348514
771116,WI_2_HOUSEHOLD_0.995_evaluation,3479.220607,3341.950003,3341.950003,3341.950003,3789.503481,3952.352566,3856.857360,3767.892750,3702.070194,...,3814.598120,3989.120713,3752.178413,3478.683426,3341.412821,3341.412821,3341.412821,3788.966300,3951.815385,3689.241684
771117,WI_3_FOODS_0.995_evaluation,7846.067114,7451.298100,7451.298100,7460.041681,8406.029238,10344.161621,11134.145689,9621.266831,10094.787711,...,9132.641036,11484.586118,11977.802524,7847.468237,7452.699223,7452.699223,7461.442804,8407.430361,10345.562744,10117.128404
771118,WI_3_HOBBIES_0.995_evaluation,846.317594,832.588601,832.588601,832.588601,1031.514879,1456.196993,1374.089424,929.837889,1001.137467,...,1151.007892,1425.566846,1340.684015,850.519861,836.790869,836.790869,836.790869,1035.717094,1460.399208,1260.225685


In [51]:
sub_path = os.path.join("submission_uncertainty",model_name+".csv")
sub.to_csv(sub_path,index=False)

# submit csv-file
import subprocess

command = ["kaggle","competitions","submit","-c","m5-forecasting-uncertainty","-f",sub_path,"-m",model_name]
subprocess.check_output(command)

b'Successfully submitted to M5 Forecasting - Uncertainty'