## **LightGBM quantile Regression**

In [2]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import os
import glob
import pickle
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['agg.path.chunksize'] = 100000
import gc

from tqdm import tqdm
from myUtils import *
from feature_generator import feature_v1, feature_v2, feature_v3, feature_v4
import xgboost as xgb

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### [make features]

In [3]:
###################################################################
# make features
##################################################################
#feature_maker = feature_v1.FeaturesMaker_v1(target_col="item_cnt")
feature_maker = feature_v4.FeaturesMaker_v4(target_col="item_cnt")

base_data = "sales_train_eval_365"
data_path = os.path.join("mydata",base_data+"_"+feature_maker.name+".pickle")

if os.path.exists(data_path):
    with open(data_path,"rb") as f:
        data = pickle.load(f)
    print("data loaded")
else:
    data = pd.read_pickle(os.path.join("mydata",base_data+".pickle"))
    data = feature_maker.make_feature(data)
    
    with open(data_path,"wb") as f:
        pickle.dump(data,f)

-- column check completed --
  columns are satisfied
--  features_ver4  --
dim: 36
N: 11982570
-----------------


#### [training]
make regression model for each quantile

In [5]:
model_dir = os.path.join("models","RightGBM_quantileReg")
if not(os.path.exists(model_dir)):
    os.makedirs(model_dir)
    

import lightgbm as lgb
qlist = np.array([0.005,0.025,0.165,0.25, 0.5, 0.75, 0.835, 0.975, 0.995])
for q in tqdm(qlist):
    model_path = os.path.join(model_dir,"RightGBM_"+f"{q:.3f}"+"_"+feature_maker.name+".mdl")

    if os.path.exists(model_path):
        print("model already exists...")

    else:
        print("start training RightGBM")
        model = lgb.LGBMRegressor(objective='quantile', 
                                  alpha=q,
                                  n_estimators=1000,
                                  learning_rate=.1,
                                  min_samples_leaf=9,
                                  min_samples_split=9)
        

        model.fit(X=data["train"][0], y=data["train"][1], 
                  #sample_weight=None, 
                  #base_margin=None, 
                  eval_set=[data["train"],data["validation"]], 
                  #eval_metric=None, 
                  early_stopping_rounds=100, 
                  verbose=True, 
                  #xgb_model=None, 
                  #sample_weight_eval_set=None
                  )
        model.booster_.save_model(model_path)
        
print("  -- completed\n")

  0%|                                                                                            | 0/9 [00:00<?, ?it/s]

start training RightGBM
[1]	training's quantile: 0.00649824	valid_1's quantile: 0.0072141
Training until validation scores don't improve for 100 rounds
[2]	training's quantile: 0.00649824	valid_1's quantile: 0.0072141
[3]	training's quantile: 0.00649824	valid_1's quantile: 0.0072141
[4]	training's quantile: 0.00649824	valid_1's quantile: 0.0072141
[5]	training's quantile: 0.00649824	valid_1's quantile: 0.0072141
[6]	training's quantile: 0.00649824	valid_1's quantile: 0.0072141
[7]	training's quantile: 0.00649824	valid_1's quantile: 0.0072141
[8]	training's quantile: 0.00649824	valid_1's quantile: 0.0072141
[9]	training's quantile: 0.00649824	valid_1's quantile: 0.0072141
[10]	training's quantile: 0.00649824	valid_1's quantile: 0.0072141
[11]	training's quantile: 0.00649824	valid_1's quantile: 0.0072141
[12]	training's quantile: 0.00649824	valid_1's quantile: 0.0072141
[13]	training's quantile: 0.00649824	valid_1's quantile: 0.0072141
[14]	training's quantile: 0.00649824	valid_1's quant

 11%|█████████▎                                                                          | 1/9 [01:00<08:05, 60.74s/it]

start training RightGBM
[1]	training's quantile: 0.0324912	valid_1's quantile: 0.0360705
Training until validation scores don't improve for 100 rounds
[2]	training's quantile: 0.0324912	valid_1's quantile: 0.0360705
[3]	training's quantile: 0.0324912	valid_1's quantile: 0.0360705
[4]	training's quantile: 0.0324912	valid_1's quantile: 0.0360705
[5]	training's quantile: 0.0324912	valid_1's quantile: 0.0360705
[6]	training's quantile: 0.0324912	valid_1's quantile: 0.0360705
[7]	training's quantile: 0.0324912	valid_1's quantile: 0.0360705
[8]	training's quantile: 0.0324912	valid_1's quantile: 0.0360705
[9]	training's quantile: 0.0324912	valid_1's quantile: 0.0360705
[10]	training's quantile: 0.0324912	valid_1's quantile: 0.0360705
[11]	training's quantile: 0.0324912	valid_1's quantile: 0.0360705
[12]	training's quantile: 0.0324912	valid_1's quantile: 0.0360705
[13]	training's quantile: 0.0324912	valid_1's quantile: 0.0360705
[14]	training's quantile: 0.0324912	valid_1's quantile: 0.0360705

 22%|██████████████████▋                                                                 | 2/9 [02:01<07:04, 60.68s/it]

start training RightGBM
[1]	training's quantile: 0.214092	valid_1's quantile: 0.237551
Training until validation scores don't improve for 100 rounds
[2]	training's quantile: 0.213786	valid_1's quantile: 0.23709
[3]	training's quantile: 0.213517	valid_1's quantile: 0.236678
[4]	training's quantile: 0.213289	valid_1's quantile: 0.236312
[5]	training's quantile: 0.213087	valid_1's quantile: 0.235984
[6]	training's quantile: 0.212918	valid_1's quantile: 0.2357
[7]	training's quantile: 0.212772	valid_1's quantile: 0.235449
[8]	training's quantile: 0.21264	valid_1's quantile: 0.235223
[9]	training's quantile: 0.212542	valid_1's quantile: 0.235033
[10]	training's quantile: 0.212453	valid_1's quantile: 0.234862
[11]	training's quantile: 0.212374	valid_1's quantile: 0.234708
[12]	training's quantile: 0.212305	valid_1's quantile: 0.234571
[13]	training's quantile: 0.212233	valid_1's quantile: 0.234398
[14]	training's quantile: 0.212169	valid_1's quantile: 0.234242
[15]	training's quantile: 0.212

 33%|████████████████████████████                                                        | 3/9 [03:09<06:17, 62.87s/it]

start training RightGBM
[1]	training's quantile: 0.323549	valid_1's quantile: 0.358
Training until validation scores don't improve for 100 rounds
[2]	training's quantile: 0.322344	valid_1's quantile: 0.355571
[3]	training's quantile: 0.321285	valid_1's quantile: 0.353395
[4]	training's quantile: 0.320372	valid_1's quantile: 0.351468
[5]	training's quantile: 0.319479	valid_1's quantile: 0.349473
[6]	training's quantile: 0.318905	valid_1's quantile: 0.347973
[7]	training's quantile: 0.31822	valid_1's quantile: 0.346147
[8]	training's quantile: 0.317825	valid_1's quantile: 0.344788
[9]	training's quantile: 0.317505	valid_1's quantile: 0.343584
[10]	training's quantile: 0.317216	valid_1's quantile: 0.342517
[11]	training's quantile: 0.316899	valid_1's quantile: 0.341532
[12]	training's quantile: 0.316661	valid_1's quantile: 0.340696
[13]	training's quantile: 0.316163	valid_1's quantile: 0.33988
[14]	training's quantile: 0.315994	valid_1's quantile: 0.339172
[15]	training's quantile: 0.3158

 44%|████████████████████████████████████▉                                              | 4/9 [06:44<09:03, 108.70s/it]

start training RightGBM
[1]	training's quantile: 0.632561	valid_1's quantile: 0.695669
Training until validation scores don't improve for 100 rounds
[2]	training's quantile: 0.617223	valid_1's quantile: 0.672618
[3]	training's quantile: 0.604037	valid_1's quantile: 0.652036
[4]	training's quantile: 0.591995	valid_1's quantile: 0.63326
[5]	training's quantile: 0.581759	valid_1's quantile: 0.616962
[6]	training's quantile: 0.573951	valid_1's quantile: 0.60408
[7]	training's quantile: 0.567332	valid_1's quantile: 0.592184
[8]	training's quantile: 0.561147	valid_1's quantile: 0.580494
[9]	training's quantile: 0.555928	valid_1's quantile: 0.570725
[10]	training's quantile: 0.550946	valid_1's quantile: 0.561228
[11]	training's quantile: 0.546879	valid_1's quantile: 0.553268
[12]	training's quantile: 0.542708	valid_1's quantile: 0.545263
[13]	training's quantile: 0.539725	valid_1's quantile: 0.538649
[14]	training's quantile: 0.537016	valid_1's quantile: 0.533506
[15]	training's quantile: 0.5

 56%|██████████████████████████████████████████████                                     | 5/9 [08:11<06:48, 102.20s/it]

start training RightGBM
[1]	training's quantile: 0.772826	valid_1's quantile: 0.815244
Training until validation scores don't improve for 100 rounds
[2]	training's quantile: 0.732394	valid_1's quantile: 0.762586
[3]	training's quantile: 0.698019	valid_1's quantile: 0.716573
[4]	training's quantile: 0.668417	valid_1's quantile: 0.675791
[5]	training's quantile: 0.643907	valid_1's quantile: 0.641349
[6]	training's quantile: 0.624332	valid_1's quantile: 0.613433
[7]	training's quantile: 0.607676	valid_1's quantile: 0.589219
[8]	training's quantile: 0.594411	valid_1's quantile: 0.569428
[9]	training's quantile: 0.583234	valid_1's quantile: 0.552547
[10]	training's quantile: 0.573834	valid_1's quantile: 0.537714
[11]	training's quantile: 0.566212	valid_1's quantile: 0.525674
[12]	training's quantile: 0.559549	valid_1's quantile: 0.515593
[13]	training's quantile: 0.554602	valid_1's quantile: 0.5078
[14]	training's quantile: 0.550495	valid_1's quantile: 0.501081
[15]	training's quantile: 0.5

 67%|████████████████████████████████████████████████████████                            | 6/9 [09:35<04:49, 96.48s/it]

start training RightGBM
[1]	training's quantile: 0.727216	valid_1's quantile: 0.75264
Training until validation scores don't improve for 100 rounds
[2]	training's quantile: 0.677056	valid_1's quantile: 0.689829
[3]	training's quantile: 0.635196	valid_1's quantile: 0.635788
[4]	training's quantile: 0.601789	valid_1's quantile: 0.592328
[5]	training's quantile: 0.574208	valid_1's quantile: 0.555647
[6]	training's quantile: 0.551454	valid_1's quantile: 0.524884
[7]	training's quantile: 0.533102	valid_1's quantile: 0.500506
[8]	training's quantile: 0.519071	valid_1's quantile: 0.481685
[9]	training's quantile: 0.507561	valid_1's quantile: 0.465661
[10]	training's quantile: 0.498392	valid_1's quantile: 0.453103
[11]	training's quantile: 0.490954	valid_1's quantile: 0.442215
[12]	training's quantile: 0.48478	valid_1's quantile: 0.433327
[13]	training's quantile: 0.479946	valid_1's quantile: 0.426168
[14]	training's quantile: 0.475769	valid_1's quantile: 0.42072
[15]	training's quantile: 0.47

 78%|████████████████████████████████████████████████████████████████▌                  | 7/9 [11:29<03:23, 101.97s/it]

start training RightGBM
[1]	training's quantile: 0.348643	valid_1's quantile: 0.344537
Training until validation scores don't improve for 100 rounds
[2]	training's quantile: 0.308589	valid_1's quantile: 0.296459
[3]	training's quantile: 0.278795	valid_1's quantile: 0.261573
[4]	training's quantile: 0.255045	valid_1's quantile: 0.234847
[5]	training's quantile: 0.2372	valid_1's quantile: 0.214844
[6]	training's quantile: 0.223408	valid_1's quantile: 0.200005
[7]	training's quantile: 0.212364	valid_1's quantile: 0.188702
[8]	training's quantile: 0.203495	valid_1's quantile: 0.179721
[9]	training's quantile: 0.195744	valid_1's quantile: 0.172068
[10]	training's quantile: 0.189467	valid_1's quantile: 0.165969
[11]	training's quantile: 0.18355	valid_1's quantile: 0.160031
[12]	training's quantile: 0.178694	valid_1's quantile: 0.155249
[13]	training's quantile: 0.174541	valid_1's quantile: 0.151094
[14]	training's quantile: 0.170725	valid_1's quantile: 0.147568
[15]	training's quantile: 0.16

 89%|█████████████████████████████████████████████████████████████████████████▊         | 8/9 [15:59<02:32, 152.16s/it]

start training RightGBM
[1]	training's quantile: 0.145821	valid_1's quantile: 0.138692
Training until validation scores don't improve for 100 rounds
[2]	training's quantile: 0.127546	valid_1's quantile: 0.118105
[3]	training's quantile: 0.114505	valid_1's quantile: 0.104455
[4]	training's quantile: 0.105079	valid_1's quantile: 0.0947955
[5]	training's quantile: 0.0972618	valid_1's quantile: 0.0871868
[6]	training's quantile: 0.0910216	valid_1's quantile: 0.0813553
[7]	training's quantile: 0.0859984	valid_1's quantile: 0.0767727
[8]	training's quantile: 0.0819796	valid_1's quantile: 0.0730615
[9]	training's quantile: 0.0781788	valid_1's quantile: 0.06957
[10]	training's quantile: 0.074752	valid_1's quantile: 0.0663042
[11]	training's quantile: 0.0719073	valid_1's quantile: 0.0637394
[12]	training's quantile: 0.0692018	valid_1's quantile: 0.06112
[13]	training's quantile: 0.0666453	valid_1's quantile: 0.058907
[14]	training's quantile: 0.0645891	valid_1's quantile: 0.0569063
[15]	trainin

100%|███████████████████████████████████████████████████████████████████████████████████| 9/9 [19:42<00:00, 131.35s/it]


  -- completed



#### [prediction]

In [6]:
dir = os.path.join("submission_uncertainty","RightGBM_quantileReg")
if not(os.path.exists(dir)):
    os.makedirs(dir)

import lightgbm as lgb

print("start prediction")
qlist = np.array([0.005,0.025,0.165,0.25, 0.5, 0.75, 0.835, 0.975, 0.995])
for q in tqdm(qlist):
    
    model_path = os.path.join(model_dir,"RightGBM_"+f"{q:.3f}"+"_"+feature_maker.name+".mdl")
    model = lgb.Booster(model_file=model_path)
    
    
    # validation quantile predict
    data["validation"][1].loc[:] = model.predict(data["validation"][0])
    valid = data["validation"][1]
    valid = pd.DataFrame(valid.values,
                         index=valid.index,
                         columns=[feature_maker.target_col])
    valid = valid.reset_index()
    valid = pd.pivot(valid,
                     index="id", 
                     columns="d",
                     values=feature_maker.target_col)
    valid = valid.reset_index()
    valid.columns = ["id"] + [f"F{i}" for i in range(1, 29)]
    valid["id"] = valid["id"].str.replace("_evaluation","_validation")
    valid.to_csv(os.path.join(dir,"RightGBM_"+f"{q:.3f}"+"_"+feature_maker.name+"_valid.csv"))
    
    # evaluation quantile predict
    data["evaluation"][1].loc[:] = model.predict(data["evaluation"][0])
    evalu = data["evaluation"][1]
    evalu = pd.DataFrame(evalu.values,
                         index=evalu.index,
                         columns=[feature_maker.target_col])
    evalu = evalu.reset_index()
    evalu = pd.pivot(evalu,
                     index="id", 
                     columns="d", 
                     values=feature_maker.target_col)
    evalu = evalu.reset_index()
    evalu.columns = ["id"] + [f"F{i}" for i in range(1, 29)]
    evalu.to_csv(os.path.join(dir,"RightGBM_"+f"{q:.3f}"+"_"+feature_maker.name+"_evalu.csv"))

    
print("  -- completed")

start prediction


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:59<00:00,  6.57s/it]


  -- completed


#### [submission]

In [7]:
sales_train_val = pd.read_csv(os.path.join("rawdata","sales_train_evaluation.csv"))
sales_train_val = sales_train_val[["id","item_id","dept_id","cat_id","store_id","state_id"]]

dir = os.path.join("submission_uncertainty","RightGBM_quantileReg")
if not(os.path.exists(dir)):
    os.makedirs(dir)

cols = ["id"] + [f"F{i}" for i in range(1, 29)]

sub = pd.DataFrame()

import lightgbm as lgb
qlist = np.array([0.005,0.025,0.165,0.25, 0.5, 0.75, 0.835, 0.975, 0.995])
for q in tqdm(qlist):
    
    valid = pd.read_csv(os.path.join(dir,"RightGBM_"+f"{q:.3f}"+"_"+feature_maker.name+"_valid.csv"))
    valid["id"] = valid["id"].str.replace("_validation","_evaluation")
    valid = pd.merge(valid,sales_train_val,on="id",how="left")
    valid = get_agg_df(valid)
    valid["quantile"] = "_"+f"{q:.3f}"+"_validation"
    valid["id"] = valid["id"].str.cat(valid["quantile"])
    
    evalu = pd.read_csv(os.path.join(dir,"RightGBM_"+f"{q:.3f}"+"_"+feature_maker.name+"_evalu.csv"))
    evalu = pd.merge(evalu,sales_train_val,on="id",how="left")
    evalu = get_agg_df(evalu)
    evalu["quantile"] = "_"+f"{q:.3f}"+"_evaluation"
    evalu["id"] = evalu["id"].str.cat(evalu["quantile"])
    
    sub = pd.concat([sub,valid,evalu])
    
sub = sub[cols]
sub = sub.set_index("id",drop=True)

100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:11<00:00,  1.28s/it]


In [8]:
upred_sample = pd.read_csv(os.path.join("submission_uncertainty","sample_submission.csv"),index_col=0)
sub.loc[upred_sample.index].to_csv(os.path.join("submission_uncertainty","RightGBM_quantileReg_"+feature_maker.name+".csv"))
sub

Unnamed: 0_level_0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Total_X_0.005_validation,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
CA_X_0.005_validation,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
TX_X_0.005_validation,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
WI_X_0.005_validation,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
CA_1_X_0.005_validation,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HOUSEHOLD_2_516_TX_2_0.995_evaluation,2.014656,2.014656,2.014656,2.014656,2.014656,2.014656,2.014656,2.014656,2.014656,2.014656,...,2.014656,2.014656,2.014656,2.014656,2.014656,2.014656,2.014656,2.014656,2.014656,2.014656
HOUSEHOLD_2_516_TX_3_0.995_evaluation,2.109799,2.109799,2.109799,2.109799,2.109799,2.109799,2.109799,2.109799,2.109799,2.109799,...,2.109799,2.109799,2.109799,2.109799,2.109799,2.109799,2.109799,2.109799,2.109799,2.109799
HOUSEHOLD_2_516_WI_1_0.995_evaluation,1.989804,1.989804,1.989804,1.989804,1.989804,1.989804,1.989804,1.989804,1.989804,1.989804,...,1.989804,1.989804,1.989804,1.989804,1.989804,1.989804,1.989804,1.989804,1.989804,1.989804
HOUSEHOLD_2_516_WI_2_0.995_evaluation,1.797782,1.797782,1.797782,1.797782,1.797782,1.797782,1.797782,1.797782,1.797782,1.797782,...,1.797782,1.797782,1.797782,1.797782,1.797782,1.797782,1.797782,1.797782,1.797782,1.797782


In [None]:
from scipy.stats import norm

print(norm(loc))