## **LightGBM quantile Regression**

In [2]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import os
import glob
import pickle
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['agg.path.chunksize'] = 100000
import gc

from tqdm import tqdm
from myUtils import *
from feature_generator import feature_v1, feature_v2, feature_v3, feature_v4
import xgboost as xgb

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### [make features]

In [3]:
###################################################################
# make features
##################################################################
#feature_maker = feature_v1.FeaturesMaker_v1(target_col="item_cnt")
feature_maker = feature_v4.FeaturesMaker_v4(target_col="item_cnt")

base_data = "sales_train_eval_365"
data_path = os.path.join("mydata",base_data+"_"+feature_maker.name+".pickle")

if os.path.exists(data_path):
    with open(data_path,"rb") as f:
        data = pickle.load(f)
    print("data loaded")
else:
    data = pd.read_pickle(os.path.join("mydata",base_data+".pickle"))
    data = feature_maker.make_feature(data)
    
    with open(data_path,"wb") as f:
        pickle.dump(data,f)

-- column check completed --
  columns are satisfied
--  features_ver4  --
dim: 36
N: 11982570
-----------------


#### [training]
make regression model for each quantile

In [4]:
model_dir = os.path.join("models","RightGBM_quantileReg")
if not(os.path.exists(model_dir)):
    os.makedirs(model_dir)
    

import lightgbm as lgb
qlist = np.array([0.005,0.025,0.165,0.25, 0.5, 0.75, 0.835, 0.975, 0.995])
for q in tqdm(qlist):
    model_path = os.path.join(model_dir,"RightGBM_"+f"{q:.3f}"+"_"+feature_maker.name+".mdl")

    if os.path.exists(model_path):
        print("model already exists...")

    else:
        print("start training RightGBM")
        model = lgb.LGBMRegressor(objective='quantile', 
                                  alpha=q,
                                  n_estimators=1000,
                                  learning_rate=.1,
                                  min_samples_leaf=9,
                                  min_samples_split=9)
        

        model.fit(X=data["train"][0], y=data["train"][1], 
                  #sample_weight=None, 
                  #base_margin=None, 
                  eval_set=[data["train"],data["validation"]], 
                  #eval_metric=None, 
                  early_stopping_rounds=100, 
                  verbose=True, 
                  #xgb_model=None, 
                  #sample_weight_eval_set=None
                  )
        model.booster_.save_model(model_path)
        
print("  -- completed\n")

TypeError: stat: path should be string, bytes, os.PathLike or integer, not builtin_function_or_method

#### [prediction]

In [None]:
dir = os.path.join("submission_uncertainty","RightGBM_quantileReg")
if not(os.path.exists(dir)):
    os.makedirs(dir)

import lightgbm as lgb

print("start prediction")
qlist = np.array([0.005,0.025,0.165,0.25, 0.5, 0.75, 0.835, 0.975, 0.995])
for q in tqdm(qlist):
    
    model_path = os.path.join(model_dir,"RightGBM_"+f"{q:.3f}"+"_"+feature_maker.name+".mdl")
    model = lgb.Booster(model_file=model_path)
    
    
    # validation quantile predict
    data["validation"][1].loc[:] = model.predict(data["validation"][0])
    valid = data["validation"][1]
    valid = pd.DataFrame(valid.values,
                         index=valid.index,
                         columns=[feature_maker.target_col])
    valid = valid.reset_index()
    valid = pd.pivot(valid,
                     index="id", 
                     columns="d",
                     values=feature_maker.target_col)
    valid = valid.reset_index()
    valid.columns = ["id"] + [f"F{i}" for i in range(1, 29)]
    valid["id"] = valid["id"].str.replace("_evaluation","_validation")
    valid.to_csv(os.path.join(dir,"RightGBM_"+f"{q:.3f}"+"_"+feature_maker.name+"_valid.csv"))
    
    # evaluation quantile predict
    data["evaluation"][1].loc[:] = model.predict(data["evaluation"][0])
    evalu = data["evaluation"][1]
    evalu = pd.DataFrame(evalu.values,
                         index=evalu.index,
                         columns=[feature_maker.target_col])
    evalu = evalu.reset_index()
    evalu = pd.pivot(evalu,
                     index="id", 
                     columns="d", 
                     values=feature_maker.target_col)
    evalu = evalu.reset_index()
    evalu.columns = ["id"] + [f"F{i}" for i in range(1, 29)]
    evalu.to_csv(os.path.join(dir,"RightGBM_"+f"{q:.3f}"+"_"+feature_maker.name+"_evalu.csv"))

    
print("  -- completed")

#### [submission]

In [None]:
sales_train_val = pd.read_csv(os.path.join("rawdata","sales_train_evaluation.csv"))
sales_train_val = sales_train_val[["id","item_id","dept_id","cat_id","store_id","state_id"]]

dir = os.path.join("submission_uncertainty","RightGBM_quantileReg")
if not(os.path.exists(dir)):
    os.makedirs(dir)

cols = ["id"] + [f"F{i}" for i in range(1, 29)]

sub = pd.DataFrame()

import lightgbm as lgb
qlist = np.array([0.005,0.025,0.165,0.25, 0.5, 0.75, 0.835, 0.975, 0.995])
for q in tqdm(qlist):
    
    valid = pd.read_csv(os.path.join(dir,"RightGBM_"+f"{q:.3f}"+"_"+feature_maker.name+"_valid.csv"))
    valid["id"] = valid["id"].str.replace("_validation","_evaluation")
    valid = pd.merge(valid,sales_train_val,on="id",how="left")
    valid = get_agg_df(valid)
    valid["quantile"] = "_"+f"{q:.3f}"+"_validation"
    valid["id"] = valid["id"].str.cat(valid["quantile"])
    
    evalu = pd.read_csv(os.path.join(dir,"RightGBM_"+f"{q:.3f}"+"_"+feature_maker.name+"_evalu.csv"))
    evalu = pd.merge(evalu,sales_train_val,on="id",how="left")
    evalu = get_agg_df(evalu)
    evalu["quantile"] = "_"+f"{q:.3f}"+"_evaluation"
    evalu["id"] = evalu["id"].str.cat(evalu["quantile"])
    
    sub = pd.concat([sub,valid,evalu])
    
sub = sub[cols]
sub = sub.set_index("id",drop=True)

In [None]:
upred_sample = pd.read_csv(os.path.join("submission_uncertainty","sample_submission.csv"),index_col=0)
sub.loc[upred_sample.index].to_csv(os.path.join("submission_uncertainty","RightGBM_quantileReg_"+feature_maker.name+".csv"))
sub