## Please input your directory for the top level folder
folder name : SUBMISSION MODEL

In [1]:
dir_ = 'C:/Users/kimms/OneDrive/바탕 화면/소캡디/' # input only here

#### setting other directory

In [2]:
raw_data_dir = dir_+'2. data/'
processed_data_dir = dir_+'2. data/processed/'
log_dir = dir_+'4. logs/'
model_dir = dir_+'5. models/'

In [4]:
ver, KKK = 'priv', 0
STORES_IDS = ['CA_1','CA_2','CA_3','CA_4','TX_1','TX_2','TX_3','WI_1','WI_2','WI_3']

In [5]:
import numpy as np
import pandas as pd
import os, sys, gc, time, warnings, pickle, psutil, random
import tqdm

from multiprocessing import Pool

warnings.filterwarnings('ignore')

In [6]:
################################## SEED 설정 ####################################
def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)

    
## Multiprocess Runs
def df_parallelize_run(func, t_split):
    num_cores = np.min([N_CORES,len(t_split)])
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, t_split), axis=1)
    pool.close()
    pool.join()
    return df

In [7]:
############### Target Store에 해당하는 Data 로드############################

def get_data_by_store(store):
    
    # Read and contact basic feature
    df = pd.concat([pd.read_pickle(BASE),
                    pd.read_pickle(PRICE).iloc[:,2:],
                    pd.read_pickle(CALENDAR).iloc[:,2:]],
                    axis=1)
    
    df = df[df['d']>=START_TRAIN]
    
    df = df[df['store_id']==store]

    df2 = pd.read_pickle(MEAN_ENC)[mean_features]
    df2 = df2[df2.index.isin(df.index)]
    
    df3 = pd.read_pickle(LAGS).iloc[:,3:]
    df3 = df3[df3.index.isin(df.index)]
    
    df = pd.concat([df, df2], axis=1)
    del df2
    
    df = pd.concat([df, df3], axis=1)
    del df3

    features = [col for col in list(df) if col not in remove_features]
    df = df[['id','d',TARGET]+features]
    
    df = df.reset_index(drop=True)
    
    return df, features

# Recombine Test set after training
def get_base_test():
    base_test = pd.DataFrame()

    for store_id in STORES_IDS:
        temp_df = pd.read_pickle(processed_data_dir+'test_'+store_id+'.pkl')
        temp_df['store_id'] = store_id
        base_test = pd.concat([base_test, temp_df]).reset_index(drop=True)
    
    return base_test


#################### 전처리를 위한 (이동평균선) 데이터 전처리 코드 ################################33

def make_lag(LAG_DAY):
    lag_df = base_test[['id','d',TARGET]]
    col_name = 'sales_lag_'+str(LAG_DAY)
    lag_df[col_name] = lag_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(LAG_DAY)).astype(np.float16)
    return lag_df[[col_name]]


def make_lag_roll(LAG_DAY):
    shift_day = LAG_DAY[0]
    roll_wind = LAG_DAY[1]
    lag_df = base_test[['id','d',TARGET]]
    col_name = 'rolling_mean_tmp_'+str(shift_day)+'_'+str(roll_wind)
    lag_df[col_name] = lag_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(shift_day).rolling(roll_wind).mean())
    return lag_df[[col_name]]

In [8]:
########################### Default model parameter설정 (기준값) ##########################
import lightgbm as lgb
lgb_params = {
                    'boosting_type': 'gbdt',
                    'objective': 'tweedie',
                    'tweedie_variance_power': 1.1,
                    'metric': 'rmse',
                    'boost_from_average': False,
                    'verbose': -1,
                    'n_estimators' : 1000
                } 

In [9]:
########################### 전처리 변수 생성 및 데이터 불러오기 #########################

VER = 1                          
SEED = 42                        
seed_everything(SEED)            
lgb_params['seed'] = SEED        
N_CORES = psutil.cpu_count()     


#LIMITS and const
TARGET      = 'sales'            
START_TRAIN = 0                  
END_TRAIN   = 1941 - 28*KKK      
P_HORIZON   = 28                 
USE_AUX     = False             


remove_features = ['id','state_id','store_id',
                   'date','wm_yr_wk','d',TARGET]
mean_features   = ['enc_cat_id_mean','enc_cat_id_std',
                   'enc_dept_id_mean','enc_dept_id_std',
                   'enc_item_id_mean','enc_item_id_std'] 

ORIGINAL = raw_data_dir
BASE     = processed_data_dir+'grid_part_1.pkl'
PRICE    = processed_data_dir+'grid_part_2.pkl'
CALENDAR = processed_data_dir+'grid_part_3.pkl'
LAGS     = processed_data_dir+'lags_df_28.pkl'
MEAN_ENC = processed_data_dir+'mean_encoding_df.pkl'


#SPLITS for lags creation
SHIFT_DAY  = 28
N_LAGS     = 15
LAGS_SPLIT = [col for col in range(SHIFT_DAY,SHIFT_DAY+N_LAGS)]
ROLS_SPLIT = []
for i in [1,7,14]:
    for j in [7,14,30,60]:
        ROLS_SPLIT.append([i,j])

#### 최근 1년 데이터만 활용해서 basic model 학습 및 생성

In [12]:
########################### Train Basic Model (No Hyper parameter tuning) ####################

for store_id in STORES_IDS:
    print('Train', store_id)
    
    grid_df, features_columns = get_data_by_store(store_id)
    
    train_mask = grid_df['d']<=END_TRAIN
    valid_mask = train_mask&(grid_df['d']>(END_TRAIN-P_HORIZON))
    preds_mask = (grid_df['d']>(END_TRAIN-100)) & (grid_df['d'] <= END_TRAIN+P_HORIZON)
    
    latest_1y = int(len(train_mask)/5)
    

    train_data = lgb.Dataset(grid_df[train_mask][features_columns].iloc[-latest_1y:], 
                       label=grid_df[train_mask][TARGET].iloc[-latest_1y:])

    valid_data = lgb.Dataset(grid_df[valid_mask][features_columns], 
                       label=grid_df[valid_mask][TARGET])
    
    grid_df = grid_df[preds_mask].reset_index(drop=True)
    keep_cols = [col for col in list(grid_df) if '_tmp_' not in col]
    grid_df = grid_df[keep_cols]
    
    d_sales = grid_df[['d','sales']]
    substitute = d_sales['sales'].values
    substitute[(d_sales['d'] > END_TRAIN)] = np.nan
    grid_df['sales'] = substitute
    
    grid_df.to_pickle(processed_data_dir+'test_'+store_id+'.pkl')
    del grid_df, d_sales, substitute
    

    seed_everything(SEED)
    estimator = lgb.train(lgb_params,
                          train_data,
                          valid_sets = [valid_data],
                          verbose_eval = 100,
                          )
    
    display(pd.DataFrame({'name':estimator.feature_name(),
                          'imp':estimator.feature_importance()}).sort_values('imp',ascending=False).head(25))

    
    model_name = model_dir+'lgb_model_1y_basic_'+store_id+'_v'+str(VER)+'.bin'
    pickle.dump(estimator, open(model_name, 'wb'))

    del train_data, valid_data, estimator
    gc.collect()
    
    MODEL_FEATURES = features_columns

Train TX_2
[100]	valid_0's rmse: 2.00049
[200]	valid_0's rmse: 1.76819
[300]	valid_0's rmse: 1.7523
[400]	valid_0's rmse: 1.74721
[500]	valid_0's rmse: 1.74262
[600]	valid_0's rmse: 1.73905
[700]	valid_0's rmse: 1.73534
[800]	valid_0's rmse: 1.73139
[900]	valid_0's rmse: 1.7289
[1000]	valid_0's rmse: 1.72607
[1100]	valid_0's rmse: 1.72332
[1200]	valid_0's rmse: 1.72103
[1300]	valid_0's rmse: 1.71853
[1400]	valid_0's rmse: 1.71626
[1500]	valid_0's rmse: 1.71434
[1600]	valid_0's rmse: 1.71173
[1700]	valid_0's rmse: 1.70935
[1800]	valid_0's rmse: 1.70667
[1900]	valid_0's rmse: 1.70444
[2000]	valid_0's rmse: 1.7023
[2100]	valid_0's rmse: 1.70066
[2200]	valid_0's rmse: 1.69845
[2300]	valid_0's rmse: 1.69646
[2400]	valid_0's rmse: 1.69468
[2500]	valid_0's rmse: 1.69287
[2600]	valid_0's rmse: 1.69108
[2700]	valid_0's rmse: 1.68909
[2800]	valid_0's rmse: 1.68679
[2900]	valid_0's rmse: 1.68517
[3000]	valid_0's rmse: 1.6833


Unnamed: 0,name,imp
0,item_id,83429
23,tm_w,47910
59,rolling_std_180,47711
58,rolling_mean_180,46527
22,tm_d,43981
57,rolling_std_60,36933
7,price_std,35039
60,rolling_mean_tmp_1_7,34744
11,item_nunique,34400
55,rolling_std_30,34342


Train TX_3
[100]	valid_0's rmse: 2.08525
[200]	valid_0's rmse: 1.86047
[300]	valid_0's rmse: 1.82784
[400]	valid_0's rmse: 1.81555
[500]	valid_0's rmse: 1.80826
[600]	valid_0's rmse: 1.80245
[700]	valid_0's rmse: 1.79704
[800]	valid_0's rmse: 1.79239
[900]	valid_0's rmse: 1.78807
[1000]	valid_0's rmse: 1.78387
[1100]	valid_0's rmse: 1.78005
[1200]	valid_0's rmse: 1.77699
[1300]	valid_0's rmse: 1.77398
[1400]	valid_0's rmse: 1.76975
[1500]	valid_0's rmse: 1.7667
[1600]	valid_0's rmse: 1.76407
[1700]	valid_0's rmse: 1.76102
[1800]	valid_0's rmse: 1.75833
[1900]	valid_0's rmse: 1.75496
[2000]	valid_0's rmse: 1.752
[2100]	valid_0's rmse: 1.74919
[2200]	valid_0's rmse: 1.74586
[2300]	valid_0's rmse: 1.74309
[2400]	valid_0's rmse: 1.74072
[2500]	valid_0's rmse: 1.73809
[2600]	valid_0's rmse: 1.73532
[2700]	valid_0's rmse: 1.73306
[2800]	valid_0's rmse: 1.73063
[2900]	valid_0's rmse: 1.72787
[3000]	valid_0's rmse: 1.72578


Unnamed: 0,name,imp
0,item_id,50766
59,rolling_std_180,49046
23,tm_w,48660
58,rolling_mean_180,47808
22,tm_d,45377
57,rolling_std_60,38126
7,price_std,37724
34,enc_item_id_std,37621
33,enc_item_id_mean,37435
11,item_nunique,36984


Train WI_1
[100]	valid_0's rmse: 1.70606
[200]	valid_0's rmse: 1.6007
[300]	valid_0's rmse: 1.58403
[400]	valid_0's rmse: 1.57711
[500]	valid_0's rmse: 1.57248
[600]	valid_0's rmse: 1.5687
[700]	valid_0's rmse: 1.56545
[800]	valid_0's rmse: 1.56253
[900]	valid_0's rmse: 1.55979
[1000]	valid_0's rmse: 1.55734
[1100]	valid_0's rmse: 1.55505
[1200]	valid_0's rmse: 1.55272
[1300]	valid_0's rmse: 1.55054
[1400]	valid_0's rmse: 1.54838
[1500]	valid_0's rmse: 1.54636
[1600]	valid_0's rmse: 1.54419
[1700]	valid_0's rmse: 1.54228
[1800]	valid_0's rmse: 1.54047
[1900]	valid_0's rmse: 1.53836
[2000]	valid_0's rmse: 1.5366
[2100]	valid_0's rmse: 1.53459
[2200]	valid_0's rmse: 1.53273
[2300]	valid_0's rmse: 1.53098
[2400]	valid_0's rmse: 1.52918
[2500]	valid_0's rmse: 1.5275
[2600]	valid_0's rmse: 1.52585
[2700]	valid_0's rmse: 1.52427
[2800]	valid_0's rmse: 1.52263
[2900]	valid_0's rmse: 1.52096
[3000]	valid_0's rmse: 1.51921


Unnamed: 0,name,imp
0,item_id,84806
23,tm_w,50698
59,rolling_std_180,45715
58,rolling_mean_180,45550
22,tm_d,44076
57,rolling_std_60,35448
56,rolling_mean_60,32126
11,item_nunique,31911
55,rolling_std_30,31889
33,enc_item_id_mean,31339


Train WI_2
[100]	valid_0's rmse: 3.04743
[200]	valid_0's rmse: 2.72308
[300]	valid_0's rmse: 2.65931
[400]	valid_0's rmse: 2.62936
[500]	valid_0's rmse: 2.61304
[600]	valid_0's rmse: 2.59991
[700]	valid_0's rmse: 2.59035
[800]	valid_0's rmse: 2.58341
[900]	valid_0's rmse: 2.57764
[1000]	valid_0's rmse: 2.57264
[1100]	valid_0's rmse: 2.56812
[1200]	valid_0's rmse: 2.56282
[1300]	valid_0's rmse: 2.55696
[1400]	valid_0's rmse: 2.5512
[1500]	valid_0's rmse: 2.54781
[1600]	valid_0's rmse: 2.54343
[1700]	valid_0's rmse: 2.53918
[1800]	valid_0's rmse: 2.53537
[1900]	valid_0's rmse: 2.52979
[2000]	valid_0's rmse: 2.52458
[2100]	valid_0's rmse: 2.51924
[2200]	valid_0's rmse: 2.51439
[2300]	valid_0's rmse: 2.50984
[2400]	valid_0's rmse: 2.50557
[2500]	valid_0's rmse: 2.50204
[2600]	valid_0's rmse: 2.49722
[2700]	valid_0's rmse: 2.49346
[2800]	valid_0's rmse: 2.49005
[2900]	valid_0's rmse: 2.48588
[3000]	valid_0's rmse: 2.48258


Unnamed: 0,name,imp
59,rolling_std_180,49805
23,tm_w,47815
58,rolling_mean_180,47694
22,tm_d,45772
34,enc_item_id_std,40957
33,enc_item_id_mean,40473
7,price_std,38225
57,rolling_std_60,38199
11,item_nunique,37298
3,release,33423


Train WI_3
[100]	valid_0's rmse: 2.17226
[200]	valid_0's rmse: 1.90832
[300]	valid_0's rmse: 1.87104
[400]	valid_0's rmse: 1.85385
[500]	valid_0's rmse: 1.84264
[600]	valid_0's rmse: 1.83593
[700]	valid_0's rmse: 1.82966
[800]	valid_0's rmse: 1.8245
[900]	valid_0's rmse: 1.82006
[1000]	valid_0's rmse: 1.81673
[1100]	valid_0's rmse: 1.81378
[1200]	valid_0's rmse: 1.80971
[1300]	valid_0's rmse: 1.80626
[1400]	valid_0's rmse: 1.80242
[1500]	valid_0's rmse: 1.79827
[1600]	valid_0's rmse: 1.79468
[1700]	valid_0's rmse: 1.79158
[1800]	valid_0's rmse: 1.78789
[1900]	valid_0's rmse: 1.78471
[2000]	valid_0's rmse: 1.78269
[2100]	valid_0's rmse: 1.77947
[2200]	valid_0's rmse: 1.77707
[2300]	valid_0's rmse: 1.77531
[2400]	valid_0's rmse: 1.77206
[2500]	valid_0's rmse: 1.76936
[2600]	valid_0's rmse: 1.7666
[2700]	valid_0's rmse: 1.7642
[2800]	valid_0's rmse: 1.7622
[2900]	valid_0's rmse: 1.75965
[3000]	valid_0's rmse: 1.75598


Unnamed: 0,name,imp
59,rolling_std_180,50080
23,tm_w,48984
58,rolling_mean_180,48460
22,tm_d,47389
34,enc_item_id_std,42169
33,enc_item_id_mean,42136
7,price_std,40720
11,item_nunique,39752
57,rolling_std_60,38304
6,price_min,35608


#### 최근 3년 데이터만 활용해서 basic model 학습 및 생성

In [None]:
########################### Train Basic Model (No Hyper parameter tuning) ####################

for store_id in STORES_IDS:
    print('Train', store_id)
    
    grid_df, features_columns = get_data_by_store(store_id)
    
    train_mask = grid_df['d']<=END_TRAIN
    valid_mask = train_mask&(grid_df['d']>(END_TRAIN-P_HORIZON))
    preds_mask = (grid_df['d']>(END_TRAIN-100)) & (grid_df['d'] <= END_TRAIN+P_HORIZON)
    
    latest_3y = int(len(train_mask)/5)*3
    

    train_data = lgb.Dataset(grid_df[train_mask][features_columns].iloc[-latest_3y:], 
                       label=grid_df[train_mask][TARGET].iloc[-latest_3y:])

    valid_data = lgb.Dataset(grid_df[valid_mask][features_columns], 
                       label=grid_df[valid_mask][TARGET])
    
    grid_df = grid_df[preds_mask].reset_index(drop=True)
    keep_cols = [col for col in list(grid_df) if '_tmp_' not in col]
    grid_df = grid_df[keep_cols]
    
    d_sales = grid_df[['d','sales']]
    substitute = d_sales['sales'].values
    substitute[(d_sales['d'] > END_TRAIN)] = np.nan
    grid_df['sales'] = substitute
    
    grid_df.to_pickle(processed_data_dir+'test_'+store_id+'.pkl')
    del grid_df, d_sales, substitute
    

    seed_everything(SEED)
    estimator = lgb.train(lgb_params,
                          train_data,
                          valid_sets = [valid_data],
                          verbose_eval = 100,
                          )
    
    display(pd.DataFrame({'name':estimator.feature_name(),
                          'imp':estimator.feature_importance()}).sort_values('imp',ascending=False).head(25))

    
    model_name = model_dir+'lgb_model_3y_basic_'+store_id+'_v'+str(VER)+'.bin'
    pickle.dump(estimator, open(model_name, 'wb'))

    del train_data, valid_data, estimator
    gc.collect()
    
    MODEL_FEATURES = features_columns