## Please input your directory for the top level folder
folder name : SUBMISSION MODEL

In [19]:
dir_ = 'C:/Users/kimms/OneDrive/바탕 화면/소캡디/' # input only here

#### setting other directory

In [20]:
raw_data_dir = dir_+'2. data/'
processed_data_dir = dir_+'2. data/processed/'
log_dir = dir_+'4. logs/'
model_dir = dir_+'5. models/'
submission_dir = dir_+'6. submissions/'

In [21]:
####################################################################################
########################### 1-1. recursive model by store ##########################
####################################################################################

In [22]:
ver, KKK = 'priv', 0
STORES_IDS = ['CA_1','CA_2','CA_3','CA_4','TX_1','TX_2','TX_3','WI_1','WI_2','WI_3']

In [23]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import os, sys, gc, time, warnings, pickle, psutil, random

from multiprocessing import Pool

warnings.filterwarnings('ignore')

In [24]:
########################### Helpers
#################################################################################
## Seeder
def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)

    
## Multiprocess Runs
def df_parallelize_run(func, t_split):
    num_cores = np.min([N_CORES,len(t_split)])
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, t_split), axis=1)
    pool.close()
    pool.join()
    return df

In [25]:
########################### Helper to load data by store ID
#################################################################################
# Read data
def get_data_by_store(store):
    
    # Read and contact basic feature
    df = pd.concat([pd.read_pickle(BASE),
                    pd.read_pickle(PRICE).iloc[:,2:],
                    pd.read_pickle(CALENDAR).iloc[:,2:]],
                    axis=1)
    
    df = df[df['d']>=START_TRAIN]
    
    df = df[df['store_id']==store]

    df2 = pd.read_pickle(MEAN_ENC)[mean_features]
    df2 = df2[df2.index.isin(df.index)]
    
    df3 = pd.read_pickle(LAGS).iloc[:,3:]
    df3 = df3[df3.index.isin(df.index)]
    
    df = pd.concat([df, df2], axis=1)
    del df2
    
    df = pd.concat([df, df3], axis=1)
    del df3
    
    features = [col for col in list(df) if col not in remove_features]
    df = df[['id','d',TARGET]+features]
    
    df = df.reset_index(drop=True)
    
    return df, features

# Recombine Test set after training
def get_base_test():
    base_test = pd.DataFrame()

    for store_id in STORES_IDS:
        temp_df = pd.read_pickle(processed_data_dir+'test_'+store_id+'.pkl')
        temp_df['store_id'] = store_id
        base_test = pd.concat([base_test, temp_df]).reset_index(drop=True)
    
    return base_test


########################### Helper to make dynamic rolling lags
#################################################################################
def make_lag(LAG_DAY):
    lag_df = base_test[['id','d',TARGET]]
    col_name = 'sales_lag_'+str(LAG_DAY)
    lag_df[col_name] = lag_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(LAG_DAY)).astype(np.float16)
    return lag_df[[col_name]]


def make_lag_roll(LAG_DAY):
    shift_day = LAG_DAY[0]
    roll_wind = LAG_DAY[1]
    lag_df = base_test[['id','d',TARGET]]
    col_name = 'rolling_mean_tmp_'+str(shift_day)+'_'+str(roll_wind)
    lag_df[col_name] = lag_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(shift_day).rolling(roll_wind).mean())
    return lag_df[[col_name]]

In [26]:
########################### Vars
#################################################################################
VER = 1                          
SEED = 42                        
seed_everything(SEED)            
N_CORES = psutil.cpu_count()     


#LIMITS and const
TARGET      = 'sales'            
START_TRAIN = 0                
END_TRAIN   = 1941 - 28*KKK      
P_HORIZON   = 28                 
USE_AUX     = False             

remove_features = ['id','state_id','store_id',
                   'date','wm_yr_wk','d',TARGET]
mean_features   = ['enc_cat_id_mean','enc_cat_id_std',
                   'enc_dept_id_mean','enc_dept_id_std',
                   'enc_item_id_mean','enc_item_id_std'] 

ORIGINAL = raw_data_dir
BASE     = processed_data_dir+'grid_part_1.pkl'
PRICE    = processed_data_dir+'grid_part_2.pkl'
CALENDAR = processed_data_dir+'grid_part_3.pkl'
LAGS     = processed_data_dir+'lags_df_28.pkl'
MEAN_ENC = processed_data_dir+'mean_encoding_df.pkl'


#SPLITS for lags creation
SHIFT_DAY  = 28
N_LAGS     = 15
LAGS_SPLIT = [col for col in range(SHIFT_DAY,SHIFT_DAY+N_LAGS)]
ROLS_SPLIT = []
for i in [1,7,14]:
    for j in [7,14,30,60]:
        ROLS_SPLIT.append([i,j])

In [27]:
_, MODEL_FEATURES = get_data_by_store(STORES_IDS[-1])
del _; gc.collect()

0

In [31]:
########################### Predict
#################################################################################

all_preds = pd.DataFrame()

# Join back the Test dataset with 
# a small part of the training data 
# to make recursive features
base_test = get_base_test()

main_time = time.time()

for PREDICT_DAY in range(1,29):    
    print('Predict | Day:', PREDICT_DAY)
    start_time = time.time()

    grid_df = base_test.copy()
    
    # slow for loop version
    temp = []
    for a in ROLS_SPLIT:
        temp.append(make_lag_roll(a))
    temp = pd.concat(temp, axis=1)
    grid_df = pd.concat([grid_df, temp], axis=1)
    del temp; gc.collect()
    ###
    
    # fast multiprocessing version
    #     grid_df = pd.concat([grid_df, df_parallelize_run(make_lag_roll, ROLS_SPLIT)], axis=1)
    ###
    
    for store_id in STORES_IDS:
        
        model_path = model_dir+'1y_hy/'+'lgb_model_1y_hy_'+store_id+'_v'+str(VER)+'.bin' 
        if USE_AUX:
            model_path = AUX_MODELS + model_path

        estimator = pickle.load(open(model_path, 'rb'))

        day_mask = base_test['d']==(END_TRAIN+PREDICT_DAY)
        store_mask = base_test['store_id']==store_id

        mask = (day_mask)&(store_mask)
        base_test[TARGET][mask] = estimator.predict(grid_df[mask][MODEL_FEATURES])

    temp_df = base_test[day_mask][['id',TARGET]]
    temp_df.columns = ['id','F'+str(PREDICT_DAY)]
    if 'id' in list(all_preds):
        all_preds = all_preds.merge(temp_df, on=['id'], how='left')
    else:
        all_preds = temp_df.copy()
        
    print('#'*10, ' %0.2f min round |' % ((time.time() - start_time) / 60),
                  ' %0.2f min total |' % ((time.time() - main_time) / 60),
                  ' %0.2f day sales |' % (temp_df['F'+str(PREDICT_DAY)].sum()))

    del temp_df
    
all_preds = all_preds.reset_index(drop=True)
all_preds

Predict | Day: 1
##########  1.99 min round |  1.99 min total |  37709.71 day sales |
Predict | Day: 2
##########  2.00 min round |  3.99 min total |  35408.55 day sales |
Predict | Day: 3
##########  2.03 min round |  6.02 min total |  35359.40 day sales |
Predict | Day: 4
##########  2.07 min round |  8.09 min total |  34725.87 day sales |
Predict | Day: 5
##########  1.95 min round |  10.04 min total |  38555.11 day sales |
Predict | Day: 6
##########  1.96 min round |  12.00 min total |  47668.06 day sales |
Predict | Day: 7
##########  2.00 min round |  14.00 min total |  50953.37 day sales |
Predict | Day: 8
##########  2.04 min round |  16.05 min total |  33690.81 day sales |
Predict | Day: 9
##########  1.80 min round |  17.85 min total |  35243.33 day sales |
Predict | Day: 10
##########  1.80 min round |  19.64 min total |  39426.73 day sales |
Predict | Day: 11
##########  2.02 min round |  21.66 min total |  40112.08 day sales |
Predict | Day: 12
##########  2.02 min round 

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_evaluation,0.765438,0.773216,0.883342,0.748277,0.970336,1.157535,0.880323,0.523075,0.648505,...,0.984496,1.003382,1.021467,0.991271,0.745482,1.095049,0.901004,0.884636,0.933845,1.258377
1,HOBBIES_1_002_CA_1_evaluation,0.115489,0.207677,0.205412,0.377207,0.356161,0.193226,0.433168,0.214815,0.360568,...,0.244183,0.244032,0.122706,0.175335,0.100577,0.124490,0.365859,0.387632,0.249490,0.331498
2,HOBBIES_1_003_CA_1_evaluation,0.656870,0.537632,0.688507,0.444122,0.630250,0.830574,0.525434,0.431679,0.777113,...,0.628360,0.915335,0.464038,0.549969,0.263767,0.299183,0.312992,0.366238,0.447227,0.538170
3,HOBBIES_1_004_CA_1_evaluation,1.164363,0.953318,0.924742,0.771392,1.295684,3.237632,4.451479,1.488893,1.223270,...,1.790949,2.982593,3.044284,1.571704,1.223399,1.297601,0.911706,1.333253,2.170806,2.281030
4,HOBBIES_1_005_CA_1_evaluation,0.688540,1.340234,0.837617,0.626225,0.815646,1.296828,0.963141,0.795634,0.580813,...,1.379032,2.449875,1.662034,0.977617,1.208917,0.978521,0.852351,0.927087,1.170598,1.692568
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,FOODS_3_823_WI_3_evaluation,0.206952,0.504257,0.354498,0.644927,0.430747,0.800774,0.836473,0.427760,0.656800,...,0.581686,1.073315,1.112075,0.720511,0.288623,0.891884,0.394869,0.344574,0.489050,0.814594
30486,FOODS_3_824_WI_3_evaluation,0.296564,0.372903,0.130460,0.102485,0.091623,0.095330,0.105178,0.062949,0.100256,...,0.237230,0.253214,0.221222,0.109358,0.182877,0.217662,0.121330,0.088931,0.105601,0.124014
30487,FOODS_3_825_WI_3_evaluation,0.475983,0.625393,0.366402,0.249458,0.479075,0.353051,0.721615,0.446963,0.445937,...,0.558437,0.602998,0.639031,0.680563,1.433785,0.779301,0.490823,0.837721,0.536551,0.670362
30488,FOODS_3_826_WI_3_evaluation,1.030575,1.238772,1.209289,1.247480,0.790638,1.444323,1.285143,1.022136,1.139895,...,1.334480,1.073078,1.186813,1.777549,2.142995,1.437035,1.138643,1.193031,1.297430,1.537085


In [32]:
########################### Export
#################################################################################
submission = pd.read_csv(ORIGINAL+'sample_submission.csv')[['id']]
submission = submission.merge(all_preds, on=['id'], how='left').fillna(0)
submission.to_csv('C:/Users/kimms/OneDrive/바탕 화면/소캡디/M5_past/M5-methods/Code of Winning Methods/A1/6. submission/'+'1y_hy.csv', index=False)