In [None]:
from google.colab import drive


drive.mount('/content/gdrive', force_remount = True)
dataset_path = '/content/gdrive/My Drive/M5_forecasting/'

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [None]:
from  datetime import datetime, timedelta
import gc
import numpy as np, pandas as pd
import lightgbm as lgb

In [None]:
lgb.__version__

'2.2.3'

The different categories of the columns are mentioned 


In [None]:
CAL_DTYPES={"event_name_1": "category", "event_name_2": "category", "event_type_1": "category", 
         "event_type_2": "category", "weekday": "category", 'wm_yr_wk': 'int16', "wday": "int16",
        "month": "int16", "year": "int16", "snap_CA": "int16", 'snap_TX': 'int16', 'snap_WI': 'int16' }
PRICE_DTYPES = {"store_id": "category", "item_id": "category", "wm_yr_wk": "int16","sell_price":"float32"}
#PRICE_DTYPES = {"item_id": "category", "wm_yr_wk": "int16","sell_price":"float32" }

In [None]:
pd.options.display.max_columns = 50

The starting date of the prediction task is specified along with the last day of training

In [None]:
h = 28 
max_lags = 57
tr_last = 1941
fday = datetime(2016,5, 23) 
#tr_last = 1913
#fday = datetime(2016,4, 25) 
fday

datetime.datetime(2016, 5, 23, 0, 0)

The three csv's are combined into a single dataframe. Since in the sales_train_evaluation.csv, we have the dates as columns and in the other two we have dates as rows, we melt the dataframe to get dates as rows for each of the three csv files. After this we can merge the 3 csvs

In [None]:
def create_dt(is_train = True, nrows = None, first_day = 1200):
    prices = pd.read_csv(dataset_path+"sell_prices.csv", dtype = PRICE_DTYPES)
    for col, col_dtype in PRICE_DTYPES.items():
        if col_dtype == "category":
            prices[col] = prices[col].cat.codes.astype("int16")
            prices[col] -= prices[col].min()
            
    cal = pd.read_csv(dataset_path+"calendar.csv", dtype = CAL_DTYPES)
    cal["date"] = pd.to_datetime(cal["date"])
    for col, col_dtype in CAL_DTYPES.items():
        if col_dtype == "category":
            cal[col] = cal[col].cat.codes.astype("int16")
            cal[col] -= cal[col].min()
    
    start_day = max(1 if is_train  else tr_last-max_lags, first_day)
    numcols = [f"d_{day}" for day in range(start_day,tr_last+1)]
    catcols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']
    #catcols = ['id', 'item_id', 'dept_id', 'cat_id', 'state_id']
    dtype = {numcol:"float32" for numcol in numcols} 
    dtype.update({col: "category" for col in catcols if col != "id"})
    dt = pd.read_csv(dataset_path + "sales_train_evaluation.csv", 
                     nrows = nrows, usecols = catcols + numcols, dtype = dtype)
    print(dt.shape)
    for col in catcols:
        if col != "id":
            dt[col] = dt[col].cat.codes.astype("int16")
            dt[col] -= dt[col].min()
    
    if not is_train:
        for day in range(tr_last+1, tr_last+ 28 +1):
            dt[f"d_{day}"] = np.nan
    
    dt = pd.melt(dt,
                  id_vars = catcols,
                  value_vars = [col for col in dt.columns if col.startswith("d_")],
                  var_name = "d",
                  value_name = "sales")
    dt = dt.merge(cal, on= "d", copy = False)
    dt = dt.merge(prices, on = ["store_id", "item_id", "wm_yr_wk"], copy = False)
    print(dt.shape)
    return dt

Lagging and Rolling mean of the lags are computed. Some date features have also been added.

In [None]:
def create_fea(dt):
    lags = [1, 7, 14, 28]
    lag_cols = [f"lag_{lag}" for lag in lags]
    lag_price = [f"price_lag_{lag}" for lag in lags]
    for lag, lag_col, price_lag in zip(lags, lag_cols, lag_price):
        dt[lag_col] = dt[["id","sales"]].groupby("id")["sales"].shift(lag)
        dt[price_lag] = dt[["id","sell_price"]].groupby("id")["sell_price"].shift(lag)
    print("LAGS DONE")
    wins = [1, 7, 14]
    for win in wins :
        for lag,lag_col, price_lag in zip(lags, lag_cols, lag_price):
            dt[f"rmean_{lag}_{win}"] = dt[["id", lag_col]].groupby("id")[lag_col].transform(lambda x : x.rolling(win).mean())
            dt[f"price_rmean_{lag}_{win}"] = dt[["id", price_lag]].groupby("id")[price_lag].transform(lambda x : x.rolling(win).mean())
    
    print("ROLLINGS DONE")
    date_features = {
        
        "wday": "weekday",
        "week": "weekofyear",
        "month": "month",
        "year": "year",
        "mday": "day",
        #"quarter" : "quarter",
#         "ime": "is_month_end",
#         "ims": "is_month_start",
    }
    
#     dt.drop(["d", "wm_yr_wk", "weekday"], axis=1, inplace = True)
    
    for date_feat_name, date_feat_func in date_features.items():
        if date_feat_name in dt.columns:
            dt[date_feat_name] = dt[date_feat_name].astype("int16")
        else:
            dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


In [None]:
FIRST_DAY = 1

In [None]:
df = create_dt(is_train=True, first_day= FIRST_DAY)
df.shape


(30490, 1947)
(46881677, 22)


'(30490, 1570)\n(40718219, 22)\n(40718219, 22)'

In [None]:
df = df.drop(['wm_yr_wk', 'weekday', 'd'], axis=1)

CSV file for each store is created and stored

In [None]:
#for 10 models
store_ids = [x for x in range(10)]
for store_id in store_ids:
    df_part = df[df['store_id']==store_id]
    print(store_id)
    state = list(set(list(df_part['id'])))[0].split('_')[3]
    snap_cols = ['snap_CA', 'snap_TX', 'snap_WI']
    snap_cols.remove('snap_'+state)
    snap_cols.append('state_id')
    print(snap_cols)
    use_cols = df_part.columns[~df_part.columns.isin(snap_cols)]
    df_part = df_part[use_cols]
    create_fea(df_part)
    snap_cols = ['snap_CA', 'snap_TX', 'snap_WI']
    out_name = "input_"+str(store_id)+ ".csv"
    print(df_part.shape)
    df_part.dropna(inplace=True)
    print(df_part.shape)
    df_part.to_csv(dataset_path + out_name, index=False)


CSV file for each store and category is created and stored

In [None]:
#for 30 models
store_ids = [x for x in range(10)]
cat_ids = [x for x in range(3)]
for store_id in store_ids:
  for cat_id in cat_ids:
    #if store_id == 8 and cat_id == 2:
      df_part = df[(df['store_id']==store_id) & (df['cat_id'] == cat_id)]
      print(store_id, cat_id)
      state = list(set(list(df_part['id'])))[0].split('_')[3]
      snap_cols = ['snap_CA', 'snap_TX', 'snap_WI']
      if cat_id == 2:
        snap_cols.remove('snap_'+state)
      snap_cols.append('state_id')
      print(snap_cols)
      use_cols = df_part.columns[~df_part.columns.isin(snap_cols)]
      df_part = df_part[use_cols]
      create_fea(df_part)
      snap_cols = ['snap_CA', 'snap_TX', 'snap_WI']
      out_name = "input_"+str(store_id) +"_"+str(cat_id)+ ".csv"
      print(df_part.shape)
      df_part.dropna(inplace=True)
      print(df_part.shape)
      
      df_part.to_csv(dataset_path +"input/" + out_name, index=False)


CSV file for each state and department is created and stored

In [None]:
#for 21 models
state_ids = [x for x in range(3)]
dept_ids = [x for x in range(7)]
for state_id in state_ids:
  for dept_id in dept_ids:
    #if state_id == 1 and dept_id ==2:
      df_part = df[(df['state_id']==state_id) & (df['dept_id'] == dept_id)]
      print(state_id, dept_id)
      snap_cols = ['snap_CA', 'snap_TX', 'snap_WI']
      state = list(set(list(df_part['id'])))[0].split('_')[3]
      if set(df_part['cat_id']) == {2}:
        snap_cols.remove('snap_'+state)
      print(snap_cols)
      use_cols = df_part.columns[~df_part.columns.isin(snap_cols)]
      df_part = df_part[use_cols]
      create_fea(df_part)
      snap_cols = ['snap_CA', 'snap_TX', 'snap_WI']
      out_name = "input_"+str(state_id) +"_"+str(dept_id)+ ".csv"
      print(df_part.shape)
      df_part.dropna(inplace=True)
      print(df_part.shape)
      df_part.to_csv(dataset_path + "input/" + out_name, index=False)


Params for LGB are created. Training for different CSVs were done with different set of parameters, according to the validation score. 

In [None]:
params1 = {
                    'boosting_type': 'gbdt',
                    'objective': 'tweedie',
                    'tweedie_variance_power': 1.5,
                    'metric': 'rmse',
                    'subsample': 0.5,
                    'subsample_freq': 1,
                    'lambda_l2' : 0.1,
                    #'lambda_l1' : 0.1,
                    'learning_rate': 0.03,
                    'num_leaves': 2**11-1,
                    'min_data_in_leaf': 2**12-1,
                    'feature_fraction': 0.5,
                    'max_bin': 100,
                    'n_estimators': 2500,
                    'boost_from_average': False,
                    'verbose': 1,

                   #'early_stopping_round' : 500
                } 
params2 = {
                    'boosting_type': 'gbdt',
                    'objective': 'tweedie',
                    'tweedie_variance_power': 1.5,
                    'metric': 'rmse',
                    'subsample': 0.5,
                    'subsample_freq': 1,
                    'lambda_l2' : 0.1,
                    #'lambda_l1' : 0.1,
                    'learning_rate': 0.075,
                    'num_leaves': 2**11-1,
                    'min_data_in_leaf': 2**12-1,
                    'feature_fraction': 0.5,
                    'max_bin': 100,
                    'n_estimators': 5000,
                    'boost_from_average': False,
                    'verbose': 1,

                   'early_stopping_round' : 500
                }  
params3 = {
                    'boosting_type': 'gbdt',
                    'objective': 'tweedie',
                    'tweedie_variance_power': 1.5,
                    'metric': 'rmse',
                    'subsample': 0.5,
                    'subsample_freq': 1,
                    'lambda_l2' : 0.1,
                    #'lambda_l1' : 0.1,
                    'learning_rate': 0.07,
                    'num_leaves': 2**11-1,
                    'min_data_in_leaf': 2**12-1,
                    'feature_fraction': 0.5,
                    'max_bin': 100,
                    'n_estimators': 4800,
                    'boost_from_average': False,
                    'verbose': 1,

                   #'early_stopping_round' : 500
                } 
params4 = {
                    'boosting_type': 'gbdt',
                    'objective': 'tweedie',
                    'tweedie_variance_power': 1.5,
                    'metric': 'rmse',
                    'subsample': 0.5,
                    'subsample_freq': 1,
                    'lambda_l2' : 0.1,
                    #'lambda_l1' : 0.1,
                    'learning_rate': 0.03,
                    'num_leaves': 2**11-1,
                    'min_data_in_leaf': 2**12-1,
                    'feature_fraction': 0.5,
                    'max_bin': 100,
                    'n_estimators': 5000,
                    'boost_from_average': False,
                    'verbose': 1,

                   'early_stopping_round' : 500
                }                    

In [None]:
import pickle

Training is done. While predicting the entire data is used for prediction as it contains more information. Validation is used only for setting the parameters.

In [None]:
#for 21 models
cat_feats = ['item_id'] + ["event_name_1", "event_name_2", "event_type_1", "event_type_2"]
useless_cols = ["id", "date", "sales","d", "wm_yr_wk", "weekday", "Discount", "quarter", "state_id", "cat_id", "dept_id"]
np.random.seed(0)
for i in range(3):
  for j in range(7):
      print(i, j)
      df_inp = pd.read_csv(dataset_path+"input/"+ "input_"+str(i) +"_"+str(j)+".csv")
      y_train = df_inp["sales"]
      #y_val = X_val_orig["sales"]
      train_cols = df_inp.columns[~df_inp.columns.isin(useless_cols)]
      X_train = df_inp[train_cols]
      #X_val = X_val_orig[train_cols]
      #train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=cat_feats, free_raw_data=False)
      #fake_valid_data = lgb.Dataset(X_val, label=y_val, categorical_feature=cat_feats, free_raw_data=False)
      '''num_rows = X_train.shape[0]
      val_size = int(0.2*num_rows)
      fake_valid_inds = np.random.choice(X_train.index.values, val_size, replace = False)
      train_inds = np.setdiff1d(X_train.index.values, fake_valid_inds)
      train_data = lgb.Dataset(X_train.loc[train_inds] , label = y_train.loc[train_inds], 
                              categorical_feature=cat_feats, free_raw_data=False)
      fake_valid_data = lgb.Dataset(X_train.loc[fake_valid_inds], label = y_train.loc[fake_valid_inds],
                                    categorical_feature=cat_feats,free_raw_data=False)
      del df_inp, X_train, y_train, fake_valid_inds, train_inds'''
      
      train_data = lgb.Dataset(X_train , label = y_train, 
                              categorical_feature=cat_feats, free_raw_data=False)
                              
      del df_inp, X_train, y_train
      gc.collect()

      #m_lgb = lgb.train(params, train_data, valid_sets = [fake_valid_data], verbose_eval=100) 

      m_lgb = lgb.train(params, train_data,valid_sets = [train_data], verbose_eval=100) 
      model_name = dataset_path+"models/" + str(i)+str(j)+".bin"
      pickle.dump(m_lgb, open(model_name, 'wb'))
      del m_lgb
      gc.collect()
      #m_lgb.save_model(dataset_path+str(i)+".lgb")'''


In [None]:
#for 10 models
cat_feats = ['item_id', 'dept_id', 'cat_id'] + ["event_name_1", "event_name_2", "event_type_1", "event_type_2"]
useless_cols = ["id", "date", "sales","d", "wm_yr_wk", "weekday", "store_id", "Discount", "quarter"]
np.random.seed(10)
for i in range(10):
    print("store-" + str(i))
    df_inp = pd.read_csv(dataset_path+"input_"+str(i)+".csv")
    y_train = df_inp["sales"]
    #y_val = X_val_orig["sales"]
    train_cols = df_inp.columns[~df_inp.columns.isin(useless_cols)]
    X_train = df_inp[train_cols]
    #X_val = X_val_orig[train_cols]
    print(X_train.shape)
    #train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=cat_feats, free_raw_data=False)
    #fake_valid_data = lgb.Dataset(X_val, label=y_val, categorical_feature=cat_feats, free_raw_data=False)
    '''
    num_rows = X_train.shape[0]
    val_size = int(0.2*num_rows)
    fake_valid_inds = np.random.choice(X_train.index.values, val_size, replace = False)
    train_inds = np.setdiff1d(X_train.index.values, fake_valid_inds)
    train_data = lgb.Dataset(X_train.loc[train_inds] , label = y_train.loc[train_inds], 
                            categorical_feature=cat_feats, free_raw_data=False)
    fake_valid_data = lgb.Dataset(X_train.loc[fake_valid_inds], label = y_train.loc[fake_valid_inds],
                                  categorical_feature=cat_feats,free_raw_data=False)
    del df_inp, X_train, y_train, fake_valid_inds, train_inds'''

    train_data = lgb.Dataset(X_train , label = y_train, 
                            categorical_feature=cat_feats, free_raw_data=False)
    del df_inp, X_train, y_train
    gc.collect()

    #m_lgb = lgb.train(params, train_data, valid_sets = [fake_valid_data], verbose_eval=100) 

    m_lgb = lgb.train(params, train_data,valid_sets = [train_data], verbose_eval=100) 
    model_name = dataset_path+str(i)+".bin"
    pickle.dump(m_lgb, open(model_name, 'wb'))
    del m_lgb
    gc.collect()
    #m_lgb.save_model(dataset_path+str(i)+".lgb")'''


In [None]:
#foe 30 models
import pickle
cat_feats = ['item_id', 'dept_id'] + ["event_name_1", "event_name_2", "event_type_1", "event_type_2"]
useless_cols = ["id", "date", "sales","d", "wm_yr_wk", "weekday", "store_id", "Discount", "quarter", "cat_id"]
np.random.seed(10)
for i in range(10):
  for j in range(3):
    if i == 2 and j == 2:
      params = params3
      
    elif (i == 7 or i == 9 or i == 0) and j == 2:
      params = params2
    elif (i == 1) and j == 2:
      params = params4
    else:
      params = params1
      

    print("store-" + str(i) + "cat-" + str(j) + str(params['n_estimators']))
    df_inp = pd.read_csv(dataset_path+"input/"+"input_"+str(i) +"_"+str(j)+".csv")
    y_train = df_inp["sales"]
    #y_val = X_val_orig["sales"]
    train_cols = df_inp.columns[~df_inp.columns.isin(useless_cols)]
    X_train = df_inp[train_cols]
    #X_val = X_val_orig[train_cols]
    print(X_train.shape)
    #train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=cat_feats, free_raw_data=False)
    #fake_valid_data = lgb.Dataset(X_val, label=y_val, categorical_feature=cat_feats, free_raw_data=False)
    '''
    num_rows = X_train.shape[0]
    val_size = int(0.2*num_rows)
    fake_valid_inds = np.random.choice(X_train.index.values, val_size, replace = False)
    train_inds = np.setdiff1d(X_train.index.values, fake_valid_inds)
    train_data = lgb.Dataset(X_train.loc[train_inds] , label = y_train.loc[train_inds], 
                            categorical_feature=cat_feats, free_raw_data=False)
    fake_valid_data = lgb.Dataset(X_train.loc[fake_valid_inds], label = y_train.loc[fake_valid_inds],
                                  categorical_feature=cat_feats,free_raw_data=False)
    del df_inp, X_train, y_train, fake_valid_inds, train_inds'''

    train_data = lgb.Dataset(X_train , label = y_train, 
                            categorical_feature=cat_feats, free_raw_data=False)
    del df_inp, X_train, y_train
    gc.collect()

    #m_lgb = lgb.train(params, train_data, valid_sets = [fake_valid_data], verbose_eval=100) 

    m_lgb = lgb.train(params, train_data,valid_sets = [train_data], verbose_eval=100) 
    model_name = dataset_path+"models/" + str(i)+str(j)+".bin"
    pickle.dump(m_lgb, open(model_name, 'wb'))
    del m_lgb
    gc.collect()
    #m_lgb.save_model(dataset_path+str(i)+".lgb")'''


In [None]:
#for 10 models
useless_cols = ["id", "date", "sales","d", "wm_yr_wk", "weekday", "store_id", "Discount", "quarter"]
def get_test_data(df, ind):
  test_df = df.loc[df.store_id == ind]
  #test_df = test_df.drop(test_df.columns[[0]], axis=1)
  state = list(set(list(test_df['id'])))[0].split('_')[3]
  snap_cols = ['snap_CA', 'snap_TX', 'snap_WI']
  snap_cols.remove('snap_'+state)
  #snap_cols.remove('snap_'+state)
  snap_cols.append('state_id')
  snap_cols.extend(useless_cols)
  #print(snap_cols)
  use_cols = test_df.columns[~test_df.columns.isin(snap_cols)]
  test_df = test_df[use_cols]
  return test_df

In [None]:
# for 30 models
useless_cols = ["id", "date", "sales","d", "wm_yr_wk", "weekday", "store_id", "Discount", "quarter", "cat_id"]
def get_test_data(df, ind, cat_ind):
  test_df = df.loc[(df.store_id == ind) & (df.cat_id == cat_ind)]
  #test_df = test_df.drop(test_df.columns[[0]], axis=1)
  state = list(set(list(test_df['id'])))[0].split('_')[3]
  snap_cols = ['snap_CA', 'snap_TX', 'snap_WI']
  if cat_ind == 2:
    snap_cols.remove('snap_'+state)
  #snap_cols.remove('snap_'+state)
  snap_cols.append('state_id')
  snap_cols.extend(useless_cols)
  #print(snap_cols)
  use_cols = test_df.columns[~test_df.columns.isin(snap_cols)]
  test_df = test_df[use_cols]
  return test_df

In [None]:
# for 21 models
useless_cols = ["id", "date", "sales","d", "wm_yr_wk", "weekday", "Discount", "quarter", "state_id", "cat_id", "dept_id"]
def get_test_data(df, ind, dept_ind):
  test_df = df.loc[(df.state_id == ind) & (df.dept_id == dept_ind)]
  #test_df = test_df.drop(test_df.columns[[0]], axis=1)
  state = list(set(list(test_df['id'])))[0].split('_')[3]
  snap_cols = ['snap_CA', 'snap_TX', 'snap_WI']
  if set(test_df['cat_id']) == {2}:
    snap_cols.remove('snap_'+state)
  #snap_cols.remove('snap_'+state)
  snap_cols.extend(useless_cols)
  #print(snap_cols)
  use_cols = test_df.columns[~test_df.columns.isin(snap_cols)]
  test_df = test_df[use_cols]
  return test_df

Prediction is done. Recursive feature is used in the prediction task.

In [None]:
%%time
#for 21 models
import pickle
#alphas = [1.028, 1.023, 1.018]
alphas = [1]
weights = [1/len(alphas)]*len(alphas)
sub = 0.

for icount, (alpha, weight) in enumerate(zip(alphas, weights)):

    te = create_dt(False)
    te = te.drop(['wm_yr_wk', 'weekday', 'd'], axis=1)
    #te = te[(te.state_id == 0) & ((te.dept_id == 6) | (te.dept_id == 2))]
    #te = te[(te.state_id == 0) & (te.dept_id == 2)]
    #te['sales_max'] = te[['id', 'sales']].groupby('id')['sales'].transform('max')
    #te['sales_min'] = te[['id', 'sales']].groupby('id')['sales'].transform('min')
    #te['sales'] = (te['sales']-te['sales_min'])/te['sales_max']
    #te = te.drop(['sell_price', 'price_max'], axis=1)
    #te.rename(columns={'price_norm':'sell_price'}, inplace=True)
    cols = [f"F{i}" for i in range(1,29)]
    for tdelta in range(28):
        day = fday + timedelta(days=tdelta)
        print(tdelta, day)
        #tst = te[((te.date >= fday - timedelta(days=max_lags)) & (te.date < fday)) | (te.date==day) ].copy()
        tst = te[((te.date >= fday - timedelta(days=max_lags)) & (te.date <= day))].copy()
        create_fea(tst)
        tst = tst.loc[tst.date == day]

        
        for i in range(3):
          for j in range(7):
            #if (i == 0 and j == 2): #or (i == 0 and j == 2):
              tst_df = get_test_data(tst, i, j)
              print(i, j, tst_df.shape)
              model_path = dataset_path + "models/"+str(i) +str(j)+ ".bin"
              estimator = pickle.load(open(model_path, 'rb'))
              res = alpha*estimator.predict(tst_df)
              te.loc[(te.date == day) & (te.state_id == i) & (te.dept_id==j), "sales"] = res
        #te.loc[te.date == day, "sales"] = alpha*m_lgb.predict(tst) # magic multiplier by kyakovlev


    te_sub = te.loc[te.date >= fday, ["id", "sales"]].copy()
#     te_sub.loc[te.date >= fday+ timedelta(days=h), "id"] = te_sub.loc[te.date >= fday+timedelta(days=h), 
#                                                                           "id"].str.replace("validation$", "evaluation")
    te_sub["F"] = [f"F{rank}" for rank in te_sub.groupby("id")["id"].cumcount()+1]
    te_sub = te_sub.set_index(["id", "F" ]).unstack()["sales"][cols].reset_index()
    te_sub.fillna(0., inplace = True)
    te_sub.sort_values("id", inplace = True)
    te_sub.reset_index(drop=True, inplace = True)
    te_sub.to_csv(f"submission_{icount}.csv",index=False)
    if icount == 0 :
        sub = te_sub
        sub[cols] *= weight
    else:
        sub[cols] += te_sub[cols]*weight
    print(icount, alpha, weight)


sub2 = sub.copy()
sub2["id"] = sub2["id"].str.replace("validation$", "evaluation")
sub = pd.concat([sub, sub2], axis=0, sort=False)
sub.to_csv(dataset_path+"submission.csv",index=False)

In [None]:
%%time
#for 10 models
import pickle
#alphas = [1.028, 1.023, 1.018]
alphas = [1]
weights = [1/len(alphas)]*len(alphas)
sub = 0.

for icount, (alpha, weight) in enumerate(zip(alphas, weights)):

    te = create_dt(False)
    te = te.drop(['wm_yr_wk', 'weekday', 'd'], axis=1)
    cols = [f"F{i}" for i in range(1,29)]

    for tdelta in range(28):
        day = fday + timedelta(days=tdelta)
        print(tdelta, day)
        #tst = te[((te.date >= fday - timedelta(days=max_lags)) & (te.date < fday)) | (te.date==day) ].copy()
        tst = te[((te.date >= fday - timedelta(days=max_lags)) & (te.date <= day))].copy()
        create_fea(tst)
        tst = tst.loc[tst.date == day]

        
        for i in range(10):
            tst_df = get_test_data(tst, i)
            print(i, tst_df.shape)
            model_path = dataset_path + str(i) + ".bin"
            estimator = pickle.load(open(model_path, 'rb'))
            res = alpha*estimator.predict(tst_df)
            te.loc[(te.date == day) & (te.store_id == i) , "sales"] = res
        #te.loc[te.date == day, "sales"] = alpha*m_lgb.predict(tst) # magic multiplier by kyakovlev


    te_sub = te.loc[te.date >= fday, ["id", "sales"]].copy()
#     te_sub.loc[te.date >= fday+ timedelta(days=h), "id"] = te_sub.loc[te.date >= fday+timedelta(days=h), 
#                                                                           "id"].str.replace("validation$", "evaluation")
    te_sub["F"] = [f"F{rank}" for rank in te_sub.groupby("id")["id"].cumcount()+1]
    te_sub = te_sub.set_index(["id", "F" ]).unstack()["sales"][cols].reset_index()
    te_sub.fillna(0., inplace = True)
    te_sub.sort_values("id", inplace = True)
    te_sub.reset_index(drop=True, inplace = True)
    te_sub.to_csv(f"submission_{icount}.csv",index=False)
    if icount == 0 :
        sub = te_sub
        sub[cols] *= weight
    else:
        sub[cols] += te_sub[cols]*weight
    print(icount, alpha, weight)


sub2 = sub.copy()
sub2["id"] = sub2["id"].str.replace("validation$", "evaluation")
sub = pd.concat([sub, sub2], axis=0, sort=False)
sub.to_csv(dataset_path+"submission.csv",index=False)

In [None]:
%%time
#for 30 models
import pickle
#alphas = [1.028, 1.023, 1.018]
alphas = [1]
weights = [1/len(alphas)]*len(alphas)
sub = 0.

for icount, (alpha, weight) in enumerate(zip(alphas, weights)):

    te = create_dt(False)
    te = te.drop(['wm_yr_wk', 'weekday', 'd'], axis=1)
    #te = te[(te.store_id == 0) & ((te.store_id == 6) | (te.dept_id == 2))]
    #te = te[(te.store_id == 7) & (te.cat_id == 2)]
    #te['sales_max'] = te[['id', 'sales']].groupby('id')['sales'].transform('max')
    #te['sales_min'] = te[['id', 'sales']].groupby('id')['sales'].transform('min')
    #te['sales'] = (te['sales']-te['sales_min'])/te['sales_max']
    #te = te.drop(['sell_price', 'price_max'], axis=1)
    #te.rename(columns={'price_norm':'sell_price'}, inplace=True)
    cols = [f"F{i}" for i in range(1,29)]
    for tdelta in range(28):
        day = fday + timedelta(days=tdelta)
        print(tdelta, day)
        #tst = te[((te.date >= fday - timedelta(days=max_lags)) & (te.date < fday)) | (te.date==day) ].copy()
        tst = te[((te.date >= fday - timedelta(days=max_lags)) & (te.date <= day))].copy()
        create_fea(tst)
        tst = tst.loc[tst.date == day]

        
        for i in range(10):
          for j in range(3):
            #if (i == 7 and j == 2): #or (i == 0 and j == 2):
              tst_df = get_test_data(tst, i, j)
              print(i, j, tst_df.shape)
              model_path = dataset_path + "models/" + str(i) +str(j)+ ".bin"
              estimator = pickle.load(open(model_path, 'rb'))
              res = alpha*estimator.predict(tst_df)
              te.loc[(te.date == day) & (te.store_id == i) & (te.cat_id==j), "sales"] = res
        #te.loc[te.date == day, "sales"] = alpha*m_lgb.predict(tst) # magic multiplier by kyakovlev


    te_sub = te.loc[te.date >= fday, ["id", "sales"]].copy()
#     te_sub.loc[te.date >= fday+ timedelta(days=h), "id"] = te_sub.loc[te.date >= fday+timedelta(days=h), 
#                                                                           "id"].str.replace("validation$", "evaluation")
    te_sub["F"] = [f"F{rank}" for rank in te_sub.groupby("id")["id"].cumcount()+1]
    te_sub = te_sub.set_index(["id", "F" ]).unstack()["sales"][cols].reset_index()
    te_sub.fillna(0., inplace = True)
    te_sub.sort_values("id", inplace = True)
    te_sub.reset_index(drop=True, inplace = True)
    te_sub.to_csv(f"submission_{icount}.csv",index=False)
    if icount == 0 :
        sub = te_sub
        sub[cols] *= weight
    else:
        sub[cols] += te_sub[cols]*weight
    print(icount, alpha, weight)


sub2 = sub.copy()
#sub2["id"] = sub2["id"].str.replace("validation$", "evaluation")
sub2["id"] = sub2["id"].str.replace("evaluation$", "validation")
sub = pd.concat([sub, sub2], axis=0, sort=False)
sub.to_csv(dataset_path+"sub.csv",index=False)