In [1]:
import os
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import matplotlib.pyplot as plt
import random
import gc
import lightgbm as lgb
import joblib
from lightgbm import LGBMRegressor
from hyperopt import hp, tpe, fmin, Trials, rand, anneal

In [2]:
%%time
base_path = "./data_sets/"
calendar = pd.read_csv(f"{base_path}calendar.csv")
train_eva = pd.read_csv(f"{base_path}train/sales_train_evaluation.csv")
sell_prices = pd.read_csv(f"{base_path}sell_prices.csv")
sample_sub = pd.read_csv(f"{base_path}sample_submission.csv")

CPU times: total: 312 ms
Wall time: 3.1 s


In [3]:
# Add more columns in file train
for d in range(1942,1970):
    col = 'd_' + str(d)
    train_eva[col] = 0
    train_eva[col] = train_eva[col].astype(np.int16)

In [8]:
def downcast(df):
    cols = df.dtypes.index.tolist()
    types = df.dtypes.values.tolist()
    for i,t in enumerate(types):
        if 'int' in str(t):
            if df[cols[i]].min() > np.iinfo(np.int8).min and df[cols[i]].max() < np.iinfo(np.int8).max:
                df[cols[i]] = df[cols[i]].astype(np.int8)
            elif df[cols[i]].min() > np.iinfo(np.int16).min and df[cols[i]].max() < np.iinfo(np.int16).max:
                df[cols[i]] = df[cols[i]].astype(np.int16)
            elif df[cols[i]].min() > np.iinfo(np.int32).min and df[cols[i]].max() < np.iinfo(np.int32).max:
                df[cols[i]] = df[cols[i]].astype(np.int32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.int64)
        elif 'float' in str(t):
            if df[cols[i]].min() > np.finfo(np.float16).min and df[cols[i]].max() < np.finfo(np.float16).max:
                df[cols[i]] = df[cols[i]].astype(np.float16)
            elif df[cols[i]].min() > np.finfo(np.float32).min and df[cols[i]].max() < np.finfo(np.float32).max:
                df[cols[i]] = df[cols[i]].astype(np.float32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.float64)
        elif t == np.str_:
            if cols[i] == 'date':
                df[cols[i]] = pd.to_datetime(df[cols[i]], format='%Y-%m-%d')
            else:
                df[cols[i]] = df[cols[i]].astype('categoryegoryegoryegoryegory')
    return df  

In [9]:
%%time
print("Downcasting data")
train_eva = downcast(train_eva)
sell_prices = downcast(sell_prices)
calendar = downcast(calendar)

Downcasting data
CPU times: total: 78.1 ms
Wall time: 410 ms


In [10]:
%%time
print("Melting data")
df = pd.melt(frame=train_eva, 
             id_vars=["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"],
             var_name="d", value_name="sold")

Melting data
CPU times: total: 516 ms
Wall time: 3.23 s


In [11]:
%%time
print("Merging data")
df = pd.merge(left=df, right=calendar, how="left", on="d")
df = pd.merge(left=df, right=sell_prices, on=["store_id", "item_id", "wm_yr_wk"], how="left")

Merging data
CPU times: total: 5.09 s
Wall time: 37.5 s


In [12]:
%%time
print("Implement features")
#Calculate the SNAP (Supplemental Nutrition Assistance Program) day for each state
df["snap"] = df["snap_CA"] + df["snap_TX"] + df["snap_WI"]
df["snap"] = np.where(df["snap"] >= 1, 1, 0).astype(np.int8)

# Apply int for day column
df["d"] = df["d"].str[2:].astype(np.int16)

# Process NaN value
df["sell_price"] = df['sell_price'].fillna(df.groupby('id')['sell_price'].transform('median'))

# Is it a weekend
df["weekend"] = np.where(df["wday"] < 3, 1, 0).astype(np.int8)

# Drop unnecessary columns
df = df.drop(["date", "weekday", "wm_yr_wk", "event_name_2", "event_type_2", "snap_CA", "snap_TX", "snap_WI"], axis=1)

Implement features
CPU times: total: 2.41 s
Wall time: 15.4 s


In [19]:
# Label Encoder
print("Label Encoding")
# d_id = dict(zip(df["id"].cat.codes, df["id"]))
# d_store = dict(zip(df["store_id"].cat.codes, df["store_id"]))
# df["id"] = df["id"].cat.codes
# df["item_id"] = df["item_id"].cat.codes
# df["dept_id"] = df["dept_id"].cat.codes
# df["cat_id"] = df["cat_id"].cat.codes
# df["store_id"] = df["store_id"].cat.codes
# df["state_id"] = df["state_id"].cat.codes
# df["event_name_1"] = df["event_name_1"].cat.codes
# df["event_type_1"] = df["event_type_1"].cat.codes

Label Encoding


In [20]:
%%time
print("Calulating Lags and Rolling mean")
# Lags must be > 28
lags = [29,30,31,32,33,34,35,40,55,60,65,180]
for lag in lags:
    df['sold_lag_'+str(lag)] = df.groupby(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],as_index=False)['sold'].shift(lag).astype(np.float16)
    
df['rolling_mean_7']   = df.groupby(['id'])['sold'].transform(lambda x: x.shift(28).rolling(7).mean())
df['rolling_mean_14']   = df.groupby(['id'])['sold'].transform(lambda x: x.shift(28).rolling(14).mean())
df['rolling_mean_30']  = df.groupby(['id'])['sold'].transform(lambda x: x.shift(28).rolling(30).mean())
df['rolling_mean_60']  = df.groupby(['id'])['sold'].transform(lambda x: x.shift(28).rolling(60).mean())
df['rolling_mean_180']  = df.groupby(['id'])['sold'].transform(lambda x: x.shift(28).rolling(180).mean())

Calulating Lags and Rolling mean
CPU times: total: 37 s
Wall time: 4min 20s


In [None]:
df = df[df["d"] > 28+180]

In [None]:
# Save dataframe
df.to_pickle('data.pkl')
del df
gc.collect()

In [None]:
%%time
data = pd.read_pickle('data.pkl')
valid = data[(data['d']>=1914) & (data['d']<1942)][['id','d','sold']]
test = data[data['d']>=1942][['id','d','sold']]

In [None]:
# Get from file Tuning HyperParams
best_params_store = (
{'CA_1': {'colsample_bytree': 0.8,
  'learning_rate': 0.2,
  'max_depth': 7.0,
  'min_child_weight': 300.0,
  'num_leaves': 200.0,
  'subsample': 0.9},
 'CA_2': {'colsample_bytree': 0.9,
  'learning_rate': 0.30000000000000004,
  'max_depth': 8.0,
  'min_child_weight': 200.0,
  'num_leaves': 200.0,
  'subsample': 0.8},
 'CA_3': {'colsample_bytree': 0.9,
  'learning_rate': 0.2,
  'max_depth': 8.0,
  'min_child_weight': 300.0,
  'num_leaves': 250.0,
  'subsample': 0.9},
 'CA_4': {'colsample_bytree': 0.9,
  'learning_rate': 0.30000000000000004,
  'max_depth': 7.0,
  'min_child_weight': 300.0,
  'num_leaves': 200.0,
  'subsample': 0.9},
 'TX_1': {'colsample_bytree': 0.8,
  'learning_rate': 0.2,
  'max_depth': 7.0,
  'min_child_weight': 300.0,
  'num_leaves': 200.0,
  'subsample': 1.0},
 'TX_2': {'colsample_bytree': 1.0,
  'learning_rate': 0.30000000000000004,
  'max_depth': 8.0,
  'min_child_weight': 300.0,
  'num_leaves': 200.0,
  'subsample': 0.9},
 'TX_3': {'colsample_bytree': 0.8,
  'learning_rate': 0.2,
  'max_depth': 7.0,
  'min_child_weight': 300.0,
  'num_leaves': 250.0,
  'subsample': 0.9},
 'WI_1': {'colsample_bytree': 0.9,
  'learning_rate': 0.2,
  'max_depth': 8.0,
  'min_child_weight': 400.0,
  'num_leaves': 250.0,
  'subsample': 0.8},
 'WI_2': {'colsample_bytree': 1.0,
  'learning_rate': 0.2,
  'max_depth': 8.0,
  'min_child_weight': 200.0,
  'num_leaves': 250.0,
  'subsample': 0.9},
 'WI_3': {'colsample_bytree': 0.9,
  'learning_rate': 0.2,
  'max_depth': 8.0,
  'min_child_weight': 400.0,
  'num_leaves': 250.0,
  'subsample': 0.9}})

In [None]:
%%time
data = pd.read_pickle('data.pkl')
valid = data[(data['d']>=1914) & (data['d']<1942)][['id','d','sold']]
test = data[data['d']>=1942][['id','d','sold']]

In [None]:
def cross_validation(data, valid_first_day, params, day_start, d_store=d_store, sample_sub=sample_sub):
    
    valid = data[(data['d']>=valid_first_day) & (data['d']<valid_first_day+28)][['id','d','sold']]
    print(f"Valid first day {valid_first_day} predicting")
    for i in range(10): 
        # Forecast cho từng store 
        df = data[data["store_id"] == i]

        #Create train set
        X_train, y_train = df[(df['d']>=day_start) & (df['d']<valid_first_day)].drop('sold',axis=1), df[(df['d']>=day_start) & (df['d']<valid_first_day)]['sold']
        train_sets = lgb.Dataset(X_train, y_train)
        X_valid, y_valid = df[(df['d']>=valid_first_day) & (df['d']<valid_first_day+28)].drop('sold',axis=1), df[(df['d']>=valid_first_day) & (df['d']<valid_first_day+28)]['sold']
        valid_sets = lgb.Dataset(X_valid, y_valid)

        model = lgb.train(params={'objective' : 'tweedie',
                                  'force_row_wise': True,
                                  'verbose': -1,
                                  'n_estimators': 1000,
                                  'learning_rate':params[d_store[i]]["learning_rate"],
                                  'subsample': params[d_store[i]]["subsample"],
                                  'colsample_bytree':params[d_store[i]]["colsample_bytree"],
                                  'min_child_weight':params[d_store[i]]["min_child_weight"],
                                  'max_depth':np.int16(params[d_store[i]]["max_depth"]),
                                  'num_leaves':np.int16(params[d_store[i]]["num_leaves"])},
                                
                      train_set=train_sets, 
                      valid_sets=valid_sets,
                      verbose_eval=False,
                      early_stopping_rounds=50)

        pred_val = model.predict(X_valid)
        valid.loc[X_valid.index, "sold"] = pred_val

    valid["id"] = valid["id"].map(d_id)
    valid = valid.pivot(index="id", columns="d", values="sold").reset_index()
    valid["id"] = valid["id"].str.replace("evaluation", "validation")
    
    sample_sub = sample_sub[["id"]]

    f_col = [f"F{i}" for i in range(1,29)]
    f_col.insert(0, "id")
    
    print(f"Valid testset from day {valid.columns[1]} to day {valid.columns[-1]}")
    
    out_val = pd.merge(left=sample_sub[:30490], right=valid, on="id")
    out_val.columns=f_col
    
    return out_val

def avg_rmsse_score(out_val, train_eva, valid_first_day, day_start):
    
    print(f"Scoring from {valid_first_day} to {valid_first_day+28-1}")
    print(f"Naive first day {day_start}")
    
    days_train = [i for i in range(day_start, valid_first_day)]
    days_valid = [i for i in range(valid_first_day, valid_first_day+28)]
    
    naive_predict = np.array(train_eva[days_train].drop(valid_first_day-1, axis=1)).astype(np.int32)
    y_true_naive = np.array(train_eva[days_train].drop(day_start, axis=1)).astype(np.int32)
    naive_mse = np.mean((naive_predict - y_true_naive) ** 2, axis=1)
    
    model_pred = np.array(out_val.iloc[:, 1:])
    y_true_model = np.array(train_eva[days_valid])
    model_mse = np.mean((model_pred - y_true_model) ** 2, axis=1)
    
    avg_rmsse = np.sqrt(model_mse / naive_mse).mean()
    
    return avg_rmsse

In [None]:
train_eva.columns = list(train_eva.columns[:6]) + [i for i in range(1, 1970)]

In [None]:
valid_first_days_list = [1858, 1886, 1914]
cv_score = dict()

In [None]:
%%time
valid_first_days_list = [1858, 1886, 1914]
day_start = 209
day_start_naive = 1
cv_score = dict()
for i in valid_first_days_list:
    out_df_cv = cross_validation(data=data, valid_first_day=i, day_start=day_start, params=best_params_store)
    cv_score[i] = avg_rmsse_score(out_val=out_df_cv, train_eva=train_eva, valid_first_day=i, day_start=day_start_naive)
    day_start += 28
    day_start_naive += 28

In [None]:
for i in valid_first_days_list:
    print(f"Score {i}", cv_score[i])
    
print("CV score", np.mean(list(cv_score.values())))