In [None]:
import lightgbm as lgb
from lightgbm import LGBMRegressor as gbm
from sklearn.ensemble import RandomForestRegressor
from tqdm.auto import tqdm
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
import warnings
warnings.filterwarnings('ignore')

In [None]:
data = train_data.loc[train_data['store_nbr'] == 3]
data

In [None]:
def cross_validation_method(df,method,test_start_date,train_size=365*2,test_size=15,cv_steps=30,window_shift=1):
    
    train_indices_list=[]
    cv_indices_list=[]
    
    if method=='sliding':
        for step in range(cv_steps):
            train_start_date=(test_start_date - pd.Timedelta(train_size - step,'days')).date()
            train_end_date=(test_start_date + pd.Timedelta(step - 1, 'days')).date()
            train_indices_list.append((train_start_date,train_end_date))
            
            cv_start_date=(test_start_date + pd.Timedelta(step,'days')).date()
            cv_end_date=(test_start_date + pd.Timedelta(step+14, 'days')).date()
            cv_indices_list.append((cv_start_date,cv_end_date))
            
    elif method=='expanding':
        for step in range(cv_steps):
            train_start_date=(test_start_date - pd.Timedelta(train_size,'days')).date()
            train_end_date=(test_start_date + pd.Timedelta(step - 1, 'days')).date()
            train_indices_list.append((train_start_date,train_end_date))
            
            cv_start_date=(test_start_date + pd.Timedelta(step,'days')).date()
            cv_end_date=(test_start_date + pd.Timedelta(step+14, 'days')).date()
            cv_indices_list.append((cv_start_date,cv_end_date))
            
    return train_indices_list, cv_indices_list

cross_validation_method(data,
                        method='expanding',
                        test_start_date=pd.datetime(2017,1,1),
                        train_size=365*3,
                        test_size=15,
                        cv_steps=30)

In [None]:
 Data Processing

def cartesian(df1,df2):
    #Determine cartesian product of 2 dataframe
    key='key'
    while key in df1.columns or key in df2.columns:
        print(key)
        key='_'+key
    key_d={key:0}
    return pd.merge(
        df1.assign(**key_d), df2.assign(**key_d), on=key).drop(key,axis=1)


def data_preprocessing(df,holiday_events_df,oil_df,lag_days=[1,7,30],rolling_days=[7,30,60]):
    
    start_date=pd.to_datetime(df['date'].agg(['min','max'])['min'])
    end_date=pd.to_datetime(df['date'].agg(['min','max'])['max'])
    date_df = pd.DataFrame()
    date_df['date']=pd.date_range(start=start_date,end=end_date)
    unique_store_df=pd.DataFrame({'store_nbr':df['store_nbr'].unique()})
    
    date_df = cartesian(date_df,unique_store_df)
    
    df=date_df.merge(df,how='left',on=['date','store_nbr'])
    df=df[df['sales']>0.0]
    
    enc=OrdinalEncoder()
    df['family']=enc.fit_transform(df[['family']])
  
    #Create date features
    df['day_of_month']=df['date'].dt.day
    df['day_of_week']=df['date'].dt.dayofweek
    df['day_of_year']=df['date'].dt.dayofyear
    df['month']=df['date'].dt.month
    df['year']=df['date'].dt.year
    df['is_weekend']=(df['day_of_week'] > 5).astype(np.int8)
    
    #handling null values
    df['sales']=df.groupby(['store_nbr','day_of_week'])['sales'].ffill()
    
    #creating lag features
    SHIFT = 15
    for l in lag_days:
        df['lag_{}'.format(l)]=df.groupby(['store_nbr','family','day_of_week'])['sales'].transform(lambda x: x.shift(SHIFT+l))
        
    #creating rolling features
    for r in rolling_days:
        df['rolling_mean_{}'.format(r)]=df.groupby(['store_nbr','family','day_of_week'])['sales'].transform(lambda x:x.shift(SHIFT).rolling(r,min_periods=1).mean())
        
    #merging oil data
    oil_df['date']=pd.to_datetime(oil_df['date'])
    oil_df = oil_df.rename(columns={"dcoilwtico": "oil_price"})
    df=df.merge(oil_df,how='left',on='date')
    
    #filling in missing values
    df['oil_price']=df['oil_price'].fillna(axis=0,method='ffill')
    #to fill data for the first day we will use the mean price from the 2nd day
    oil_price=df[df['date']=='2013-01-02']['oil_price']
    oil_price=round(np.mean(oil_price))
    df['oil_price']=df['oil_price'].fillna(oil_price)
    
    #merging holiday data
    holiday_events_df['date'] = pd.to_datetime(holiday_events_df['date'])
    holiday_events_df['type']=holiday_events_df['type'].replace(['Transfer','Additional','Bridge','Event'],'Holiday')
    holiday_events_df=holiday_events_df.drop(['locale','locale_name','description','transferred'],axis=1)
    holiday_events_df = holiday_events_df.rename(columns={"type": "day_type"})
    df=df.merge(holiday_events_df[['date','day_type']],how='left',on='date')
    df['day_type'].fillna(False, inplace=True)
    df['day_type']=df['day_type'].astype(bool).astype(int)
    
    
    return df

In [None]:
def model_CV(df,cv_method,cv_start_date,lgb_params,train_size=365*2,test_size=15,cv_steps=5,window_shift=1,lag_days=[1,7,30],rolling_days=[7,30,60]):
    
    train_indices_list, valid_indices_list = cross_validation_method(df,
                        cv_method,test_start_date=cv_start_date,
                        train_size=train_size,
                        test_size=test_size,
                        cv_steps=cv_steps,window_shift=window_shift)
    
    features = df.columns[~df.columns.isin(['sales','date'])]
    cat_features = ['family','onpromotion','day_of_month', 'day_of_week', 'month', 'year', 'day_of_month', 'day_of_year', 'is_weekend']
    
    feature_importance_df=pd.DataFrame()
    feature_importance_df['name']=features
    feature_importance_df['imp']=0
    fold=0
    
    train_date_range=[]
    avg_mape = 0
    avg_wmape = 0
    for train_dates, valid_dates in tqdm(zip(train_indices_list,valid_indices_list)):
        
        train_date_range = (df['date'] >= pd.to_datetime(train_dates[0])) & (df['date'] <= pd.to_datetime(train_dates[1]))
        valid_date_range = (df['date'] >= pd.to_datetime(valid_dates[0])) & (df['date'] <= pd.to_datetime(valid_dates[1]))
        
        model = gbm(silent=True, verbose=-1)
        model.set_params(**lgb_params)
        model.fit(X=df.loc[train_date_range][features],y=df[train_date_range]['sales'],
                  categorical_feature = cat_features,
                  eval_set=[(df[valid_date_range][features],df[valid_date_range]['sales'])],
                  early_stopping_rounds = 2000,
                  eval_metric = 'mape',                                  #mape - mean absolute percentage error, its a KPI like RMSE
                  verbose = False)
        
        pred_df=pd.DataFrame({'target':df[valid_date_range]['sales'],'pred':model.predict(df[valid_date_range][features])})
        pred_df['pred']=round(pred_df['pred'],3)
        pred_df['weight']=round((pred_df['target']/sum(pred_df['target'])),3)
        pred_df['mape']=round(abs(pred_df['target']-pred_df['pred'])/pred_df['target'],3)
        pred_df['wmape']=pred_df['weight']*pred_df['mape']
        mape=pred_df['mape'].mean()
        wmape=pred_df['wmape'].sum()
        
        avg_mape += mape
        avg_wmape += wmape

        print(mape,wmape)
        
        feature_importance_df['imp']+=model.feature_importances_
        fold += 1
        
    feature_importance_df['imp']=feature_importance_df['imp']/fold
    
    print(f'Average MAPE: {avg_mape/cv_steps}, Average WMAPE: {avg_mape/cv_steps}')
    
    return pred_df, feature_importance_df, mape

In [None]:
#lgb params
lgb_params = {'boosting_type':'gbdt',
             'objective':'mape',
             'n_estimators':1000,
             'learning_rate':0.1,
             'num_leaves':127,
             'max_bin':127,
             'feature_fraction':0.8,
             'bagging_fraction':0.8,
             'verbose': -1
             }

#Preprocessing data -
df=data_preprocessing(data,holidays,oil,lag_days = [1, 7, 14],rolling_days =  [7, 30, 60])

pred_df, feat_df, mape=model_CV(df,
                                cv_method = 'sliding',
                                cv_start_date = pd.datetime(2017,1,1),
                                lgb_params = lgb_params,
                                train_size = 365*3,
                                test_size =15,
                                cv_steps = 5,
                                lag_days = [1,2,3,4,5,6,7,14,30],
                                rolling_days = [7,30,60]
                               )

In [None]:
lgb_params = {'boosting_type':'gbdt',
             'objective':'mape',
             'n_estimators':100,
             'learning_rate':0.1,
             'num_leaves':127,
             'max_bin':127,
             'feature_fraction':0.8,
             'bagging_fraction':0.8,
             'verbose': -1
             }

#Preprocessing data -
df=data_preprocessing(data,holidays,oil,lag_days = [1, 7, 14],rolling_days =  [7, 30, 60])

pred_df, feat_df, mape=model_CV(df,
                                cv_method = 'sliding',
                                cv_start_date = pd.datetime(2017,1,1),
                                lgb_params = lgb_params,
                                train_size = 365*3,
                                test_size =15,
                                cv_steps = 30,
                                lag_days = [1,2,3,4,5,6,7,14,30],
                                rolling_days = [7,30,60]
                               )


In [None]:
lgb_params = {'boosting_type':'gbdt',
             'objective':'mape',
             'n_estimators':100,
             'learning_rate':0.1,
             'num_leaves':127,
             'max_bin':127,
             'feature_fraction':0.8,
             'bagging_fraction':0.8,
             'verbose': -1
             }

#Preprocessing data -
df=data_preprocessing(data,holidays,oil,lag_days = [1, 7, 14],rolling_days =  [7, 30, 60])

pred_df, feat_df, mape=model_CV(df,
                                cv_method = 'expanding',
                                cv_start_date = pd.datetime(2017,1,1),
                                lgb_params = lgb_params,
                                train_size = 365*3,
                                test_size =15,
                                cv_steps = 15,
                                lag_days = [1,2,3,4,5,6,7,14,30],
                                rolling_days = [7,30,60]
                               )

In [None]:
lgb_params = {'boosting_type':'gbdt',
             'objective':'mape',
             'n_estimators':500,
             'learning_rate':0.1,
             'num_leaves':127,
             'max_bin':127,
             'feature_fraction':0.8,
             'bagging_fraction':0.8,
             'verbose': -1
             }

#Preprocessing data -
df=data_preprocessing(data,holidays,oil,lag_days = [1, 7, 14],rolling_days =  [7, 30, 60])

pred_df, feat_df, mape=model_CV(df,
                                cv_method = 'expanding',
                                cv_start_date = pd.datetime(2017,1,1),
                                lgb_params = lgb_params,
                                train_size = 365*3,
                                test_size =15,
                                cv_steps = 15,
                                lag_days = [1,2,3,4,5,6,7,14,30],
                                rolling_days = [7,30,60]
                               )