In [9]:
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [37]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error 
from lightgbm import LGBMRegressor
import pickle
from sklearn.metrics import make_scorer
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [11]:
# from google.colab import drive

# drive.mount('/content/drive')
# %cd '/content/drive/Shared drives/Predictive Analysis- Walmart'

In [12]:
sales_train_evaluation_ = pd.read_csv('/content/sales_train_evaluation.csv')
calendar_ = pd.read_csv('/content/calendar.csv')
sell_prices_ = pd.read_csv('/content/sell_prices.csv')
STORES_IDS = list(sales_train_evaluation_['store_id'].unique())

In [13]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [14]:
def data_processing(x):
  # #Adding columns for the days d_1942 to d_1969 as nan for which we need to forecast sales
  for i in range(1942,1970):
      x['d_'+str(i)]=np.nan
      x['d_'+str(i)]=x['d_'+str(i)].astype(np.float16)

  # Melting to convert one date to one observation
  df=pd.melt(x,id_vars=['id','item_id','dept_id','cat_id','store_id','state_id'],var_name='d',value_name='demand')
  # Merge calendar and sell_prices data
  df=pd.merge(df,calendar_,on='d',how='left')
  df=pd.merge(df,sell_prices_,on=['item_id','store_id','wm_yr_wk'],how='left')
  df['sell_price']=df.groupby(['id'])['sell_price'].apply(lambda x: x.fillna(x.mean()))

  # Fill events N/A
  cat=['event_name_1','event_type_1','event_name_2','event_type_2']
  for i in cat:
      df[i].fillna('no_event',inplace=True)

  # Create is_weekend feature
  f=lambda x: 1 if x<=2 else 0
  df['is_weekend']=df['wday'].map(f)
  df['is_weekend']=df['is_weekend'].astype(np.int8)

  # Create month_day feature
  f=lambda x: x.split("-")[2]
  df['month_day']=df['date'].map(f)
  df['month_day']=df['month_day'].astype(np.int8)

  # Create month_week_number feature
  df['month_week_number']=(df['month_day']-1) // 7 + 1  
  df['month_week_number']=df['month_week_number'].astype(np.int8)

  # Lags
  lags=[28,35,42]
  for i in lags:
      df['lag_'+str(i)]=df.groupby(['id'])['demand'].shift(i)
  df = df.fillna(0)
  # # Rolling Median
  # window=[7,14,28,35,42]
  # for i in window:
  #     df['rolling_median_'+str(i)]=df.groupby(['id'])['demand'].transform(lambda s: s.rolling(i,center=False).median())

  # window=['rolling_median_7','rolling_median_14','rolling_median_28','rolling_median_35','rolling_median_42']
  # for i in window:
  #     df[i]=df[i].fillna(0) 

  # Encoding
  labelencoder=LabelEncoder() 
  category=['event_name_1','event_type_1','event_name_2','event_type_2','id','item_id','dept_id','cat_id','store_id','state_id']
  for i in category:
      df[i+'_']=labelencoder.fit_transform(df[i])

  df=df.drop(['event_name_1','event_type_1','event_name_2','event_type_2','id','item_id','dept_id','cat_id','store_id','state_id'],axis=1)

  # Convert string date to int date
  f=lambda x: x.split('_')[1]
  df['day']=df['d'].map(f)
  df['day']=df['day'].astype(np.int16) 

  # Cleanup - drop redundant columns
  df=df.drop(['d','date','weekday'],axis=1)

  return df

In [16]:
# Process data by store, write to disk for further steps
sales_train_evaluation_ = reduce_mem_usage(sales_train_evaluation_)
for store_id in STORES_IDS:
  x = sales_train_evaluation_[sales_train_evaluation_.store_id == store_id]
  df = data_processing(x)
  file_path = '/content/'+store_id+'.csv'
  df.to_csv(file_path, index=False)

Mem. usage decreased to 96.13 Mb (0.0% reduction)


In [34]:
# Define loss function
def tweedie_eval(y_pred, y_true, p=1.5):
    y_true = y_true.get_label()
    a = y_true*np.exp(y_pred, (1-p)) / (1-p)
    b = np.exp(y_pred, (2-p))/(2-p)
    loss = -a + b
    return loss 


def custom_split(X, y, groups):
  for train_index, test_index in groups:
    original_train_index = np.array(X[X['day'].isin(train_index+1)].index)
    original_test_index = np.array(X[X['day'].isin(test_index+1)].index)
    yield original_train_index, original_test_index

tweedie = make_scorer(tweedie_eval)

In [50]:
# Hyperparameter tuning with RandomizedSearchCV
from scipy.stats import reciprocal

for store_id in STORES_IDS:
  file_path = '/content/'+store_id+'.csv'
  df1 = pd.read_csv(file_path)
  df1 = reduce_mem_usage(df1)
  # remove testing data
  df1 = df1[df1['day']<1942]

  df = df1.reset_index().drop("index",axis=1)
  X = df.iloc[:, 1:]
  y = df['demand']
  
  tscv = TimeSeriesSplit(n_splits=5, test_size=28)
  groups = tscv.split(df1['day'].unique())

  lgb = LGBMRegressor( objective='tweedie')

  lgb_grid = {
      'tweedie_variance_power': [1.1],
      'learning_rate': reciprocal(3e-3, 3e-1),
            'max_depth': list(range(50,70)) ,
            'n_estimators': list(range(100,300)),
            'num_leaves': list(range(150,300)) }

  lgb_reg = RandomizedSearchCV(lgb, param_distributions=lgb_grid,
                            n_jobs=-1, scoring ='neg_root_mean_squared_error', cv=custom_split(X, y, groups))
  lgb_reg.fit(X, y)
  print(store_id, '\t', lgb_reg.best_params_,lgb_reg.best_score_)
  model_path = '/content/lgb_model_'+store_id+'.bin'
  pickle.dump(lgb_reg.best_estimator_, open(model_path, 'wb'))


Mem. usage decreased to 206.11 Mb (82.7% reduction)


RandomizedSearchCV(cv=<generator object custom_split at 0x7fe475194c50>,
                   estimator=LGBMRegressor(objective='tweedie'), n_jobs=-1,
                   param_distributions={'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fe475c00a10>,
                                        'max_depth': [50, 51, 52, 53, 54, 55,
                                                      56, 57, 58, 59, 60, 61,
                                                      62, 63, 64, 65, 66, 67,
                                                      68, 69],
                                        'n_estimators': [100, 110, 120, 130,
                                                         140, 150, 160, 170,
                                                         180, 190, 200, 210,
                                                         220, 230, 240, 250,
                                                         260, 270, 280, 290],
                                       

CA_1 	 {'learning_rate': 0.050610394940121636, 'max_depth': 67, 'n_estimators': 280, 'num_leaves': 194, 'tweedie_variance_power': 1.1} -2.30774943648447
Mem. usage decreased to 206.11 Mb (82.7% reduction)


RandomizedSearchCV(cv=<generator object custom_split at 0x7fe475194750>,
                   estimator=LGBMRegressor(objective='tweedie'), n_jobs=-1,
                   param_distributions={'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fe476381490>,
                                        'max_depth': [50, 51, 52, 53, 54, 55,
                                                      56, 57, 58, 59, 60, 61,
                                                      62, 63, 64, 65, 66, 67,
                                                      68, 69],
                                        'n_estimators': [100, 110, 120, 130,
                                                         140, 150, 160, 170,
                                                         180, 190, 200, 210,
                                                         220, 230, 240, 250,
                                                         260, 270, 280, 290],
                                       

CA_2 	 {'learning_rate': 0.051583904830467577, 'max_depth': 53, 'n_estimators': 280, 'num_leaves': 222, 'tweedie_variance_power': 1.1} -2.144873011113553
Mem. usage decreased to 206.11 Mb (82.7% reduction)


RandomizedSearchCV(cv=<generator object custom_split at 0x7fe476573050>,
                   estimator=LGBMRegressor(objective='tweedie'), n_jobs=-1,
                   param_distributions={'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fe476913d50>,
                                        'max_depth': [50, 51, 52, 53, 54, 55,
                                                      56, 57, 58, 59, 60, 61,
                                                      62, 63, 64, 65, 66, 67,
                                                      68, 69],
                                        'n_estimators': [100, 110, 120, 130,
                                                         140, 150, 160, 170,
                                                         180, 190, 200, 210,
                                                         220, 230, 240, 250,
                                                         260, 270, 280, 290],
                                       

CA_3 	 {'learning_rate': 0.07582967729206709, 'max_depth': 54, 'n_estimators': 270, 'num_leaves': 208, 'tweedie_variance_power': 1.1} -3.124439480568764
Mem. usage decreased to 206.11 Mb (82.7% reduction)


RandomizedSearchCV(cv=<generator object custom_split at 0x7fe475194950>,
                   estimator=LGBMRegressor(objective='tweedie'), n_jobs=-1,
                   param_distributions={'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fe47593a8d0>,
                                        'max_depth': [50, 51, 52, 53, 54, 55,
                                                      56, 57, 58, 59, 60, 61,
                                                      62, 63, 64, 65, 66, 67,
                                                      68, 69],
                                        'n_estimators': [100, 110, 120, 130,
                                                         140, 150, 160, 170,
                                                         180, 190, 200, 210,
                                                         220, 230, 240, 250,
                                                         260, 270, 280, 290],
                                       

CA_4 	 {'learning_rate': 0.1478995769975061, 'max_depth': 62, 'n_estimators': 130, 'num_leaves': 299, 'tweedie_variance_power': 1.1} -1.4947936396158847
Mem. usage decreased to 206.11 Mb (82.7% reduction)


RandomizedSearchCV(cv=<generator object custom_split at 0x7fe476492850>,
                   estimator=LGBMRegressor(objective='tweedie'), n_jobs=-1,
                   param_distributions={'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fe476821390>,
                                        'max_depth': [50, 51, 52, 53, 54, 55,
                                                      56, 57, 58, 59, 60, 61,
                                                      62, 63, 64, 65, 66, 67,
                                                      68, 69],
                                        'n_estimators': [100, 110, 120, 130,
                                                         140, 150, 160, 170,
                                                         180, 190, 200, 210,
                                                         220, 230, 240, 250,
                                                         260, 270, 280, 290],
                                       

TX_1 	 {'learning_rate': 0.16804816033695238, 'max_depth': 53, 'n_estimators': 150, 'num_leaves': 192, 'tweedie_variance_power': 1.1} -1.9312170398919613
Mem. usage decreased to 206.11 Mb (82.7% reduction)


RandomizedSearchCV(cv=<generator object custom_split at 0x7fe475194350>,
                   estimator=LGBMRegressor(objective='tweedie'), n_jobs=-1,
                   param_distributions={'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fe47655e490>,
                                        'max_depth': [50, 51, 52, 53, 54, 55,
                                                      56, 57, 58, 59, 60, 61,
                                                      62, 63, 64, 65, 66, 67,
                                                      68, 69],
                                        'n_estimators': [100, 110, 120, 130,
                                                         140, 150, 160, 170,
                                                         180, 190, 200, 210,
                                                         220, 230, 240, 250,
                                                         260, 270, 280, 290],
                                       

TX_2 	 {'learning_rate': 0.036646621908299024, 'max_depth': 50, 'n_estimators': 270, 'num_leaves': 257, 'tweedie_variance_power': 1.1} -2.184581615599202
Mem. usage decreased to 206.11 Mb (82.7% reduction)


RandomizedSearchCV(cv=<generator object custom_split at 0x7fe4634cded0>,
                   estimator=LGBMRegressor(objective='tweedie'), n_jobs=-1,
                   param_distributions={'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fe4752718d0>,
                                        'max_depth': [50, 51, 52, 53, 54, 55,
                                                      56, 57, 58, 59, 60, 61,
                                                      62, 63, 64, 65, 66, 67,
                                                      68, 69],
                                        'n_estimators': [100, 110, 120, 130,
                                                         140, 150, 160, 170,
                                                         180, 190, 200, 210,
                                                         220, 230, 240, 250,
                                                         260, 270, 280, 290],
                                       

TX_3 	 {'learning_rate': 0.16132262516317375, 'max_depth': 59, 'n_estimators': 240, 'num_leaves': 154, 'tweedie_variance_power': 1.1} -2.1470173603077294
Mem. usage decreased to 206.11 Mb (82.7% reduction)


RandomizedSearchCV(cv=<generator object custom_split at 0x7fe475194e50>,
                   estimator=LGBMRegressor(objective='tweedie'), n_jobs=-1,
                   param_distributions={'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fe476459ad0>,
                                        'max_depth': [50, 51, 52, 53, 54, 55,
                                                      56, 57, 58, 59, 60, 61,
                                                      62, 63, 64, 65, 66, 67,
                                                      68, 69],
                                        'n_estimators': [100, 110, 120, 130,
                                                         140, 150, 160, 170,
                                                         180, 190, 200, 210,
                                                         220, 230, 240, 250,
                                                         260, 270, 280, 290],
                                       

WI_1 	 {'learning_rate': 0.04500479865741088, 'max_depth': 64, 'n_estimators': 220, 'num_leaves': 225, 'tweedie_variance_power': 1.1} -1.8347939897611092
Mem. usage decreased to 206.11 Mb (82.7% reduction)


RandomizedSearchCV(cv=<generator object custom_split at 0x7fe475194650>,
                   estimator=LGBMRegressor(objective='tweedie'), n_jobs=-1,
                   param_distributions={'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fe474e74310>,
                                        'max_depth': [50, 51, 52, 53, 54, 55,
                                                      56, 57, 58, 59, 60, 61,
                                                      62, 63, 64, 65, 66, 67,
                                                      68, 69],
                                        'n_estimators': [100, 110, 120, 130,
                                                         140, 150, 160, 170,
                                                         180, 190, 200, 210,
                                                         220, 230, 240, 250,
                                                         260, 270, 280, 290],
                                       

WI_2 	 {'learning_rate': 0.0612933414734137, 'max_depth': 50, 'n_estimators': 290, 'num_leaves': 200, 'tweedie_variance_power': 1.1} -3.285028287799137
Mem. usage decreased to 206.11 Mb (82.7% reduction)


RandomizedSearchCV(cv=<generator object custom_split at 0x7fe475194050>,
                   estimator=LGBMRegressor(objective='tweedie'), n_jobs=-1,
                   param_distributions={'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fe475608110>,
                                        'max_depth': [50, 51, 52, 53, 54, 55,
                                                      56, 57, 58, 59, 60, 61,
                                                      62, 63, 64, 65, 66, 67,
                                                      68, 69],
                                        'n_estimators': [100, 110, 120, 130,
                                                         140, 150, 160, 170,
                                                         180, 190, 200, 210,
                                                         220, 230, 240, 250,
                                                         260, 270, 280, 290],
                                       

WI_3 	 {'learning_rate': 0.05896493503811511, 'max_depth': 65, 'n_estimators': 160, 'num_leaves': 203, 'tweedie_variance_power': 1.1} -2.352375351361978


In [51]:
# Initialize submission dataframe
pred_test=pd.DataFrame()
pred_test['id']=sales_train_evaluation_['id']
pred_test['store_id']=sales_train_evaluation_['store_id'] 
for i in range(1,29):
    pred_test['F'+str(i)]=np.nan
    pred_test['F'+str(i)]=pred_test['F'+str(i)].astype(np.float16)

In [52]:
# Make test predictions by store
for store_id in STORES_IDS:
  file_path = '/content/'+store_id+'.csv'
  df = pd.read_csv(file_path)
  x_test=df.loc[df['day']>=1942]
  x_test = x_test.drop(['demand'],axis=1)
  model_path = '/content/lgb_model_'+store_id+'.bin'
  lgb = pickle.load(open(model_path, 'rb'))
  k=1
  for i in range(1942,1970):
    # Read all our models and make predictions for each day/store pairs
    pred_test['F'+str(k)][pred_test['store_id']==store_id]=lgb.predict(x_test[x_test['day']==(i)]) 
    k+=1
    
prediction_test = np.round(pred_test,2) 

In [53]:
# Output the final submission file
import time 
current_timestamp = int(time.time())
prediction_test = prediction_test.drop('store_id',axis=1)
sample_submission = pd.read_csv('/content/sample_submission.csv')
sample_validation = sample_submission.iloc[:30490,:]
final = pd.concat([sample_validation, prediction_test])
file_path = '/content/prediction_result' + str(current_timestamp) + '.csv'
final.to_csv(file_path,index=False)

In [54]:
# ########################### TEST Predict ############################################

# df=df.drop(['demand'],axis=1)

# #Loading Already Trained LightGBM Regressor Model for Computaion 
# with open('/content/lgb_model.pkl','rb') as f:
#     lgb=pickle.load(f)

# pred_test=pd.DataFrame()
# pred_test['id']=x['id'] 
# j=1
# k=1
# for i in range(1942,1970):
#     pred_test['F'+str(k)]=lgb.predict(x_test[x_test['day']==(i)]) 
#     k+=1
    
# prediction_test = np.round(pred_test,2) 

# # Loop over each prediction day
# # As rolling lags are the most timeconsuming
# # we will calculate it for whole day
# for PREDICT_DAY in range(1,29):    
#     print('Predict | Day:', PREDICT_DAY)

#     for store_id in STORES_IDS:
        
#         # Read all our models and make predictions
#         # for each day/store pairs
#         model_path = '/content/lgb_model_'+store_id+'.bin' 
        
#         estimator = pickle.load(open(model_path, 'rb'))
        
#         day_mask = base_test['d']==(END_TRAIN+PREDICT_DAY)
#         store_mask = base_test['store_id']==store_id
        
#         mask = (day_mask)&(store_mask)
#         base_test[TARGET][mask] = estimator.predict(grid_df[mask][MODEL_FEATURES])
    
#     # Make good column naming and add 
#     # to all_preds DataFrame
#     temp_df = base_test[day_mask][['id',TARGET]]
#     temp_df.columns = ['id','F'+str(PREDICT_DAY)]
#     if 'id' in list(all_preds):
#         all_preds = all_preds.merge(temp_df, on=['id'], how='left')
#     else:
#         all_preds = temp_df.copy()
        
#     print('#'*10, ' %0.2f min round |' % ((time.time() - start_time) / 60),
#                   ' %0.2f min total |' % ((time.time() - main_time) / 60),
#                   ' %0.2f day sales |' % (temp_df['F'+str(PREDICT_DAY)].sum()))
#     del temp_df
    
# all_preds = all_preds.reset_index(drop=True)
# all_preds