# Prediction Pipeline

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error as mse
# local imports
from data_tools import MonthlyData
from model_tools import PredictorData
from tree_models import LGBM_Predictor
from utils import Utils

In [2]:
config = Utils.read_config_for_env(config_path='../config/config.yml')
pred_data = PredictorData(
    config,
    refresh_monthly=False,
    refresh_ts_features=False,
    clean_strategy='olrem_for_all',
    # split_strategy='random',
    # split_strategy='last_months_val',
    split_strategy='months',
    num_lag_mon=3,
    val_ratio=0.2)

Loading data.. Done.
Fixing data schemas.. Done.
Loading /home/onur/WORK/DS/repos/shop_sales_prediction/data/all_ts.parquet


In [3]:
# Get the raw features before transormation
pred_data.split_X_y()
X_raw_features = pred_data.X_train.columns

In [4]:
# load predictor
lgbm_predictor = LGBM_Predictor(
    pred_data=pred_data)

In [5]:
# fit the transfrmer
lgbm_predictor.pred_data = lgbm_predictor.split_transform(
        lgbm_predictor.pred_data,
        lgbm_predictor.transformer)

Splitting train-val
Fit-transforming X_train
Transforming X_val


In [6]:
# Load the model
# model = lgbm_predictor.load_model('lgbm_model_20240609_212645.pkl')
model = lgbm_predictor.load_model('lgbm_model_20240609_204756.pkl')
# check the validation rmse
pred_val = model.predict(lgbm_predictor.pred_data.X_val)
print("Validation RMSE: ", np.sqrt(mse(lgbm_predictor.pred_data.y_val, pred_val)))

Validation RMSE:  0.4633272407560897


In [7]:
def create_next_month_base(self, cols2copy):
    # get the last month, and specified columns
    df_NM = self.df.loc[pred_data.df.index[-1], cols2copy].copy()
    df_NM.rename(
        columns={
            'price_l2': 'price_l3',
            'price_l1': 'price_l2',
            'price': 'price_l1',
            'amount_item_l2': 'amount_item_l3',
            'amount_item_l1': 'amount_item_l2',
            'amount_item': 'amount_item_l1',
            'amount_cat_l2': 'amount_cat_l3',
            'amount_cat_l1': 'amount_cat_l2',
            'amount_cat': 'amount_cat_l1'
            },
        inplace=True)
    # advance the index, add years and months
    df_NM.index = df_NM.index +1
    df_NM['year'] = df_NM.index.year
    df_NM['month'] = df_NM.index.month
    return df_NM

def extend_df_NM(self, df_NM, cols2copy, num_lag_mon):
    num_months_to_extend = 2-num_lag_mon
    print(f'extending by {num_months_to_extend} months')
    indices = self.df.index.unique()[num_months_to_extend:]
    df_ext = self.df.loc[indices, self.df.columns]
    print(f'df_ext has the periods: {df_ext.index.unique()}')
    df_NM_ext = pd.concat([df_ext, df_NM], axis=0)
    return df_NM_ext

def add_moving_average_features(self, df, cols2copy):
    df_ext = extend_df_NM(self, df, cols2copy, self.num_lag_mon)
    print(f'The extended df has the periods: {df_ext.index.unique()}')
    df_ext_wMA = self.monthly_data.add_ma_features(
        df_ext,
        mas_to_include=[self.num_lag_mon-1],
        ma_features=['price_l1', 'amount_item_l1', 'amount_cat_l1']
        )
    return df_ext_wMA

def return_X_lastmonth(df, features):
    df = df.loc[df.index[-1], features]
    return df
    

In [8]:
cols2copy = ['shop_id', 'item_id', 'item_category_id', 
                 'price', 'price_l1', 'price_l2',
                 'amount_item', 'amount_item_l1', 'amount_item_l2',
                 'amount_cat', 'amount_cat_l1', 'amount_cat_l2']

df_NM = create_next_month_base(pred_data, cols2copy)
df_NM_ext_wMA = add_moving_average_features(pred_data, df_NM, cols2copy)


extending by -1 months
df_ext has the periods: PeriodIndex(['2015-09'], dtype='period[M]', name='monthly_period')
The extended df has the periods: PeriodIndex(['2015-09', '2015-10'], dtype='period[M]', name='monthly_period')


In [9]:
X_newmonth = return_X_lastmonth(df_NM_ext_wMA, X_raw_features)

In [10]:
X_newmonth.head()

Unnamed: 0_level_0,shop_id,item_id,item_category_id,year,month,price_l1,price_l2,price_l3,amount_item_l1,amount_item_l2,amount_item_l3,amount_cat_l1,amount_cat_l2,amount_cat_l3,price_l1_ma2,amount_item_l1_ma2,amount_cat_l1_ma2
monthly_period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2015-10,99,100000,1040,2015,10,189.0,189.0,189.0,0.0,0.0,0.0,0.0,0.0,0.0,189.0,0.0,0.0
2015-10,99,100001,1076,2015,10,14580.0,14580.0,14580.0,0.0,0.0,0.0,0.0,0.0,0.0,14580.0,0.0,0.0
2015-10,99,100002,1040,2015,10,189.0,189.0,189.0,0.0,0.0,0.0,0.0,0.0,0.0,189.0,0.0,0.0
2015-10,99,100003,1040,2015,10,257.0,257.0,257.0,0.0,0.0,0.0,0.0,0.0,0.0,257.0,0.0,0.0
2015-10,99,100004,1040,2015,10,189.0,189.0,189.0,0.0,0.0,0.0,0.0,0.0,0.0,189.0,0.0,0.0


In [11]:
# transform 
X_nm_tr = lgbm_predictor.transformer.transform(X_newmonth)

In [12]:
# predict
pred_newmonth = model.predict(X_nm_tr)

In [13]:
X_newmonth.head()

Unnamed: 0_level_0,shop_id,item_id,item_category_id,year,month,price_l1,price_l2,price_l3,amount_item_l1,amount_item_l2,amount_item_l3,amount_cat_l1,amount_cat_l2,amount_cat_l3,price_l1_ma2,amount_item_l1_ma2,amount_cat_l1_ma2
monthly_period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2015-10,99,100000,1040,2015,10,189.0,189.0,189.0,0.0,0.0,0.0,0.0,0.0,0.0,189.0,0.0,0.0
2015-10,99,100001,1076,2015,10,14580.0,14580.0,14580.0,0.0,0.0,0.0,0.0,0.0,0.0,14580.0,0.0,0.0
2015-10,99,100002,1040,2015,10,189.0,189.0,189.0,0.0,0.0,0.0,0.0,0.0,0.0,189.0,0.0,0.0
2015-10,99,100003,1040,2015,10,257.0,257.0,257.0,0.0,0.0,0.0,0.0,0.0,0.0,257.0,0.0,0.0
2015-10,99,100004,1040,2015,10,189.0,189.0,189.0,0.0,0.0,0.0,0.0,0.0,0.0,189.0,0.0,0.0


In [17]:
Xy_newmonth = X_newmonth.loc[:, ['shop_id','item_id']].copy()
Xy_newmonth.rename(columns={'shop_id': 'shop', 'item_id':'item'}, inplace=True)
Xy_newmonth['amount'] = pred_newmonth
Xy_newmonth.head()

Unnamed: 0_level_0,shop,item,amount
monthly_period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-10,99,100000,-0.002976
2015-10,99,100001,0.003163
2015-10,99,100002,-0.002976
2015-10,99,100003,-0.002976
2015-10,99,100004,-0.002976


In [15]:
test_df = pred_data.raw_data.test

In [26]:
# merge with the test subset
test_wpreds = pd.merge(test_df, Xy_newmonth, on=['shop', 'item'], how='left')


In [28]:
import os
test_wpreds.to_csv(
    os.path.join(config['root_data_path'], 'test_preds.csv'),
    index=False)

In [32]:
test_wpreds['amount'] = test_wpreds['amount'].round()
test_wpreds['amount'] = test_wpreds['amount'].astype(int)
test_wpreds


Unnamed: 0,shop,item,amount
0,101,100031,0
1,101,100486,0
2,101,100787,0
3,101,100794,0
4,101,100968,0
...,...,...,...
31526,158,122087,3
31527,158,122088,2
31528,158,122091,2
31529,158,122100,0


In [33]:
import os
test_wpreds.to_csv(
    os.path.join(config['root_data_path'], 'test_preds_rounded.csv'),
    index=False
)