In [4]:
import pandas as pd
import numpy as np
import pickle
import warnings
from joblib import Parallel, delayed

from sktime.benchmarking.forecasting import ForecastingBenchmark
from sktime.split import temporal_train_test_split

from sktime.forecasting.theta import ThetaForecaster
from sktime.forecasting.compose import make_reduction
from sktime.forecasting.base import ForecastingHorizon
from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.performance_metrics.forecasting import mean_absolute_error

# Import KNeighborsRegressor for KNN
from sklearn.neighbors import KNeighborsRegressor

# Import RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor

# Import LightGBM
import lightgbm as lgb

# Import Prophet
from prophet import Prophet
from sktime.forecasting.fbprophet import Prophet as SktimeProphet

from sktime.split import ExpandingWindowSplitter

# hide warnings
warnings.filterwarnings("ignore")

In [65]:
def _reduce_memory_usage(df, category=True, n_jobs=1):
    """
    Iterate through all columns of a DataFrame and modify the datatype.

    Parameters
    ----------
    df : pd.DataFrame
        The DataFrame whose memory usage needs to be optimized.
    category : bool, optional
        If True, convert object types to category. Default is True.
    n_jobs : int, optional
        The number of parallel jobs to run for optimizing columns. Default is 1.

    Returns
    -------
    df : pd.DataFrame
        The optimized DataFrame with reduced memory usage.
    """
    
    def optimize_column(col):
        col_type = col.dtype
        if pd.api.types.is_numeric_dtype(col):
            c_min = col.min()
            c_max = col.max()
            if pd.api.types.is_integer_dtype(col):
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    return col.astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    return col.astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    return col.astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    return col.astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    return col.astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    return col.astype(np.float32)
                else:
                    return col.astype(np.float64)
        elif col_type == object:
            if category:
                return col.astype('category')
        return col

    optimized_columns = Parallel(n_jobs=n_jobs)(
        delayed(optimize_column)(df[col]) for col in df.columns
    )

    for col, optimized_col in zip(df.columns, optimized_columns):
        df[col] = optimized_col

    return df

In [66]:
with open('dataset_with_events.pkl', 'rb') as file:
    data = pickle.load(file)
data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,id,item_id,day,sales,wm_yr_wk,wday,month,year,event_name_1,event_name_2,event_type_1,event_type_2,sell_price
state_id,store_id,cat_id,dept_id,date,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,1,1,1,2011-01-29,FOODS_1_001_CA_1_validation,FOODS_1_001,1,3,11101,1,1,2011,,,,,2.000000
1,1,1,1,2011-01-29,FOODS_1_002_CA_1_validation,FOODS_1_002,1,0,11101,1,1,2011,,,,,7.878906
1,1,1,1,2011-01-29,FOODS_1_003_CA_1_validation,FOODS_1_003,1,0,11101,1,1,2011,,,,,2.880859
1,1,1,1,2011-01-29,FOODS_1_004_CA_1_validation,FOODS_1_004,1,0,11101,1,1,2011,,,,,
1,1,1,1,2011-01-29,FOODS_1_005_CA_1_validation,FOODS_1_005,1,3,11101,1,1,2011,,,,,2.939453
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3,10,3,7,2016-04-24,HOUSEHOLD_2_512_WI_3_validation,HOUSEHOLD_2_512,1913,0,11613,2,4,2016,,,,,3.970703
3,10,3,7,2016-04-24,HOUSEHOLD_2_513_WI_3_validation,HOUSEHOLD_2_513,1913,0,11613,2,4,2016,,,,,2.779297
3,10,3,7,2016-04-24,HOUSEHOLD_2_514_WI_3_validation,HOUSEHOLD_2_514,1913,0,11613,2,4,2016,,,,,19.546875
3,10,3,7,2016-04-24,HOUSEHOLD_2_515_WI_3_validation,HOUSEHOLD_2_515,1913,0,11613,2,4,2016,,,,,1.969727


In [67]:
data['sell_price'] = data['sell_price'].fillna(0)

In [None]:

f=lambda x: 1 if x==7 or x==1 or x==2 else 0
data['is_high_sale_day']=data['wday'].map(f) 

f=lambda x: 1 if x==2 or x==3 or x==4 or x==5 else 0
data['is_high_sale_months']=data['month'].map(f) 

f=lambda x: 1 if x=='Christmas' else 0
data['is_christmas']=data['event_name_1'].map(f)

In [None]:
# Adding lag features
lags=[1,7,14,21,28]
for i in lags:
    data['lag_'+str(i)]=data.groupby(['item_id'])['sales'].shift(i)

data = _reduce_memory_usage(data)

In [None]:
# Adding window feature
window=[7,14,21,28]
for i in window:
    data['rolling_mean_'+str(i)]=data.groupby(['item_id'])['sales'].transform(lambda s: s.rolling(i).mean())

In [None]:
# if you see in dataframe there are lot of NaN values which got created. replacing them with "0".
lags=['lag_1','lag_7', 'lag_14','lag_21', 'lag_28']
for i in lags:
    data[i]=data[i].fillna(0)

window=['rolling_mean_7','rolling_mean_14', 'rolling_mean_7' ,'rolling_mean_21', 'rolling_mean_28']
for i in window:
    data[i]=data[i].fillna(0)

In [None]:
# Expading mean
data['expanding_mean']=data.groupby(['id'])['sales'].transform(lambda s: s.expanding().mean())

data['expanding_mean']=data['expanding_mean'].fillna(0)

In [None]:
cols_to_convert = ['event_type_1', 'event_type_2']
data[cols_to_convert] = data[cols_to_convert].apply(lambda x: x.astype('category').cat.codes + 1)

In [None]:
data=data.drop(
    ['event_name_1','event_name_2', 'item_id','wday','month', 'wm_yr_wk','year'],
    axis = 1,
)

In [None]:
data = _reduce_memory_usage(data)

In [None]:
with open('final_dataset.pkl', 'wb') as file:
    pickle.dump(data, file)


In [5]:
with open('final_dataset.pkl', 'rb') as file:
    final_data = pickle.load( file)


In [6]:
final_data = final_data[final_data['id'] == 'FOODS_1_001_CA_1_validation']

In [12]:
final_data = final_data.reset_index(4)

In [19]:
X =final_data.drop(columns=['day','sales'], axis=1)
y = final_data[['date','day','sales']]

In [20]:
X

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,date,id,event_type_1,event_type_2,sell_price,is_high_sale_day,is_high_sale_months,is_christmas,lag_1,lag_7,lag_14,lag_21,lag_28,rolling_mean_7,rolling_mean_14,rolling_mean_21,rolling_mean_28,expanding_mean
state_id,store_id,cat_id,dept_id,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,1,1,2011-01-29,FOODS_1_001_CA_1_validation,0,0,2.000000,1,0,0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,3.000000
1,1,1,1,2011-01-30,FOODS_1_001_CA_1_validation,0,0,2.000000,1,0,0,3.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,1.500000
1,1,1,1,2011-01-31,FOODS_1_001_CA_1_validation,0,0,2.000000,0,0,0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,1.000000
1,1,1,1,2011-02-01,FOODS_1_001_CA_1_validation,0,0,2.000000,0,1,0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,1.000000
1,1,1,1,2011-02-02,FOODS_1_001_CA_1_validation,0,0,2.000000,0,1,0,1.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,1.599609
1,1,1,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,1,1,1,2016-04-20,FOODS_1_001_CA_1_validation,0,0,2.240234,0,1,0,1.0,1.0,3.0,1.0,2.0,1.286133,1.071289,1.286133,1.178711,0.786133
1,1,1,1,2016-04-21,FOODS_1_001_CA_1_validation,0,0,2.240234,0,1,0,1.0,1.0,0.0,0.0,0.0,1.142578,1.071289,1.286133,1.178711,0.785645
1,1,1,1,2016-04-22,FOODS_1_001_CA_1_validation,0,0,2.240234,1,1,0,0.0,0.0,1.0,4.0,1.0,1.286133,1.071289,1.142578,1.178711,0.786133
1,1,1,1,2016-04-23,FOODS_1_001_CA_1_validation,0,0,2.240234,1,1,0,1.0,2.0,0.0,0.0,1.0,1.142578,1.142578,1.190430,1.178711,0.786133


In [21]:
y

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,date,day,sales
state_id,store_id,cat_id,dept_id,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,1,1,2011-01-29,1,3
1,1,1,1,2011-01-30,2,0
1,1,1,1,2011-01-31,3,0
1,1,1,1,2011-02-01,4,1
1,1,1,1,2011-02-02,5,4
1,1,1,...,...,...,...
1,1,1,1,2016-04-20,1909,1
1,1,1,1,2016-04-21,1910,0
1,1,1,1,2016-04-22,1911,1
1,1,1,1,2016-04-23,1912,1
