In [1]:
import pandas as pd
from datetime import datetime
from typing import List, cast, Literal, Tuple
from sklearn.multioutput import RegressorChain
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from xgboost import XGBRegressor
import lightgbm as lgb
import optuna

**LOAD DATA WITH CORRECT TYPES**

In [2]:
stores = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/stores.csv")
holidays_events = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv",dtype={
            "type": "category",
            "locale": "category",
            "locale_name": "category",
            "description": "category",
            "transferred": "bool",
            "date": "period[D]"
        }
)
store_sales = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/train.csv",
        dtype={
            "store_nbr": "category",
            "family": "category",
            "sales": "float32",
            "onpromotion": "uint32",
            "date": "period[D]"
        }
)
query = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/test.csv",
        dtype={
            "store_nbr": "category",
            "family": "category",
            "onpromotion": "uint32",
            "date": "period[D]"
        }
)
oil = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/oil.csv", 
        dtype={            
            "date": "period[D]"
        }
)
transactions = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/transactions.csv")

**ADDITIONAL HELPER FUNCTIONS**

In [3]:
main_index: List[str] = ["date", "store_nbr", "family"]
secondary_index: List[str] = ["store_nbr", "family"]
unique_families = store_sales["family"].unique()
unique_stores = store_sales["store_nbr"].unique()

def timeline(reset = False):
    global timeline_index
    if reset is True:
        timeline_index = 1
    else :
        try:
            timeline_index += 1
        except Exception:
            timeline_index = 1
        
    print("{}.Current time: {}".format(timeline_index,datetime.now().strftime("%H:%M:%S")))
timeline()

1.Current time: 17:37:59


In [4]:
def make_mw_in_groups(
    df: pd.DataFrame,
    groupby: List[str] = [],
    column: str = "",
    window: List[int] | int = 30,
    center: List[bool] | bool  = False,
    min_periods: List[int] | int  = 1,
    aggregator: List[Literal["mean","sum","median","std","var"]] | Literal["mean","sum","median","std","var"] = "mean",
) -> pd.DataFrame:
    df = df.copy(deep=True)
    if isinstance(window,int):
        window = [window]
        
    window = list(filter(lambda x : x != 0, window))
    if len(window) == 0 :
        raise ValueError("Window value must be non-zero!")
    if isinstance(center,bool):
        center = [center] * len(window)
    if isinstance(min_periods,int):
        min_periods = [min_periods]*len(window)
    if isinstance(aggregator,str):
        aggregator = [aggregator]*len(window)
        
    
    def create_mw_columns(group):
        ma_group = pd.DataFrame(index=group.index)
        for index, val in enumerate(window):
            type_name = "lag" if val > 0 else "lead"
            if val < 0 :
                ma_group[f"{column}_{type_name}_{aggregator[index]}_{-val}"] = group[column].rolling(window=-val, center=center[index], min_periods=min_periods[index]).aggregate(aggregator[index]).shift(val)
            else:
                ma_group[f"{column}_{type_name}_{aggregator[index]}_{val}"] = group[column].shift(1).rolling(window=val, center=center[index], min_periods=min_periods[index]).aggregate(aggregator[index])
        
        return ma_group
    
    return cast(pd.DataFrame,df.reset_index(groupby).groupby(groupby, observed=True).apply(create_mw_columns, include_groups=False).reset_index(groupby).set_index(groupby, append=True).sort_index())


def make_shift_in_groups(
    df: pd.DataFrame,
    groupby: List[str] = [],
    column: str = "",
    shift: List[int] | int = 1,
) -> pd.DataFrame:
    df = df.copy(deep=True)
   
    if isinstance(shift,int):
        shift = [shift]
    
    shift = list(filter(lambda el: el != 0, shift))
    
    if len(shift) == 0:
        raise ValueError(
            "Shift value must be non-zero!"
        )

    def create_shifted_columns(group):
        shifted_group = pd.DataFrame(index=group.index)
        for val in shift:

            shifted_group[f"{column}_{'lead' if val < 0 else 'lag'}_{abs(val)}"] = (
                group[column].shift(val)
            )

        return shifted_group

    shifted_df = cast(
        pd.DataFrame,
        df.reset_index(groupby)
        .groupby(groupby, observed=True)
        .apply(create_shifted_columns, include_groups=False).reset_index(groupby).set_index(groupby, append=True).sort_index(),
    )

    return shifted_df

In [5]:
TRAIN_START = "2014-01-01"
TRAIN_END = "2017-07-29"

TEST_DAY = "2017-07-30"

N_STEP_PREDICTION = 16

**DATA PREPARATION**

In [6]:
def combine_data(store_sales, query, oil, holidays_events) -> pd.DataFrame:
    store_sales = store_sales.copy(deep=True)
    query = query.copy(deep=True)
    oil = oil.copy(deep=True)
    holidays_events.copy(deep=True)
    
    data = pd.concat([
        store_sales.set_index(main_index),
        query.set_index(main_index)
    ], axis=0 )
    
    holidays_to_consider = holidays_events[
        (holidays_events["transferred"].eq(False))
        & holidays_events["locale"].isin(["National"])
    ].drop_duplicates(keep="first", subset=["date"])
    
    all_periods = pd.period_range("2013-01-01", "2017-08-31")
    oil_prices = (
        oil.set_index("date")
        .reindex(all_periods)
        .rename_axis("date").ffill().bfill()
        .rename(columns={"dcoilwtico":"oil"})
    )
    
    data_combined = (
        data.reset_index(secondary_index)
        .join(oil_prices)
    )
    data_combined["is_holiday"] = data_combined.index.get_level_values("date").isin(holidays_to_consider["date"])
    data_combined = data_combined.set_index(secondary_index, append=True)
    return data_combined

data = combine_data(store_sales, query, oil, holidays_events)
display(data)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,sales,onpromotion,oil,is_holiday
date,store_nbr,family,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2013-01-01,1,AUTOMOTIVE,0,0.0,0,93.14,True
2013-01-01,1,BABY CARE,1,0.0,0,93.14,True
2013-01-01,1,BEAUTY,2,0.0,0,93.14,True
2013-01-01,1,BEVERAGES,3,0.0,0,93.14,True
2013-01-01,1,BOOKS,4,0.0,0,93.14,True
...,...,...,...,...,...,...,...
2017-08-31,9,POULTRY,3029395,,1,47.26,False
2017-08-31,9,PREPARED FOODS,3029396,,0,47.26,False
2017-08-31,9,PRODUCE,3029397,,1,47.26,False
2017-08-31,9,SCHOOL AND OFFICE SUPPLIES,3029398,,9,47.26,False


**FEATURE ENGINEERING**

In [10]:
def prepare_days_since_last_paycheck(date: pd.PeriodIndex) -> List[int]:

    days_since_paycheck = [1] * len(date)
    for i, period in enumerate(date):
        if period.day == 15 or period.day == period.days_in_month:
            continue
        elif period.day < 15:
            days_since_paycheck[i] = period.day + 1
        else:
            days_since_paycheck[i] = period.day - 15 + 1

    return days_since_paycheck


def engineer_features(data) -> pd.DataFrame:
    data = data.copy(deep=True)
    data["year"] = data.index.get_level_values("date").year
    data["month"] = data.index.get_level_values("date").month
    data["day_of_week"] = data.index.get_level_values("date").day_of_week
    data["end_of_year"] = data.index.get_level_values("date").month >= 11
#     data["family_cat"] = data.index.get_level_values("family")
#     data["store_cat"] = data.index.get_level_values("store_nbr")
    
    data["days_since_last_paycheck"] = prepare_days_since_last_paycheck(date=data.index.get_level_values("date"))
    
    grouped_lags_leads: List[pd.DataFrame] = [
        make_shift_in_groups(
            df = data,
            groupby = secondary_index,
            column = "sales",
            shift = [1,2]
        ),
        make_shift_in_groups(
            df = data,
            groupby = secondary_index,
            column = "onpromotion",
            shift = [-i for i in range(1,17)]
        ),
        make_mw_in_groups(
            df = data,
            groupby = secondary_index,
            column = "sales",
            window = [7,14,28],
            aggregator = "mean",
            center = False
        ),
        make_mw_in_groups(
            df = data.set_index("day_of_week",append=True),
            groupby = ["store_nbr","family","day_of_week"],
            column = "sales",
            # --- NOTICE ---
            # window references the weeks
            window = 10,
            aggregator = "mean"
        ).reset_index("day_of_week", drop=True),
        make_mw_in_groups(
            df = data,
            groupby = secondary_index,
            column = "oil",
            window = [-7, -N_STEP_PREDICTION]
        ),
    ]
    
    data_lag_lead = data.join(grouped_lags_leads)
    
    category_cols = ["year","month","day_of_week","end_of_year"]
    data_lag_lead[category_cols] = data_lag_lead[category_cols].astype("category")
    
    return data_lag_lead
        
eng_feat_set = engineer_features(data)
eng_feat_set

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,sales,onpromotion,oil,is_holiday,year,month,day_of_week,end_of_year,days_since_last_paycheck,...,onpromotion_lead_13,onpromotion_lead_14,onpromotion_lead_15,onpromotion_lead_16,sales_lag_mean_7,sales_lag_mean_14,sales_lag_mean_28,sales_lag_mean_10,oil_lead_mean_7,oil_lead_mean_16
date,store_nbr,family,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2013-01-01,1,AUTOMOTIVE,0,0.0,0,93.14,True,2013,1,1,False,2,...,0.0,0.0,0.0,0.0,,,,,93.125714,93.554375
2013-01-01,1,BABY CARE,1,0.0,0,93.14,True,2013,1,1,False,2,...,0.0,0.0,0.0,0.0,,,,,93.125714,93.554375
2013-01-01,1,BEAUTY,2,0.0,0,93.14,True,2013,1,1,False,2,...,0.0,0.0,0.0,0.0,,,,,93.125714,93.554375
2013-01-01,1,BEVERAGES,3,0.0,0,93.14,True,2013,1,1,False,2,...,0.0,0.0,0.0,0.0,,,,,93.125714,93.554375
2013-01-01,1,BOOKS,4,0.0,0,93.14,True,2013,1,1,False,2,...,0.0,0.0,0.0,0.0,,,,,93.125714,93.554375
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-08-31,9,POULTRY,3029395,,1,47.26,False,2017,8,3,False,1,...,,,,,,,436.513148,327.136497,,
2017-08-31,9,PREPARED FOODS,3029396,,0,47.26,False,2017,8,3,False,1,...,,,,,,,123.308385,98.218875,,
2017-08-31,9,PRODUCE,3029397,,1,47.26,False,2017,8,3,False,1,...,,,,,,,1627.045542,1205.129868,,
2017-08-31,9,SCHOOL AND OFFICE SUPPLIES,3029398,,9,47.26,False,2017,8,3,False,1,...,,,,,,,150.153846,38.875000,,


**PREPARE TRAIN & TEST SETS**

In [11]:
def split_train_test(
    df : pd.DataFrame,
    train_start: str,
    train_end: str,
    test_day: str
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    
    df = df.copy()
    display(df.columns)
    
    col_x_drop = ["id","sales"]
    target_cols = ["sales"]
    
    X_train = df.loc[train_start: train_end].drop(columns=col_x_drop)
    X_test = df.loc[test_day: test_day].drop(columns=col_x_drop)
    
    y = make_shift_in_groups(
        df = data,
        groupby = secondary_index,
        column = "sales",
        shift = [-i for i in range(1, N_STEP_PREDICTION+1)]
    )
    y_train = y.loc[train_start: train_end]
    y_test = y.loc[test_day: test_day]
    
    X_train_d = pd.get_dummies(X_train, drop_first = True)
    X_test_d = pd.get_dummies(X_test, drop_first = True)
    return X_train_d, X_test_d, y_train, y_test
    
    
X_train, X_test, y_train, y_test = split_train_test(eng_feat_set, TRAIN_START, TRAIN_END, TEST_DAY)

display(X_train, X_test, y_train,y_test)

Index(['id', 'sales', 'onpromotion', 'oil', 'is_holiday', 'year', 'month',
       'day_of_week', 'end_of_year', 'days_since_last_paycheck', 'sales_lag_1',
       'sales_lag_2', 'onpromotion_lead_1', 'onpromotion_lead_2',
       'onpromotion_lead_3', 'onpromotion_lead_4', 'onpromotion_lead_5',
       'onpromotion_lead_6', 'onpromotion_lead_7', 'onpromotion_lead_8',
       'onpromotion_lead_9', 'onpromotion_lead_10', 'onpromotion_lead_11',
       'onpromotion_lead_12', 'onpromotion_lead_13', 'onpromotion_lead_14',
       'onpromotion_lead_15', 'onpromotion_lead_16', 'sales_lag_mean_7',
       'sales_lag_mean_14', 'sales_lag_mean_28', 'sales_lag_mean_10',
       'oil_lead_mean_7', 'oil_lead_mean_16'],
      dtype='object')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,onpromotion,oil,is_holiday,days_since_last_paycheck,sales_lag_1,sales_lag_2,onpromotion_lead_1,onpromotion_lead_2,onpromotion_lead_3,onpromotion_lead_4,...,month_10,month_11,month_12,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6,end_of_year_True
date,store_nbr,family,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2014-01-01,1,AUTOMOTIVE,0,98.17,True,2,2.000000,1.000000,0.0,0.0,0.0,0.0,...,False,False,False,False,True,False,False,False,False,False
2014-01-01,1,BABY CARE,0,98.17,True,2,0.000000,0.000000,0.0,0.0,0.0,0.0,...,False,False,False,False,True,False,False,False,False,False
2014-01-01,1,BEAUTY,0,98.17,True,2,1.000000,3.000000,0.0,0.0,0.0,0.0,...,False,False,False,False,True,False,False,False,False,False
2014-01-01,1,BEVERAGES,0,98.17,True,2,840.000000,1001.000000,0.0,0.0,0.0,0.0,...,False,False,False,False,True,False,False,False,False,False
2014-01-01,1,BOOKS,0,98.17,True,2,0.000000,0.000000,0.0,0.0,0.0,0.0,...,False,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-07-29,9,POULTRY,0,49.72,False,15,458.345001,294.811005,1.0,1.0,0.0,0.0,...,False,False,False,False,False,False,False,True,False,False
2017-07-29,9,PREPARED FOODS,1,49.72,False,15,64.197998,92.336998,1.0,0.0,0.0,1.0,...,False,False,False,False,False,False,False,True,False,False
2017-07-29,9,PRODUCE,8,49.72,False,15,1164.241943,1138.975952,7.0,6.0,151.0,8.0,...,False,False,False,False,False,False,False,True,False,False
2017-07-29,9,SCHOOL AND OFFICE SUPPLIES,7,49.72,False,15,5.000000,6.000000,8.0,6.0,9.0,7.0,...,False,False,False,False,False,False,False,True,False,False


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,onpromotion,oil,is_holiday,days_since_last_paycheck,sales_lag_1,sales_lag_2,onpromotion_lead_1,onpromotion_lead_2,onpromotion_lead_3,onpromotion_lead_4,...,month_10,month_11,month_12,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6,end_of_year_True
date,store_nbr,family,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2017-07-30,1,AUTOMOTIVE,0,49.72,False,16,4.000000,7.000000,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,True,False
2017-07-30,1,BABY CARE,0,49.72,False,16,0.000000,0.000000,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,True,False
2017-07-30,1,BEAUTY,0,49.72,False,16,3.000000,2.000000,0.0,0.0,1.0,1.0,...,False,False,False,False,False,False,False,False,True,False
2017-07-30,1,BEVERAGES,16,49.72,False,16,2161.000000,2358.000000,24.0,26.0,25.0,7.0,...,False,False,False,False,False,False,False,False,True,False
2017-07-30,1,BOOKS,0,49.72,False,16,0.000000,0.000000,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,True,False
2017-07-30,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-07-30,9,POULTRY,1,49.72,False,16,425.854004,458.345001,1.0,0.0,0.0,1.0,...,False,False,False,False,False,False,False,False,True,False
2017-07-30,9,PREPARED FOODS,1,49.72,False,16,152.591003,64.197998,0.0,0.0,1.0,0.0,...,False,False,False,False,False,False,False,False,True,False
2017-07-30,9,PRODUCE,7,49.72,False,16,1740.156982,1164.241943,6.0,151.0,8.0,8.0,...,False,False,False,False,False,False,False,False,True,False
2017-07-30,9,SCHOOL AND OFFICE SUPPLIES,8,49.72,False,16,21.000000,5.000000,6.0,9.0,7.0,6.0,...,False,False,False,False,False,False,False,False,True,False


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sales_lead_1,sales_lead_2,sales_lead_3,sales_lead_4,sales_lead_5,sales_lead_6,sales_lead_7,sales_lead_8,sales_lead_9,sales_lead_10,sales_lead_11,sales_lead_12,sales_lead_13,sales_lead_14,sales_lead_15,sales_lead_16
date,store_nbr,family,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2014-01-01,1,AUTOMOTIVE,3.000000,0.000000,0.000000,1.000000,6.000000,2.000000,0.000000,2.000000,2.000000,4.000000,2.000000,3.000000,1.000000,2.000000,4.000000,2.000000
2014-01-01,1,BABY CARE,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2014-01-01,1,BEAUTY,6.000000,5.000000,1.000000,2.000000,2.000000,2.000000,1.000000,4.000000,0.000000,5.000000,1.000000,1.000000,1.000000,6.000000,1.000000,4.000000
2014-01-01,1,BEVERAGES,2524.000000,2192.000000,2282.000000,989.000000,2417.000000,1963.000000,2285.000000,1982.000000,2059.000000,2152.000000,998.000000,2275.000000,1911.000000,2503.000000,1747.000000,2085.000000
2014-01-01,1,BOOKS,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-07-29,9,POULTRY,517.911011,470.513000,570.195984,455.652985,425.946014,604.973999,546.205994,612.789001,459.039978,358.131989,333.131989,291.820984,525.223999,383.386993,412.458008,283.428986
2017-07-29,9,PREPARED FOODS,145.490005,61.939999,50.462997,89.135002,83.426003,117.189003,162.324005,155.975006,119.900002,112.954002,123.464996,111.930000,112.099998,129.903992,105.168999,114.120003
2017-07-29,9,PRODUCE,1882.588013,1517.552002,2470.460938,1333.885010,1364.578003,1401.673950,1853.345947,1876.197998,1675.090942,2299.715088,1310.447998,1036.438965,1453.078003,1419.264038,1693.607056,1348.425049
2017-07-29,9,SCHOOL AND OFFICE SUPPLIES,41.000000,56.000000,203.000000,149.000000,139.000000,136.000000,154.000000,195.000000,146.000000,170.000000,83.000000,148.000000,140.000000,138.000000,200.000000,182.000000


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sales_lead_1,sales_lead_2,sales_lead_3,sales_lead_4,sales_lead_5,sales_lead_6,sales_lead_7,sales_lead_8,sales_lead_9,sales_lead_10,sales_lead_11,sales_lead_12,sales_lead_13,sales_lead_14,sales_lead_15,sales_lead_16
date,store_nbr,family,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2017-07-30,1,AUTOMOTIVE,8.000000,5.000000,4.000000,3.000000,8.000000,5.000000,6.000000,7.000000,4.000000,7.000000,9.000000,1.000000,6.000000,1.000000,1.000000,4.000000
2017-07-30,1,BABY CARE,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2017-07-30,1,BEAUTY,3.000000,4.000000,2.000000,5.000000,7.000000,3.000000,2.000000,5.000000,2.000000,4.000000,10.000000,1.000000,3.000000,1.000000,6.000000,4.000000
2017-07-30,1,BEVERAGES,2414.000000,2627.000000,2645.000000,2037.000000,2479.000000,2093.000000,968.000000,2086.000000,2418.000000,2311.000000,2377.000000,1006.000000,1659.000000,803.000000,2201.000000,1942.000000
2017-07-30,1,BOOKS,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2017-07-30,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-07-30,9,POULTRY,470.513000,570.195984,455.652985,425.946014,604.973999,546.205994,612.789001,459.039978,358.131989,333.131989,291.820984,525.223999,383.386993,412.458008,283.428986,438.132996
2017-07-30,9,PREPARED FOODS,61.939999,50.462997,89.135002,83.426003,117.189003,162.324005,155.975006,119.900002,112.954002,123.464996,111.930000,112.099998,129.903992,105.168999,114.120003,154.552994
2017-07-30,9,PRODUCE,1517.552002,2470.460938,1333.885010,1364.578003,1401.673950,1853.345947,1876.197998,1675.090942,2299.715088,1310.447998,1036.438965,1453.078003,1419.264038,1693.607056,1348.425049,2419.729004
2017-07-30,9,SCHOOL AND OFFICE SUPPLIES,56.000000,203.000000,149.000000,139.000000,136.000000,154.000000,195.000000,146.000000,170.000000,83.000000,148.000000,140.000000,138.000000,200.000000,182.000000,121.000000


In [12]:
def create_model(**params) :
    # --- NOTICE ---
    # It will allow for easy parameters manipulation and model changes
    model = RegressorChain(XGBRegressor(**params))
    return model
    
def validate_model(X_train, X_test, y_train, y_test) -> Tuple[pd.DataFrame, float]:
    model = create_model().fit(X_train, y_train)
    y_pred = pd.DataFrame(model.predict(X_test), index=X_test.index, columns=y_train.columns).clip(0.0)
    
    rsmle = mean_squared_log_error(y_test, y_pred)
    print(f"RSMLE: {rsmle}")
    
    return y_pred, rsmle

In [None]:
print(datetime.now().strftime("%H:%M:%S"))
y_test_pred, rsmle = validate_model(X_train, X_test, y_train, y_test)
print(datetime.now().strftime("%H:%M:%S"))

17:44:08


***LOGBOOK***

**1.Initial setup without one-hot encodings:** RSMLE: 0.6812447830583148

**2.Initial setup with one-hot encodings:** RSMLE: 0.6145283461216959

**3.Initial setup without DOWM_10:** RSMLE: 0.6559099951029017

**4.Initial setup with one-hot encodings but no family and store_nbr:** RSMLE: 0.6302813566155643




In [None]:
y_test_pred

**OPTUNA OPTUMALIZATION**

In [None]:
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000, step=50),  # Number of trees in the ensemble
        "max_depth": trial.suggest_int("max_depth", 3, 10),  # Maximum depth of each tree
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),  # Learning rate
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),  # Subsample ratio of the training instances
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),  # Subsample ratio of columns when constructing each tree
        "gamma": trial.suggest_float("gamma", 0.01, 10.0, log=True),  # Minimum loss reduction required to make a further partition on a leaf node of the tree
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 100.0, log=True),  # L1 regularization term on weights
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 100.0, log=True),  # L2 regularization term on weights
        "min_child_weight": trial.suggest_float("min_child_weight", 1, 100, log=True),  # Minimum sum of instance weight (hessian) needed in a child
    }
    model = create_model(**params)
    model = model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred)

    return rmse

study = optuna.create_study(direction="minimize", study_name="MultiOutputModelOptymalization")
study.optimize(objective, n_trials=100)

best_params = study.best_params
print((f"Best hyperparameters: {best_params}"))