# Prediction Pipeline: Tree-Based

In [None]:
import itertools
import os
import pandas as pd
import parquet

from tree_predictor import TreePredictor
from utils import Utils

In [None]:
config = Utils.read_config_for_env(config_path='../config/config.yml')
treepredictor = TreePredictor(config)

## Data Preparation

In [None]:
def get_monthly_data(df_daily, df_shops, df_items, splitname, refresh):
    fn_base = os.path.join(config['root_data_path'], config[f'fn_{splitname}_base'])
    if os.path.exists(fn_base) and not refresh:
        print(f'Loading {fn_base}')
        df_base = pd.read_parquet(fn_base)
    else:
        print(f'Creating {fn_base}')
        df_base = prep_monthly_data(df_daily, df_shops, df_items)
        df_base.to_parquet(fn_base)
    return df_base

def prep_monthly_data(df_daily, df_shops, df_items):
    # Create a full Item-Amounts table
    df_items_monthly = create_items_df_monthly(df_daily, df_shops, df_items) 
    # Add missing category_id's
    df_items_monthly = add_category_to_df(df_items_monthly, df_items)
    # df_items_monthly.info()
    # Add missing prices of non-transacted items: first based on average price of items in each shop, then average price of items in all shops, then average price of categories in all shops, then the global average price.
    df_items_monthly = add_avg_shopitem_price_to_df(df_items_monthly, df_daily, 'mean shop-item-specific price')
    # df_items_monthly.info()
    # Add category amounts as a feature
    # Create a full Item-Amounts table
    df_categories_monthly = create_categories_df_monthly(df_daily, df_shops, df_items)
    # Merge the items and categories tables
    df_monthly = pd.merge(
        df_items_monthly,
        df_categories_monthly,
        how='left',
        on=['shop_id', 'item_category_id', 'monthly_period'],
        suffixes=('_item', '_cat'))
    # df_monthly.info(show_counts=True)
    # add time features
    df_monthly = add_time_features(df_monthly)
    # set monthly_period as index
    df_monthly.set_index('monthly_period', inplace=True)
    return df_monthly

def create_items_df_monthly(df_daily, df_shops, df_items):
    df_items_monthly_transactions = convert_items_daily_to_monthly(
        df_daily
    )
    # df_items_monthly_transactions.info()
    columns = ['shop_id', 'item_id', 'monthly_period']
    df_items_monthly_all = create_df_all(
        columns=columns,
        shops=df_shops['shop_id'].unique(),
        items=df_items['item_id'].unique(),
        dates=df_items_monthly_transactions['monthly_period'].unique()
    )
    # df_items_monthly_all.info()
    df_items_monthly = create_df_with_zero_sales(
        df_items_monthly_transactions,
        df_items_monthly_all,
        columns)
    # df_items_monthly.info()
    return df_items_monthly

def convert_items_daily_to_monthly(df):
    df_items_monthly_grouped = df.groupby(
    ['shop_id', 'item_id', 'monthly_period']
    )
    df_items_monthly = df_items_monthly_grouped.agg(
        {
        'item_category_id': 'first',
        'price': 'mean',
        'amount': 'sum',
        }
    ).reset_index()
    return df_items_monthly

def create_df_all(columns, shops, items, dates):
    # Generate all possible combinations of shops, items, and dates.
    all_combinations = list(itertools.product(shops, items, dates))
    df_all = pd.DataFrame(all_combinations, columns=columns)
    return df_all

def create_df_all_limited(df, coldict):
    shop_item_pairs = df[[coldict['shops'], coldict['items']]].drop_duplicates()

    dates = df[coldict['date']].unique()

    # Create a list of all possible combinations of shop-item pairs with the dates
    df_all = pd.DataFrame(
        list(itertools.product(shop_item_pairs.values, dates)),
        columns=['shop_item', coldict['date']])
    # Split the 'shop_item' column back into separate 'shop_id' and 'item_id' columns
    df_all[[coldict['shops'], coldict['items']]] = pd.DataFrame(
        df_all['shop_item'].tolist(), 
        index=df_all.index)
    # Drop the intermediate 'shop_item' column
    df_all = df_all.drop(columns=['shop_item'])
    return df_all

def create_df_with_zero_sales(df, df_all, columns):
    # Merge with the original dataframe
    df_merged = pd.merge(
        df_all,
        df,
        on=columns,
        how='left')
    # Fill missing values with 0
    # df_merged['amount'].fillna(0, inplace=True)
    df_merged.fillna({'amount': 0}, inplace=True)
    return df_merged

def create_categories_df_monthly(df_daily, df_shops, df_items):
    df_categories_monthly_transactions = convert_categories_daily_to_monthly(
        df_daily
    )
    # df_categories_monthly_transactions.info()
    columns = ['shop_id', 'item_category_id', 'monthly_period']
    df_categories_monthly_all = create_df_all(
        columns=columns,
        shops=df_shops['shop_id'].unique(),
        items=df_items['item_category_id'].unique(),
        dates=df_categories_monthly_transactions['monthly_period'].unique()
    )
    # df_categories_monthly_all.info()
    df_categories_monthly = create_df_with_zero_sales(
        df_categories_monthly_transactions,
        df_categories_monthly_all,
        columns)
    #df_categories_monthly.info()
    return df_categories_monthly

def convert_categories_daily_to_monthly(df):
    df_categories_monthly_grouped = df.groupby(
    ['shop_id', 'item_category_id', 'monthly_period']
    )
    df_categories_monthly = df_categories_monthly_grouped.agg(
        {
        'amount': 'sum',
        }
    ).reset_index()
    return df_categories_monthly

def add_category_to_df(df_monthly, df_items):
    df_monthly_full = df_monthly.loc[df_monthly['item_category_id'].notna(), :]  #.copy()
    df_monthly_missing = df_monthly.loc[df_monthly['item_category_id'].isna(), :].copy()
    df_monthly_missing.drop(['item_category_id'], axis=1, inplace=True)
    df_monthly_missing_filled = pd.merge(
        df_monthly_missing,
        df_items,
        on='item_id',
        how='left')
    df_items_monthly = pd.concat([df_monthly_full, df_monthly_missing_filled], ignore_index=True)
    df_items_monthly = df_items_monthly.sort_values(by = ['monthly_period', 'shop_id', 'item_id'])
    count_missing_cats = df_items_monthly.loc[df_items_monthly['item_category_id'].isna(), :].shape[0]
    print(f'after the operation, count of rows with missing categories: {count_missing_cats}')
    return df_items_monthly

def add_avg_shopitem_price_to_df(df_monthly, df_daily_train, method):
    df_monthly_full = df_monthly.loc[df_monthly['price'].notna(), :]
    df_monthly_miss = df_monthly.loc[df_monthly['price'].isna(), :].copy()
    print(f'{df_monthly_full.shape[0]} and {df_monthly_miss.shape[0]} rows with filled and missing prices, respectively.')
    if df_monthly_miss.shape[0] > 0:
        print(f'Filling missing with {method}')
        df_monthly_miss.drop(['price'], axis=1, inplace=True)
        mean_price, merge_columns = get_mean_price(df_daily_train, method)
        # fill the missing
        df_monthly_miss_filled = pd.merge(
            df_monthly_miss, 
            mean_price,
            on=merge_columns,
            how='left')
        df_monthly = pd.concat([df_monthly_full, df_monthly_miss_filled], ignore_index=True)
        df_monthly = df_monthly.sort_values(by = ['monthly_period', 'shop_id', 'item_id'])
        count_missing_price = df_monthly.loc[df_monthly['price'].isna(), :].shape[0]
        print(f'after the operation, count of rows with missing price: {count_missing_price}')
        if count_missing_price > 0: 
            if method == 'mean shop-item-specific price':
                df_monthly = add_avg_shopitem_price_to_df(df_monthly, df_daily_train, 'mean item-specific price')
            elif method == 'mean item-specific price':
                df_monthly = add_avg_shopitem_price_to_df(df_monthly, df_daily_train, 'mean category-specific price')
            elif method == 'mean category-specific price':
                print(f'Filling missing with global average price')
                global_avg_price = df_daily_train[['price']].mean().values[0]
                df_monthly.loc[df_monthly['price'].isna(), 'price'] = float(global_avg_price)
            else:
                raise ValueError(f'Uknown method: {method}')
    else:
        df_monthly = df_monthly_full
    return df_monthly

def get_mean_price(df_daily_train, method):
    if method == 'mean shop-item-specific price':
        # calculate mean shop-item price:
        mean_price = df_daily_train[['shop_id', 'item_id', 'price']].groupby(['shop_id', 'item_id']).mean().reset_index()
        merge_columns = ['shop_id', 'item_id']
    elif method == 'mean item-specific price':
        mean_price = df_daily_train[['item_id', 'price']].groupby(['item_id']).mean().reset_index()
        merge_columns = ['item_id']
    elif method == 'mean category-specific price':
        mean_price = df_daily_train[['item_category_id', 'price']].groupby(['item_category_id']).mean().reset_index()
        merge_columns = ['item_category_id']
    else:
        raise ValueError(f'Uknown method: {method}')
    return mean_price, merge_columns

# Year and month can help capturing the trend and the seasonality, respectively
def add_time_features(df_monthly):
    df_monthly['year'] = df_monthly['monthly_period'].dt.year
    df_monthly['month'] = df_monthly['monthly_period'].dt.month
    return df_monthly

def get_ts_data(df_base, splitname, refresh):
    fn_ts = os.path.join(config['root_data_path'], config[f'fn_{splitname}_ts'])
    if os.path.exists(fn_ts) and not refresh:
        print(f'Loading {fn_ts}')
        df_ts = pd.read_parquet(fn_ts)
    else:
        print(f'Creating {fn_ts}')
        df_ts = add_lag_ma_features(
            df_base,
            lags_to_include = treepredictor.num_lag_mon)
        # remove the months for which lags could not be calculated
        periods_to_remove = df_ts.index.unique()[0:treepredictor.num_lag_mon]
        df_ts = df_ts.drop(periods_to_remove)
        df_ts.to_parquet(fn_ts)
    return df_ts

def add_lag_ma_features(
        df_monthly,
        lags_to_include=3,
        lag_features=['price', 'amount_item', 'amount_cat'],
        mas_to_include=[2],
        ma_features=['price_l1', 'amount_item_l1', 'amount_cat_l1']
        ):
    for feature in lag_features:
        df_monthly = add_feature_lags(
            df_monthly,
            feature,
            lags_to_include)
    for feature in ma_features:
        df_monthly = add_feature_moving_averages(
            df_monthly,
            feature,
            mas_to_include)
    return df_monthly

def add_feature_lags(df, column, lagcount):
    for lag in range(1, lagcount+1):
        new_column_name = column + '_l' + str(lag)
        df[new_column_name] = df.groupby(['shop_id', 'item_id'])[column].shift(lag)
    return df

def add_feature_moving_averages(df, column, windows):
    for window in windows:
        new_column_name = column + '_ma' + str(window)
        df[new_column_name] = df.groupby(['shop_id', 'item_id'])[column].transform(lambda x: x.rolling(window=window).mean())
    return df

## Prepare monthly training and validation data

In [None]:
# Data needed for both train and val
df_shops= treepredictor.rawdata.shop_list
df_items = treepredictor.rawdata.item_list[['item_id', 'item_category_id']].copy()

In [None]:
def data_prep_pipeline(
        df_daily,
        df_shops,
        df_items,
        splitname,
        refresh):
    
    # get base monthly data
    df_base = get_monthly_data(df_daily, df_shops, df_items, splitname, refresh)

    # get monthly data with lag and ma features
    df_ts= get_ts_data(df_base, splitname, refresh)
    
    # TODO: scaling

    # TODO: OHE

    return df_ts

### Training data

In [None]:
columns_needed = ['monthly_period', 'shop_id', 'item_id', 'item_category_id', 'amount', 'price']
df_daily_train = treepredictor.df_daily_train[columns_needed].copy()
# df_daily_train.info()
df_train = data_prep_pipeline(
    df_daily_train,
    df_shops,
    df_items,
    'train',
    refresh=False)

# create X and y
train_y = df_train['amount_item']
df_train.drop(columns=['price', 'amount_item', 'amount_cat'], axis=1, inplace=True)
train_X = df_train


### Validation data

In [None]:
df_daily_val = treepredictor.df_daily_val[columns_needed].copy()
# df_daily_train.info()
df_val = data_prep_pipeline(
    df_daily_val,
    df_shops,
    df_items,
    'val',
    refresh=False)

# create X and y
val_y = df_val['amount_item']
df_val.drop(columns=['price', 'amount_item', 'amount_cat'], axis=1, inplace=True)
val_X = df_val