# Prediction Pipeline: Tree-Based

In [None]:
import itertools
import os
import pandas as pd
import parquet

from tree_predictor import TreePredictor
from utils import Utils

In [None]:
config = Utils.read_config_for_env(config_path='../config/config.yml')
treepredictor = TreePredictor(config)

## Data Preparation

## Prepare monthly training and validation data

In [None]:
# TODO: when ready, move this to tree_predictor
def data_prep_pipeline(
        df_daily,
        splitname,
        refresh):
    
    # get base monthly data
    df_base = treepredictor.data.get_monthly_data(df_daily, splitname, refresh)

    # get monthly data with lag and ma features
    df_ts= treepredictor.data.get_ts_data(df_base, splitname, refresh, treepredictor.num_lag_mon)
    
    return df_ts

### Training data

In [None]:
columns_needed = ['monthly_period', 'shop_id', 'item_id', 'item_category_id', 'amount', 'price']
df_daily_train = treepredictor.df_daily_train[columns_needed].copy()
# df_daily_train.info()
df_train = data_prep_pipeline(
    df_daily_train,
    'train',
    refresh=False)

# create X and y
y_train = df_train['amount_item']
df_train.drop(columns=['price', 'amount_item', 'amount_cat'], axis=1, inplace=True)
X_train = df_train

### Validation data

In [None]:
df_daily_val = treepredictor.df_daily_val[columns_needed].copy()
# df_daily_train.info()
df_val = data_prep_pipeline(
    df_daily_val,
    'val',
    refresh=False)

# create X and y
y_val = df_val['amount_item']
df_val.drop(columns=['price', 'amount_item', 'amount_cat'], axis=1, inplace=True)
X_val= df_val

### Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
# Scaling
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)

## Basic LightGBM model

In [None]:
import lightgbm as lgb
# from lightgbm import LGBMRegressor 
import numpy as np
import shap
from sklearn.metrics import mean_squared_error as mse 
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import make_scorer

In [None]:
# Create a LightGBM dataset for training with features X_train and labels Y_train 
train_data = lgb.Dataset(X_train, label=y_train) 
# Create a LightGBM dataset for testing with features X_val and labels Y_val, 
# and specify the reference dataset as train_data for consistent evaluation 
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data) 
# Define a dictionary of parameters for configuring the LightGBM regression model. 
params = { 
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 30,
    'learning_rate': 0.01,
    'feature_fraction': 0.9,
}
callback_early_stopping = lgb.early_stopping(5)
num_round = 100
model = lgb.train(
    params,
    train_data,
    num_round,
    valid_sets=[val_data],
    callbacks=[callback_early_stopping, lgb.log_evaluation()],
)

In [None]:
# Make predictions on the training and validation data. 
pred_train = model.predict(X_train)
pred_val = model.predict(X_val)

# Calculate and print the Root Mean Squared Error (RMSE) for training and validation predictions. 
print("Training RMSE: ", np.sqrt(mse(y_train, pred_train)))
print("Validation RMSE: ", np.sqrt(mse(y_val, pred_val)))