# Prediction Pipeline: Tree-Based

In [38]:
import itertools
import os
import pandas as pd
import parquet

from tree_predictor import TreePredictor
from utils import Utils

In [39]:
config = Utils.read_config_for_env(config_path='../config/config.yml')
treepredictor = TreePredictor(config)

Loading data.. Done.
Fixing data schemas.. Done.
Cleaning training data
Checking for negative values in ['price', 'amount'].. Count of rows marked as invalid: 6469
Checking for outliers in ['price', 'amount'].. Count of rows marked as invalid: 40138
Count of cleaned rows: 46248
Cleaning validation data
Checking for negative values in ['price', 'amount'].. Count of rows marked as invalid: 1507
Count of cleaned rows: 1507
Prepared daily raw data.


## Data Preparation

## Prepare monthly training and validation data

In [40]:
# TODO: when ready, move this to tree_predictor
def data_prep_pipeline(
        df_daily,
        splitname,
        refresh):
    
    # get base monthly data
    df_base = treepredictor.data.get_monthly_data(df_daily, splitname, refresh)

    # get monthly data with lag and ma features
    df_ts= treepredictor.data.get_ts_data(df_base, splitname, refresh, treepredictor.num_lag_mon)
    
    return df_ts

### Training data

In [41]:
columns_needed = ['monthly_period', 'shop_id', 'item_id', 'item_category_id', 'amount', 'price']
df_daily_train = treepredictor.df_daily_train[columns_needed].copy()
# df_daily_train.info()
df_train = data_prep_pipeline(
    df_daily_train,
    'train',
    refresh=False)

# create X and y
y_train = df_train['amount_item']
df_train.drop(columns=['price', 'amount_item', 'amount_cat'], axis=1, inplace=True)
X_train = df_train

Loading /Users/Onur/opt/MLrepos/shop_sales_prediction/data/train_base.parquet
Loading /Users/Onur/opt/MLrepos/shop_sales_prediction/data/train_ts.parquet


### Validation data

In [42]:
df_daily_val = treepredictor.df_daily_val[columns_needed].copy()
# df_daily_train.info()
df_val = data_prep_pipeline(
    df_daily_val,
    'val',
    refresh=False)

# create X and y
y_val = df_val['amount_item']
df_val.drop(columns=['price', 'amount_item', 'amount_cat'], axis=1, inplace=True)
X_val= df_val

Loading /Users/Onur/opt/MLrepos/shop_sales_prediction/data/val_base.parquet
Loading /Users/Onur/opt/MLrepos/shop_sales_prediction/data/val_ts.parquet


### Scaling

In [43]:
from sklearn.preprocessing import StandardScaler
# Scaling
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)

## Basic LightGBM model

In [44]:
import lightgbm as lgb
# from lightgbm import LGBMRegressor 
import numpy as np
import shap
from sklearn.metrics import mean_squared_error as mse 
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import make_scorer

In [None]:
# Create a LightGBM dataset for training with features X_train and labels Y_train 
train_data = lgb.Dataset(X_train, label=y_train) 
# Create a LightGBM dataset for testing with features X_val and labels Y_val, 
# and specify the reference dataset as train_data for consistent evaluation 
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data) 
# Define a dictionary of parameters for configuring the LightGBM regression model. 

In [45]:
params = { 
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 30,
    'learning_rate': 0.1,
    'feature_fraction': 0.9,
}
callback_early_stopping = lgb.early_stopping(5)
num_round = 100
model = lgb.train(
    params,
    train_data,
    num_round,
    valid_sets=[val_data],
    callbacks=[callback_early_stopping, lgb.log_evaluation()],
)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.466542 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2610
[LightGBM] [Info] Number of data points in the train set: 31924800, number of used features: 17
[LightGBM] [Info] Start training from score 0.078381
[1]	valid_0's rmse: 1.65917
Training until validation scores don't improve for 5 rounds
[2]	valid_0's rmse: 1.64354
[3]	valid_0's rmse: 1.62982
[4]	valid_0's rmse: 1.61804
[5]	valid_0's rmse: 1.60776
[6]	valid_0's rmse: 1.59815
[7]	valid_0's rmse: 1.59186
[8]	valid_0's rmse: 1.58492
[9]	valid_0's rmse: 1.57901
[10]	valid_0's rmse: 1.57547
[11]	valid_0's rmse: 1.57279
[12]	valid_0's rmse: 1.56934
[13]	valid_0's rmse: 1.56753
[14]	valid_0's rmse: 1.5655
[15]	valid_0's rmse: 1.56399
[16]	valid_0's rmse: 1.56182
[17]	valid_0's rmse: 1.55994
[18]	valid_0's rmse: 1.55767
[19]	valid_0's rmse

In [46]:
# Make predictions on the training and validation data. 
pred_train = model.predict(X_train)
pred_val = model.predict(X_val)

# Calculate and print the Root Mean Squared Error (RMSE) for training and validation predictions. 
print("Training RMSE: ", np.sqrt(mse(y_train, pred_train)))
print("Validation RMSE: ", np.sqrt(mse(y_val, pred_val)))

Training RMSE:  0.547277644868107
Validation RMSE:  1.54360680064235


In [56]:
pred_train.max()

95.28733265883206