In [276]:
import pandas as pd
from tabulate import tabulate
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import joblib
from datetime import datetime

In [280]:
class XGBoostModel:
    def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=5, subsample=1.0, colsample_bytree=1.0, alpha=0, lambda_=1):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.subsample = subsample
        self.colsample_bytree = colsample_bytree
        self.alpha = alpha
        self.lambda_ = lambda_
        self.model_qty = None
        self.label_encoder_store = LabelEncoder()
        self.label_encoder_dept = LabelEncoder()

    def train(self, training_data):
        # Encode categorical variables
        training_data['store'] = self.label_encoder_store.fit_transform(training_data['store'])
        training_data['item_dept'] = self.label_encoder_dept.fit_transform(training_data['item_dept'])

        # Define features and target variables for item_qty prediction
        X_train = training_data[['date_id', 'item_dept', 'store', 'item_qty_lag_1', 'net_sales_lag_1', 
                                'item_qty_lag_2', 'net_sales_lag_2']]
        y_train_qty = training_data['item_qty']

        # Convert date_id to numerical value for XGBoost
        X_train['date_id'] = pd.to_datetime(X_train['date_id']).map(pd.Timestamp.toordinal)

        # Train the XGBoost model for item_qty
        self.model_qty = xgb.XGBRegressor(
            objective='reg:squarederror',
            n_estimators=self.n_estimators,
            learning_rate=self.learning_rate,
            max_depth=self.max_depth,
            subsample=self.subsample,
            colsample_bytree=self.colsample_bytree,
            alpha=self.alpha,
            reg_lambda=self.lambda_
        )
        self.model_qty.fit(X_train, y_train_qty)

        # Predict on the training dataset
        y_train_pred_qty = self.model_qty.predict(X_train)

        # Calculate metrics for item_qty
        mse_qty = mean_squared_error(y_train_qty, y_train_pred_qty)
        rmse_qty = np.sqrt(mse_qty)
        r2_qty = r2_score(y_train_qty, y_train_pred_qty)

        # Print the results
        print(f"Item Qty - MSE: {mse_qty}, RMSE: {rmse_qty}, R-squared: {r2_qty}")

    def evaluate(self, test_data):
        # Encode categorical variables
        test_data['store'] = self.label_encoder_store.fit_transform(test_data['store'])
        test_data['item_dept'] = self.label_encoder_dept.fit_transform(test_data['item_dept'])

        # Define features and target variables for item_qty prediction
        X_test = test_data[['date_id', 'item_dept', 'store', 'item_qty_lag_1', 'net_sales_lag_1', 
                                'item_qty_lag_2', 'net_sales_lag_2']]
        y_test_qty = test_data['item_qty']


        # Convert date_id to numerical value for XGBoost
        X_test['date_id'] = pd.to_datetime(X_test['date_id']).map(pd.Timestamp.toordinal)

        # Predict and evaluate for item_qty
        y_test_qty = test_data['item_qty']
        y_pred_qty = self.model_qty.predict(X_test)
        mse_qty = mean_squared_error(y_test_qty, y_pred_qty)
        rmse_qty = np.sqrt(mse_qty)
        r2_qty = r2_score(y_test_qty, y_pred_qty)

        print(f"Item Quantity Prediction - MSE: {mse_qty}, RMSE: {rmse_qty}, R²: {r2_qty}")

    def save_model(self, model_path='xgb_model_qty.pkl'):
        # Save the model
        joblib.dump((self.model_qty, self.label_encoder_store, self.label_encoder_dept), model_path)
        print(f"Model saved to {model_path}")

In [278]:
training_data = pd.read_csv("../data/training_model_data.csv")
testing_data = pd.read_csv("../data/testing_model_data.csv")
xgb_model = XGBoostModel(n_estimators=150, learning_rate=0.05, max_depth=5, subsample=0.8, colsample_bytree=0.8, alpha=10, lambda_=1)
trained_xgb_model = xgb_model.train(training_data)
tested_xgb_model = xgb_model.evaluate(testing_data)

# Save the model
xgb_model.save_model('../src/models/xgb_model_qty.pkl')

Item Qty - MSE: 13640.168807877744, RMSE: 116.79113325881269, R-squared: 0.9856336520186134
Item Quantity Prediction - MSE: 229868.0489398246, RMSE: 479.44556410485706, R²: 0.8431927503457206
Model saved to ../../src/models/xgb_model_qty.pkl


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['date_id'] = pd.to_datetime(X_train['date_id']).map(pd.Timestamp.toordinal)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['date_id'] = pd.to_datetime(X_test['date_id']).map(pd.Timestamp.toordinal)


In [279]:
# # Load the model
# xgb_model.load_model('../../src/models/xgb_model_qty.pkl')
# # Predict the item_qty
# xgb_model.predict('2022-02-04', 'Beverages', 'ABC')

Model loaded from ../../src/models/xgb_model_qty.pkl
Predicted item_qty: 1633.808837890625


1633.8088