In [51]:
import pandas as pd
from tabulate import tabulate
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import joblib
from datetime import datetime

In [52]:
class XGBoostModel:
    def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=5, subsample=1.0, colsample_bytree=1.0, alpha=0, lambda_=1):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.subsample = subsample
        self.colsample_bytree = colsample_bytree
        self.alpha = alpha
        self.lambda_ = lambda_
        self.model_sales = None
        self.label_encoder_store = LabelEncoder()
        self.label_encoder_dept = LabelEncoder()

    def train(self, training_data):
        # Encode categorical variables
        training_data['store'] = self.label_encoder_store.fit_transform(training_data['store'])
        training_data['item_dept'] = self.label_encoder_dept.fit_transform(training_data['item_dept'])

        # Define features and target variables for item_qty prediction
        X_train = training_data[['date_id', 'item_dept', 'store', 'item_qty_lag_1', 'net_sales_lag_1', 
                                'item_qty_lag_2', 'net_sales_lag_2']]
        y_train_sales = training_data['net_sales']

        # Convert date_id to numerical value for XGBoost
        X_train['date_id'] = pd.to_datetime(X_train['date_id']).map(pd.Timestamp.toordinal)

        # Train the XGBoost model for net_sales
        self.model_sales = xgb.XGBRegressor(
            objective='reg:squarederror',
            n_estimators=self.n_estimators,
            learning_rate=self.learning_rate,
            max_depth=self.max_depth,
            subsample=self.subsample,
            colsample_bytree=self.colsample_bytree,
            alpha=self.alpha,
            reg_lambda=self.lambda_
        )
        self.model_sales.fit(X_train, y_train_sales)

        # Predict on the training dataset
        y_train_pred_sales = self.model_sales.predict(X_train)

        # Calculate metrics for net_sales
        mse_sales = mean_squared_error(y_train_sales, y_train_pred_sales)
        rmse_sales = np.sqrt(mse_sales)
        r2_sales = r2_score(y_train_sales, y_train_pred_sales)

        # Print the results
        print(f"Net Sales - MSE: {mse_sales}, RMSE: {rmse_sales}, R-squared: {r2_sales}")

    def evaluate(self, test_data):
        # Encode categorical variables
        test_data['store'] = self.label_encoder_store.fit_transform(test_data['store'])
        test_data['item_dept'] = self.label_encoder_dept.fit_transform(test_data['item_dept'])

        # Define features and target variables for net_sales prediction
        X_test = test_data[['date_id', 'item_dept', 'store', 'item_qty_lag_1', 'net_sales_lag_1', 
                                'item_qty_lag_2', 'net_sales_lag_2']]
        y_test_sales = test_data['net_sales']


        # Convert date_id to numerical value for XGBoost
        X_test['date_id'] = pd.to_datetime(X_test['date_id']).map(pd.Timestamp.toordinal)

        # Predict and evaluate for net_sales
        y_test_sales = test_data['net_sales']
        y_pred_sales = self.model_sales.predict(X_test)
        mse_sales = mean_squared_error(y_test_sales, y_pred_sales)
        rmse_sales = np.sqrt(mse_sales)
        r2_sales = r2_score(y_test_sales, y_pred_sales)

        print(f"Item Quantity Prediction - MSE: {mse_sales}, RMSE: {rmse_sales}, R²: {r2_sales}")

    def save_model(self, model_path='xgb_model_sales.pkl'):
        # Save the model
        joblib.dump((self.model_sales, self.label_encoder_store, self.label_encoder_dept), model_path)
        print(f"Model saved to {model_path}")

In [53]:
training_data = pd.read_csv("../data/training_model_data.csv")
testing_data = pd.read_csv("../data/testing_model_data.csv")
xgb_model = XGBoostModel(n_estimators=150, learning_rate=0.05, max_depth=5, subsample=0.8, colsample_bytree=0.8, alpha=10, lambda_=1)
trained_xgb_model = xgb_model.train(training_data)
tested_xgb_model = xgb_model.evaluate(testing_data)

# Save the model
xgb_model.save_model('../src/models/xgb_model_sales.pkl')

Net Sales - MSE: 745920691.2935708, RMSE: 27311.54867988212, R-squared: 0.9741291301088193
Item Quantity Prediction - MSE: 15824310815.791012, RMSE: 125794.71696295918, R²: 0.6344770468526446
Model saved to ../../src/models/xgb_model_sales.pkl


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['date_id'] = pd.to_datetime(X_train['date_id']).map(pd.Timestamp.toordinal)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['date_id'] = pd.to_datetime(X_test['date_id']).map(pd.Timestamp.toordinal)


In [54]:
# # Load the model
# xgb_model.load_model('../src/models/xgb_model_sales.pkl')
# # Predict the item_qty
# xgb_model.predict('2022-02-02', 'Beverages', 'ABC')

Model loaded from ../../src/models/xgb_model_sales.pkl
Predicted net_sales: 294050.96875
