In [2]:
import sys
import os
sys.path.append(os.path.abspath('../src'))

from data_preprocessing_utils import data_for_training
from utils import compute_error

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from tabulate import tabulate



We start by loading the pre-processed data

In [3]:
sales_and_purchase_prices=pd.read_csv('../data/prepocessing/sales_and_purchase_prices.csv')
sales_and_purchase_prices.head()

Unnamed: 0,InventoryId,Store,Brand,Description,Size,SalesQuantity,SalesDollars,SalesPrice,SalesDate,Volume,Classification,ExciseTax,VendorNo,VendorName,Amount,PurchasePrice
0,1_HARDERSFIELD_1004,1,1004,Jim Beam w/2 Rocks Glasses,750.0,1,16.49,16.49,2016-01-01,750,1,0.79,12546,JIM BEAM BRANDS COMPANY,17.28,10.65
1,1_HARDERSFIELD_1004,1,1004,Jim Beam w/2 Rocks Glasses,750.0,2,32.98,16.49,2016-01-02,750,1,1.57,12546,JIM BEAM BRANDS COMPANY,34.55,10.65
2,1_HARDERSFIELD_1004,1,1004,Jim Beam w/2 Rocks Glasses,750.0,1,16.49,16.49,2016-01-03,750,1,0.79,12546,JIM BEAM BRANDS COMPANY,17.28,10.65
3,1_HARDERSFIELD_1004,1,1004,Jim Beam w/2 Rocks Glasses,750.0,1,14.49,14.49,2016-01-08,750,1,0.79,12546,JIM BEAM BRANDS COMPANY,15.28,10.65
4,1_HARDERSFIELD_1005,1,1005,Maker's Mark Combo Pack,750.0,2,69.98,34.99,2016-01-09,375,1,0.79,12546,JIM BEAM BRANDS COMPANY,70.77,27.34


# we aggregate sales by month

In [4]:

# Removal of some columns that are not important for prediction: ["InventoryId","Brand","Volume","VendorNo","Amount","VendorName"]
sales_and_purchase_prices=sales_and_purchase_prices.drop(["InventoryId","Brand","Volume","VendorNo", "Amount", "SalesDollars"], axis=1)


# Convert date columns to datetime format
sales_and_purchase_prices['SalesDate'] = pd.to_datetime(sales_and_purchase_prices['SalesDate'])

# Créez une nouvelle colonne pour l'année et le mois
sales_and_purchase_prices['Year'] = sales_and_purchase_prices['SalesDate'].dt.year
sales_and_purchase_prices['Month'] = sales_and_purchase_prices['SalesDate'].dt.month

# Groupez les données par année et par mois et calculez les agrégats souhaités pour chaque groupe
sales_month_aggregated=sales_and_purchase_prices.groupby(['Year', 'Month','Description','Store','Classification']).agg({
    'SalesQuantity': 'sum',
    'ExciseTax': 'first',
    'Size': 'mean',
    'PurchasePrice': 'first',
    'SalesPrice':'first',
    'VendorName': 'first'

    # Ajoutez d'autres colonnes que vous souhaitez agréger ici
}).reset_index()

sales_month_aggregated.head()

Unnamed: 0,Year,Month,Description,Store,Classification,SalesQuantity,ExciseTax,Size,PurchasePrice,SalesPrice,VendorName
0,2016,1,(RI) 1,1,1,1,0.79,750.0,26.92,36.99,JIM BEAM BRANDS COMPANY
1,2016,1,(RI) 1,22,1,1,0.79,750.0,26.92,36.99,JIM BEAM BRANDS COMPANY
2,2016,1,(RI) 1,33,1,1,0.79,750.0,26.92,36.99,JIM BEAM BRANDS COMPANY
3,2016,1,(RI) 1,34,1,3,0.79,750.0,26.92,36.99,JIM BEAM BRANDS COMPANY
4,2016,1,(RI) 1,35,1,1,0.79,750.0,26.92,36.99,JIM BEAM BRANDS COMPANY


# Obtain data in the formats required by the models, then train classical learning models such as Random Forest, linear regression,Lasso, Ridge, ElasticNet,KNN.

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from tabulate import tabulate

# Assuming sales_month_aggregated is your DataFrame
data = sales_month_aggregated.copy()

#get model input data through the function data_for_training
X_train,X_test,X_val,y_train,y_test,y_val=data_for_training(data)


# Models
rf_model = RandomForestRegressor(n_estimators=50, random_state=42)
lr_model = LinearRegression()
lasso_model = Lasso(alpha=0.1)
ridge_model = Ridge(alpha=1.0)
elasticnet_model = ElasticNet(alpha=0.1, l1_ratio=0.5)
knn_model = KNeighborsRegressor(n_neighbors=5)

# Train models
rf_model.fit(X_train, y_train)
lr_model.fit(X_train, y_train)
lasso_model.fit(X_train, y_train)
ridge_model.fit(X_train, y_train)
elasticnet_model.fit(X_train, y_train)
knn_model.fit(X_train, y_train)

# Prediction
y_test_pred_rf = rf_model.predict(X_test)
y_test_pred_lr = lr_model.predict(X_test)
y_test_pred_lasso = lasso_model.predict(X_test)
y_test_pred_ridge = ridge_model.predict(X_test)
y_test_pred_elasticnet = elasticnet_model.predict(X_test)
y_test_pred_knn = knn_model.predict(X_test)



rmse_rf, corr_rf, mae_rf, rae_rf, rrse_rf, mape_rf, r2_rf = compute_error(y_test, y_test_pred_rf)
rmse_lr, corr_lr, mae_lr, rae_lr, rrse_lr, mape_lr, r2_lr = compute_error(y_test, y_test_pred_lr)
rmse_lasso, corr_lasso, mae_lasso, rae_lasso, rrse_lasso, mape_lasso, r2_lasso = compute_error(y_test, y_test_pred_lasso)
rmse_ridge, corr_ridge, mae_ridge, rae_ridge, rrse_ridge, mape_ridge, r2_ridge = compute_error(y_test, y_test_pred_ridge)
rmse_elasticnet, corr_elasticnet, mae_elasticnet, rae_elasticnet, rrse_elasticnet, mape_elasticnet, r2_elasticnet = compute_error(y_test, y_test_pred_elasticnet)
rmse_knn, corr_knn, mae_knn, rae_knn, rrse_knn, mape_knn, r2_knn = compute_error(y_test, y_test_pred_knn)

# Table
summary_data = [
    ["Random Forest", rmse_rf, corr_rf, mae_rf, rae_rf, rrse_rf, mape_rf, r2_rf],
    ["Linear Regression", rmse_lr, corr_lr, mae_lr, rae_lr, rrse_lr, mape_lr, r2_lr],
    ["Lasso", rmse_lasso, corr_lasso, mae_lasso, rae_lasso, rrse_lasso, mape_lasso, r2_lasso],
    ["Ridge", rmse_ridge, corr_ridge, mae_ridge, rae_ridge, rrse_ridge, mape_ridge, r2_ridge],
    ["ElasticNet", rmse_elasticnet, corr_elasticnet, mae_elasticnet, rae_elasticnet, rrse_elasticnet, mape_elasticnet, r2_elasticnet],
    ["KNN", rmse_knn, corr_knn, mae_knn, rae_knn, rrse_knn, mape_knn, r2_knn]
]

headers = ["Model", "RMSE", "Correlation", "MAE", "RAE", "RRSE", "MAPE", "R2"]
table = tabulate(summary_data, headers=headers, tablefmt="grid")

print("Summary of Model Performances:")
print(table)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[['ExciseTax', 'Size', 'PurchasePrice', 'SalesPrice']] = scaler.fit_transform(X_train[['ExciseTax', 'Size', 'PurchasePrice', 'SalesPrice']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_val[['ExciseTax', 'S

Summary of Model Performances:
+-------------------+---------+---------------+----------+----------+----------+----------+-----------+
| Model             |    RMSE |   Correlation |      MAE |      RAE |     RRSE |     MAPE |        R2 |
| Random Forest     | 26.6074 |      0.801028 |  9.14725 | 0.502143 | 0.603771 |  84.6752 | 0.635461  |
+-------------------+---------+---------------+----------+----------+----------+----------+-----------+
| Linear Regression | 27.2808 |      0.785528 | 11.5406  | 0.633526 | 0.619051 | 221.928  | 0.616776  |
+-------------------+---------+---------------+----------+----------+----------+----------+-----------+
| Lasso             | 37.4797 |      0.584521 | 16.1726  | 0.887803 | 0.850483 | 361.208  | 0.276679  |
+-------------------+---------+---------------+----------+----------+----------+----------+-----------+
| Ridge             | 27.2367 |      0.786461 | 11.342   | 0.622625 | 0.61805  | 215.645  | 0.618015  |
+-------------------+---------+--