In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [11]:
# See the data and shape, remove residual index column
# No NAs

sales = pd.read_csv("./training.csv")
sales= sales.iloc[:,1:]
print("Shape: ",sales.shape)
print("NA values", len(sales[sales.isna().any(axis=1)]))
sales.head()

Shape:  (640840, 9)
NA values 0


Unnamed: 0,store_ID,day_of_week,date,nb_customers_on_day,open,promotion,state_holiday,school_holiday,sales
0,366,4,2013-04-18,517,1,0,0,0,4422
1,394,6,2015-04-11,694,1,0,0,0,8297
2,807,4,2013-08-29,970,1,1,0,0,9729
3,802,2,2013-05-28,473,1,1,0,0,6513
4,726,4,2013-10-10,1068,1,1,0,0,10882


In [12]:
# Mapping all non-zero state holiday to 1
print(sales['state_holiday'].value_counts())
sales['state_holiday'] = sales['state_holiday'].apply(lambda x: 0 if str(x) == '0' else 1)

state_holiday
0    621160
a     12842
b      4214
c      2624
Name: count, dtype: int64


In [13]:
# Create year and month and describe data
sales["date"] = pd.to_datetime(sales["date"])
sales["year"] = sales["date"].dt.year
sales["month"] = sales["date"].dt.month
sales.drop(columns='date', inplace=True)

sales.describe()
# day_of_week: categorical
# open: boolean
# promotion: boolean
# school_holiday: boolean
# year: spans from 2013-2015
# nb_customers_on_day: numerical

Unnamed: 0,store_ID,day_of_week,nb_customers_on_day,open,promotion,state_holiday,school_holiday,sales,year,month
count,640840.0,640840.0,640840.0,640840.0,640840.0,640840.0,640840.0,640840.0,640840.0,640840.0
mean,558.211348,4.000189,633.398577,0.830185,0.381718,0.03071,0.178472,5777.469011,2013.832351,5.846555
std,321.878521,1.996478,464.094416,0.37547,0.485808,0.17253,0.38291,3851.338083,0.777335,3.326202
min,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2013.0,1.0
25%,280.0,2.0,405.0,1.0,0.0,0.0,0.0,3731.0,2013.0,3.0
50%,558.0,4.0,609.0,1.0,0.0,0.0,0.0,5746.0,2014.0,6.0
75%,837.0,6.0,838.0,1.0,1.0,0.0,0.0,7860.0,2014.0,8.0
max,1115.0,7.0,5458.0,1.0,1.0,1.0,1.0,41551.0,2015.0,12.0


In [14]:
# XGBOOST

import os
os.environ['DYLD_LIBRARY_PATH'] = '/opt/homebrew/opt/libomp/lib'
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder

df = sales.copy()

# Encode categorical columns
for col in ['store_ID', 'day_of_week', 'year', 'month']:
    
    df[col] = LabelEncoder().fit_transform(df[col])

X = df[['store_ID', 'day_of_week', 'nb_customers_on_day', 'open', 'promotion',
       'state_holiday', 'school_holiday', 'year', 'month']]
y = df['sales']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost with categorical awareness
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    enable_categorical=True,  
    tree_method='hist',      
    max_depth=6,
    learning_rate=0.1,
    n_estimators=300
)

xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)
print("R²:", r2_score(y_test, y_pred))



R²: 0.9313978552818298


# Real Data Predictions

In [15]:
# XGBOOST

real_data = pd.read_csv("./REAL_DATA.csv")
read_data_original = real_data.copy()

real_data['state_holiday'] = real_data['state_holiday'].apply(lambda x: 0 if str(x) == '0' else 1)

real_data["date"] = pd.to_datetime(real_data["date"], dayfirst=True)
real_data["year"] = real_data["date"].dt.year
real_data["month"] = real_data["date"].dt.month
real_data.drop(columns='date', inplace=True)
real_data.drop(columns='index', inplace=True)

y_pred_real = xgb_model.predict(real_data)
y_pred_real_clipped = np.clip(y_pred_real, 0, None).tolist()
read_data_original['sales_predict'] = y_pred_real_clipped
read_data_original.head()

Unnamed: 0,index,store_ID,day_of_week,date,nb_customers_on_day,open,promotion,state_holiday,school_holiday,sales_predict
0,272371,415,7,01/03/2015,0,0,0,0,0,54.924526
1,558468,27,7,29/12/2013,0,0,0,0,0,0.0
2,76950,404,3,19/03/2014,657,1,1,0,0,6815.095703
3,77556,683,2,29/01/2013,862,1,0,0,0,7090.768555
4,456344,920,3,19/03/2014,591,1,1,0,0,6135.796387
