## 1.Problem Definition
Forecast store sales on data from Corporación Favorita, a large Ecuadorian-based grocery retailer.

## 2.Data
Data is gotten from kaggle: https://www.kaggle.com/competitions/store-sales-time-series-forecasting/data

## 3.Evaluation 
Evaluation is based on the Root Mean Squared Logarithmic Error

## 4.Features
The training data, comprising time series of features store_nbr, family, and onpromotion as well as the target sales.
* `store_nbr` identifies the store at which the products are sold.
* `family` identifies the type of product sold.
* `sales` gives the total sales for a product family at a particular store at a given date. Fractional values are possible since products can be sold in fractional units (1.5 kg of cheese, for instance, as opposed to 1 bag of chips).
* `onpromotion` gives the total number of items in a product family that were being promoted at a store at a given date.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDRegressor


In [2]:
df = pd.read_csv('train.csv', parse_dates=['date'], dayfirst=[True])
df

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.000,0
1,1,2013-01-01,1,BABY CARE,0.000,0
2,2,2013-01-01,1,BEAUTY,0.000,0
3,3,2013-01-01,1,BEVERAGES,0.000,0
4,4,2013-01-01,1,BOOKS,0.000,0
...,...,...,...,...,...,...
3000883,3000883,2017-08-15,9,POULTRY,438.133,0
3000884,3000884,2017-08-15,9,PREPARED FOODS,154.553,1
3000885,3000885,2017-08-15,9,PRODUCE,2419.729,148
3000886,3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8


In [3]:
df.describe()

Unnamed: 0,id,store_nbr,sales,onpromotion
count,3000888.0,3000888.0,3000888.0,3000888.0
mean,1500444.0,27.5,357.7757,2.60277
std,866281.9,15.58579,1101.998,12.21888
min,0.0,1.0,0.0,0.0
25%,750221.8,14.0,0.0,0.0
50%,1500444.0,27.5,11.0,0.0
75%,2250665.0,41.0,195.8473,0.0
max,3000887.0,54.0,124717.0,741.0


In [4]:
df.corr()

Unnamed: 0,id,store_nbr,sales,onpromotion
id,1.0,0.000301,0.085784,0.20626
store_nbr,0.000301,1.0,0.041196,0.007286
sales,0.085784,0.041196,1.0,0.427923
onpromotion,0.20626,0.007286,0.427923,1.0


In [5]:
df['dayofweek'] = df['date'].dt.day_of_week
df.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,dayofweek
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,1
1,1,2013-01-01,1,BABY CARE,0.0,0,1
2,2,2013-01-01,1,BEAUTY,0.0,0,1
3,3,2013-01-01,1,BEVERAGES,0.0,0,1
4,4,2013-01-01,1,BOOKS,0.0,0,1


In [6]:
df.tail()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,dayofweek
3000883,3000883,2017-08-15,9,POULTRY,438.133,0,1
3000884,3000884,2017-08-15,9,PREPARED FOODS,154.553,1,1
3000885,3000885,2017-08-15,9,PRODUCE,2419.729,148,1
3000886,3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.0,8,1
3000887,3000887,2017-08-15,9,SEAFOOD,16.0,0,1


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000888 entries, 0 to 3000887
Data columns (total 7 columns):
 #   Column       Dtype         
---  ------       -----         
 0   id           int64         
 1   date         datetime64[ns]
 2   store_nbr    int64         
 3   family       object        
 4   sales        float64       
 5   onpromotion  int64         
 6   dayofweek    int64         
dtypes: datetime64[ns](1), float64(1), int64(4), object(1)
memory usage: 160.3+ MB


In [8]:
# Convert column type from object to category
df['family'] = df['family'].astype('category')

In [9]:
# Select categorical columns
cat_columns = df.select_dtypes('category').columns
cat_columns

Index(['family'], dtype='object')

In [10]:
# Convert categorical column values to codes/int
df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)
df

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,dayofweek
0,0,2013-01-01,1,0,0.000,0,1
1,1,2013-01-01,1,1,0.000,0,1
2,2,2013-01-01,1,2,0.000,0,1
3,3,2013-01-01,1,3,0.000,0,1
4,4,2013-01-01,1,4,0.000,0,1
...,...,...,...,...,...,...,...
3000883,3000883,2017-08-15,9,28,438.133,0,1
3000884,3000884,2017-08-15,9,29,154.553,1,1
3000885,3000885,2017-08-15,9,30,2419.729,148,1
3000886,3000886,2017-08-15,9,31,121.000,8,1


In [11]:
from xgboost import XGBRFRegressor

  from pandas import MultiIndex, Int64Index


In [12]:
# Split data
X = df.drop(['sales'], axis=1)
y = df['sales']

In [13]:
# Split into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [14]:
np.random.seed(42)

In [15]:
# Instantiate model
model = XGBRFRegressor()

In [16]:
X_train.info(), y_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2400710 entries, 1768446 to 1307968
Data columns (total 6 columns):
 #   Column       Dtype         
---  ------       -----         
 0   id           int64         
 1   date         datetime64[ns]
 2   store_nbr    int64         
 3   family       int8          
 4   onpromotion  int64         
 5   dayofweek    int64         
dtypes: datetime64[ns](1), int64(4), int8(1)
memory usage: 112.2 MB
<class 'pandas.core.series.Series'>
Int64Index: 2400710 entries, 1768446 to 1307968
Series name: sales
Non-Null Count    Dtype  
--------------    -----  
2400710 non-null  float64
dtypes: float64(1)
memory usage: 36.6 MB


(None, None)

In [17]:
X_train['date'] = pd.to_numeric(X_train['date'])
X_train

Unnamed: 0,id,date,store_nbr,family,onpromotion,dayofweek
1768446,1768446,1442880000000000000,29,9,1,1
2051401,2051401,1456704000000000000,18,22,0,0
2457441,2457441,1476403200000000000,10,30,1,4
1613170,1613170,1435363200000000000,21,31,0,5
238141,238141,1368489600000000000,40,13,0,1
...,...,...,...,...,...,...
1008680,1008680,1405987200000000000,11,2,0,1
306930,306930,1371859200000000000,20,30,0,5
2011330,2011330,1454716800000000000,43,13,0,5
2309872,2309872,1469232000000000000,20,4,0,5


In [18]:
X_train['family'] = pd.to_numeric(X_train['family'])
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2400710 entries, 1768446 to 1307968
Data columns (total 6 columns):
 #   Column       Dtype
---  ------       -----
 0   id           int64
 1   date         int64
 2   store_nbr    int64
 3   family       int8 
 4   onpromotion  int64
 5   dayofweek    int64
dtypes: int64(5), int8(1)
memory usage: 112.2 MB


In [19]:
y_train.info()

<class 'pandas.core.series.Series'>
Int64Index: 2400710 entries, 1768446 to 1307968
Series name: sales
Non-Null Count    Dtype  
--------------    -----  
2400710 non-null  float64
dtypes: float64(1)
memory usage: 36.6 MB


In [20]:
# Fit model
model.fit(X_train, y_train)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


XGBRFRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
               colsample_bytree=1, enable_categorical=False, gamma=0, gpu_id=-1,
               importance_type=None, interaction_constraints='',
               max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
               monotone_constraints='()', n_estimators=100, n_jobs=12,
               num_parallel_tree=100, objective='reg:squarederror',
               predictor='auto', random_state=0, reg_alpha=0,
               scale_pos_weight=1, tree_method='exact', validate_parameters=1,
               verbosity=None)

In [21]:
X_test['family'] = pd.to_numeric(X_test['family'])
X_test['date'] = pd.to_numeric(X_test['date'])
X_test

Unnamed: 0,id,date,store_nbr,family,onpromotion,dayofweek
1664143,1664143,1437782400000000000,51,19,0,5
2345855,2345855,1470960000000000000,3,17,0,4
2253969,2253969,1466467200000000000,51,3,35,1
2539373,2539373,1480377600000000000,1,23,0,1
1568102,1568102,1433116800000000000,8,8,1,0
...,...,...,...,...,...,...
2336017,2336017,1470441600000000000,53,13,2,5
2100940,2100940,1459036800000000000,8,28,0,6
1568135,1568135,1433116800000000000,9,8,3,0
2104982,2104982,1459296000000000000,21,11,0,2


In [22]:
model.score(X_test, y_test)

0.6854849976904935

In [23]:
from sklearn.metrics import mean_squared_log_error

In [24]:
y_preds = model.predict(X_test)
y_preds

array([  44.42998 ,   16.137194, 4222.749   , ..., 1110.1058  ,
        208.19527 , 3304.1907  ], dtype=float32)

In [25]:
msle = mean_squared_log_error(y_test, y_preds)
msle

7.359175848470636

In [26]:
rmsle = np.sqrt(msle)
rmsle

2.7127800958556585

In [27]:
from xgboost import XGBRegressor

In [28]:
model_2 = XGBRegressor()

In [29]:
model_2.fit(X_train, y_train)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=12,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [30]:
model_2.score(X_train, y_train)

0.9013547372009645

In [31]:
y_preds_2 = abs(model_2.predict(X_test))
y_preds_2

array([  47.39261 ,   73.83027 , 3924.751   , ..., 1085.9867  ,
         69.314354, 5069.695   ], dtype=float32)

In [32]:
msle_2 = mean_squared_log_error(y_test, y_preds_2)
msle_2

5.525370019852095

In [33]:
rmsle_2 = np.sqrt(msle_2)
rmsle_2

2.3506105632052483