In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import r2_score

In [2]:
# Import dataset
print("Loading dataset...")
df = pd.read_csv('Walmart_Store_sales.csv')
print("...Done.")
print()

Loading dataset...
...Done.



In [3]:
df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%Y')
df['Weekly_Sales'] = df['Weekly_Sales'].round(2)
df['year'] = df['Date'].dt.year
df['month'] = df['Date'].dt.month
df['day'] = df['Date'].dt.day
df['day_of_week'] = df['Date'].dt.day_of_week
df = df.dropna(subset=['Weekly_Sales'])
df = df.sort_values(by=['Store'])

In [4]:
df = df.convert_dtypes()

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 136 entries, 73 to 110
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Store         136 non-null    Int64         
 1   Date          118 non-null    datetime64[ns]
 2   Weekly_Sales  136 non-null    Float64       
 3   Holiday_Flag  125 non-null    Int64         
 4   Temperature   121 non-null    Float64       
 5   Fuel_Price    124 non-null    Float64       
 6   CPI           125 non-null    Float64       
 7   Unemployment  122 non-null    Float64       
 8   year          118 non-null    Int64         
 9   month         118 non-null    Int64         
 10  day           118 non-null    Int64         
 11  day_of_week   118 non-null    Int64         
dtypes: Float64(5), Int64(6), datetime64[ns](1)
memory usage: 15.3 KB


In [6]:
#Enlèvement des Outliers
remove_outlier=['Temperature','Fuel_Price','CPI','Unemployment']

for col in remove_outlier:
    mean = df[col].mean()
    std= df[col].std()

    mask = np.abs((df[col] - mean) <= 3* std) & ((df[col] - mean) >= - 3* std)
    df = df[mask]

display(df.head())
print()
display(df.describe(include='all'))

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,year,month,day,day_of_week
73,1,2010-08-27,1449142.92,,85.22,2.619,211.567306,7.787,2010,8,27,4
44,1,2010-02-12,1641957.44,1.0,38.51,2.548,211.24217,8.106,2010,2,12,4
78,1,2011-11-18,1539483.7,0.0,62.25,3.308,218.220509,7.866,2011,11,18,4
13,1,2012-03-16,1677472.78,0.0,64.74,3.734,221.211813,7.348,2012,3,16,4
95,1,2010-05-14,1494251.5,0.0,74.78,2.854,210.337426,7.808,2010,5,14,4





Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,year,month,day,day_of_week
count,90.0,80,90.0,80.0,90.0,90.0,90.0,90.0,80.0,80.0,80.0,80.0
mean,9.9,2011-05-17 11:06:00,1233864.509,0.075,61.061,3.318444,179.524905,7.389733,2010.8875,6.3625,16.125,4.0
min,1.0,2010-02-05 00:00:00,268929.03,0.0,18.79,2.548,126.128355,5.143,2010.0,1.0,1.0,4.0
25%,4.0,2010-08-04 06:00:00,561724.0475,0.0,45.3425,2.81475,132.602339,6.64225,2010.0,4.0,10.0,4.0
50%,9.0,2011-05-16 12:00:00,1260826.1,0.0,61.45,3.468,197.166416,7.419,2011.0,6.0,16.5,4.0
75%,15.75,2012-02-18 18:00:00,1807159.02,0.0,75.7925,3.73775,214.855374,8.099,2012.0,8.25,23.25,4.0
max,20.0,2012-10-19 00:00:00,2771397.17,1.0,91.65,4.17,226.968844,9.342,2012.0,12.0,31.0,4.0
std,6.204475,,664725.013301,0.265053,17.74604,0.484399,39.554303,0.982729,0.826672,3.028321,8.521566,0.0


In [7]:
# Separate target variable Y from features X
target_name = 'Weekly_Sales'
features_list = ['Store', 'Temperature','Fuel_Price','CPI','Unemployment', 'Holiday_Flag', 'year', 'month', 'day', 'day_of_week']

print("Separating labels from features...")
Y = df.loc[:,target_name]
X = df.loc[:,features_list] # All columns are kept, except the target
print("...Done.")
print(Y.head())
print()
print(X.head())
print()

Separating labels from features...
...Done.
73    1449142.92
44    1641957.44
78     1539483.7
13    1677472.78
95     1494251.5
Name: Weekly_Sales, dtype: Float64

    Store  Temperature  Fuel_Price         CPI  Unemployment  Holiday_Flag  \
73      1        85.22       2.619  211.567306         7.787          <NA>   
44      1        38.51       2.548   211.24217         8.106             1   
78      1        62.25       3.308  218.220509         7.866             0   
13      1        64.74       3.734  221.211813         7.348             0   
95      1        74.78       2.854  210.337426         7.808             0   

    year  month  day  day_of_week  
73  2010      8   27            4  
44  2010      2   12            4  
78  2011     11   18            4  
13  2012      3   16            4  
95  2010      5   14            4  



In [8]:
# First : always divide dataset into train set & test set !!


print("Dividing into train and test sets...")
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
# test_size indicates the proportion of rows from X and Y that will go into the test dataset while 
# maintaining the correspondance between the rows from X and Y 

# random_state is an argument that can be found in all functions that have a pseudo-random behaviour
# if random_state is not stated the function will derive a different random result everytime the cell 
# runs, if random_state is given a value the results will be the same everytime the cell runs while
# each different value of radom_state will derive a specific result
print("...Done.")
print()

Dividing into train and test sets...
...Done.



In [9]:
# *Temperature*, *Fuel_price*, *CPI* and *Unemployment*
# Create pipeline for numeric features
numeric_features = ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'year', 'month', 'day', 'day_of_week'] # Names of numeric columns in X_train/X_test
numeric_transformer = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy='median')), # missing values will be replaced by columns' median
    ('scaler', StandardScaler())
])

# Create pipeline for categorical features
categorical_features = ['Store', 'Holiday_Flag'] # Names of categorical columns in X_train/X_test
categorical_transformer = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # missing values will be replaced by most frequent value
    ('encoder', OneHotEncoder(drop='first')) # first column will be dropped to avoid creating correlations between features
    ])

In [10]:
# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [11]:
# Preprocessings on train set
print("Performing preprocessings on train set...")
print(X_train.head())
X_train = preprocessor.fit_transform(X_train)
print('...Done.')
print(X_train[0:5]) # MUST use this syntax because X_train is a numpy array and not a pandas DataFrame anymore
print()

# Preprocessings on test set
print("Performing preprocessings on test set...")
print(X_test.head()) 
X_test = preprocessor.transform(X_test) # Don't fit again !! The test set is used for validating decisions
# we made based on the training set, therefore we can only apply transformations that were parametered using the training set.
# Otherwise this creates what is called a leak from the test set which will introduce a bias in all your results.
print('...Done.')
print(X_test[0:5,:]) # MUST use this syntax because X_test is a numpy array and not a pandas DataFrame anymore
print()

Performing preprocessings on train set...
     Store  Temperature  Fuel_Price         CPI  Unemployment  Holiday_Flag  \
17      18        21.33       2.788  131.527903         9.202             0   
107      8        33.34       2.548  214.621419         6.299             1   
136      4        84.59       3.469    129.1125         5.644          <NA>   
45       2        54.63       3.555  220.275944         7.057             0   
59      14        36.85       3.695  189.842483         8.424             0   

     year  month   day  day_of_week  
17   <NA>   <NA>  <NA>         <NA>  
107  2010      2    12            4  
136  2011      7     8            4  
45   2012      2    24            4  
59   2012      2    17            4  
...Done.
[[-2.27582047 -1.16285602 -1.11596452  1.7922351   0.0696733  -0.05386943
   0.04276567  0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0

In [12]:
# Perform 3-fold cross-validation to evaluate the generalized R2 score obtained with a Lasso model
print("3-fold cross-validation...")
regressor = Lasso()
scores = cross_val_score(regressor, X_train, Y_train, cv=3)
print('The cross-validated R2-score is : ', scores.mean())
print('The standard deviation is : ', scores.std())

3-fold cross-validation...
The cross-validated R2-score is :  0.8459115572086736
The standard deviation is :  0.14322689171326167


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [16]:
# Perform grid search
print("Grid search...")
regressor = Lasso()
# Grid of values to be tested
params = {
    'alpha': [0.1, 0.3, 0.5, 0.7, 1.0], # 0 corresponds to no regularization
}
gridsearch = GridSearchCV(regressor, param_grid = params, cv = 5) # cv : the number of folds to be used for CV
gridsearch.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", gridsearch.best_params_)
print("Best R2 score : ", gridsearch.best_score_)

Grid search...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


...Done.
Best hyperparameters :  {'alpha': 1.0}
Best R2 score :  0.9244808027451266


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [14]:
# Print R^2 scores
print("R2 score on training set : ", gridsearch.score(X_train, Y_train))
print("R2 score on test set : ", gridsearch.score(X_test, Y_test))

R2 score on training set :  0.9851311111922835
R2 score on test set :  0.947314382948029
