# Initialisation

Import libraries, load setting and load data.

In [1]:
import numpy as np
import json
import datetime
import pickle
from sklearn import linear_model, kernel_ridge, tree, model_selection

import utils


path_base = "../"

# Load settings
with open(path_base+"SETTINGS.json", 'r') as fid:
    settings = json.load(fid)
    print("Settings loaded.")
    
# Load store info
store_info = utils.loadStoreInfo(path_base+settings["STORE_INFO_PATH"])
print("Store information: {} lines loaded".format(len(store_info)))

# Load training data
(input_data, output_sales, output_customers) = utils.loadTrainingData(path_base+settings["TRAIN_DATA_PATH"])
print("Training data: {} lines loaded".format(len(input_data)))


Settings loaded.
Store information: 1115 lines loaded
Training data: 1017209 lines loaded


# 1. Creation of the features

Here we define the functions that generates a vector of $\mathbb{R}^{D}$ from an entry.

In [2]:
# Define dimension of the feature space
D = 38


def generateFeatureVector(entry):
    vector = np.zeros((D))
    
    ## Day features: 
    
    # Features 0 to 6: Day of the week
    vector[entry[1]-1] = 1.0
    
    # Feature 7: beginning of the month i.e. day in [1;10]
    if entry[2].day <= 10:
        vector[7] = 1.0
        
    # Feature 8: middle of the month i.e. day in [11;20]
    if entry[2].day > 10 and entry[2].day <= 20:
        vector[8] = 1.0
        
    # Feature 9: end of the month i.e. day > 20
    if entry[2].day > 20:
        vector[9] = 1.0
        
    # Features 10 to 21: month of the year
    vector[9+entry[2].month] = 1.0
    
    # Feature 22: is the store open on that day ?
    if entry[3] == 1:
        vector[22] = 1.0
        
    # Feature 23: promo ?
    if entry[4] == 1:
        vector[23] = 1.0
        
    # Feature 24: public holiday
    if entry[5] == 'a':
        vector[24] = 1.0
    
    # Feature 25: Easter
    if entry[5] == 'b':
        vector[25] = 1.0
        
    # Feature 26: Christmas
    if entry[5] == 'c':
        vector[26] = 1.0
        
    # Feature 27: School holiday
    if entry[6] == '1':
        vector[27] = 1.0
        
    
    ## Store Features:
    # Features 28-31: store type
    if store_info[entry[0]-1][1] == 'a':
        vector[28] = 1.0
    elif store_info[entry[0]-1][1] == 'b':
        vector[29] = 1.0
    elif store_info[entry[0]-1][1] == 'c':
        vector[30] = 1.0
    else:
        vector[31] = 1.0
        
    # Features 32-34: Assortment
    if store_info[entry[0]-1][2] == 'a':
        vector[32] = 1.0
    elif store_info[entry[0]-1][2] == 'b':
        vector[33] = 1.0
    else:
        vector[34] = 1.0
        
    # Feature 35: Competition Distance 
    vector[35] = store_info[entry[0]-1][3]
    
    # Feature 36: Days since competition started (>0)
    if store_info[entry[0]-1][4] != -1:
        starting_day = datetime.date(
            store_info[entry[0]-1][5],  # Year
            store_info[entry[0]-1][4],  # Month
            1)                          # Day (no data, so 1 by default)

        delta = entry[2] - starting_day
        if delta.days > 0:
            vector[36] = delta.days
        
    # Feature 37: Promo2
    vector[37] = store_info[entry[0]-1][6]
    
    return vector

# 2. Creation of the training set / test set


In [3]:
pourcentage_training_set = 80

# Split the entries into training data and test data
(tr_input, tr_label_sales, tr_label_customers,
     te_input, te_label_sales, te_label_customers) = utils.separateTrainingSet(
        input_data,
        output_sales, 
        output_customers,
        pourcentage_training_set)

# Convert the labels into Numpy arrays
training_labels = np.array(tr_label_sales)
test_labels = np.array(te_label_sales)

N = training_labels.shape[0]
N_test = test_labels.shape[0]

# Generate features of data set and test set
training_set = np.zeros((N,D))
for i in range(N):
    training_set[i,:] = generateFeatureVector(tr_input[i])
    
test_set = np.zeros((N_test, D))
for i in range(N_test):
    test_set[i,:] = generateFeatureVector(te_input[i])
    

# Summary
print("Dimensions of the training set: ({},{})".format(
        training_set.shape[0], training_set.shape[1]))
print("Dimensions of the test set: ({},{})".format(
        test_set.shape[0], test_set.shape[1]))



Dimensions of the training set: (813767,38)
Dimensions of the test set: (203442,38)


# 3. Model training

### Mean value

The most simple predictor: we always predict the mean value. This gives us an idea about the performance to expect from the other predictors. 


In [4]:
print("RMSPE if we predict the mean value: {}".format(
    utils.compute_RMSPE(test_labels, training_labels.mean()*np.ones((test_labels.shape[0])))))

RMSPE if we predict the mean value: 0.48604770543968084


### Linear regression

First, we test a simple linear regression, we expect it to perform poorly, among others because of outliers.

In [5]:
# Linear regression with normalization
model_linreg_n = linear_model.LinearRegression(fit_intercept=True,normalize=True,n_jobs=2)
model_linreg_n.fit(training_set, training_labels)

y_hat_n = model_linreg_n.predict(test_set)

print("Linear regression with normalisation: RMSPE = {}".format(
    utils.compute_RMSPE(test_labels, y_hat_n)))


Linear regression with normalisation: RMSPE = 0.5067171084438546


The linear regression performs even worse than the mean predictor... 

### Ridge regression

Then, a more complicated model: a ridge regression to handle outliers. 

In [6]:
model_ridgereg = linear_model.RidgeCV(
    alphas = [0.0001,0.001,0.003,0.01,0.1,1.0,10],
    normalize=True)

model_ridgereg.fit(training_set, training_labels)

print("Hyperparameter selected: alpha = {}".format(
    model_ridgereg.alpha_))

y_hat_rr = model_ridgereg.predict(test_set)

print("Ridge regression with normalisation: RMSPE = {}".format(
    utils.compute_RMSPE(test_labels, y_hat_rr)))

Hyperparameter selected: alpha = 0.0001
Ridge regression with normalisation: RMSPE = 0.5066239580387693


The ridge regression does not perform well either: the RMSPE is still higher than in the mean predictor case.

### Kernel Ridge Regression
Then a regression with different kernels to better handle non-linearities. 

Unfortunately, we do not have enough RAM to process the whole test set with this implementation. For this reason, we only use a small portion of it.

In [7]:
# Normalize dataset
data_means = training_set.mean(axis=0)
data_std = training_set.std(axis=0)

n_training_set = np.zeros(training_set.shape)
n_test_set = np.zeros(test_set.shape)

for i in range(training_set.shape[0]):
    n_training_set[i,:] = (training_set[i,:]-data_means)/data_std
    
for i in range(test_set.shape[0]):
    n_test_set[i,:] = (test_set[i,:]-data_means)/data_std
    
    
# Train model
model_kridge = model_selection.GridSearchCV(kernel_ridge.KernelRidge(kernel="sigmoid", gamma=0.1, coef0=0.1), cv=5,
                           param_grid={"kernel": ["rbf","polynomial", "sigmoid"],
                                       "alpha": [1e-4,3e-4,1e-3,1e-2,1e-1],
                                       "gamma": np.logspace(-4,1,6),
                                       "coef0": np.logspace(-4,2,7)})


model_kridge.fit(n_training_set[:800,:], training_labels[:800])

# Evaluate model
y_hat_kridge = model_kridge.predict(n_test_set)

print("Kernel ridge regression with normalisation: RMSPE = {}".format(
    utils.compute_RMSPE(test_labels, y_hat_kridge)))




Kernel ridge regression with normalisation: RMSPE = 0.5075511329024066


This model does not perform well either, probably (but not only) because of the small amount of data points considered for the model training. 

# Decision Tree Regression

Finally, with a decision tree:

In [11]:
# Create model
model_tree = tree.DecisionTreeRegressor()

# Train model on the whole training set
model_tree.fit(training_set, training_labels)

# Evaluate model
y_hat_tree = model_tree.predict(test_set)

print("Decision tree regression: RMSPE = {}".format(
    utils.compute_RMSPE(test_labels, y_hat_tree)))


Decision tree regression: RMSPE = 0.22404878635603767


The RMSPE on the test set is much better. 

# Save model
We save the decision tree regression model using pickle.

In [10]:
with open(path_base+settings["MODEL_PATH"], "wb") as fid:
    mpk = pickle.Pickler(fid)
    mpk.dump(model_tree)
