# Initialisation

Import libraries, load setting and load data.

In [1]:
import numpy as np
import json
from sklearn import linear_model

import utils


path_base = "../"

# Load settings
with open(path_base+"SETTINGS.json", 'r') as fid:
    settings = json.load(fid)
    print("Settings loaded.")
    
# Load store info
store_info = utils.loadStoreInfo(path_base+settings["STORE_INFO_PATH"])
print("Store information: {} lines loaded".format(len(store_info)))

# Load training data
(input_data, output_sales, output_customers) = utils.loadTrainingData(path_base+settings["TRAIN_DATA_PATH"])
print("Training data: {} lines loaded".format(len(input_data)))

Settings loaded.
Store information: 1115 lines loaded
Training data: 1017209 lines loaded


# 1. Creation of the features

Here we define the functions that generates vectors of $\mathbb{R}^{D}$ from the entries.

In [2]:
# Define dimension of the feature space
D = 5

def generateFeatureVector(entry):
    vector = np.array([
        entry[1],                    # Day of week
        entry[2].day,                # Day of month
        entry[2].month,              # Month
        entry[2].year,               # Year
        entry[4]                     # Promo
    ])
    
    return vector


# 2. Creation of the training set / test set


In [9]:
pourcentage_training_set = 80

# Split the entries into training data and test data
(tr_input, tr_label_sales, tr_label_customers,
     te_input, te_label_sales, te_label_customers) = utils.separateTrainingSet(
        input_data,
        output_sales, 
        output_customers,
        pourcentage_training_set)

# Convert the labels into Numpy arrays
training_labels = np.array(tr_label_sales)
test_labels = np.array(te_label_sales)

N = training_labels.shape[0]
N_test = test_labels.shape[0]

# Generate features of data set and test set
training_set = np.zeros((N,D))
for i in range(N):
    training_set[i,:] = generateFeatureVector(tr_input[i])
    
test_set = np.zeros((N_test, D))
for i in range(N_test):
    test_set[i,:] = generateFeatureVector(te_input[i])
    

# Summary
print("Dimensions of the training set: ({},{})".format(
        training_set.shape[0], training_set.shape[1]))
print("Dimensions of the test set: ({},{})".format(
        test_set.shape[0], test_set.shape[1]))

print(training_set[0,:])


Dimensions of the training set: (813767,5)
Dimensions of the test set: (203442,5)
[    3.    18.     9.  2013.     0.]


# 3. Model training

### Linear regression

First, we test a simple linear regression, we expect it to perform poorly because of outliers.

In [13]:
print("RMSPE avec la moyenne: {}".format(
    utils.compute_RMSPE(test_labels, training_labels.mean()*np.ones((test_labels.shape[0])))))

# Linear regression without normalization
model_linreg = linear_model.LinearRegression()
model_linreg.fit(training_set, training_labels)

print("Coefficients: ")
print(model_linreg.coef_)

y_hat = model_linreg.predict(test_set)

print("Linear regression without normalisation: RMSPE = {}".format(
    utils.compute_RMSPE(test_labels, y_hat)))

# Linear regression with normalization
model_linreg_n = linear_model.LinearRegression(fit_intercept=True,normalize=True,n_jobs=2)
model_linreg_n.fit(training_set, training_labels)

y_hat_n = model_linreg_n.predict(test_set)

print("Linear regression with normalisation: RMSPE = {}".format(
    utils.compute_RMSPE(test_labels, y_hat_n)))


RMSPE avec la moyenne: 0.47578643688932665
Coefficients: 
[ -644.78175258    10.74656958    67.62666673   160.43544296  2561.97327287]
Linear regression without normalisation: RMSPE = 0.4420407944318239
Linear regression with normalisation: RMSPE = 0.4420407944317316


The order of magnitude of the RMSPE with linear regression is 0.45, slightly better than just predicting the mean value of the sales in the training set. 

### Ridge regression

Then, a more complicated model: a ridge regression to handle outliers. 

In [18]:
model_ridgereg = linear_model.RidgeCV(
    alphas = [0.001,0.01,0.1,1.0,10],
    normalize=True)

model_ridgereg.fit(training_set, training_labels)

print("Hyperparameter selected: alpha = {}".format(
    model_ridgereg.alpha_))

y_hat_rr = model_ridgereg.predict(test_set)

print("Ridge regression with normalisation: RMSPE = {}".format(
    utils.compute_RMSPE(test_labels, y_hat_rr)))

Hyperparameter selected: alpha = 0.001
Ridge regression with normalisation: RMSPE = 0.4419641337685197
