# Initialisation

Import libraries, load setting and load data.

In [1]:
import numpy as np
import json
import datetime
import pickle
from sklearn import linear_model, kernel_ridge, tree

import utils


path_base = "../"

# Load settings
with open(path_base+"SETTINGS.json", 'r') as fid:
    settings = json.load(fid)
    print("Settings loaded.")
    
# Load store info
store_info = utils.loadStoreInfo(path_base+settings["STORE_INFO_PATH"])
print("Store information: {} lines loaded".format(len(store_info)))

# Load training data
(input_data, output_sales, output_customers) = utils.loadTrainingData(path_base+settings["TRAIN_DATA_PATH"])
print("Training data: {} lines loaded".format(len(input_data)))

print(store_info[0])

Settings loaded.
Store information: 1115 lines loaded
Training data: 1017209 lines loaded
[1, 'c', 'a', 1270, 9, 2008, 0, -1, -1, '']


# 1. Creation of the features

Here we define the functions that generates vectors of $\mathbb{R}^{D}$ from the entries.

In [10]:
# Define dimension of the feature space
D = 38


def generateFeatureVector(entry):
    vector = np.zeros((D))
    
    ## Day features: 
    
    # Features 0 to 6: Day of the week
    vector[entry[1]-1] = 1.0
    
    # Feature 7: beginning of the month i.e. day in [1;10]
    if entry[2].day <= 10:
        vector[7] = 1.0
        
    # Feature 8: middle of the month i.e. day in [11;20]
    if entry[2].day > 10 and entry[2].day <= 20:
        vector[8] = 1.0
        
    # Feature 9: end of the month i.e. day > 20
    if entry[2].day > 20:
        vector[9] = 1.0
        
    # Features 10 to 21: month of the year
    vector[9+entry[2].month] = 1.0
    
    # Feature 22: is the store open on that day ?
    if entry[3] == 1:
        vector[22] = 1.0
        
    # Feature 23: promo ?
    if entry[4] == 1:
        vector[23] = 1.0
        
    # Feature 24: public holiday
    if entry[5] == 'a':
        vector[24] = 1.0
    
    # Feature 25: Easter
    if entry[5] == 'b':
        vector[25] = 1.0
        
    # Feature 26: Christmas
    if entry[5] == 'c':
        vector[26] = 1.0
        
    # Feature 27: School holiday
    if entry[6] == '1':
        vector[27] = 1.0
        
    
    ## Store Features:
    # Features 28-31: store type
    if store_info[entry[0]-1][1] == 'a':
        vector[28] = 1.0
    elif store_info[entry[0]-1][1] == 'b':
        vector[29] = 1.0
    elif store_info[entry[0]-1][1] == 'c':
        vector[30] = 1.0
    else:
        vector[31] = 1.0
        
    # Features 32-34: Assortment
    if store_info[entry[0]-1][2] == 'a':
        vector[32] = 1.0
    elif store_info[entry[0]-1][2] == 'b':
        vector[33] = 1.0
    else:
        vector[34] = 1.0
        
    # Feature 35: Competition Distance 
    vector[35] = store_info[entry[0]-1][3]
    
    # Feature 36: Days since competition started (>0)
    if store_info[entry[0]-1][4] != -1:
        starting_day = datetime.date(
            store_info[entry[0]-1][5],  # Year
            store_info[entry[0]-1][4],  # Month
            1)                          # Day (no data, so 1 by default)

        delta = entry[2] - starting_day
        if delta.days > 0:
            vector[36] = delta.days
        
    # Feature 37: Promo2
    vector[37] = store_info[entry[0]-1][6]
    
    return vector

# 2. Creation of the training set / test set


In [11]:
pourcentage_training_set = 80

# Split the entries into training data and test data
(tr_input, tr_label_sales, tr_label_customers,
     te_input, te_label_sales, te_label_customers) = utils.separateTrainingSet(
        input_data,
        output_sales, 
        output_customers,
        pourcentage_training_set)

# Convert the labels into Numpy arrays
training_labels = np.array(tr_label_sales)
test_labels = np.array(te_label_sales)

N = training_labels.shape[0]
N_test = test_labels.shape[0]

# Generate features of data set and test set
training_set = np.zeros((N,D))
for i in range(N):
    training_set[i,:] = generateFeatureVector(tr_input[i])
    
test_set = np.zeros((N_test, D))
for i in range(N_test):
    test_set[i,:] = generateFeatureVector(te_input[i])
    

# Summary
print("Dimensions of the training set: ({},{})".format(
        training_set.shape[0], training_set.shape[1]))
print("Dimensions of the test set: ({},{})".format(
        test_set.shape[0], test_set.shape[1]))

print(training_set[0,:])
print(tr_input[0])
print(training_set[:,4].min())
print(training_set[:,4].max())


Dimensions of the training set: (813767,38)
Dimensions of the test set: (203442,38)
[  0.00000000e+00   0.00000000e+00   0.00000000e+00   1.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   1.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   1.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   1.00000000e+00   1.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   1.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   1.00000000e+00   0.00000000e+00   0.00000000e+00   2.38000000e+03
   3.27600000e+03   1.00000000e+00]
[89, 4, datetime.date(2013, 6, 20), 1, 1, '0', '0']
0.0
1.0


# 3. Model training

### Linear regression

First, we test a simple linear regression, we expect it to perform poorly because of outliers.

In [12]:
print("RMSPE avec la moyenne: {}".format(
    utils.compute_RMSPE(test_labels, training_labels.mean()*np.ones((test_labels.shape[0])))))

# Linear regression without normalization
model_linreg = linear_model.LinearRegression()
model_linreg.fit(training_set, training_labels)

print("Coefficients: ")
print(model_linreg.coef_)

y_hat = model_linreg.predict(test_set)
print(y_hat[:20])
print(test_labels[:20])


print("Linear regression without normalisation: RMSPE = {}".format(
    utils.compute_RMSPE(test_labels, y_hat)))

# Linear regression with normalization
model_linreg_n = linear_model.LinearRegression(fit_intercept=True,normalize=True,n_jobs=2)
model_linreg_n.fit(training_set, training_labels)

y_hat_n = model_linreg_n.predict(test_set)

print("Linear regression with normalisation: RMSPE = {}".format(
    utils.compute_RMSPE(test_labels, y_hat_n)))


RMSPE avec la moyenne: 0.473894537235089
Coefficients: 
[ -2.78114898e+11  -2.78114899e+11  -2.78114899e+11  -2.78114899e+11
  -2.78114899e+11  -2.78114899e+11  -2.78114898e+11   7.08737767e+01
  -6.34429678e+01  -7.53454185e+00  -2.75682655e+02  -2.41113261e+02
  -5.47802172e+01   6.42614628e+01   1.22036397e+02   5.58720365e+01
  -1.46361087e+02  -3.77771636e+02  -3.54348325e+02  -2.98751505e+02
   1.38649499e+02   1.36803499e+03   6.59189541e+03   2.23516820e+03
  -7.77244298e+01  -1.16148470e+03  -6.21613036e+02   2.86666451e+02
  -1.15325909e+03   3.71660701e+03  -1.24238545e+03  -1.32099790e+03
   7.50030742e+02  -2.20061603e+03   1.45059353e+03  -1.97143555e-02
   1.17187500e-02  -6.29538822e+02]
[ -259.67272949  -762.80987549   162.51940918  7470.62182617  8026.99810791
  4619.69616699  8064.4364624   5309.05963135   272.81811523  5191.47558594
  6832.00323486  5513.52990723  7622.22991943  8275.05871582  8282.97821045
  5703.82165527  7407.6038208   6360.90283203  8632.4415893

The order of magnitude of the RMSPE with linear regression is 0.45, slightly better than just predicting the mean value of the sales in the training set. 

### Ridge regression

Then, a more complicated model: a ridge regression to handle outliers. 

In [13]:
model_ridgereg = linear_model.RidgeCV(
    alphas = [0.0001,0.001,0.003,0.01,0.1,1.0,10],
    normalize=True)

model_ridgereg.fit(training_set, training_labels)

print("Hyperparameter selected: alpha = {}".format(
    model_ridgereg.alpha_))

y_hat_rr = model_ridgereg.predict(test_set)

print("Ridge regression with normalisation: RMSPE = {}".format(
    utils.compute_RMSPE(test_labels, y_hat_rr)))

Hyperparameter selected: alpha = 0.0001
Ridge regression with normalisation: RMSPE = 0.49867351629144274


### Kernel Ridge Regression
Then a regression with different kernels to better handle non-linearities

In [15]:
#model_kridge = kernel_ridge.KernelRidge(alpha=0.001,kernel='rbf',gamma=0.01)
#model_kridge = kernel_ridge.KernelRidge(alpha=0.001,kernel='rbf',gamma=0.0001)

#model_kridge = kernel_ridge.KernelRidge(alpha=0.001,kernel='rbf',gamma=0.001)

#model_kridge = kernel_ridge.KernelRidge(alpha=0.001,kernel='linear',degree=3, coef0=1)

model_kridge = kernel_ridge.KernelRidge(alpha=0.001,kernel="sigmoid", gamma=0.0002, coef0=0.00001)

# Normalize dataset
data_means = training_set.mean(axis=0)
data_std = training_set.std(axis=0)

n_training_set = np.zeros(training_set.shape)
n_test_set = np.zeros(test_set.shape)

for i in range(training_set.shape[0]):
    n_training_set[i,:] = (training_set[i,:]-data_means)/data_std
    
for i in range(test_set.shape[0]):
    n_test_set[i,:] = (test_set[i,:]-data_means)/data_std

"""
del training_set
del test_set
del input_data
del te_input
del tr_input
"""

# Train model
model_kridge.fit(n_training_set[:200,:], training_labels[:200])

# Evaluate model
y_hat_kridge = model_kridge.predict(n_test_set)

print("Kernel ridge regression with normalisation: RMSPE = {}".format(
    utils.compute_RMSPE(test_labels, y_hat_kridge)))


Kernel ridge regression with normalisation: RMSPE = 1.9769184708763692


# Decision Tree Regression


In [21]:
# Create model
model_tree = tree.DecisionTreeRegressor(max_depth=20)

# Train model
model_tree.fit(training_set, training_labels)

# Evaluate model
y_hat_tree = model_tree.predict(test_set)

print("Decision tree regression: RMSPE = {}".format(
    utils.compute_RMSPE(test_labels, y_hat_tree)))


Decision tree regression: RMSPE = 0.2711855296082456


# Save model
We save the model using pickle.

In [22]:
with open(path_base+settings["MODEL_PATH"], "wb") as fid:
    mpk = pickle.Pickler(fid)
    mpk.dump(model_tree)
