In [1]:
import pandas as pd 
import pickle
import numpy as np
from sklearn import linear_model
from sklearn.metrics import r2_score, mean_squared_error
import tensorflow as tf
import xgboost as xgb
from tensorflow.keras.layers import LSTM

from sklearn.model_selection import GridSearchCV


In [2]:
data = []
with (open("saved_results/clients_data", "rb")) as openfile:
    while True:
        try:
            data.append(pickle.load(openfile))
        except EOFError:
            break

data[customer][train_test][feature]


2 steps - train individual models per customer, then 1 global model for all data

## Dataframe creation 

In [3]:
#One dataframe per household
dfs_train = [None]*25
dfs_test = [None]*25

for i in range(len(data[0])):
    #Create train dfs
    mat_train = np.matrix(data[0][i][0])
    dfs_train[i] = pd.DataFrame(mat_train)
    dfs_train[i]['pred'] = data[0][i][1].tolist()
    
    #Create test dfs
    mat = np.matrix(data[0][i][2])
    dfs_test[i] = pd.DataFrame(mat)
    dfs_test[i]['pred'] = data[0][i][3].tolist()

In [4]:
#One complete df 
tot_train = dfs_train[0]
tot_test = dfs_test[0]

for i in range(len(dfs_train)-1):
    tot_train = tot_train.append(dfs_train[i+1])
    tot_test = tot_test.append(dfs_train[i+1])

## Multi-linear regression 


In [25]:
#One household
X_train = dfs_train[0].loc[:, dfs_train[0].columns != 'pred']
X_test = dfs_test[0].loc[:, dfs_test[0].columns != 'pred']

y_train = dfs_train[0].loc[:, dfs_train[0].columns == 'pred']
y_test = dfs_test[0].loc[:, dfs_test[0].columns == 'pred']

regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)

y_pred = regr.predict(X_test)

r2_adj = 1-(1-r2_score(y_test, y_pred))*((len(X_test)-1)/(len(X_test)-len(X_test.columns)-1))
mse = mean_squared_error(y_test, y_pred)

print("Adjusted R2 : " + r2_adj.astype(str) + "\nMSE : " + mse.astype(str))



Adjusted R2 : 0.7495491609617009
MSE : 0.28565978932868724


In [28]:
#One model per household
r_2 = [None]*len(dfs_train)
mserror = [None]*len(dfs_train)
for i in range(len(dfs_train)):
    X_train = dfs_train[i].loc[:, dfs_train[i].columns != 'pred']
    X_test = dfs_test[i].loc[:, dfs_test[i].columns != 'pred']


    y_train = dfs_train[i].loc[:, dfs_train[i].columns == 'pred']
    y_test = dfs_test[i].loc[:, dfs_test[i].columns == 'pred']
    
    regr = linear_model.LinearRegression()
    regr.fit(X_train, y_train)

    y_pred = regr.predict(X_test)
    
    r_2[i] = 1-(1-r2_score(y_test, y_pred))*((len(X_test)-1)/(len(X_test)-len(X_test.columns)-1))
    mserror[i] = mean_squared_error(y_test, y_pred)
    
    
r2_adj = np.mean(r_2)
mse = np.mean(mserror)

print("Adjusted R2 : " + r2_adj.astype(str) + "\nMSE : " + mse.astype(str))

Adjusted R2 : 0.68700508300428
MSE : 0.31843413883381044


In [32]:
#All together 
X_train = tot_train.loc[:, tot_train.columns != 'pred']
X_test = tot_test.loc[:, tot_test.columns != 'pred']


y_train = tot_train.loc[:, tot_train.columns == 'pred']
y_test = tot_test.loc[:, tot_test.columns == 'pred']

regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)

y_pred = regr.predict(X_test)

r2_adj = 1-(1-r2_score(y_test, y_pred))*((len(X_test)-1)/(len(X_test)-len(X_test.columns)-1))
mse = mean_squared_error(y_test, y_pred)

print("Adjusted R2 : " + r2_adj.astype(str) + "\nMSE : " + mse.astype(str))


Adjusted R2 : 0.7878142498010648
MSE : 0.2135946090738038


## ANN Linear

In [39]:
#Initialising ANN
ann = tf.keras.models.Sequential()

#Adding First Hidden Layer
ann.add(tf.keras.layers.Dense(units=8))

#Adding Second Hidden Layer
ann.add(tf.keras.layers.Dense(units=4))

#Adding Output Layer
ann.add(tf.keras.layers.Dense(units=1))


#Compiling ANN
ann.compile(optimizer="adam",loss="MeanSquaredError")

X_train = tot_train.loc[:, tot_train.columns != 'pred']
X_test = tot_test.loc[:, tot_test.columns != 'pred']


y_train = tot_train.loc[:, tot_train.columns == 'pred']
y_test = tot_test.loc[:, tot_test.columns == 'pred']

#Fitting ANN
ann.fit(X_train,y_train,batch_size=32,epochs = 15, verbose=0)

y_pred = ann.predict(X_test)
y_pred

r2_adj = 1-(1-r2_score(y_test, y_pred))*((len(X_test)-1)/(len(X_test)-len(X_test.columns)-1))
mse = mean_squared_error(y_test, y_pred)

print("Adjusted R2 : " + r2_adj.astype(str) + "\nMSE : " + mse.astype(str))


Adjusted R2 : 0.7861675787959927
MSE : 0.21525221355135135


In [38]:
#One model per household
r_2 = [None]*len(dfs_train)
mserror = [None]*len(dfs_train)

for i in range(len(dfs_train)):
    X_train = dfs_train[i].loc[:, dfs_train[i].columns != 'pred']
    X_test = dfs_test[i].loc[:, dfs_test[i].columns != 'pred']


    y_train = dfs_train[i].loc[:, dfs_train[i].columns == 'pred']
    y_test = dfs_test[i].loc[:, dfs_test[i].columns == 'pred']

    #Initialising ANN
    ann = tf.keras.models.Sequential()

    #Adding First Hidden Layer
    ann.add(tf.keras.layers.Dense(units=8))

    #Adding Second Hidden Layer
    ann.add(tf.keras.layers.Dense(units=4))

    #Adding Output Layer
    ann.add(tf.keras.layers.Dense(units=1))

    #Compiling ANN
    ann.compile(optimizer="adam",loss="MeanSquaredError")
    
    #Fitting ANN
    ann.fit(X_train,y_train,batch_size=32,epochs = 100, verbose=0)

    y_pred = ann.predict(X_test)
    
    r_2[i] = 1-(1-r2_score(y_test, y_pred))*((len(X_test)-1)/(len(X_test)-len(X_test.columns)-1))
    mserror[i] = mean_squared_error(y_test, y_pred)
    
    
r2_adj = np.mean(r_2)
mse = np.mean(mserror)

print("Adjusted R2 : " + r2_adj.astype(str) + "\nMSE : " + mse.astype(str))

Adjusted R2 : 0.6881897774793262
MSE : 0.3172990707016136


## ANN Non-Linear

Tanh activation on the output

In [53]:
#Initialising ANN
ann = tf.keras.models.Sequential()

#Adding First Hidden Layer
ann.add(tf.keras.layers.Dense(units=8))

#Adding Second Hidden Layer
ann.add(tf.keras.layers.Dense(units=4))

#Adding Output Layer
ann.add(tf.keras.layers.Dense(units=1, activation = "tanh"))


#Compiling ANN
ann.compile(optimizer="adam",loss="MeanSquaredError")

X_train = tot_train.loc[:, tot_train.columns != 'pred']
X_test = tot_test.loc[:, tot_test.columns != 'pred']


y_train = tot_train.loc[:, tot_train.columns == 'pred']
y_test = tot_test.loc[:, tot_test.columns == 'pred']

#Fitting ANN
ann.fit(X_train,y_train,batch_size=32,epochs = 15, verbose=0)

y_pred = ann.predict(X_test)
y_pred

r2_adj = 1-(1-r2_score(y_test, y_pred))*((len(X_test)-1)/(len(X_test)-len(X_test.columns)-1))
mse = mean_squared_error(y_test, y_pred)

print("Adjusted R2 : " + r2_adj.astype(str) + "\nMSE : " + mse.astype(str))


Adjusted R2 : 0.7395269761195613
MSE : 0.2622024978484775


In [40]:
#One model per household
r_2 = [None]*len(dfs_train)
mserror = [None]*len(dfs_train)

for i in range(len(dfs_train)):
    X_train = dfs_train[i].loc[:, dfs_train[i].columns != 'pred']
    X_test = dfs_test[i].loc[:, dfs_test[i].columns != 'pred']


    y_train = dfs_train[i].loc[:, dfs_train[i].columns == 'pred']
    y_test = dfs_test[i].loc[:, dfs_test[i].columns == 'pred']

    #Initialising ANN
    ann = tf.keras.models.Sequential()

    #Adding First Hidden Layer
    ann.add(tf.keras.layers.Dense(units=8))

    #Adding Second Hidden Layer
    ann.add(tf.keras.layers.Dense(units=4))

    #Adding Output Layer
    ann.add(tf.keras.layers.Dense(units=1, activation = "tanh"))

    
    
    #Compiling ANN
    ann.compile(optimizer="adam",loss="MeanSquaredError")
    
    #Fitting ANN
    ann.fit(X_train,y_train,batch_size=32,epochs = 100, verbose=0)

    y_pred = ann.predict(X_test)
    
    r_2[i] = 1-(1-r2_score(y_test, y_pred))*((len(X_test)-1)/(len(X_test)-len(X_test.columns)-1))
    mserror[i] = mean_squared_error(y_test, y_pred)
    
    
r2_adj = np.mean(r_2)
mse = np.mean(mserror)

print("Adjusted R2 : " + r2_adj.astype(str) + "\nMSE : " + mse.astype(str))

Adjusted R2 : 0.6230623329643588
MSE : 0.3855005994468207


relu activation on the first hidden layer

In [75]:
#Initialising ANN
ann = tf.keras.models.Sequential()

#Adding First Hidden Layer
ann.add(tf.keras.layers.Dense(units=8, activation = "relu"))

#Adding Second Hidden Layer
ann.add(tf.keras.layers.Dense(units=4))

#Adding Output Layer
ann.add(tf.keras.layers.Dense(units=1))


#Compiling ANN
ann.compile(optimizer="adam",loss="MeanSquaredError")

X_train = tot_train.loc[:, tot_train.columns != 'pred']
X_test = tot_test.loc[:, tot_test.columns != 'pred']


y_train = tot_train.loc[:, tot_train.columns == 'pred']
y_test = tot_test.loc[:, tot_test.columns == 'pred']

#Fitting ANN
ann.fit(X_train,y_train,batch_size=32,epochs = 100, verbose=0)

y_pred = ann.predict(X_test)
y_pred

r2_adj = 1-(1-r2_score(y_test, y_pred))*((len(X_test)-1)/(len(X_test)-len(X_test.columns)-1))
mse = mean_squared_error(y_test, y_pred)

print("Adjusted R2 : " + r2_adj.astype(str) + "\nMSE : " + mse.astype(str))


Adjusted R2 : 0.8513771713056267
MSE : 0.14960964609854793


In [76]:
#One model per household
r_2 = [None]*len(dfs_train)
mserror = [None]*len(dfs_train)

for i in range(len(dfs_train)):
    X_train = dfs_train[i].loc[:, dfs_train[i].columns != 'pred']
    X_test = dfs_test[i].loc[:, dfs_test[i].columns != 'pred']


    y_train = dfs_train[i].loc[:, dfs_train[i].columns == 'pred']
    y_test = dfs_test[i].loc[:, dfs_test[i].columns == 'pred']

    #Initialising ANN
    ann = tf.keras.models.Sequential()

    #Adding First Hidden Layer
    ann.add(tf.keras.layers.Dense(units=8, activation = "relu"))

    #Adding Second Hidden Layer
    ann.add(tf.keras.layers.Dense(units=4))

    #Adding Output Layer
    ann.add(tf.keras.layers.Dense(units=1))

    
    
    #Compiling ANN
    ann.compile(optimizer="adam",loss="MeanSquaredError")
    
    #Fitting ANN
    ann.fit(X_train,y_train,batch_size=32,epochs = 100, verbose=0)

    y_pred = ann.predict(X_test)
    
    r_2[i] = 1-(1-r2_score(y_test, y_pred))*((len(X_test)-1)/(len(X_test)-len(X_test.columns)-1))
    mserror[i] = mean_squared_error(y_test, y_pred)
    
    
r2_adj = np.mean(r_2)
mse = np.mean(mserror)

print("Adjusted R2 : " + r2_adj.astype(str) + "\nMSE : " + mse.astype(str))

Adjusted R2 : 0.6893714111264815
MSE : 0.31636072979390706


## Non linear ANN with dropouts 

In [74]:
#Initialising ANN
ann = tf.keras.models.Sequential()

#Adding First Hidden Layer
ann.add(tf.keras.layers.Dense(units=8, activation = "relu"))

ann.add(tf.keras.layers.Dropout(0.2))

#Adding Second Hidden Layer
ann.add(tf.keras.layers.Dense(units=4))

#Adding Output Layer
ann.add(tf.keras.layers.Dense(units=1))


#Compiling ANN
ann.compile(optimizer="adam",loss="MeanSquaredError")

X_train = tot_train.loc[:, tot_train.columns != 'pred']
X_test = tot_test.loc[:, tot_test.columns != 'pred']


y_train = tot_train.loc[:, tot_train.columns == 'pred']
y_test = tot_test.loc[:, tot_test.columns == 'pred']

#Fitting ANN
ann.fit(X_train,y_train,batch_size=32,epochs = 100)

y_pred = ann.predict(X_test)
y_pred

r2_adj = 1-(1-r2_score(y_test, y_pred))*((len(X_test)-1)/(len(X_test)-len(X_test.columns)-1))
mse = mean_squared_error(y_test, y_pred)

print("Adjusted R2 : " + r2_adj.astype(str) + "\nMSE : " + mse.astype(str))


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [None]:
#One model per household
r_2 = [None]*len(dfs_train)
mserror = [None]*len(dfs_train)

for i in range(len(dfs_train)):
    X_train = dfs_train[i].loc[:, dfs_train[i].columns != 'pred']
    X_test = dfs_test[i].loc[:, dfs_test[i].columns != 'pred']


    y_train = dfs_train[i].loc[:, dfs_train[i].columns == 'pred']
    y_test = dfs_test[i].loc[:, dfs_test[i].columns == 'pred']

    #Initialising ANN
    ann = tf.keras.models.Sequential()

    #Adding First Hidden Layer
    ann.add(tf.keras.layers.Dense(units=8))

    ann.add(tf.keras.layers.Dropout(0.2))

    #Adding Second Hidden Layer
    ann.add(tf.keras.layers.Dense(units=4))

    #Adding Output Layer
    ann.add(tf.keras.layers.Dense(units=1, activation = "tanh"))

    
    
    #Compiling ANN
    ann.compile(optimizer="adam",loss="MeanSquaredError")
    
    #Fitting ANN
    ann.fit(X_train,y_train,batch_size=32,epochs = 100, verbose=0)

    y_pred = ann.predict(X_test)
    
    r_2[i] = 1-(1-r2_score(y_test, y_pred))*((len(X_test)-1)/(len(X_test)-len(X_test.columns)-1))
    mserror[i] = mean_squared_error(y_test, y_pred)
    
    
r2_adj = np.mean(r_2)
mse = np.mean(mserror)

print("Adjusted R2 : " + r2_adj.astype(str) + "\nMSE : " + mse.astype(str))

## XGBoost 

In [9]:
#One model
X_train = tot_train.loc[:, tot_train.columns != 'pred']
X_test = tot_test.loc[:, tot_test.columns != 'pred']


y_train = tot_train.loc[:, tot_train.columns == 'pred']
y_test = tot_test.loc[:, tot_test.columns == 'pred']



xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 4, alpha = 4, n_estimators = 300)

xg_reg.fit(X_train, y_train)
y_pred = xg_reg.predict(X_test)


r2_adj = 1-(1-r2_score(y_test, y_pred))*((len(X_test)-1)/(len(X_test)-len(X_test.columns)-1))
mse = mean_squared_error(y_test, y_pred)

print("Adjusted R2 : " + r2_adj.astype(str) + "\nMSE : " + mse.astype(str))


Adjusted R2 : 0.9605589685626578
MSE : 0.039702909754441205


In [10]:
#One model per household
r_2 = [None]*len(dfs_train)
mserror = [None]*len(dfs_train)


for i in range(len(dfs_train)):
    X_train = dfs_train[i].loc[:, dfs_train[i].columns != 'pred']
    X_test = dfs_test[i].loc[:, dfs_test[i].columns != 'pred']


    y_train = dfs_train[i].loc[:, dfs_train[i].columns == 'pred']
    y_test = dfs_test[i].loc[:, dfs_test[i].columns == 'pred']

    xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                    max_depth = 4, alpha = 4, n_estimators = 300)

    xg_reg.fit(X_train, y_train)
    y_pred = xg_reg.predict(X_test)


    r_2[i] = 1-(1-r2_score(y_test, y_pred))*((len(X_test)-1)/(len(X_test)-len(X_test.columns)-1))
    mserror[i] = mean_squared_error(y_test, y_pred)
    
    
r2_adj = np.mean(r_2)
mse = np.mean(mserror)

print("Adjusted R2 : " + r2_adj.astype(str) + "\nMSE : " + mse.astype(str))

Adjusted R2 : 0.706939583177193
MSE : 0.29849921251596695


## LSTM 

In [22]:
X_train = tot_train.loc[:, tot_train.columns != 'pred']
X_test = tot_test.loc[:, tot_test.columns != 'pred']


y_train = tot_train.loc[:, tot_train.columns == 'pred']
y_test = tot_test.loc[:, tot_test.columns == 'pred']


#Initialising ANN
ann = tf.keras.models.Sequential()

#Adding First Hidden Layer
ann.add(LSTM(units=8, input_shape = (X_train.shape[1],1), return_sequences=True,activation="tanh",recurrent_activation="sigmoid"))

#Adding Second Hidden Layer
ann.add(LSTM(units=4, activation="tanh",recurrent_activation="sigmoid"))

#Adding Output Layer
ann.add(tf.keras.layers.Dense(units=1))

#Compiling ANN
ann.compile(optimizer="adam",loss="MeanSquaredError")

#Fitting ANN
ann.fit(X_train,y_train,batch_size=32,epochs = 100)

y_pred = ann.predict(X_test)
y_pred

r2_adj = 1-(1-r2_score(y_test, y_pred))*((len(X_test)-1)/(len(X_test)-len(X_test.columns)-1))
mse = mean_squared_error(y_test, y_pred)

print("Adjusted R2 : " + r2_adj.astype(str) + "\nMSE : " + mse.astype(str))


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 99/100
Epoch 100/100
Adjusted R2 : 0.8282501456183259
MSE : 0.1728902292955227


In [25]:
#One model per household
r_2 = [None]*len(dfs_train)
mserror = [None]*len(dfs_train)

for i in range(len(dfs_train)):
    X_train = dfs_train[i].loc[:, dfs_train[i].columns != 'pred']
    X_test = dfs_test[i].loc[:, dfs_test[i].columns != 'pred']


    y_train = dfs_train[i].loc[:, dfs_train[i].columns == 'pred']
    y_test = dfs_test[i].loc[:, dfs_test[i].columns == 'pred']

    #Initialising ANN
    ann = tf.keras.models.Sequential()

    #Adding First Hidden Layer
    ann.add(LSTM(units=8, input_shape = (X_train.shape[1],1), return_sequences=True,activation="tanh",recurrent_activation="sigmoid"))

    #Adding Second Hidden Layer
    ann.add(LSTM(units=4, activation="tanh",recurrent_activation="sigmoid"))

    #Adding Output Layer
    ann.add(tf.keras.layers.Dense(units=1))
    
    #Compiling ANN
    ann.compile(optimizer="adam",loss="MeanSquaredError")
    
    #Fitting ANN
    ann.fit(X_train,y_train,batch_size=32,epochs = 100, verbose = 0)

    y_pred = ann.predict(X_test)
    
    r_2[i] = 1-(1-r2_score(y_test, y_pred))*((len(X_test)-1)/(len(X_test)-len(X_test.columns)-1))
    mserror[i] = mean_squared_error(y_test, y_pred)
    print(i)
    
    
r2_adj = np.mean(r_2)
mse = np.mean(mserror)

print("Adjusted R2 : " + r2_adj.astype(str) + "\nMSE : " + mse.astype(str))

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
Adjusted R2 : 0.6192548902147478
MSE : 0.3885707114353605
