In [18]:
import pandas as pd 
import pickle
import numpy as np
from sklearn import linear_model
from sklearn.metrics import r2_score, mean_squared_error
import tensorflow as tf
import xgboost as xgb

from sklearn.model_selection import GridSearchCV


In [3]:
data = []
with (open("saved_results/clients_data", "rb")) as openfile:
    while True:
        try:
            data.append(pickle.load(openfile))
        except EOFError:
            break

data[customer][train_test][feature]


2 steps - train individual models per customer, then 1 global model for all data

## Dataframe creation 

In [4]:
#One dataframe per household
dfs_train = [None]*25
dfs_test = [None]*25

for i in range(len(data[0])):
    #Create train dfs
    mat_train = np.matrix(data[0][i][0])
    dfs_train[i] = pd.DataFrame(mat_train)
    dfs_train[i]['pred'] = data[0][i][1].tolist()
    
    #Create test dfs
    mat = np.matrix(data[0][i][2])
    dfs_test[i] = pd.DataFrame(mat)
    dfs_test[i]['pred'] = data[0][i][3].tolist()

In [5]:
#One complete df 
tot_train = dfs_train[0]
tot_test = dfs_test[0]

for i in range(len(dfs_train)-1):
    tot_train = tot_train.append(dfs_train[i+1])
    tot_test = tot_test.append(dfs_train[i+1])

## Multi-linear regression 


In [25]:
#One household
X_train = dfs_train[0].loc[:, dfs_train[0].columns != 'pred']
X_test = dfs_test[0].loc[:, dfs_test[0].columns != 'pred']

y_train = dfs_train[0].loc[:, dfs_train[0].columns == 'pred']
y_test = dfs_test[0].loc[:, dfs_test[0].columns == 'pred']

regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)

y_pred = regr.predict(X_test)

r2_adj = 1-(1-r2_score(y_test, y_pred))*((len(X_test)-1)/(len(X_test)-len(X_test.columns)-1))
mse = mean_squared_error(y_test, y_pred)

print("Adjusted R2 : " + r2_adj.astype(str) + "\nMSE : " + mse.astype(str))



Adjusted R2 : 0.7495491609617009
MSE : 0.28565978932868724


In [28]:
#One model per household
r_2 = [None]*len(dfs_train)
mserror = [None]*len(dfs_train)
for i in range(len(dfs_train)):
    X_train = dfs_train[i].loc[:, dfs_train[i].columns != 'pred']
    X_test = dfs_test[i].loc[:, dfs_test[i].columns != 'pred']


    y_train = dfs_train[i].loc[:, dfs_train[i].columns == 'pred']
    y_test = dfs_test[i].loc[:, dfs_test[i].columns == 'pred']
    
    regr = linear_model.LinearRegression()
    regr.fit(X_train, y_train)

    y_pred = regr.predict(X_test)
    
    r_2[i] = 1-(1-r2_score(y_test, y_pred))*((len(X_test)-1)/(len(X_test)-len(X_test.columns)-1))
    mserror[i] = mean_squared_error(y_test, y_pred)
    
    
r2_adj = np.mean(r_2)
mse = np.mean(mserror)

print("Adjusted R2 : " + r2_adj.astype(str) + "\nMSE : " + mse.astype(str))

Adjusted R2 : 0.68700508300428
MSE : 0.31843413883381044


In [32]:
#All together 
X_train = tot_train.loc[:, tot_train.columns != 'pred']
X_test = tot_test.loc[:, tot_test.columns != 'pred']


y_train = tot_train.loc[:, tot_train.columns == 'pred']
y_test = tot_test.loc[:, tot_test.columns == 'pred']

regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)

y_pred = regr.predict(X_test)

r2_adj = 1-(1-r2_score(y_test, y_pred))*((len(X_test)-1)/(len(X_test)-len(X_test.columns)-1))
mse = mean_squared_error(y_test, y_pred)

print("Adjusted R2 : " + r2_adj.astype(str) + "\nMSE : " + mse.astype(str))


Adjusted R2 : 0.7878142498010648
MSE : 0.2135946090738038


## ANN Linear

In [37]:
#Initialising ANN
ann = tf.keras.models.Sequential()

#Adding First Hidden Layer
ann.add(tf.keras.layers.Dense(units=8))

#Adding Second Hidden Layer
ann.add(tf.keras.layers.Dense(units=4))

#Adding Output Layer
ann.add(tf.keras.layers.Dense(units=1))


#Compiling ANN
ann.compile(optimizer="adam",loss="MeanSquaredError")

X_train = tot_train.loc[:, tot_train.columns != 'pred']
X_test = tot_test.loc[:, tot_test.columns != 'pred']


y_train = tot_train.loc[:, tot_train.columns == 'pred']
y_test = tot_test.loc[:, tot_test.columns == 'pred']

#Fitting ANN
ann.fit(X_train,y_train,batch_size=32,epochs = 15, verbose=0)

y_pred = ann.predict(X_test)
y_pred

r2_adj = 1-(1-r2_score(y_test, y_pred))*((len(X_test)-1)/(len(X_test)-len(X_test.columns)-1))
mse = mean_squared_error(y_test, y_pred)

print("Adjusted R2 : " + r2_adj.astype(str) + "\nMSE : " + mse.astype(str))


Adjusted R2 : 0.7875811258924978
MSE : 0.21382928086523856


In [38]:
#One model per household
r_2 = [None]*len(dfs_train)
mserror = [None]*len(dfs_train)

for i in range(len(dfs_train)):
    X_train = dfs_train[i].loc[:, dfs_train[i].columns != 'pred']
    X_test = dfs_test[i].loc[:, dfs_test[i].columns != 'pred']


    y_train = dfs_train[i].loc[:, dfs_train[i].columns == 'pred']
    y_test = dfs_test[i].loc[:, dfs_test[i].columns == 'pred']

    #Initialising ANN
    ann = tf.keras.models.Sequential()

    #Adding First Hidden Layer
    ann.add(tf.keras.layers.Dense(units=8))

    #Adding Second Hidden Layer
    ann.add(tf.keras.layers.Dense(units=4))

    #Adding Output Layer
    ann.add(tf.keras.layers.Dense(units=1))

    #Compiling ANN
    ann.compile(optimizer="adam",loss="MeanSquaredError")
    
    #Fitting ANN
    ann.fit(X_train,y_train,batch_size=32,epochs = 100, verbose=0)

    y_pred = ann.predict(X_test)
    
    r_2[i] = 1-(1-r2_score(y_test, y_pred))*((len(X_test)-1)/(len(X_test)-len(X_test.columns)-1))
    mserror[i] = mean_squared_error(y_test, y_pred)
    
    
r2_adj = np.mean(r_2)
mse = np.mean(mserror)

print("Adjusted R2 : " + r2_adj.astype(str) + "\nMSE : " + mse.astype(str))

Adjusted R2 : 0.6881897774793262
MSE : 0.3172990707016136


## ANN Non-Linear

In [39]:
#Initialising ANN
ann = tf.keras.models.Sequential()

#Adding First Hidden Layer
ann.add(tf.keras.layers.Dense(units=8))

#Adding Second Hidden Layer
ann.add(tf.keras.layers.Dense(units=4))

#Adding Output Layer
ann.add(tf.keras.layers.Dense(units=1, activation = "tanh"))


#Compiling ANN
ann.compile(optimizer="adam",loss="MeanSquaredError")

X_train = tot_train.loc[:, tot_train.columns != 'pred']
X_test = tot_test.loc[:, tot_test.columns != 'pred']


y_train = tot_train.loc[:, tot_train.columns == 'pred']
y_test = tot_test.loc[:, tot_test.columns == 'pred']

#Fitting ANN
ann.fit(X_train,y_train,batch_size=32,epochs = 15, verbose=0)

y_pred = ann.predict(X_test)
y_pred

r2_adj = 1-(1-r2_score(y_test, y_pred))*((len(X_test)-1)/(len(X_test)-len(X_test.columns)-1))
mse = mean_squared_error(y_test, y_pred)

print("Adjusted R2 : " + r2_adj.astype(str) + "\nMSE : " + mse.astype(str))


Adjusted R2 : 0.740388700632667
MSE : 0.2613350517059652


In [40]:
#One model per household
r_2 = [None]*len(dfs_train)
mserror = [None]*len(dfs_train)

for i in range(len(dfs_train)):
    X_train = dfs_train[i].loc[:, dfs_train[i].columns != 'pred']
    X_test = dfs_test[i].loc[:, dfs_test[i].columns != 'pred']


    y_train = dfs_train[i].loc[:, dfs_train[i].columns == 'pred']
    y_test = dfs_test[i].loc[:, dfs_test[i].columns == 'pred']

    #Initialising ANN
    ann = tf.keras.models.Sequential()

    #Adding First Hidden Layer
    ann.add(tf.keras.layers.Dense(units=8))

    #Adding Second Hidden Layer
    ann.add(tf.keras.layers.Dense(units=4))

    #Adding Output Layer
    ann.add(tf.keras.layers.Dense(units=1, activation = "tanh"))

    
    
    #Compiling ANN
    ann.compile(optimizer="adam",loss="MeanSquaredError")
    
    #Fitting ANN
    ann.fit(X_train,y_train,batch_size=32,epochs = 100, verbose=0)

    y_pred = ann.predict(X_test)
    
    r_2[i] = 1-(1-r2_score(y_test, y_pred))*((len(X_test)-1)/(len(X_test)-len(X_test.columns)-1))
    mserror[i] = mean_squared_error(y_test, y_pred)
    
    
r2_adj = np.mean(r_2)
mse = np.mean(mserror)

print("Adjusted R2 : " + r2_adj.astype(str) + "\nMSE : " + mse.astype(str))

Adjusted R2 : 0.6230623329643588
MSE : 0.3855005994468207


## XGBoost 

In [41]:
#One model
X_train = tot_train.loc[:, tot_train.columns != 'pred']
X_test = tot_test.loc[:, tot_test.columns != 'pred']


y_train = tot_train.loc[:, tot_train.columns == 'pred']
y_test = tot_test.loc[:, tot_test.columns == 'pred']



xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 4, alpha = 4, n_estimators = 80)

xg_reg.fit(X_train, y_train)
y_pred = xg_reg.predict(X_test)


r2_adj = 1-(1-r2_score(y_test, y_pred))*((len(X_test)-1)/(len(X_test)-len(X_test.columns)-1))
mse = mean_squared_error(y_test, y_pred)

print("Adjusted R2 : " + r2_adj.astype(str) + "\nMSE : " + mse.astype(str))


Adjusted R2 : 0.9183415680840374
MSE : 0.08220062292739855


In [42]:
#One model per household
r_2 = [None]*len(dfs_train)
mserror = [None]*len(dfs_train)


for i in range(len(dfs_train)):
    X_train = dfs_train[i].loc[:, dfs_train[i].columns != 'pred']
    X_test = dfs_test[i].loc[:, dfs_test[i].columns != 'pred']


    y_train = dfs_train[i].loc[:, dfs_train[i].columns == 'pred']
    y_test = dfs_test[i].loc[:, dfs_test[i].columns == 'pred']

    xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                    max_depth = 4, alpha = 4, n_estimators = 80)

    xg_reg.fit(X_train, y_train)
    y_pred = xg_reg.predict(X_test)


    r_2[i] = 1-(1-r2_score(y_test, y_pred))*((len(X_test)-1)/(len(X_test)-len(X_test.columns)-1))
    mserror[i] = mean_squared_error(y_test, y_pred)
    
    
r2_adj = np.mean(r_2)
mse = np.mean(mserror)

print("Adjusted R2 : " + r2_adj.astype(str) + "\nMSE : " + mse.astype(str))

Adjusted R2 : 0.6987849609984348
MSE : 0.30678938212501394
