In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import sys
from sklearn import metrics # for the evaluation
from settings import CORR_GROUP, SEED1, SEED2, SEED3
from keras.callbacks import EarlyStopping
import tensorflow as tf
import logging

In [2]:
import AttentionBiLSTM
import SimpleGRU
import ConvLSTM


models = {
    'P_SUM': AttentionBiLSTM.create_model(8, 0.5, 19*15),
    'U_L1_N': SimpleGRU.create_model(10, 6*15),
    'I_SUM': SimpleGRU.create_model(20, 15),
    'H_TDH_I_L3_N': AttentionBiLSTM.create_model(8, 0.1, 15),
    'F': ConvLSTM.create_model(3, 15),
    'C_phi_L3': SimpleGRU.create_model(70, 15)
}

In [3]:
def unique_cols(df):
    a = df.to_numpy() # df.values (pandas<0.24)
    return (a[0] == a).all(0)


def create_supervised_dataset(df, target, feats, n_in=1, n_out=1):
    cols, names = list(), list()
    n_vars = len(feats)
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df[feats].shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df[target].shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(1)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(1)]
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    agg.dropna(inplace=True)
    return agg.values

In [5]:
from settings import CORR_GROUP

df = pd.read_csv("data/mongo_filtered_av101_mins.csv", index_col='ts')
#df = df.drop('Unnamed: 0', 1)
df.index = pd.to_datetime(df.index)

df_2 = df.loc[:,np.invert(unique_cols(df))]

# Average window
# df_2 = df.groupby(np.arange(len(df))//60).mean()

scaler = MinMaxScaler()
d = scaler.fit_transform(df_2)
scaled_df = pd.DataFrame(d, columns=df_2.columns, index=df_2.index)

callback = EarlyStopping(monitor='loss', patience=10)
history_window = 15
prediction_window = 1

for m in models:
    if m =='U_L1_N' or m == 'I_SUM': continue
    values = create_supervised_dataset(scaled_df, m, CORR_GROUP[m], n_in=history_window, n_out=prediction_window)
    len_values = values.shape[0]
    # split into train and test sets 
    n_train_seconds = int(0.8*len_values) #70% dos valores
    n_cv_seconds =  int(1*len_values) #20% dos valores
    train = values[:n_train_seconds, :]
    cv = values[n_train_seconds:n_cv_seconds, :]

    # split into input and outputs
    train_X, train_y = train[:, :-1], train[:, -1:]
    cv_X, cv_y = cv[:, :-1], cv[:, -1:]
    # reshape input to be 3D [samples, timesteps, features]
    train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
    cv_X = cv_X.reshape((cv_X.shape[0], 1, cv_X.shape[1]))
    model = models[m]
    history = model.fit(train_X, train_y, epochs=100, batch_size=72, validation_data=(cv_X, cv_y), verbose=1, shuffle=False, callbacks=[callback])
    model.save('models/' + m + '_model.h5')
    print(model.summary())
    

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [9]:
import pickle
from sklearn.linear_model import LinearRegression

for k in CORR_GROUP:
    if k in models: continue
    values = create_supervised_dataset(scaled_df, k, CORR_GROUP[k], n_in=history_window, n_out=prediction_window)
    len_values = values.shape[0]
    # split into train and test sets 
    n_train_seconds = int(0.8*len_values) #70% dos valores
    n_cv_seconds =  int(1*len_values) #20% dos valores
    train = values[:n_train_seconds, :]
    cv = values[n_train_seconds:n_cv_seconds, :]
    
    # split into input and outputs
    train_X, train_y = train[:, :-1], train[:, -1:]
    train_y = np.reshape(train_y, newshape=train_y.shape[0])
    model = LinearRegression().fit(train_X, train_y)
    pickle.dump(model, open(f'models/{k}_model.sav', 'wb'))

In [4]:
import pandas as pd
from settings import OUTPUT_FILE
results = pd.read_csv(OUTPUT_FILE, index_col='model')
for g,m in results.idxmin().to_dict().items():
    print(g,m, sep=': ')

P_SUM: AutoKeras
U_L1_N: AutoKeras
I_SUM: AutoKeras
H_TDH_I_L3_N: AutoKeras
F: AutoKeras
ReacEc_L1: LinRegression
C_phi_L3: AutoKeras
ReacEc_L3: LinRegression
RealE_SUM: LinRegression
H_TDH_U_L2_N: AutoKeras
