In [1]:
#import tensorflow as tf
#gpu = len(tf.config.list_physical_devices('GPU'))>0
#print("GPU is", "available" if gpu else "NOT AVAILABLE")
import os
from support.utils import real_data_loading
import pandas as pd
from synthesizers.timeseries import TimeEncoder
from synthesizers import ModelParameters
from sklearn.decomposition import PCA#from sklearn.manifold import TSNE
import numpy as np
import matplotlib.gridspec as gridspec
from matplotlib import pyplot as plt


# Parameters

In [2]:
seq_len = 24        # Timesteps
n_seq = 8        # Features
hidden_dim = 8     # Hidden units for generator (GRU & LSTM).
                    # Also decides output_units for generator
gamma = 1           # Used for discriminator loss
noise_dim = 32      # Used by generator as a starter dimension
dim = 128           # UNUSED
batch_size = 8
learning_rate = 5e-4
beta_1 = 0          # UNUSED
beta_2 = 1          # UNUSED
data_dim = 28       # UNUSED

gan_args = ModelParameters(batch_size=batch_size,
                           lr=learning_rate,
                           noise_dim=noise_dim,
                           layers_dim=dim)

# Input train dataset

In [3]:
train_path = "data_train_24.csv"
train_df = pd.read_csv(train_path)


# Data transformations to be applied prior to be used with the synthesizer model
train_data = real_data_loading(train_df.values, seq_len=seq_len, n_signal=3)
print(len(train_data))#, train_data[0].shape)

6619902840
3902591


In [4]:
train_df.head()

Unnamed: 0,time,ACTON275,BOL5,Densidad2_,ECE7,GR,GR2,HALFAC3,IACCEL1,RX306
0,1000.0,-0.063492,-0.102564,-0.009916,-0.031746,-0.019536,-0.029304,-0.03663,2.288156,0.0
1,1000.01,-0.06105,-0.102564,-0.009916,-0.031746,-0.017094,-0.029304,-0.046398,2.288156,0.004884
2,1000.02,-0.056166,-0.100122,-0.009916,-0.031746,-0.019536,-0.034188,-0.043956,2.283272,0.0
3,1000.03,-0.06105,-0.102564,-0.009468,-0.029304,-0.019536,-0.03663,-0.041514,2.28083,0.0
4,1000.05,-0.065934,-0.102564,-0.007293,-0.029304,-0.019536,-0.031746,-0.043956,2.28083,0.0


In [5]:
#print(train_data[:][0][0])
#print(len(train_data[:][0][0]))

# Training

In [6]:
synth = TimeEncoder(model_parameters=gan_args, hidden_dim=hidden_dim, seq_len=seq_len, n_seq=n_seq, gamma=1, n_out=1)
synth.train(train_data, train_steps=1)

folders = os.listdir('models')

try:
    n = np.max([int(f.split('_')[1]) for f in folders])
    folder = os.path.join('models','model_'+str(n+1))
    os.mkdir(folder)
except:
    folder = os.path.join('models','model_1')
    os.mkdir(folder)

print('Model saved in:',folder)
synth.save(os.path.join(folder,'synth_energy.pkl'))

GPU
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
RealData (InputLayer)        [(8, 24, 8)]              0         
_________________________________________________________________
Embedder (Sequential)        (8, 24, 8)                1368      
_________________________________________________________________
Recovery (Sequential)        (8, 1)                    1489      
Total params: 2,857
Trainable params: 2,857
Non-trainable params: 0
_________________________________________________________________
Model: "Embedder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
GRU_1 (GRU)                  (8, 24, 8)                432       
_________________________________________________________________
GRU_2 (GRU)                  (8, 24, 8)                432       
_____________________________________

Emddeding network training: 100%|███████████████████████████████████████████████████████| 1/1 [03:07<00:00, 187.98s/it]


Model saved in: models\model_2


TypeError: cannot pickle 'weakref' object

# Loading (in case you want to use a trained model)

In [None]:
folder = r'models\model_12'
synth = TimeEncoder.load(os.path.join(folder,'synth_energy.pkl'))

# Input test dataset

In [None]:
test_path = "data_test_24.csv"
test_df = pd.read_csv(test_path)

# Data transformations to be applied prior to be used with the synthesizer model
test_data = real_data_loading(test_df.values, seq_len=seq_len)
print(len(test_data), test_data[0].shape)

# Synthetic data generation

In [None]:
synth_data = synth.sample(len(test_data))

In [None]:
cols = ['Open','High','Low','Close','Adj Close','Volume']

#Plotting some generated samples. Both Synthetic and Original data are still standartized with values between [0,1]
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(15, 10))
axes=axes.flatten()

time = list(range(1,25))
obs = np.random.randint(len(energy_data))

for j, col in enumerate(cols):
    df = pd.DataFrame({'Real': test_data[obs][:, j],
                   'Synthetic': synth_data[obs][:, j]})
    df.plot(ax=axes[j],
            title = col,
            secondary_y='Synthetic data', style=['-', '--'])
fig.tight_layout()

In [None]:
sample_size = 250
idx = np.random.permutation(len(test_data))[:sample_size]

real_sample = np.asarray(test_data)[idx]
synthetic_sample = np.asarray(synth_data)[idx]

# For the purpose of comparison we need the data to be 2-Dimensional.
# For that reason we are going to use only two components for both the PCA and TSNE.
# synth_data_reduced: {ndarray: (7000, 24)}
# energy_data_reduced: {ndarray: (7000, 24)}
synth_data_reduced = real_sample.reshape(-1, seq_len)
energy_data_reduced = np.asarray(synthetic_sample).reshape(-1,seq_len)

n_components = 2
pca = PCA(n_components=n_components)
tsne = TSNE(n_components=n_components, n_iter=300)

# The fit of the methods must be done only using the real sequential data
pca.fit(energy_data_reduced)

# pca_real: {DataFrame: (7000, 2)}
# pca_synth: {DataFrame: (7000, 2)}
pca_real = pd.DataFrame(pca.transform(energy_data_reduced))
pca_synth = pd.DataFrame(pca.transform(synth_data_reduced))

# data_reduced: {ndarray: (14000, 24)}
data_reduced = np.concatenate((energy_data_reduced, synth_data_reduced), axis=0)

# tsne_results: {DataFrame: (14000, 2)}
tsne_results = pd.DataFrame(tsne.fit_transform(data_reduced))

In [None]:
fig = plt.figure(constrained_layout=True, figsize=(20,10))
spec = gridspec.GridSpec(ncols=2, nrows=1, figure=fig)

#TSNE scatter plot
ax = fig.add_subplot(spec[0,0])
ax.set_title('PCA results',
             fontsize=20,
             color='red',
             pad=10)

#PCA scatter plot
plt.scatter(pca_real.iloc[:, 0].values, pca_real.iloc[:,1].values,
            c='black', alpha=0.2, label='Original')
plt.scatter(pca_synth.iloc[:,0], pca_synth.iloc[:,1],
            c='red', alpha=0.2, label='Synthetic')
ax.legend()

ax2 = fig.add_subplot(spec[0,1])
ax2.set_title('TSNE results',
              fontsize=20,
              color='red',
              pad=10)

plt.scatter(tsne_results.iloc[:sample_size, 0].values, tsne_results.iloc[:sample_size,1].values,
            c='black', alpha=0.2, label='Original')
plt.scatter(tsne_results.iloc[sample_size:,0], tsne_results.iloc[sample_size:,1],
            c='red', alpha=0.2, label='Synthetic')

ax2.legend()

fig.suptitle('Validating synthetic vs real data diversity and distributions',
             fontsize=16,
             color='grey')

In [None]:
from tensorflow.keras import Input, Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import GRU, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanAbsoluteError

#First implement a simple RNN model for prediction
def RNN_regression(units):
    opt = Adam(name='AdamOpt')
    loss = MeanAbsoluteError(name='MAE')
    model = Sequential()
    model.add(GRU(units=units,
                  name=f'RNN_1'))
    model.add(Dense(units=10,
                    activation='sigmoid',
                    name='OUT'))
    model.compile(optimizer=opt, loss=loss)
    return model

In [None]:
#Prepare the dataset for the regression model
stock_data=np.asarray(test_data)
#synth_data = synth_data[:len(stock_data)]
n_events = len(stock_data)

#Split data on train and test
idx = np.arange(n_events)
n_train = int(.75*n_events)
train_idx = idx[:n_train]
test_idx = idx[n_train:]

#Define the X for synthetic and real data
X_stock_train = stock_data[train_idx, :seq_len, :]
X_synth_train = synth_data[train_idx, :seq_len, :]

X_stock_test = stock_data[test_idx, :seq_len, :]
y_stock_test = stock_data[test_idx, -1, :]

#Define the y for synthetic and real datasets
y_stock_train = stock_data[train_idx, -1, :]
y_synth_train = synth_data[train_idx, -1, :]

print('Synthetic X train: {}'.format(X_synth_train.shape))
print('Real X train: {}'.format(X_stock_train.shape))

print('Synthetic y train: {}'.format(y_synth_train.shape))
print('Real y train: {}'.format(y_stock_train.shape))

print('Real X test: {}'.format(X_stock_test.shape))
print('Real y test: {}'.format(y_stock_test.shape))

In [None]:
#Training the model with the real train data
ts_real = RNN_regression(12)
early_stopping = EarlyStopping(monitor='val_loss')

real_train = ts_real.fit(x=X_stock_train,
                          y=y_stock_train,
                          validation_data=(X_stock_test, y_stock_test),
                          epochs=200,
                          batch_size=128,
                          callbacks=[early_stopping])

In [None]:
#Training the model with the synthetic data
ts_synth = RNN_regression(12)
synth_train = ts_synth.fit(x=X_synth_train,
                          y=y_synth_train,
                          validation_data=(X_stock_test, y_stock_test),
                          epochs=200,
                          batch_size=128,
                          callbacks=[early_stopping])

In [None]:
#Summarize the metrics here as a pandas dataframe
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_log_error
real_predictions = ts_real.predict(X_stock_test)
synth_predictions = ts_synth.predict(X_stock_test)

metrics_dict = {'r2': [r2_score(y_stock_test, real_predictions),
                       r2_score(y_stock_test, synth_predictions)],
                'MAE': [mean_absolute_error(y_stock_test, real_predictions),
                        mean_absolute_error(y_stock_test, synth_predictions)],
                'MRLE': [mean_squared_log_error(y_stock_test, real_predictions),
                         mean_squared_log_error(y_stock_test, synth_predictions)]}

results = pd.DataFrame(metrics_dict, index=['Real', 'Synthetic'])

results