analyze split_data method from data_formatter class(GenericDataFormatter)

In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import tensorflow as tf
import tensorflow.compat.v1 as tf1
from tensorflow.compat.v1 import Session, ConfigProto
from tensorflow.python.eager.context import PhysicalDevice
from typing import Dict, List, Union, Generator
import os
from numpy import load

In [2]:
import sys
sys.path.insert(0, '..')

In [3]:
from data_formatters.base import GenericDataFormatter, InputTypes, DataTypes
from data_formatters.electricity import ElectricityFormatter

In [4]:
from expt_settings.configs import ExperimentConfig
from libs.hyperparam_opt import HyperparamOptManager
from libs.tft_model import TemporalFusionTransformer
import libs.utils as utils

In [5]:
 if tf.test.gpu_device_name(): 
    print('Default GPU Device:{}'.format(tf.test.gpu_device_name()))
else:
    print("Please install GPU version of TF")

Default GPU Device:/device:GPU:0


In [6]:
gpu: List[PhysicalDevice] = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(gpu[0], True)

In [7]:
# Tensorflow setup
default_keras_session: Session = tf1.keras.backend.get_session()
tf_config: ConfigProto = utils.get_default_tensorflow_config(tf_device="gpu", gpu_id=0)

Selecting GPU ID=0


In [8]:
file_path: str = r'C:\Users\Lorenzo\PycharmProjects\TFT\outputs\data\electricity\data\electricity\hourly_electricity.csv'

In [9]:
raw_data: DataFrame = pd.read_csv(file_path, index_col=0)

  mask |= (ar1 == a)


In [10]:
config = ExperimentConfig('electricity', r'C:\Users\Lorenzo\PycharmProjects\TFT\outputs')

In [11]:
formatter: ElectricityFormatter = config.make_data_formatter()

# SPLIT THE DATA

In [12]:
train, valid, test = formatter.split_data(raw_data)

Formatting train-valid-test splits.
Setting scalers with training data...


In [13]:
train.head()

Unnamed: 0,power_usage,t,days_from_start,categorical_id,date,id,hour,day,day_of_week,month,hours_from_start,categorical_day_of_week,categorical_hour
17544,-0.127174,26304.0,1096,0,2014-01-01 00:00:00,MT_001,-1.661325,1,-0.499719,1,-1.731721,2,0
17545,-0.050713,26305.0,1096,0,2014-01-01 01:00:00,MT_001,-1.516862,1,-0.499719,1,-1.731062,2,1
17546,-0.050713,26306.0,1096,0,2014-01-01 02:00:00,MT_001,-1.372399,1,-0.499719,1,-1.730403,2,2
17547,-0.050713,26307.0,1096,0,2014-01-01 03:00:00,MT_001,-1.227936,1,-0.499719,1,-1.729744,2,3
17548,-0.127174,26308.0,1096,0,2014-01-01 04:00:00,MT_001,-1.083473,1,-0.499719,1,-1.729085,2,4


In [14]:
train.columns

Index(['power_usage', 't', 'days_from_start', 'categorical_id', 'date', 'id',
       'hour', 'day', 'day_of_week', 'month', 'hours_from_start',
       'categorical_day_of_week', 'categorical_hour'],
      dtype='object')

In [15]:
raw_data.columns

Index(['power_usage', 't', 'days_from_start', 'categorical_id', 'date', 'id',
       'hour', 'day', 'day_of_week', 'month', 'hours_from_start',
       'categorical_day_of_week', 'categorical_hour'],
      dtype='object')

In [16]:
raw_data.head()

Unnamed: 0,power_usage,t,days_from_start,categorical_id,date,id,hour,day,day_of_week,month,hours_from_start,categorical_day_of_week,categorical_hour
17544,2.538071,26304.0,1096,MT_001,2014-01-01 00:00:00,MT_001,0,1,2,1,26304.0,2,0
17545,2.85533,26305.0,1096,MT_001,2014-01-01 01:00:00,MT_001,1,1,2,1,26305.0,2,1
17546,2.85533,26306.0,1096,MT_001,2014-01-01 02:00:00,MT_001,2,1,2,1,26306.0,2,2
17547,2.85533,26307.0,1096,MT_001,2014-01-01 03:00:00,MT_001,3,1,2,1,26307.0,2,3
17548,2.538071,26308.0,1096,MT_001,2014-01-01 04:00:00,MT_001,4,1,2,1,26308.0,2,4


In [17]:
train.head()

Unnamed: 0,power_usage,t,days_from_start,categorical_id,date,id,hour,day,day_of_week,month,hours_from_start,categorical_day_of_week,categorical_hour
17544,-0.127174,26304.0,1096,0,2014-01-01 00:00:00,MT_001,-1.661325,1,-0.499719,1,-1.731721,2,0
17545,-0.050713,26305.0,1096,0,2014-01-01 01:00:00,MT_001,-1.516862,1,-0.499719,1,-1.731062,2,1
17546,-0.050713,26306.0,1096,0,2014-01-01 02:00:00,MT_001,-1.372399,1,-0.499719,1,-1.730403,2,2
17547,-0.050713,26307.0,1096,0,2014-01-01 03:00:00,MT_001,-1.227936,1,-0.499719,1,-1.729744,2,3
17548,-0.127174,26308.0,1096,0,2014-01-01 04:00:00,MT_001,-1.083473,1,-0.499719,1,-1.729085,2,4


looks like the data_formatter scaled "power_usage", "hour", "day_of_week", "hours_from_start"

In [18]:
column_definitions = formatter.get_column_definition()

In [19]:
column_definitions

[('id', <DataTypes.REAL_VALUED: 0>, <InputTypes.ID: 4>),
 ('hours_from_start', <DataTypes.REAL_VALUED: 0>, <InputTypes.TIME: 5>),
 ('power_usage', <DataTypes.REAL_VALUED: 0>, <InputTypes.TARGET: 0>),
 ('hour', <DataTypes.REAL_VALUED: 0>, <InputTypes.KNOWN_INPUT: 2>),
 ('day_of_week', <DataTypes.REAL_VALUED: 0>, <InputTypes.KNOWN_INPUT: 2>),
 ('hours_from_start', <DataTypes.REAL_VALUED: 0>, <InputTypes.KNOWN_INPUT: 2>),
 ('categorical_id', <DataTypes.CATEGORICAL: 1>, <InputTypes.STATIC_INPUT: 3>)]

In [20]:
train_samples, valid_samples = formatter.get_num_samples_for_calibration()

In [21]:
# Sets up default params
fixed_params: Dict = formatter.get_experiment_params()
params: Dict = formatter.get_default_model_params()
params["model_folder"]: str = os.path.join(config.model_folder, "fixed")
model_folder = os.path.join(config.model_folder, "fixed")

In [22]:
# Sets up hyperparam manager
print("*** Loading hyperparm manager ***")
opt_manager = HyperparamOptManager({k: [params[k]] for k in params},
                                   fixed_params, model_folder)

*** Loading hyperparm manager ***


In [23]:
model_folder: str = opt_manager.hyperparam_folder

# ANALYZE PARAMETERS

In [24]:
fixed_params.keys()

dict_keys(['total_time_steps', 'num_encoder_steps', 'num_epochs', 'early_stopping_patience', 'multiprocessing_workers', 'column_definition', 'input_size', 'output_size', 'category_counts', 'input_obs_loc', 'static_input_loc', 'known_regular_inputs', 'known_categorical_inputs'])

In [25]:
#column definition
fixed_params.get('column_definition')

[('id', <DataTypes.REAL_VALUED: 0>, <InputTypes.ID: 4>),
 ('hours_from_start', <DataTypes.REAL_VALUED: 0>, <InputTypes.TIME: 5>),
 ('power_usage', <DataTypes.REAL_VALUED: 0>, <InputTypes.TARGET: 0>),
 ('hour', <DataTypes.REAL_VALUED: 0>, <InputTypes.KNOWN_INPUT: 2>),
 ('day_of_week', <DataTypes.REAL_VALUED: 0>, <InputTypes.KNOWN_INPUT: 2>),
 ('hours_from_start', <DataTypes.REAL_VALUED: 0>, <InputTypes.KNOWN_INPUT: 2>),
 ('categorical_id', <DataTypes.CATEGORICAL: 1>, <InputTypes.STATIC_INPUT: 3>)]

In [26]:
fixed_params.get('known_categorical_inputs')

[0]

# TRAINING

In [27]:
print("*** Running calibration ***")
print("Params Selected:")
for k in params:
    print("{}: {}".format(k, params[k]))
    
best_loss = np.Inf
for _ in range(1):
    tf1.reset_default_graph()
    with tf.Graph().as_default(), tf1.Session(config=tf_config) as sess:
        tf1.keras.backend.set_session(sess)
        
        params: Dict = opt_manager.get_next_parameters()
        model: TemporalFusionTransformer = TemporalFusionTransformer(params, use_cudnn=False)

        if not model.training_data_cached():
            model.cache_batched_data(train, "train", num_samples=train_samples)
            model.cache_batched_data(valid, "valid", num_samples=valid_samples)
            
        sess.run(tf1.global_variables_initializer())
        model.fit() 
        
        val_loss: Series = model.evaluate()
            
        if val_loss < best_loss:
            opt_manager.update_score(params, val_loss, model)
            best_loss = val_loss
        tf1.keras.backend.set_session(default_keras_session)

*** Running calibration ***
Params Selected:
dropout_rate: 0.1
hidden_layer_size: 160
learning_rate: 0.001
minibatch_size: 64
max_gradient_norm: 0.01
num_heads: 4
stack_size: 1
model_folder: C:\Users\Lorenzo\PycharmProjects\TFT\outputs\saved_models\electricity\fixed
Resetting temp folder...
*** TemporalFusionTransformer params ***
# dropout_rate = 0.1
# hidden_layer_size = 160
# learning_rate = 0.001
# max_gradient_norm = 0.01
# minibatch_size = 64
# model_folder = C:\Users\Lorenzo\PycharmProjects\TFT\outputs\saved_models\electricity\fixed
# num_heads = 4
# stack_size = 1
# total_time_steps = 192
# num_encoder_steps = 168
# num_epochs = 100
# early_stopping_patience = 5
# multiprocessing_workers = 5
# column_definition = [('id', <DataTypes.REAL_VALUED: 0>, <InputTypes.ID: 4>), ('hours_from_start', <DataTypes.REAL_VALUED: 0>, <InputTypes.TIME: 5>), ('power_usage', <DataTypes.REAL_VALUED: 0>, <InputTypes.TARGET: 0>), ('hour', <DataTypes.REAL_VALUED: 0>, <InputTypes.KNOWN_INPUT: 2>), ('da

Getting valid sampling locations.
Getting locations for MT_001
Getting locations for MT_002
Getting locations for MT_003
Getting locations for MT_004
Getting locations for MT_005
Getting locations for MT_006
Getting locations for MT_007
Getting locations for MT_008
Getting locations for MT_009
Getting locations for MT_010
Getting locations for MT_011
Getting locations for MT_012
Getting locations for MT_013
Getting locations for MT_014
Getting locations for MT_015
Getting locations for MT_016
Getting locations for MT_017
Getting locations for MT_018
Getting locations for MT_019
Getting locations for MT_020
Getting locations for MT_021
Getting locations for MT_022
Getting locations for MT_023
Getting locations for MT_024
Getting locations for MT_025
Getting locations for MT_026
Getting locations for MT_027
Getting locations for MT_028
Getting locations for MT_029
Getting locations for MT_030
Getting locations for MT_031
Getting locations for MT_032
Getting locations for MT_033
Getting l

Getting locations for MT_318
Getting locations for MT_319
Getting locations for MT_320
Getting locations for MT_321
Getting locations for MT_322
Getting locations for MT_323
Getting locations for MT_324
Getting locations for MT_325
Getting locations for MT_326
Getting locations for MT_327
Getting locations for MT_328
Getting locations for MT_329
Getting locations for MT_330
Getting locations for MT_331
Getting locations for MT_332
Getting locations for MT_333
Getting locations for MT_334
Getting locations for MT_335
Getting locations for MT_336
Getting locations for MT_337
Getting locations for MT_338
Getting locations for MT_339
Getting locations for MT_340
Getting locations for MT_341
Getting locations for MT_342
Getting locations for MT_343
Getting locations for MT_344
Getting locations for MT_345
Getting locations for MT_346
Getting locations for MT_347
Getting locations for MT_348
Getting locations for MT_349
Getting locations for MT_350
Getting locations for MT_351
Getting locati

Cached data "valid" updated
*** Fitting TemporalFusionTransformer ***
Wrapping into tensorflow Datasets
Instructions for updating:
Use `tf.data.Dataset.unbatch()`.
Using cached validation data
Train on 7032 steps, validate on 782 steps
Epoch 1/100
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Epoch 2/100
   0/7032 [..............................] - 0s 0s/step - loss: 3.1263e-05
Cannot load from C:\Users\Lorenzo\PycharmProjects\TFT\outputs\saved_models\electricity\fixed\tmp, skipping ...
Using cached validation data


NameError: name 'Series' is not defined

In [None]:
model_ckpt: str = r'C:\Users\Lorenzo\PycharmProjects\TFT\outputs\saved_models\electricity\fixed\TemporalFusionTransformer.check'

In [None]:
folder: str = r'C:\Users\Lorenzo\PycharmProjects\TFT\outputs\data\electricity\data\electricity'
train_size: int = 450000

In [None]:
def traindata_gen(folder: str, train_size: int , minibatch_size: int) -> Generator:
    data = load(os.path.join(folder, 'data.npy'))
    labels = load(os.path.join(folder, 'labels.npy'))
    active_flags = load(os.path.join(folder, 'active_flags.npy'))
    for i in range(train_size // minibatch_size + 1):
        upper = min((i + 1) * minibatch_size, train_size)

        yield data[i * minibatch_size:upper], np.concatenate([labels[i * minibatch_size:upper],
                                                                           labels[i * minibatch_size:upper],
                                                                           labels[i * minibatch_size:upper]], axis=-1), \
                      active_flags[i * minibatch_size:upper]

In [None]:
for data, labels, active_flags in traindata_gen(folder, train_size, 128):
    print(data.shape, labels.shape, active_flags.shape)