analyze split_data method from data_formatter class(GenericDataFormatter)

In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import tensorflow as tf
import tensorflow.compat.v1 as tf1
from tensorflow.compat.v1 import Session, ConfigProto
from tensorflow.python.eager.context import PhysicalDevice
from typing import Dict, List, Union, Generator
import os
from numpy import load

In [2]:
import sys
sys.path.insert(0, '..')

In [3]:
from data_formatters.base import GenericDataFormatter, InputTypes, DataTypes
from data_formatters.erg_wind import ErgFormatter

In [4]:
from expt_settings.configs import ExperimentConfig
from libs.hyperparam_opt import HyperparamOptManager
from libs.tft_model import TemporalFusionTransformer
import libs.utils as utils

In [5]:
if tf.test.gpu_device_name(): 
    print('Default GPU Device:{}'.format(tf.test.gpu_device_name()))
else:
    print("Please install GPU version of TF")

Default GPU Device:/device:GPU:0


In [6]:
gpu: List[PhysicalDevice] = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(gpu[0], True)

In [7]:
# Tensorflow setup
default_keras_session: Session = tf1.keras.backend.get_session()
tf_config: ConfigProto = utils.get_default_tensorflow_config(tf_device="gpu", gpu_id=0)

Selecting GPU ID=0


In [8]:
file_path: str = r'C:\Users\Lorenzo\PycharmProjects\TFT\outputs\data\erg\erg_7farms_final.csv'

In [9]:
raw_data: DataFrame = pd.read_csv(file_path)

In [10]:
raw_data.head()

Unnamed: 0,energy_mw,time,Wind Speed,2m_devpoint [C],temperature [C],mean_sealev_pressure [hPa],surface pressure [hPa],precipitation [m],10_wind_speed,10_u_wind,...,days_from_start,id,hour,day,day_of_week,month,categorical_id,hours_from_start,categorical_day_of_week,categorical_hour
0,11.787686,2019-01-01 01:00:00,9.024996,-0.595344,3.746035,1023.987081,952.041642,0.017248,3.077197,-0.290841,...,0,BISACCIA2,1,1,1,1,BISACCIA2,0.0,1,1
1,12.321628,2019-01-01 02:00:00,9.115065,0.186824,4.068633,1023.939205,952.04334,0.066301,3.056552,-0.466334,...,0,BISACCIA2,2,1,1,1,BISACCIA2,1.0,1,2
2,12.21724,2019-01-01 03:00:00,8.807608,0.119856,3.750193,1023.588209,951.72926,0.053635,3.240812,-0.547045,...,0,BISACCIA2,3,1,1,1,BISACCIA2,2.0,1,3
3,12.117007,2019-01-01 04:00:00,9.551801,-0.312831,3.430814,1023.465573,951.629732,0.026092,3.616165,-0.753333,...,0,BISACCIA2,4,1,1,1,BISACCIA2,3.0,1,4
4,12.415503,2019-01-01 05:00:00,8.734134,-0.526966,3.453347,1023.853208,951.984117,0.016079,3.446475,-0.759338,...,0,BISACCIA2,5,1,1,1,BISACCIA2,4.0,1,5


In [11]:
config = ExperimentConfig('erg_wind', r'C:\Users\Lorenzo\PycharmProjects\TFT\outputs')

In [12]:
formatter: ErgFormatter = config.make_data_formatter()

# SPLIT THE DATA

In [13]:
train, valid, test = formatter.split_data(raw_data)

Setting scalers with training data...


In [14]:
train.head()

Unnamed: 0,energy_mw,time,Wind Speed,2m_devpoint [C],temperature [C],mean_sealev_pressure [hPa],surface pressure [hPa],precipitation [m],10_wind_speed,10_u_wind,...,days_from_start,id,hour,day,day_of_week,month,categorical_id,hours_from_start,categorical_day_of_week,categorical_hour
0,1.128229,2019-01-01 01:00:00,1.064353,-1.362439,-1.210837,1.380884,0.953412,-0.230677,0.382889,-0.739538,...,0,BISACCIA2,-1.517353,1,-0.996175,1,0,-1.731744,1,1
1,1.223101,2019-01-01 02:00:00,1.095221,-1.242118,-1.174165,1.373247,0.9537,-0.079954,0.368875,-0.85095,...,0,BISACCIA2,-1.37287,1,-0.996175,1,0,-1.731175,1,2
2,1.204553,2019-01-01 03:00:00,0.989852,-1.25242,-1.210364,1.317254,0.900357,-0.118873,0.49395,-0.902189,...,0,BISACCIA2,-1.228386,1,-0.996175,1,0,-1.730607,1,3
3,1.186743,2019-01-01 04:00:00,1.244895,-1.31898,-1.246671,1.29769,0.883453,-0.203501,0.748738,-1.033151,...,0,BISACCIA2,-1.083902,1,-0.996175,1,0,-1.730039,1,4
4,1.239781,2019-01-01 05:00:00,0.964672,-1.35192,-1.244109,1.359528,0.943642,-0.234269,0.633553,-1.036963,...,0,BISACCIA2,-0.939418,1,-0.996175,1,0,-1.729471,1,5


looks like the data_formatter scaled "power_usage", "hour", "day_of_week", "hours_from_start"

In [None]:
column_definitions = formatter.get_column_definition()

In [None]:
column_definitions

In [None]:
train_samples, valid_samples = formatter.get_num_samples_for_calibration()

In [None]:
# Sets up default params
fixed_params: Dict = formatter.get_experiment_params()
params: Dict = formatter.get_default_model_params()
params["model_folder"]: str = os.path.join(config.model_folder, "fixed")
model_folder = os.path.join(config.model_folder, "fixed")

In [None]:
# Sets up hyperparam manager
print("*** Loading hyperparm manager ***")
opt_manager = HyperparamOptManager({k: [params[k]] for k in params},
                                   fixed_params, model_folder)

In [None]:
model_folder: str = opt_manager.hyperparam_folder

# ANALYZE PARAMETERS

In [None]:
fixed_params.keys()

In [None]:
#column definition
fixed_params.get('column_definition')

In [None]:
fixed_params.get('known_categorical_inputs')

# TRAINING

In [None]:
print("*** Running calibration ***")
print("Params Selected:")
for k in params:
    print("{}: {}".format(k, params[k]))
    
best_loss = np.Inf
for _ in range(1):
    tf1.reset_default_graph()
    with tf.Graph().as_default(), tf1.Session(config=tf_config) as sess:
        tf1.keras.backend.set_session(sess)
        
        params: Dict = opt_manager.get_next_parameters()
        model: TemporalFusionTransformer = TemporalFusionTransformer(params, use_cudnn=False)

        if not os.path.exists(os.path.join(model.data_folder, 'data.npy')) and not model.training_data_cached():
            model.cache_batched_data(train, "train", num_samples=train_samples)
            model.cache_batched_data(valid, "valid", num_samples=valid_samples)
            
        sess.run(tf1.global_variables_initializer())
        model.fit() 
        
        val_loss: Series = model.evaluate()
            
        if val_loss < best_loss:
            opt_manager.update_score(params, val_loss, model)
            best_loss = val_loss
        tf1.keras.backend.set_session(default_keras_session)

In [None]:
model_ckpt: str = r'C:\Users\Lorenzo\PycharmProjects\TFT\outputs\saved_models\electricity\fixed\TemporalFusionTransformer.check'

In [None]:
folder: str = r'C:\Users\Lorenzo\PycharmProjects\TFT\outputs\data\electricity\data\electricity'
train_size: int = 450000

In [None]:
def traindata_gen(folder: str, minibatch_size: int) -> Generator:
    data: memmap = load(os.path.join(folder, 'data.npy'), mmap_mode='r')
    print(type(data))
    labels: memmap = load(os.path.join(folder, 'labels.npy'), mmap_mode='r')
    active_flags: memmap = load(os.path.join(folder, 'active_flags.npy'), mmap_mode='r')
    train_size: Tuple = data.shape
    for i in range(train_size[0] // minibatch_size + 1):
        upper = min((i + 1) * minibatch_size, train_size[0])

        yield data[i * minibatch_size:upper], np.concatenate([labels[i * minibatch_size:upper],
                                                                           labels[i * minibatch_size:upper],
                                                                           labels[i * minibatch_size:upper]], axis=-1), \
                      active_flags[i * minibatch_size:upper]

In [None]:
num_batches = 0
for data, labels, active_flags in traindata_gen(folder, 64):
    num_batches += 1
num_batches

In [None]:
import math
math.ceil(50000/64)