Attempt to apply trained model on testset

In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series, Timestamp
import tensorflow as tf
import tensorflow.compat.v1 as tf1
from tensorflow.compat.v1 import Session, ConfigProto
from tensorflow.python.eager.context import PhysicalDevice
from typing import Dict, List, Union, Generator
import os
from numpy import load

In [2]:
import sys
sys.path.insert(0, '..')

In [3]:
from data_formatters.base import GenericDataFormatter, InputTypes, DataTypes
from data_formatters.erg_wind import ErgFormatter

In [4]:
from expt_settings.configs import ExperimentConfig
from libs.hyperparam_opt import HyperparamOptManager
from libs.tft_model import TemporalFusionTransformer
import libs.utils as utils

In [5]:
if tf.test.gpu_device_name(): 
    print('Default GPU Device:{}'.format(tf.test.gpu_device_name()))
else:
    print("Please install GPU version of TF")

Default GPU Device:/device:GPU:0


In [6]:
gpu: List[PhysicalDevice] = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(gpu[0], True)

In [7]:
# Tensorflow setup
default_keras_session: Session = tf1.keras.backend.get_session()
tf_config: ConfigProto = utils.get_default_tensorflow_config(tf_device="gpu", gpu_id=0)

Selecting GPU ID=0


In [8]:
file_path: str = r'C:\Users\Lorenzo\PycharmProjects\TFT\outputs\data\erg_wind\data\erg_7farms_final.csv'

In [9]:
raw_data: DataFrame = pd.read_csv(file_path)

In [10]:
raw_data.head()

Unnamed: 0,energy_mw,time,Wind Speed,2m_devpoint [C],temperature [C],mean_sealev_pressure [hPa],surface pressure [hPa],precipitation [m],10_wind_speed,10_u_wind,...,days_from_start,id,hour,day,day_of_week,month,categorical_id,hours_from_start,categorical_day_of_week,categorical_hour
0,11.787686,2019-01-01 01:00:00,9.024996,-0.595344,3.746035,1023.987081,952.041642,0.017248,3.077197,-0.290841,...,0,BISACCIA2,1,1,1,1,BISACCIA2,0.0,1,1
1,12.321628,2019-01-01 02:00:00,9.115065,0.186824,4.068633,1023.939205,952.04334,0.066301,3.056552,-0.466334,...,0,BISACCIA2,2,1,1,1,BISACCIA2,1.0,1,2
2,12.21724,2019-01-01 03:00:00,8.807608,0.119856,3.750193,1023.588209,951.72926,0.053635,3.240812,-0.547045,...,0,BISACCIA2,3,1,1,1,BISACCIA2,2.0,1,3
3,12.117007,2019-01-01 04:00:00,9.551801,-0.312831,3.430814,1023.465573,951.629732,0.026092,3.616165,-0.753333,...,0,BISACCIA2,4,1,1,1,BISACCIA2,3.0,1,4
4,12.415503,2019-01-01 05:00:00,8.734134,-0.526966,3.453347,1023.853208,951.984117,0.016079,3.446475,-0.759338,...,0,BISACCIA2,5,1,1,1,BISACCIA2,4.0,1,5


In [11]:
raw_data['time'] = raw_data['time'].astype('datetime64[s]')

In [12]:
config = ExperimentConfig('erg_wind', r'C:\Users\Lorenzo\PycharmProjects\TFT\outputs')

In [13]:
formatter: ErgFormatter = config.make_data_formatter()

In [14]:
data_csv_path: str = config.data_csv_path

# SPLIT DATA

In [15]:
train, valid, test = formatter.split_data(raw_data)

Setting scalers with training data...


In [16]:
column_definitions = formatter.get_column_definition()

In [17]:
column_definitions

[('id', <DataTypes.REAL_VALUED: 0>, <InputTypes.ID: 4>),
 ('time', <DataTypes.DATE: 2>, <InputTypes.TIME: 5>),
 ('energy_mw', <DataTypes.REAL_VALUED: 0>, <InputTypes.TARGET: 0>),
 ('hour', <DataTypes.REAL_VALUED: 0>, <InputTypes.KNOWN_INPUT: 2>),
 ('day_of_week', <DataTypes.REAL_VALUED: 0>, <InputTypes.KNOWN_INPUT: 2>),
 ('hours_from_start', <DataTypes.REAL_VALUED: 0>, <InputTypes.KNOWN_INPUT: 2>),
 ('Wind Speed', <DataTypes.REAL_VALUED: 0>, <InputTypes.OBSERVED_INPUT: 1>),
 ('2m_devpoint [C]',
  <DataTypes.REAL_VALUED: 0>,
  <InputTypes.OBSERVED_INPUT: 1>),
 ('temperature [C]',
  <DataTypes.REAL_VALUED: 0>,
  <InputTypes.OBSERVED_INPUT: 1>),
 ('mean_sealev_pressure [hPa]',
  <DataTypes.REAL_VALUED: 0>,
  <InputTypes.OBSERVED_INPUT: 1>),
 ('surface pressure [hPa]',
  <DataTypes.REAL_VALUED: 0>,
  <InputTypes.OBSERVED_INPUT: 1>),
 ('precipitation [m]',
  <DataTypes.REAL_VALUED: 0>,
  <InputTypes.OBSERVED_INPUT: 1>),
 ('10_wind_speed', <DataTypes.REAL_VALUED: 0>, <InputTypes.OBSERVED_INP

In [None]:
train_samples, valid_samples = formatter.get_num_samples_for_calibration()

In [None]:
# Sets up default params
fixed_params: Dict = formatter.get_experiment_params()
params: Dict = formatter.get_default_model_params()
params["model_folder"]: str = os.path.join(config.model_folder, "fixed")
model_folder = os.path.join(config.model_folder, "fixed")

In [None]:
# Sets up hyperparam manager
print("*** Loading hyperparm manager ***")
opt_manager = HyperparamOptManager({k: [params[k]] for k in params},
                                   fixed_params, model_folder)

In [None]:
model_folder: str = opt_manager.hyperparam_folder

In [None]:
test.head()

WE'RE EXPECTING THE MODEL TO BE FED INPUTS WHICH ARE STARTING FROM THE FOLLOWING TIMESTAMPS

In [None]:
test.iloc[168:178, [0,1,25]]

# PREDICTION

In [None]:
print("*** Running tests ***")
tf1.reset_default_graph()
with tf.Graph().as_default(), tf1.Session(config=tf_config) as sess:
    tf1.keras.backend.set_session(sess)
    params: Dict = opt_manager.get_next_parameters()
    params['exp_name'] = 'erg_wind'
    params['data_folder'] = os.path.abspath(os.path.join(data_csv_path, os.pardir))
    model = TemporalFusionTransformer(params, use_cudnn=False)
    params.pop('exp_name', None)
    params.pop('data_folder', None)
    # load model
    model.load(opt_manager.hyperparam_folder, use_keras_loadings=True)
    
#     print("Computing best validation loss")
#     val_loss: Series = model.evaluate(valid)
        
    print("Computing test loss")
    output_map: Dict = model.predict(test, return_targets=True)
    print(f"Output map returned a dict with keys {output_map.get('p50').shape}")
    targets: DataFrame = formatter.format_predictions(output_map["targets"])
    p50_forecast: DataFrame = formatter.format_predictions(output_map["p50"])
    p90_forecast: DataFrame = formatter.format_predictions(output_map["p90"])
        
    # save all
    print("saving predictions and targets")
    targets.to_csv(os.path.join(opt_manager.hyperparam_folder, "targets.csv"), index=False)
    p50_forecast.to_csv(os.path.join(opt_manager.hyperparam_folder, "p50.csv"), index=False)
    p90_forecast.to_csv(os.path.join(opt_manager.hyperparam_folder, "p90.csv"), index=False)
        
    def extract_numerical_data(data: DataFrame) -> DataFrame:
        """Strips out forecast time and identifier columns."""
        return data[[
            col for col in data.columns
            if col not in {"forecast_time", "identifier"}
        ]]
    
    p50_loss = utils.numpy_normalised_quantile_loss(
            extract_numerical_data(targets), extract_numerical_data(p50_forecast),
            0.5)
    p90_loss = utils.numpy_normalised_quantile_loss(
        extract_numerical_data(targets), extract_numerical_data(p90_forecast),
        0.9)

    tf1.keras.backend.set_session(default_keras_session)

print()
print("Normalised Quantile Loss for Test Data: P50={}, P90={}".format(
    p50_loss.mean(), p90_loss.mean()))

In [None]:
# test.columns

In [None]:
# test.iloc[192:202, [0,20,21,24,23,2,3,4,5,6,7,8,9,10,11,12,13,14,15]]

# UNDERSTANDING SAVED PREDICTIONS

START BY CONVERTING THE FORECAST TIME INTO PROPER DATETIME FORMAT

In [None]:
p50_forecast: DataFrame = pd.read_csv(os.path.join(opt_manager.hyperparam_folder, "p50.csv"))
p90_forecast: DataFrame = pd.read_csv(os.path.join(opt_manager.hyperparam_folder, "p90.csv"))
targets: DataFrame = pd.read_csv(os.path.join(opt_manager.hyperparam_folder, "targets.csv"))

In [None]:
p90_forecast.head()

WE CAN'T FETCH DATES BECAUSE WE CONFIGURED THE TIME INDEX WITH THE "hours_from_start" WHICH GOT SCALED DURING PRE-TRAINING.
WE SHOULD RETRY USING THE DATETIME FORMATTED COLUMN AS TIME INDEX.