In [1]:
from sqlalchemy.engine import Engine, Connection
from sqlalchemy import create_engine
from os import getenv
from typing import List, Dict, Optional
import pandas as pd
from pandas import DataFrame, Series, Timestamp
from pandas.core.groupby import DataFrameGroupBy
import matplotlib.pyplot as plt
from datetime import datetime
import datetime as dt
import numpy as np
from numpy import ndarray
import pytz
from numpy import allclose
import time

In [2]:
import sys
sys.path.insert(0, '..')

In [3]:
from etl.ETL import db_connection, group_hourly

In [4]:
try:
    from dotenv import load_dotenv

    load_dotenv()
except:
    print('No ".env" file or python-dotenv not installed... Using default env variables...')

In [5]:
def db_connection() -> Engine:
    dbname: Optional[str] = getenv('POSTGRES_DB_NAME')
    host: Optional[str] = getenv('POSTGRES_HOST')
    user: Optional[str] = getenv('POSTGRES_USERNAME')
    password: Optional[str] = getenv('POSTGRES_PASSWORD')
    port: Optional[str] = getenv('POSTGRES_PORT')
        
    postgres_str: str = f'postgresql://{user}:{password}@{host}:{port}/{dbname}'
    
    engine: Engine = create_engine(postgres_str)
    
    return engine

# EXTRACT DATA FROM PREDICTIONS TABLE

use tft_testset_preds for now , but the real one will be "ml_predictions"

pick last n months

In [None]:
n: int = 3
table: str = 'tft_testset_preds'

In [None]:
preds_query: str = "select * from {} where extract(month from forecast_time_utc) between extract(month from NOW()) -2 and extract(month from NOW())"

In [None]:
preds_query_interim: str = "select * from {} where extract(month from forecast_time_utc) between extract(month from TO_TIMESTAMP('2020-12-31 10:00:00', 'YYYY-MM-DD HH:MI:SS')) - {} and extract(month from TO_TIMESTAMP('2020-12-31 10:00:00', 'YYYY-MM-DD HH:MI:SS'))"
preds: DataFrame = pd.read_sql_query(preds_query_interim.format(table, n-1), con=db_connection())

In [None]:
preds

In [None]:
# gather min and max dates to match with energy query
lower = preds['forecast_time_utc'].min().strftime("%Y-%m-%d %H:%M:%S")
upper = (preds['forecast_time_utc'].max() + pd.Timedelta(hours=12)).strftime("%Y-%m-%d %H:%M:%S")

In [None]:
lower, upper

In [None]:
preds.dtypes

# EXTRACT DATA FROM ENERGY TABLE

table is "sorgenia_energy"

In [None]:
farm_list: List = ['UP_PRCLCDPLRM_1',
'UP_PRCLCDMZRD_1',
'UP_PRCLCDPRZZ_1',
'UP_PRCLCMINEO_1',
'UP_PEPIZZA_1',
'UP_MPNTLCSMBC_1',
'UP_MPNTLCDMRN_1']

In [None]:
energy_table: str = "sorgenia_energy"

In [None]:
t0 = time.perf_counter()
# query_energy: str = "SELECT * FROM {} WHERE start_date_utc >= '{}' and start_date_utc <= '{}'"
query_energy: str = "SELECT * FROM {} WHERE start_date_utc >= '{}' and start_date_utc < to_timestamp('{}', 'YYYY-MM-DD HH24:MI:SS') + '1 hour'::interval;"
df_energy: DataFrame = pd.read_sql_query(query_energy.format(energy_table, lower, upper), con=db_connection())
df_energy: DataFrame = group_hourly(df_energy)
df_energy: DataFrame = df_energy[df_energy['plant_name_up'].isin(farm_list)]
t1 = time.perf_counter()
print("time elapsed is ", t1-t0)

In [None]:
df_energy.head()

In [None]:
lower_en = df_energy['time'].min().strftime("%Y-%m-%d %H:%M:%S")
upper_en = df_energy['time'].max().strftime("%Y-%m-%d %H:%M:%S")

In [None]:
assert lower_en == lower
assert upper_en == upper

# Rearrange df_energy into lagged schema

In [None]:
def shift_lags(df: DataFrame, n_lags: int, col: str) -> DataFrame:
    df1 = df.copy()
    for i in range(1, n_lags+1):
        df1[f't{str(i)}'] = df1[col].shift(-i)
    #drop nans
    df1.dropna(axis=0, how='any', inplace=True)   
    
    return df1

In [None]:
df_energy = df_energy.groupby(by='plant_name_up').apply(shift_lags, 12, 'kwh')

In [None]:
df_energy.index = df_energy.index.droplevel(0)

In [None]:
df_energy.head()

In [None]:
df_energy.drop(['kwh'], axis=1, inplace=True)

In [None]:
df_energy.head()

In [None]:
df_energy.shape, preds.shape

# COMPARE DF_ENERGY WITH TFT_TESTSET_TARGETS

EXTRACT tft_testset_targets

In [None]:
target_table: str = 'tft_testset_targets'

In [None]:
targets: DataFrame = pd.read_sql_query(preds_query_interim.format(target_table, n-1), con=db_connection())

In [None]:
targets = targets.sort_values(by=['identifier', 'forecast_time_utc'], ascending=True, ignore_index=True)

ARE TARGETS AND DF_ENERGY IDENTICAL ?

In [None]:
tar_array: ndarray = targets.iloc[:, 2:].values
df_en_array: ndarray = df_energy.iloc[:, 2:].values

In [None]:
def compare(farm: str, targets: DataFrame, df_energy: DataFrame):
    tar_array: ndarray = targets.iloc[:-1, 2:].values
    df_en_array: ndarray = df_energy.iloc[:-1, 2:].values
    if allclose(tar_array, df_en_array):
        print(f"Arrays are equal for farm {farm}")
    else:
        print(f"Arrays aren't equal for farm {farm}")

In [None]:
for farm in targets['identifier'].unique():
    target_farm = targets[targets['identifier']==farm]
    print(target_farm.shape)
    df_energy_farm = df_energy[df_energy['plant_name_up']==farm]
    print(df_energy_farm.shape)
    compare(farm, target_farm, df_energy_farm)

In [None]:
# sort both df by up and time
df_energy.sort_values(by=['plant_name_up', 'time'], ascending=True, ignore_index=True, inplace=True)
preds.sort_values(by=['identifier', 'forecast_time_utc'], ascending=True, ignore_index=True, inplace=True)

# APPLY MAPE CALCULATION

In [6]:
from inference.mape import rolling_mape

In [None]:
df_mape: DataFrame = rolling_mape(df_energy, preds, 700, 'forecast_time_utc', 'plant_name_up')

In [None]:
df_mape.sort_values(by=['identifier', 'time'], ascending=True, ignore_index=True, inplace=True)

In [None]:
df_mape.iloc[:, 2:].mean().mean()

# WRAP UP FROM IMPORTS

In [7]:
from inference.inference import GetDataMape
from inference.constants import columns, sorgenia_farms, preds_query_interim, query_energy
from inference.evaluate import boxplotter

In [8]:
t0 = time.perf_counter()
getdata = GetDataMape(last_months=3, preds_table='tft_testset_preds', truth_table='sorgenia_energy', preds_query=preds_query_interim, query_energy=query_energy)
preds, truths = getdata.generate()
df_mape: DataFrame = rolling_mape(truths, preds, 700, 'forecast_time_utc', 'plant_name_up')
print(df_mape.iloc[:, 2:].mean().mean())
t1 = time.perf_counter()
print("time elapsed is ", t1-t0)

45.038461279668155
time elapsed is  25.0459128


In [9]:
df_mape.head()

Unnamed: 0,time,identifier,mape_t1,mape_t2,mape_t3,mape_t4,mape_t5,mape_t6,mape_t7,mape_t8,mape_t9,mape_t10,mape_t11,mape_t12
0,2020-10-01 00:00:00,UP_MPNTLCDMRN_1,27.605005,34.274829,38.776166,41.048545,41.662221,41.822887,42.108115,42.472731,42.718891,42.629015,42.413384,42.248339
1,2020-10-01 01:00:00,UP_MPNTLCDMRN_1,27.581282,34.284252,38.797733,41.072979,41.73888,41.907389,42.197657,42.564367,42.814373,42.738221,42.514021,42.331724
2,2020-10-01 02:00:00,UP_MPNTLCDMRN_1,27.588375,34.306613,38.821299,41.148814,41.819975,41.994704,42.287535,42.65811,42.921419,42.835787,42.593913,42.368555
3,2020-10-01 03:00:00,UP_MPNTLCDMRN_1,27.609576,34.337712,38.902855,41.235277,41.910618,42.086992,42.382701,42.766306,43.020451,42.91703,42.632285,42.344304
4,2020-10-01 04:00:00,UP_MPNTLCDMRN_1,27.667149,34.44187,39.010572,41.340539,42.014137,42.191327,42.499651,42.872641,43.107398,42.958222,42.608185,42.322214


In [10]:
figure = boxplotter(df_mape)
figure.show()