This notebook attempt to fill NaN on past targets with the last values per farm.
This would allow the model to run even without the full past week's hourly targets, albeit giving suboptimal predictions.

In [7]:
from sqlalchemy.engine import Engine, Connection
from sqlalchemy import create_engine
from os import getenv
from typing import List, Dict, Optional
import pandas as pd
from pandas import DataFrame, Series, Timestamp
import matplotlib.pyplot as plt
from datetime import datetime
import datetime as dt
import numpy as np
from numpy import ndarray
import pytz

In [8]:
import sys
sys.path.insert(0, '..')

In [9]:
from etl.ETL import db_connection, group_hourly

In [10]:
try:
    from dotenv import load_dotenv

    load_dotenv()
except:
    print('No ".env" file or python-dotenv not installed... Using default env variables...')

In [11]:
def db_connection() -> Engine:
    dbname: Optional[str] = getenv('POSTGRES_DB_NAME')
    host: Optional[str] = getenv('POSTGRES_HOST')
    user: Optional[str] = getenv('POSTGRES_USERNAME')
    password: Optional[str] = getenv('POSTGRES_PASSWORD')
    port: Optional[str] = getenv('POSTGRES_PORT')
        
    postgres_str: str = f'postgresql://{user}:{password}@{host}:{port}/{dbname}'
    
    engine: Engine = create_engine(postgres_str)
    
    return engine

In [12]:
startDate: str = (datetime.now(pytz.timezone('UTC')) - dt.timedelta(hours=167))
endDate: str = datetime.now(pytz.timezone('UTC'))
startDate, endDate 

(datetime.datetime(2021, 7, 6, 8, 5, 30, 885823, tzinfo=<UTC>),
 datetime.datetime(2021, 7, 13, 7, 5, 30, 885823, tzinfo=<UTC>))

In [13]:
targetDate: datetime = endDate + dt.timedelta(hours=12)
targetDates: List[datetime] = [endDate + dt.timedelta(hours=i) for i in range(1,13)]

In [14]:
start: str = startDate.strftime("%Y-%m-%d %H:00:00")
end: str = endDate.strftime("%Y-%m-%d %H:00:00")
targets: List[str] = [date.strftime("%Y-%m-%d %H:00:00") for date in targetDates]

In [15]:
start, end

('2021-07-06 08:00:00', '2021-07-13 07:00:00')

In [16]:
targets

['2021-07-13 08:00:00',
 '2021-07-13 09:00:00',
 '2021-07-13 10:00:00',
 '2021-07-13 11:00:00',
 '2021-07-13 12:00:00',
 '2021-07-13 13:00:00',
 '2021-07-13 14:00:00',
 '2021-07-13 15:00:00',
 '2021-07-13 16:00:00',
 '2021-07-13 17:00:00',
 '2021-07-13 18:00:00',
 '2021-07-13 19:00:00']

In [17]:
farm_list: List = ['UP_PRCLCDPLRM_1',
'UP_PRCLCDMZRD_1',
'UP_PRCLCDPRZZ_1',
'UP_PRCLCMINEO_1',
'UP_PEPIZZA_1',
'UP_MPNTLCSMBC_1',
'UP_MPNTLCDMRN_1']

# QUERY OBSERVED INPUTS

where are they?

table: "meteomatics_weather" (for start to end - 1)

table: "meteomatics_forecast_weather" (for end)

In [34]:
query1: str = "SELECT * FROM meteomatics_weather WHERE timestamp_utc between '{}' and '{}'"
observed_df1: DataFrame = pd.read_sql_query(query1.format(start, end), con=db_connection())
observed_df1.drop(['id'], axis=1, inplace=True)
observed_df1.rename(columns={'timestamp_utc': 'time'}, inplace=True)

In [35]:
query2: str = "SELECT * FROM meteomatics_forecast_weather WHERE forecast_timestamp_utc = '{}' and timestamp_query_utc = '{}'"
observed_df2: DataFrame = pd.read_sql_query(query2.format(end, end), con=db_connection())
observed_df2.drop(['id','timestamp_query_utc'], axis=1, inplace=True)
observed_df2.rename(columns={'forecast_timestamp_utc': 'time'}, inplace=True)

In [36]:
observed_df: DataFrame = pd.concat([observed_df1, observed_df2], axis=0, ignore_index=True)
observed_df: DataFrame = observed_df[observed_df['plant_code'].isin(farm_list)]
observed_df: DataFrame = observed_df.sort_values(by=['plant_code', 'time'], ascending=True, ignore_index=True)

In [37]:
observed_df

Unnamed: 0,plant_code,time,dew_point_2m_C,temperature_2m_C,msl_pressure_hPa,sfc_pressure_hPa,precipitation_1h_mm,diffuse_rad_w,direct_rad_w,wind_speed_mean_10m_1h_ms,wind_speed_mean_100m_1h_ms,wind_dir_mean_100m_1h_d,wind_dir_mean_10m_1h_d,wind_gusts_10m_1h_ms,wind_gusts_100m_1h_ms,wind_gusts_100m_ms,wind_gusts_10m_ms,low_cloud_cover_p,medium_cloud_cover_p,high_cloud_cover_p
0,UP_MPNTLCDMRN_1,2021-07-06 08:00:00,12.2,27.8,1015.6,953.8,0.0,126.1,638.2,2.4,2.9,50.0,48.9,4.4,5.4,4.3,3.8,0.0,0.0,0.0
1,UP_MPNTLCDMRN_1,2021-07-06 09:00:00,11.5,29.8,1015.8,954.1,0.0,152.7,773.0,2.6,3.2,47.1,46.3,3.8,4.5,4.5,3.8,0.0,0.0,0.0
2,UP_MPNTLCDMRN_1,2021-07-06 10:00:00,11.6,31.4,1015.8,954.1,0.0,170.5,863.0,2.8,3.4,49.0,48.3,3.8,4.5,4.5,3.8,0.0,0.0,0.0
3,UP_MPNTLCDMRN_1,2021-07-06 11:00:00,12.2,32.4,1015.4,953.8,0.0,176.3,891.9,2.8,3.4,50.9,50.0,3.9,4.6,4.3,3.7,0.0,0.0,0.0
4,UP_MPNTLCDMRN_1,2021-07-06 12:00:00,12.7,33.0,1015.1,953.6,0.0,172.2,871.2,2.8,3.2,51.3,50.4,3.8,4.3,4.3,3.8,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1171,UP_PRCLCMINEO_1,2021-07-13 03:00:00,11.4,21.9,1013.0,969.7,0.0,0.0,0.0,2.1,3.8,246.0,228.2,4.3,5.1,4.9,4.1,0.0,0.0,0.0
1172,UP_PRCLCMINEO_1,2021-07-13 04:00:00,10.8,22.0,1012.9,969.5,0.0,1.1,0.3,2.1,3.6,233.2,217.9,4.1,4.9,4.7,3.6,0.0,0.0,0.0
1173,UP_PRCLCMINEO_1,2021-07-13 05:00:00,11.0,24.3,1013.4,970.1,0.0,58.7,75.2,2.8,4.7,230.7,222.6,5.0,6.4,6.4,5.0,0.0,0.0,0.0
1174,UP_PRCLCMINEO_1,2021-07-13 06:00:00,10.6,27.5,1013.7,970.3,0.0,90.8,232.3,3.1,4.7,229.4,225.4,5.6,6.4,5.8,5.6,0.0,0.0,0.0


In [33]:
obs_maserio = observed_df[observed_df['plant_code']=='UP_MPNTLCDMRN_1']
obs_maserio

Unnamed: 0,plant_code,time,dew_point_2m_C,temperature_2m_C,msl_pressure_hPa,sfc_pressure_hPa,precipitation_1h_mm,diffuse_rad_w,direct_rad_w,wind_speed_mean_10m_1h_ms,wind_speed_mean_100m_1h_ms,wind_dir_mean_100m_1h_d,wind_dir_mean_10m_1h_d,wind_gusts_10m_1h_ms,wind_gusts_100m_1h_ms,wind_gusts_100m_ms,wind_gusts_10m_ms,low_cloud_cover_p,medium_cloud_cover_p,high_cloud_cover_p
0,UP_MPNTLCDMRN_1,2021-07-06 08:00:00,12.2,27.8,1015.6,953.8,0.0,126.1,638.2,2.4,2.9,50.0,48.9,4.4,5.4,4.3,3.8,0.0,0.0,0.0
1,UP_MPNTLCDMRN_1,2021-07-06 09:00:00,11.5,29.8,1015.8,954.1,0.0,152.7,773.0,2.6,3.2,47.1,46.3,3.8,4.5,4.5,3.8,0.0,0.0,0.0
2,UP_MPNTLCDMRN_1,2021-07-06 10:00:00,11.6,31.4,1015.8,954.1,0.0,170.5,863.0,2.8,3.4,49.0,48.3,3.8,4.5,4.5,3.8,0.0,0.0,0.0
3,UP_MPNTLCDMRN_1,2021-07-06 11:00:00,12.2,32.4,1015.4,953.8,0.0,176.3,891.9,2.8,3.4,50.9,50.0,3.9,4.6,4.3,3.7,0.0,0.0,0.0
4,UP_MPNTLCDMRN_1,2021-07-06 12:00:00,12.7,33.0,1015.1,953.6,0.0,172.2,871.2,2.8,3.2,51.3,50.4,3.8,4.3,4.3,3.8,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161,UP_MPNTLCDMRN_1,2021-07-13 01:00:00,8.9,22.5,1013.4,952.8,0.0,0.0,0.0,1.7,3.0,233.7,222.1,4.0,4.9,4.9,4.0,0.0,0.0,0.0
162,UP_MPNTLCDMRN_1,2021-07-13 02:00:00,8.5,22.3,1012.7,952.2,0.0,0.0,0.0,2.0,3.4,209.7,206.9,4.8,5.7,5.7,4.8,0.0,0.0,0.0
163,UP_MPNTLCDMRN_1,2021-07-13 03:00:00,8.1,22.6,1012.2,951.7,0.0,0.0,0.0,2.2,4.4,209.7,208.9,5.3,6.3,6.3,5.3,0.0,0.0,0.0
164,UP_MPNTLCDMRN_1,2021-07-13 04:00:00,7.1,23.7,1012.0,951.4,0.0,0.0,0.0,2.4,5.4,209.2,205.9,5.7,7.4,7.4,5.7,0.0,0.0,0.0


In [38]:
query_tar: str = "SELECT * FROM sorgenia_energy WHERE start_date_utc >= '{}' and end_date_utc <= '{}'"
past_targets: DataFrame = pd.read_sql_query(query_tar.format(start, end), con=db_connection())
past_targets: DataFrame = group_hourly(past_targets)
past_targets: DataFrame = past_targets[past_targets['plant_name_up'].isin(farm_list)]

In [39]:
past_targets

Unnamed: 0,plant_name_up,time,kwh
0,UP_MPNTLCDMRN_1,2021-07-06 08:00:00,0.00000
1,UP_MPNTLCDMRN_1,2021-07-06 09:00:00,0.00000
2,UP_MPNTLCDMRN_1,2021-07-06 10:00:00,0.06175
3,UP_MPNTLCDMRN_1,2021-07-06 11:00:00,0.26850
4,UP_MPNTLCDMRN_1,2021-07-06 12:00:00,0.16875
...,...,...,...
1302,UP_PRCLCMINEO_1,2021-07-12 23:00:00,0.98975
1303,UP_PRCLCMINEO_1,2021-07-13 00:00:00,3.38850
1304,UP_PRCLCMINEO_1,2021-07-13 01:00:00,0.13075
1305,UP_PRCLCMINEO_1,2021-07-13 02:00:00,0.00000


MERGE KWH TO observed_df ON time and plant_name

In [40]:
observed_df = observed_df.merge(past_targets, how='left', left_on=['plant_code', 'time'], right_on=['plant_name_up', 'time'])

In [43]:
tar_maserio = observed_df[observed_df['plant_name_up']=='UP_MPNTLCDMRN_1']
tar_maserio = tar_maserio.sort_values(by='time', ascending=True, ignore_index=True)

In [44]:
tar_maserio

Unnamed: 0,plant_code,time,dew_point_2m_C,temperature_2m_C,msl_pressure_hPa,sfc_pressure_hPa,precipitation_1h_mm,diffuse_rad_w,direct_rad_w,wind_speed_mean_10m_1h_ms,...,wind_dir_mean_10m_1h_d,wind_gusts_10m_1h_ms,wind_gusts_100m_1h_ms,wind_gusts_100m_ms,wind_gusts_10m_ms,low_cloud_cover_p,medium_cloud_cover_p,high_cloud_cover_p,plant_name_up,kwh
0,UP_MPNTLCDMRN_1,2021-07-06 08:00:00,12.2,27.8,1015.6,953.8,0.0,126.1,638.2,2.4,...,48.9,4.4,5.4,4.3,3.8,0.0,0.0,0.0,UP_MPNTLCDMRN_1,0.00000
1,UP_MPNTLCDMRN_1,2021-07-06 09:00:00,11.5,29.8,1015.8,954.1,0.0,152.7,773.0,2.6,...,46.3,3.8,4.5,4.5,3.8,0.0,0.0,0.0,UP_MPNTLCDMRN_1,0.00000
2,UP_MPNTLCDMRN_1,2021-07-06 10:00:00,11.6,31.4,1015.8,954.1,0.0,170.5,863.0,2.8,...,48.3,3.8,4.5,4.5,3.8,0.0,0.0,0.0,UP_MPNTLCDMRN_1,0.06175
3,UP_MPNTLCDMRN_1,2021-07-06 11:00:00,12.2,32.4,1015.4,953.8,0.0,176.3,891.9,2.8,...,50.0,3.9,4.6,4.3,3.7,0.0,0.0,0.0,UP_MPNTLCDMRN_1,0.26850
4,UP_MPNTLCDMRN_1,2021-07-06 12:00:00,12.7,33.0,1015.1,953.6,0.0,172.2,871.2,2.8,...,50.4,3.8,4.3,4.3,3.8,0.0,0.0,0.0,UP_MPNTLCDMRN_1,0.16875
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159,UP_MPNTLCDMRN_1,2021-07-12 23:00:00,10.0,23.6,1014.4,953.7,0.0,0.0,0.0,1.6,...,224.9,3.5,4.3,4.3,3.5,0.0,0.1,0.0,UP_MPNTLCDMRN_1,0.40325
160,UP_MPNTLCDMRN_1,2021-07-13 00:00:00,9.4,23.0,1014.1,953.5,0.0,0.0,0.0,1.7,...,235.4,3.9,4.7,4.7,3.9,0.0,0.1,0.0,UP_MPNTLCDMRN_1,0.65050
161,UP_MPNTLCDMRN_1,2021-07-13 01:00:00,8.9,22.5,1013.4,952.8,0.0,0.0,0.0,1.7,...,222.1,4.0,4.9,4.9,4.0,0.0,0.0,0.0,UP_MPNTLCDMRN_1,0.44800
162,UP_MPNTLCDMRN_1,2021-07-13 02:00:00,8.5,22.3,1012.7,952.2,0.0,0.0,0.0,2.0,...,206.9,4.8,5.7,5.7,4.8,0.0,0.0,0.0,UP_MPNTLCDMRN_1,0.37300


1- APPEND NEW DATES TO DF

2- FILL NAN ffill method