In [1]:
from sqlalchemy.engine import Engine, Connection
from sqlalchemy import create_engine
from os import getenv
from typing import List, Dict, Optional
import pandas as pd
from pandas import DataFrame, Series, Timestamp
import matplotlib.pyplot as plt
from datetime import datetime
import datetime as dt
import numpy as np
from numpy import ndarray
import pytz

In [2]:
import sys
sys.path.insert(0, '..')

In [3]:
from etl.ETL import db_connection, group_hourly

In [4]:
try:
    from dotenv import load_dotenv

    load_dotenv()
except:
    print('No ".env" file or python-dotenv not installed... Using default env variables...')

In [5]:
def db_connection() -> Engine:
    dbname: Optional[str] = getenv('POSTGRES_DB_NAME')
    host: Optional[str] = getenv('POSTGRES_HOST')
    user: Optional[str] = getenv('POSTGRES_USERNAME')
    password: Optional[str] = getenv('POSTGRES_PASSWORD')
    port: Optional[str] = getenv('POSTGRES_PORT')
        
    postgres_str: str = f'postgresql://{user}:{password}@{host}:{port}/{dbname}'
    
    engine: Engine = create_engine(postgres_str)
    
    return engine

# WHAT WE HAVE TO QUERY

let's start by define date intervals

In [6]:
startDate: str = (datetime.now(pytz.timezone('UTC')) - dt.timedelta(hours=167))
endDate: str = datetime.now(pytz.timezone('UTC'))
startDate, endDate 

(datetime.datetime(2021, 7, 5, 15, 11, 53, 533497, tzinfo=<UTC>),
 datetime.datetime(2021, 7, 12, 14, 11, 53, 533497, tzinfo=<UTC>))

In [7]:
targetDate: datetime = endDate + dt.timedelta(hours=12)
targetDates: List[datetime] = [endDate + dt.timedelta(hours=i) for i in range(1,13)]

CONVERT INTO STRINGS AND PUSHED TO THE START OF THE HOUR (xx:00:00)

In [8]:
start: str = startDate.strftime("%Y-%m-%d %H:00:00")
end: str = endDate.strftime("%Y-%m-%d %H:00:00")
targets: List[str] = [date.strftime("%Y-%m-%d %H:00:00") for date in targetDates]

In [9]:
start, end

('2021-07-05 15:00:00', '2021-07-12 14:00:00')

In [10]:
targets

['2021-07-12 15:00:00',
 '2021-07-12 16:00:00',
 '2021-07-12 17:00:00',
 '2021-07-12 18:00:00',
 '2021-07-12 19:00:00',
 '2021-07-12 20:00:00',
 '2021-07-12 21:00:00',
 '2021-07-12 22:00:00',
 '2021-07-12 23:00:00',
 '2021-07-13 00:00:00',
 '2021-07-13 01:00:00',
 '2021-07-13 02:00:00']

# QUERY OBSERVED INPUTS

where are they?

table: "meteomatics_weather" (for start to end - 1)

table: "meteomatics_forecast_weather" (for end)

In [11]:
query1: str = "SELECT * FROM meteomatics_weather WHERE timestamp_utc between '{}' and '{}'"
observed_df1: DataFrame = pd.read_sql_query(query1.format(start, end), con=db_connection())
observed_df1.drop(['id'], axis=1, inplace=True)
observed_df1.rename(columns={'timestamp_utc': 'time'}, inplace=True)

In [12]:
observed_df1

Unnamed: 0,plant_code,time,dew_point_2m_C,temperature_2m_C,msl_pressure_hPa,sfc_pressure_hPa,precipitation_1h_mm,diffuse_rad_w,direct_rad_w,wind_speed_mean_10m_1h_ms,wind_speed_mean_100m_1h_ms,wind_dir_mean_100m_1h_d,wind_dir_mean_10m_1h_d,wind_gusts_10m_1h_ms,wind_gusts_100m_1h_ms,wind_gusts_100m_ms,wind_gusts_10m_ms,low_cloud_cover_p,medium_cloud_cover_p,high_cloud_cover_p
0,UP_PRCLCDPLRM_1,2021-07-05 15:00:00,14.7,28.5,1013.9,955.0,0.0,99.8,505.0,5.5,7.1,358.7,1.6,8.5,9.6,9.6,8.5,0.0,0.0,0.0
1,UP_PRCLCDMZRD_1,2021-07-05 15:00:00,16.5,31.7,1012.8,994.8,0.0,107.1,541.8,5.7,7.9,338.2,336.6,10.6,12.4,12.4,10.6,0.0,0.0,0.0
2,UP_PRCLCDPRZZ_1,2021-07-05 15:00:00,13.2,28.6,1013.7,947.0,0.0,97.6,494.2,5.6,7.4,355.5,356.7,8.8,10.3,10.3,8.8,0.0,0.0,0.0
3,UP_PRCLCMINEO_1,2021-07-05 15:00:00,11.9,33.3,1010.3,967.4,0.0,102.6,519.0,5.6,7.1,347.2,349.2,7.9,9.2,9.2,7.9,0.0,0.0,0.0
4,UP_PEPIZZA_1,2021-07-05 15:00:00,18.6,28.8,1011.2,987.0,0.0,93.4,472.5,6.6,9.5,277.9,276.6,10.7,12.0,12.0,10.7,5.5,0.4,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1498,UP_PEPIZZA_1,2021-07-12 13:00:00,16.6,32.9,1014.9,990.5,0.0,158.4,801.5,3.5,4.9,271.6,270.5,5.3,6.6,6.0,4.9,0.0,2.8,0.0
1499,UP_MPNTLCSMBC_1,2021-07-12 13:00:00,11.1,27.8,1015.4,916.6,0.0,158.5,802.2,4.6,4.6,290.6,290.7,6.0,6.0,5.7,5.7,0.0,1.9,0.0
1500,UP_MPNTLCDMRN_1,2021-07-12 13:00:00,11.4,34.4,1015.3,954.0,0.0,159.1,805.2,2.8,3.3,56.5,56.0,4.0,4.5,4.5,4.0,0.0,0.0,0.0
1501,ESAPRO_P263,2021-07-12 13:00:00,12.2,37.6,1012.1,995.1,0.0,157.1,794.9,2.0,2.4,53.9,53.2,3.7,4.5,4.5,3.7,0.0,0.1,0.0


In [13]:
query2: str = "SELECT * FROM meteomatics_forecast_weather WHERE forecast_timestamp_utc = '{}' and timestamp_query_utc = '{}'"
observed_df2: DataFrame = pd.read_sql_query(query2.format(end, end), con=db_connection())
observed_df2.drop(['id','timestamp_query_utc'], axis=1, inplace=True)
observed_df2.rename(columns={'forecast_timestamp_utc': 'time'}, inplace=True)

In [14]:
observed_df2

Unnamed: 0,plant_code,time,dew_point_2m_C,temperature_2m_C,msl_pressure_hPa,sfc_pressure_hPa,precipitation_1h_mm,diffuse_rad_w,direct_rad_w,wind_speed_mean_10m_1h_ms,wind_speed_mean_100m_1h_ms,wind_dir_mean_100m_1h_d,wind_dir_mean_10m_1h_d,wind_gusts_10m_1h_ms,wind_gusts_100m_1h_ms,wind_gusts_100m_ms,wind_gusts_10m_ms,low_cloud_cover_p,medium_cloud_cover_p,high_cloud_cover_p
0,UP_PRCLCDMZRD_1,2021-07-12 14:00:00,12.8,35.0,1013.6,995.5,0.0,135.3,684.8,3.1,4.0,203.5,202.8,4.3,5.4,4.9,4.0,0.0,0.0,0.0
1,UP_PRCLCDPLRM_1,2021-07-12 14:00:00,10.9,34.5,1014.5,955.8,0.0,136.1,688.9,2.8,3.2,49.3,48.9,3.8,4.2,4.2,3.7,0.0,0.0,0.0
2,UP_PRCLCMINEO_1,2021-07-12 14:00:00,13.8,34.2,1013.3,970.3,0.0,135.0,683.2,3.1,3.8,216.9,214.3,4.3,5.2,4.9,4.1,0.0,0.0,0.0
3,UP_PEPIZZA_1,2021-07-12 14:00:00,16.5,32.1,1014.5,990.2,0.0,132.4,670.1,3.4,4.7,270.5,269.4,5.1,6.1,6.1,5.1,0.0,1.9,0.0
4,UP_MPNTLCSMBC_1,2021-07-12 14:00:00,10.2,27.2,1015.0,916.3,0.0,132.6,670.8,4.5,4.5,286.0,288.2,5.8,5.8,5.8,5.8,0.0,1.5,0.0
5,UP_MPNTLCDMRN_1,2021-07-12 14:00:00,11.6,34.0,1014.6,953.4,0.0,132.2,669.2,3.0,3.4,62.9,61.2,4.0,4.5,4.5,3.9,0.0,0.0,0.0
6,UP_PRCLCDPRZZ_1,2021-07-12 14:00:00,10.0,34.6,1014.5,948.2,0.0,133.3,674.4,1.6,1.8,24.7,26.1,2.6,3.0,2.3,2.0,0.0,0.0,0.0
7,ESAPRO_P263,2021-07-12 14:00:00,12.9,36.8,1011.8,994.8,0.0,129.8,656.7,3.4,4.3,60.6,58.9,5.2,6.6,6.6,5.2,0.0,0.0,0.0
8,ESAPRO_P259,2021-07-12 14:00:00,20.0,31.0,1011.3,1010.9,0.0,129.9,657.6,3.7,4.7,89.3,86.7,5.5,6.8,6.8,5.5,0.8,8.6,0.0


In [None]:
observed_df2.dtypes

CONCAT THE TWO OBSERVED DF

In [15]:
farm_list: List = ['UP_PRCLCDPLRM_1',
'UP_PRCLCDMZRD_1',
'UP_PRCLCDPRZZ_1',
'UP_PRCLCMINEO_1',
'UP_PEPIZZA_1',
'UP_MPNTLCSMBC_1',
'UP_MPNTLCDMRN_1']

In [16]:
observed_df: DataFrame = pd.concat([observed_df1, observed_df2], axis=0, ignore_index=True)
observed_df: DataFrame = observed_df[observed_df['plant_code'].isin(farm_list)]
observed_df: DataFrame = observed_df.sort_values(by=['plant_code', 'time'], ascending=True, ignore_index=True)

In [17]:
observed_df

Unnamed: 0,plant_code,time,dew_point_2m_C,temperature_2m_C,msl_pressure_hPa,sfc_pressure_hPa,precipitation_1h_mm,diffuse_rad_w,direct_rad_w,wind_speed_mean_10m_1h_ms,wind_speed_mean_100m_1h_ms,wind_dir_mean_100m_1h_d,wind_dir_mean_10m_1h_d,wind_gusts_10m_1h_ms,wind_gusts_100m_1h_ms,wind_gusts_100m_ms,wind_gusts_10m_ms,low_cloud_cover_p,medium_cloud_cover_p,high_cloud_cover_p
0,UP_MPNTLCDMRN_1,2021-07-05 15:00:00,15.2,28.2,1014.1,952.5,0.0,98.2,496.7,4.9,6.3,356.1,358.8,7.8,8.8,8.8,7.8,0.5,0.0,0.0
1,UP_MPNTLCDMRN_1,2021-07-05 16:00:00,15.0,26.7,1014.2,952.7,0.0,70.2,355.1,4.5,5.8,0.0,2.7,7.8,8.8,8.4,7.4,0.4,0.0,0.0
2,UP_MPNTLCDMRN_1,2021-07-05 17:00:00,15.7,24.8,1014.3,952.8,0.0,76.4,117.7,3.8,4.9,6.4,9.5,6.5,7.4,6.9,6.0,0.4,0.0,0.0
3,UP_MPNTLCDMRN_1,2021-07-05 18:00:00,16.4,23.1,1014.3,952.8,0.0,22.7,0.0,3.3,4.3,11.1,14.4,6.0,6.9,6.6,5.8,0.0,0.0,0.0
4,UP_MPNTLCDMRN_1,2021-07-05 19:00:00,17.2,21.4,1014.7,953.2,0.0,0.0,0.0,2.6,3.6,16.2,20.0,5.8,6.6,6.1,5.1,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1171,UP_PRCLCMINEO_1,2021-07-12 10:00:00,4.3,34.6,1014.8,971.6,0.0,163.8,829.0,0.4,0.5,226.0,214.8,0.9,1.1,1.1,0.9,0.0,0.0,0.0
1172,UP_PRCLCMINEO_1,2021-07-12 11:00:00,5.9,35.4,1014.5,971.3,0.0,174.4,882.8,1.4,1.6,222.9,218.0,2.5,2.9,2.9,2.5,0.0,0.0,0.0
1173,UP_PRCLCMINEO_1,2021-07-12 12:00:00,8.6,35.7,1013.9,970.9,0.0,168.7,853.5,2.2,2.6,217.4,214.3,3.6,4.2,4.2,3.6,0.0,0.0,0.0
1174,UP_PRCLCMINEO_1,2021-07-12 13:00:00,11.7,35.0,1013.6,970.5,0.0,161.9,819.5,2.9,3.5,215.6,212.9,4.3,5.2,5.2,4.3,0.0,0.0,0.0


# QUERY TARGETS

where are they?

table: "sorgenia_energy"

In [27]:
query_tar: str = "SELECT * FROM sorgenia_energy WHERE start_date_utc >= '{}' and end_date_utc <= '{}'"
past_targets: DataFrame = pd.read_sql_query(query_tar.format(start, end), con=db_connection())
past_targets: DataFrame = group_hourly(past_targets)
past_targets: DataFrame = past_targets[past_targets['plant_name_up'].isin(farm_list)]

In [28]:
past_targets

Unnamed: 0,plant_name_up,time,kwh
0,UP_MPNTLCDMRN_1,2021-07-05 15:00:00,1.67125
1,UP_MPNTLCDMRN_1,2021-07-05 16:00:00,1.90550
2,UP_MPNTLCDMRN_1,2021-07-05 17:00:00,0.82500
3,UP_MPNTLCDMRN_1,2021-07-05 18:00:00,0.30975
4,UP_MPNTLCDMRN_1,2021-07-05 19:00:00,0.16900
...,...,...,...
1278,UP_PRCLCMINEO_1,2021-07-12 03:00:00,0.23850
1279,UP_PRCLCMINEO_1,2021-07-12 04:00:00,0.01975
1280,UP_PRCLCMINEO_1,2021-07-12 05:00:00,0.00000
1281,UP_PRCLCMINEO_1,2021-07-12 06:00:00,0.00000


MERGE KWH TO observed_df ON time and plant_name

In [None]:
observed_df = observed_df.merge(past_targets, how='left', left_on=['plant_code', 'time'], right_on=['plant_name_up', 'time'])

In [None]:
observed_df

In [None]:
observed_df['kwh'] = observed_df['kwh'].fillna(method='ffill')
observed_df.drop(['plant_name_up'], axis=1, inplace=True)

In [None]:
observed_df

# QUERY KNOWN INPUTS (FORECASTS)

where are they?

table: "meteomatics_forecast_weather" 

In [None]:
# query_fore: str = "SELECT * FROM meteomatics_forecast_weather WHERE forecast_timestamp_utc between '{}' and '{}'"
# known_df: DataFrame = pd.read_sql_query(query_fore.format(targets[0], targets[-1]), con=db_connection())
# known_df.drop(['id', 'timestamp_query_utc'], axis=1, inplace=True)
# # known_df.drop(['id'], axis=1, inplace=True)
# known_df: DataFrame = known_df.sort_values(by=['forecast_timestamp_utc','plant_code'], ascending=True, ignore_index=True)
# known_df: DataFrame = known_df[known_df['plant_code'].isin(farm_list)]

In [None]:
query_fore: str = "SELECT * FROM meteomatics_forecast_weather WHERE forecast_timestamp_utc between '{}' and '{}'"
known_df: DataFrame = pd.read_sql_query(query_fore.format(targets[0], targets[-1]), con=db_connection())
known_df.drop(['id'], axis=1, inplace=True)
known_df: DataFrame = known_df.sort_values(by=['forecast_timestamp_utc', 'plant_code'], ascending=True, ignore_index=True)
known_df['diff'] = known_df['forecast_timestamp_utc'] - known_df['timestamp_query_utc']
known_df = known_df.sort_values('diff', ascending=True).drop_duplicates(subset=['plant_code', 'forecast_timestamp_utc'], keep='first')
# assert known_df['timestamp_query_utc'].unique() == pd.Timestamp(targets[0])
known_df.drop(['timestamp_query_utc'], axis=1, inplace=True)
known_df: DataFrame = known_df[known_df['plant_code'].isin(farm_list)]
known_df.rename(columns={'forecast_timestamp_utc': 'time'}, inplace=True)

# TEST observed_df

In [None]:
maserio_ob = observed_df[observed_df['plant_code']=='UP_MPNTLCDMRN_1']

In [None]:
assert len(maserio_ob) == 168

In [None]:
maserio_kn = known_df[known_df['plant_code']=='UP_MPNTLCDMRN_1']

In [None]:
maserio_kn

# CONCATENATE OBSERVED AND KNOWN

In [None]:
columns = ['plant_code', 'time', 'dew_point_2m_C', 'temperature_2m_C', 'msl_pressure_hPa', 'sfc_pressure_hPa',
           'precipitation_1h_mm', 'wind_speed_mean_10m_1h_ms',
           'wind_speed_mean_100m_1h_ms', 'wind_dir_mean_100m_1h_d', 'wind_dir_mean_10m_1h_d', 'wind_gusts_10m_1h_ms',
           'wind_gusts_10m_ms']

In [None]:
observed_df = observed_df[columns]
known_df = known_df[columns]

In [None]:
df: DataFrame = pd.concat([observed_df, known_df], axis=0, ignore_index=True)
df = df.sort_values(['plant_code', 'time'], ascending=True, ignore_index=True)

In [None]:
# add other engineered features
timestamp_s: Series = df['time'].map(datetime.timestamp)

day: int = 24 * 60 * 60
year: float = 365.2425 * day

df['Day sin']: Series = np.sin(timestamp_s * (2 * np.pi / day))
df['Day cos']: Series = np.cos(timestamp_s * (2 * np.pi / day))
df['Year sin']: Series = np.sin(timestamp_s * (2 * np.pi / year))
df['Year cos']: Series = np.cos(timestamp_s * (2 * np.pi / year))
    
earliest_time: Timestamp = df.time.min()
df['t']: Series = (df['time'] - earliest_time).dt.seconds / 60 / 60 + (df['time'] - earliest_time).dt.days * 24
df['days_from_start']: Series = (df['time'] - earliest_time).dt.days
df["id"] = df["plant_code"]
df['hour']: Series = df["time"].dt.hour
df['day']: Series = df["time"].dt.day
df['day_of_week']: Series = df["time"].dt.dayofweek
df['month']: Series = df["time"].dt.month
df['categorical_id']: Series = df['id'].copy()
df['hours_from_start']: Series = df['t']
df['categorical_day_of_week']: Series = df['day_of_week'].copy()
df['categorical_hour']: Series = df['hour'].copy()

In [None]:
df.head()