In [1]:
from sqlalchemy.engine import Engine, Connection
from sqlalchemy import create_engine
from os import getenv
from typing import List, Dict, Optional
import pandas as pd
from pandas import DataFrame, Series, Timestamp
import matplotlib.pyplot as plt
from datetime import datetime
import datetime as dt
import numpy as np
from numpy import ndarray
import pytz

In [2]:
import sys
sys.path.insert(0, '..')

In [3]:
try:
    from dotenv import load_dotenv

    load_dotenv()
except:
    print('No ".env" file or python-dotenv not installed... Using default env variables...')

In [4]:
def db_connection() -> Engine:
    dbname: Optional[str] = getenv('POSTGRES_DB_NAME')
    host: Optional[str] = getenv('POSTGRES_HOST')
    user: Optional[str] = getenv('POSTGRES_USERNAME')
    password: Optional[str] = getenv('POSTGRES_PASSWORD')
    port: Optional[str] = getenv('POSTGRES_PORT')
        
    postgres_str: str = f'postgresql://{user}:{password}@{host}:{port}/{dbname}'
    
    engine: Engine = create_engine(postgres_str)
    
    return engine

# WHAT WE HAVE TO QUERY

let's start by define date intervals

In [5]:
startDate: str = (datetime.now(pytz.timezone('UTC')) - dt.timedelta(hours=167))
endDate: str = datetime.now(pytz.timezone('UTC'))
startDate, endDate 

(datetime.datetime(2021, 7, 5, 7, 32, 45, 485412, tzinfo=<UTC>),
 datetime.datetime(2021, 7, 12, 6, 32, 45, 485412, tzinfo=<UTC>))

In [6]:
targetDate: datetime = endDate + dt.timedelta(hours=12)
targetDates: List[datetime] = [endDate + dt.timedelta(hours=i) for i in range(1,13)]

CONVERT INTO STRINGS AND PUSHED TO THE START OF THE HOUR (xx:00:00)

In [7]:
start: str = startDate.strftime("%Y-%m-%d %H:00:00")
end: str = endDate.strftime("%Y-%m-%d %H:00:00")
targets: List[str] = [date.strftime("%Y-%m-%d %H:00:00") for date in targetDates]

In [8]:
start, end

('2021-07-05 07:00:00', '2021-07-12 06:00:00')

In [9]:
targets

['2021-07-12 07:00:00',
 '2021-07-12 08:00:00',
 '2021-07-12 09:00:00',
 '2021-07-12 10:00:00',
 '2021-07-12 11:00:00',
 '2021-07-12 12:00:00',
 '2021-07-12 13:00:00',
 '2021-07-12 14:00:00',
 '2021-07-12 15:00:00',
 '2021-07-12 16:00:00',
 '2021-07-12 17:00:00',
 '2021-07-12 18:00:00']

# QUERY OBSERVED INPUTS

where are they?

table: "meteomatics_weather" (for start to end - 1)

table: "meteomatics_forecast_weather" (for end)

In [10]:
query1: str = "SELECT * FROM meteomatics_weather WHERE timestamp_utc between '{}' and '{}'"
observed_df1: DataFrame = pd.read_sql_query(query1.format(start, end), con=db_connection())
observed_df1.drop(['id'], axis=1, inplace=True)

In [11]:
observed_df1

Unnamed: 0,plant_code,timestamp_utc,dew_point_2m_C,temperature_2m_C,msl_pressure_hPa,sfc_pressure_hPa,precipitation_1h_mm,diffuse_rad_w,direct_rad_w,wind_speed_mean_10m_1h_ms,wind_speed_mean_100m_1h_ms,wind_dir_mean_100m_1h_d,wind_dir_mean_10m_1h_d,wind_gusts_10m_1h_ms,wind_gusts_100m_1h_ms,wind_gusts_100m_ms,wind_gusts_10m_ms,low_cloud_cover_p,medium_cloud_cover_p,high_cloud_cover_p
0,UP_PRCLCDPLRM_1,2021-07-05 07:00:00,16.2,25.5,1013.6,954.6,0.0,91.2,461.5,2.8,3.9,290.5,291.1,7.4,8.8,7.1,6.1,7.8,0.0,0.0
1,UP_PRCLCDMZRD_1,2021-07-05 07:00:00,20.1,28.2,1012.5,994.3,0.0,89.1,450.8,5.1,7.4,330.4,330.0,9.8,11.3,10.9,9.8,2.6,0.0,0.0
2,UP_PRCLCDPRZZ_1,2021-07-05 07:00:00,15.3,24.4,1013.8,946.8,0.0,89.7,453.8,2.6,3.8,308.0,309.0,7.0,8.7,8.1,7.0,12.0,0.0,0.0
3,UP_PRCLCMINEO_1,2021-07-05 07:00:00,8.9,29.4,1010.9,967.7,0.0,93.1,471.4,2.6,3.6,260.7,249.0,8.0,8.6,6.7,5.6,0.0,0.0,0.0
4,UP_PEPIZZA_1,2021-07-05 07:00:00,19.0,27.4,1011.0,986.9,0.0,83.5,386.4,6.0,8.9,272.2,273.5,10.3,12.2,12.1,10.3,40.3,0.0,0.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1498,UP_PEPIZZA_1,2021-07-12 05:00:00,20.2,22.4,1015.1,991.1,0.0,65.6,0.0,3.8,6.7,269.4,270.3,7.2,8.7,8.2,6.9,49.9,0.0,0.0
1499,UP_MPNTLCSMBC_1,2021-07-12 05:00:00,11.3,20.8,1015.5,916.0,0.0,66.5,83.0,4.9,4.1,294.0,289.7,6.5,5.5,5.0,6.0,18.8,0.0,0.0
1500,UP_MPNTLCDMRN_1,2021-07-12 05:00:00,6.1,23.3,1015.8,954.5,0.0,56.4,70.1,0.4,1.2,44.7,243.5,1.4,2.6,2.6,1.4,0.0,0.0,0.0
1501,ESAPRO_P263,2021-07-12 05:00:00,14.7,25.1,1014.1,997.1,0.0,66.7,99.8,3.1,5.6,243.0,236.7,6.1,7.8,6.4,5.0,0.0,0.0,0.0


In [12]:
query2: str = "SELECT * FROM meteomatics_forecast_weather WHERE forecast_timestamp_utc = '{}' and timestamp_query_utc = '{}'"
observed_df2: DataFrame = pd.read_sql_query(query2.format(end, end), con=db_connection())
observed_df2.drop(['id','timestamp_query_utc'], axis=1, inplace=True)
observed_df2.rename(columns={'forecast_timestamp_utc': 'timestamp_utc'}, inplace=True)

In [13]:
observed_df2

Unnamed: 0,plant_code,timestamp_utc,dew_point_2m_C,temperature_2m_C,msl_pressure_hPa,sfc_pressure_hPa,precipitation_1h_mm,diffuse_rad_w,direct_rad_w,wind_speed_mean_10m_1h_ms,wind_speed_mean_100m_1h_ms,wind_dir_mean_100m_1h_d,wind_dir_mean_10m_1h_d,wind_gusts_10m_1h_ms,wind_gusts_100m_1h_ms,wind_gusts_100m_ms,wind_gusts_10m_ms,low_cloud_cover_p,medium_cloud_cover_p,high_cloud_cover_p
0,UP_PRCLCDMZRD_1,2021-07-12 06:00:00,13.1,27.1,1015.0,997.0,0.0,92.5,202.7,0.8,1.2,94.9,76.2,1.7,2.3,2.3,1.7,0.0,0.0,0.0
1,UP_PRCLCDPRZZ_1,2021-07-12 06:00:00,8.7,25.8,1016.6,949.9,0.0,88.9,227.8,1.2,3.0,26.2,28.6,3.0,4.2,4.2,3.0,0.0,0.0,0.0
2,UP_PRCLCMINEO_1,2021-07-12 06:00:00,10.2,26.0,1014.9,971.3,0.0,92.8,241.2,1.6,1.9,231.7,221.9,2.7,2.9,2.5,1.9,0.0,0.0,0.0
3,UP_PEPIZZA_1,2021-07-12 06:00:00,19.5,24.8,1015.4,991.2,0.0,100.7,230.5,4.0,6.3,271.1,270.8,6.9,8.2,7.8,6.4,14.3,0.0,0.0
4,ESAPRO_P263,2021-07-12 06:00:00,14.2,27.7,1014.3,997.2,0.0,91.4,254.4,2.4,3.9,246.8,243.6,5.0,6.4,3.7,3.3,0.0,0.0,0.0
5,ESAPRO_P259,2021-07-12 06:00:00,18.2,24.3,1013.0,1012.7,0.0,88.1,209.4,2.8,4.5,41.4,27.9,4.9,6.0,5.6,4.6,0.0,0.8,2.0
6,UP_PRCLCDPLRM_1,2021-07-12 06:00:00,7.7,26.0,1016.5,957.6,0.0,89.4,226.8,0.6,1.8,11.2,316.7,1.9,2.8,2.8,1.9,0.0,0.0,0.0
7,UP_MPNTLCSMBC_1,2021-07-12 06:00:00,11.8,22.6,1015.8,916.3,0.0,107.0,211.8,4.4,3.6,293.0,288.8,6.0,5.0,4.3,5.3,23.2,0.0,0.0
8,UP_MPNTLCDMRN_1,2021-07-12 06:00:00,7.6,26.3,1016.3,954.8,0.0,89.2,224.2,0.5,1.6,25.3,349.5,1.9,2.9,2.9,1.9,0.0,0.0,0.0


In [14]:
observed_df2.dtypes

plant_code                            object
timestamp_utc                 datetime64[ns]
dew_point_2m_C                       float64
temperature_2m_C                     float64
msl_pressure_hPa                     float64
sfc_pressure_hPa                     float64
precipitation_1h_mm                  float64
diffuse_rad_w                        float64
direct_rad_w                         float64
wind_speed_mean_10m_1h_ms            float64
wind_speed_mean_100m_1h_ms           float64
wind_dir_mean_100m_1h_d              float64
wind_dir_mean_10m_1h_d               float64
wind_gusts_10m_1h_ms                 float64
wind_gusts_100m_1h_ms                float64
wind_gusts_100m_ms                   float64
wind_gusts_10m_ms                    float64
low_cloud_cover_p                    float64
medium_cloud_cover_p                 float64
high_cloud_cover_p                   float64
dtype: object

CONCAT THE TWO OBSERVED DF

In [15]:
farm_list: List = ['UP_PRCLCDPLRM_1',
'UP_PRCLCDMZRD_1',
'UP_PRCLCDPRZZ_1',
'UP_PRCLCMINEO_1',
'UP_PEPIZZA_1',
'UP_MPNTLCSMBC_1',
'UP_MPNTLCDMRN_1']

In [16]:
observed_df: DataFrame = pd.concat([observed_df1, observed_df2], axis=0, ignore_index=True)
observed_df: DataFrame = observed_df.sort_values(by=['timestamp_utc','plant_code'], ascending=True, ignore_index=True)
observed_df: DataFrame = observed_df[observed_df['plant_code'].isin(farm_list)]

In [17]:
observed_df

Unnamed: 0,plant_code,timestamp_utc,dew_point_2m_C,temperature_2m_C,msl_pressure_hPa,sfc_pressure_hPa,precipitation_1h_mm,diffuse_rad_w,direct_rad_w,wind_speed_mean_10m_1h_ms,wind_speed_mean_100m_1h_ms,wind_dir_mean_100m_1h_d,wind_dir_mean_10m_1h_d,wind_gusts_10m_1h_ms,wind_gusts_100m_1h_ms,wind_gusts_100m_ms,wind_gusts_10m_ms,low_cloud_cover_p,medium_cloud_cover_p,high_cloud_cover_p
2,UP_MPNTLCDMRN_1,2021-07-05 07:00:00,17.3,25.7,1013.5,951.9,0.0,92.4,467.7,3.1,4.3,285.0,283.3,8.8,9.4,8.0,7.1,13.3,0.0,0.0
3,UP_MPNTLCSMBC_1,2021-07-05 07:00:00,16.4,22.1,1011.4,912.5,0.0,153.6,4.5,9.8,10.2,270.8,268.5,13.4,13.7,13.6,13.3,22.4,0.0,0.0
4,UP_PEPIZZA_1,2021-07-05 07:00:00,19.0,27.4,1011.0,986.9,0.0,83.5,386.4,6.0,8.9,272.2,273.5,10.3,12.2,12.1,10.3,40.3,0.0,0.2
5,UP_PRCLCDMZRD_1,2021-07-05 07:00:00,20.1,28.2,1012.5,994.3,0.0,89.1,450.8,5.1,7.4,330.4,330.0,9.8,11.3,10.9,9.8,2.6,0.0,0.0
6,UP_PRCLCDPLRM_1,2021-07-05 07:00:00,16.2,25.5,1013.6,954.6,0.0,91.2,461.5,2.8,3.9,290.5,291.1,7.4,8.8,7.1,6.1,7.8,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1507,UP_PEPIZZA_1,2021-07-12 06:00:00,19.5,24.8,1015.4,991.2,0.0,100.7,230.5,4.0,6.3,271.1,270.8,6.9,8.2,7.8,6.4,14.3,0.0,0.0
1508,UP_PRCLCDMZRD_1,2021-07-12 06:00:00,13.1,27.1,1015.0,997.0,0.0,92.5,202.7,0.8,1.2,94.9,76.2,1.7,2.3,2.3,1.7,0.0,0.0,0.0
1509,UP_PRCLCDPLRM_1,2021-07-12 06:00:00,7.7,26.0,1016.5,957.6,0.0,89.4,226.8,0.6,1.8,11.2,316.7,1.9,2.8,2.8,1.9,0.0,0.0,0.0
1510,UP_PRCLCDPRZZ_1,2021-07-12 06:00:00,8.7,25.8,1016.6,949.9,0.0,88.9,227.8,1.2,3.0,26.2,28.6,3.0,4.2,4.2,3.0,0.0,0.0,0.0


# QUERY KNOWN INPUTS (FORECASTS)

where are they?

table: "meteomatics_forecast_weather" 

In [44]:
# query_fore: str = "SELECT * FROM meteomatics_forecast_weather WHERE forecast_timestamp_utc between '{}' and '{}'"
# known_df: DataFrame = pd.read_sql_query(query_fore.format(targets[0], targets[-1]), con=db_connection())
# known_df.drop(['id', 'timestamp_query_utc'], axis=1, inplace=True)
# # known_df.drop(['id'], axis=1, inplace=True)
# known_df: DataFrame = known_df.sort_values(by=['forecast_timestamp_utc','plant_code'], ascending=True, ignore_index=True)
# known_df: DataFrame = known_df[known_df['plant_code'].isin(farm_list)]

In [49]:
query_fore: str = "SELECT * FROM meteomatics_forecast_weather WHERE forecast_timestamp_utc between '{}' and '{}'"
known_df: DataFrame = pd.read_sql_query(query_fore.format(targets[0], targets[-1]), con=db_connection())
known_df.drop(['id'], axis=1, inplace=True)
known_df: DataFrame = known_df.sort_values(by=['forecast_timestamp_utc', 'plant_code'], ascending=True, ignore_index=True)
known_df['diff'] = known_df['forecast_timestamp_utc'] - known_df['timestamp_query_utc']
known_df = known_df.sort_values('diff', ascending=True).drop_duplicates(subset=['plant_code', 'forecast_timestamp_utc'], keep='first')
assert known_df['timestamp_query_utc'].unique() == pd.Timestamp(targets[0])
known_df.drop(['timestamp_query_utc'], axis=1, inplace=True)
known_df: DataFrame = known_df[known_df['plant_code'].isin(farm_list)]

# TEST observed_df

In [51]:
maserio_ob = observed_df[observed_df['plant_code']=='UP_MPNTLCDMRN_1']

In [52]:
assert len(maserio_ob) == 168

In [53]:
maserio_kn = known_df[known_df['plant_code']=='UP_MPNTLCDMRN_1']

In [54]:
maserio_kn

Unnamed: 0,plant_code,forecast_timestamp_utc,dew_point_2m_C,temperature_2m_C,msl_pressure_hPa,sfc_pressure_hPa,precipitation_1h_mm,diffuse_rad_w,direct_rad_w,wind_speed_mean_10m_1h_ms,...,wind_dir_mean_100m_1h_d,wind_dir_mean_10m_1h_d,wind_gusts_10m_1h_ms,wind_gusts_100m_1h_ms,wind_gusts_100m_ms,wind_gusts_10m_ms,low_cloud_cover_p,medium_cloud_cover_p,high_cloud_cover_p,diff
67,UP_MPNTLCDMRN_1,2021-07-12 07:00:00,8.5,28.1,1016.2,954.7,0.0,81.6,412.9,1.2,...,36.7,38.6,2.9,3.7,3.7,2.9,0.0,0.0,0.0,0 days 00:00:00
280,UP_MPNTLCDMRN_1,2021-07-12 08:00:00,9.6,29.8,1016.7,955.1,0.0,112.5,569.5,2.1,...,41.3,43.5,3.6,4.0,4.0,3.6,0.0,0.0,0.0,0 days 01:00:00
484,UP_MPNTLCDMRN_1,2021-07-12 09:00:00,10.7,31.3,1017.1,955.4,0.0,136.2,689.4,2.5,...,39.0,41.5,3.6,4.0,4.0,3.4,0.0,0.0,0.0,0 days 02:00:00
679,UP_MPNTLCDMRN_1,2021-07-12 10:00:00,11.4,32.8,1017.0,955.4,0.0,146.1,739.2,2.5,...,41.5,44.1,3.4,4.0,3.8,3.3,0.0,0.0,0.0,0 days 03:00:00
865,UP_MPNTLCDMRN_1,2021-07-12 11:00:00,11.6,33.9,1016.8,955.2,0.0,132.6,827.9,2.4,...,46.3,48.4,3.4,3.8,3.8,3.4,0.0,0.0,0.0,0 days 04:00:00
1042,UP_MPNTLCDMRN_1,2021-07-12 12:00:00,11.5,34.5,1016.0,954.5,0.0,131.7,810.8,2.6,...,56.8,57.0,3.8,4.3,4.3,3.8,0.0,0.0,0.0,0 days 05:00:00
1210,UP_MPNTLCDMRN_1,2021-07-12 13:00:00,11.4,34.6,1015.2,954.0,0.0,127.5,740.4,2.9,...,66.5,64.8,4.1,4.7,4.7,4.1,0.0,0.0,0.0,0 days 06:00:00
1369,UP_MPNTLCDMRN_1,2021-07-12 14:00:00,11.3,34.3,1014.6,953.5,0.0,120.1,621.8,3.0,...,69.3,66.8,4.1,4.7,4.5,3.9,0.0,0.0,0.0,0 days 07:00:00
1519,UP_MPNTLCDMRN_1,2021-07-12 15:00:00,11.5,33.6,1014.1,953.1,0.0,109.9,466.0,2.6,...,61.0,59.0,3.9,4.5,3.7,3.2,0.0,0.0,0.0,0 days 08:00:00
1660,UP_MPNTLCDMRN_1,2021-07-12 16:00:00,12.5,32.3,1014.1,953.3,0.0,95.2,290.1,1.9,...,35.3,37.5,3.2,3.7,2.3,2.2,0.0,0.0,0.0,0 days 09:00:00
