In [11]:
from sqlalchemy.engine import Engine, Connection
from sqlalchemy import create_engine
from os import getenv
from typing import List, Dict, Optional
import pandas as pd
from pandas import DataFrame, Series, Timestamp
import matplotlib.pyplot as plt
from datetime import datetime
import datetime as dt
import numpy as np
from numpy import ndarray
import pytz

In [12]:
import sys
sys.path.insert(0, '..')

In [13]:
from etl.ETL import db_connection, group_hourly

In [14]:
try:
    from dotenv import load_dotenv

    load_dotenv()
except:
    print('No ".env" file or python-dotenv not installed... Using default env variables...')

In [15]:
def db_connection() -> Engine:
    dbname: Optional[str] = getenv('POSTGRES_DB_NAME')
    host: Optional[str] = getenv('POSTGRES_HOST')
    user: Optional[str] = getenv('POSTGRES_USERNAME')
    password: Optional[str] = getenv('POSTGRES_PASSWORD')
    port: Optional[str] = getenv('POSTGRES_PORT')
        
    postgres_str: str = f'postgresql://{user}:{password}@{host}:{port}/{dbname}'
    
    engine: Engine = create_engine(postgres_str)
    
    return engine

# WHAT WE HAVE TO QUERY

let's start by define date intervals

In [16]:
startDate: str = (datetime.now(pytz.timezone('UTC')) - dt.timedelta(hours=167))
endDate: str = datetime.now(pytz.timezone('UTC'))
startDate, endDate 

(datetime.datetime(2021, 7, 6, 8, 57, 36, 194680, tzinfo=<UTC>),
 datetime.datetime(2021, 7, 13, 7, 57, 36, 194680, tzinfo=<UTC>))

In [17]:
targetDate: datetime = endDate + dt.timedelta(hours=12)
targetDates: List[datetime] = [endDate + dt.timedelta(hours=i) for i in range(1,13)]

CONVERT INTO STRINGS AND PUSHED TO THE START OF THE HOUR (xx:00:00)

In [18]:
start: str = startDate.strftime("%Y-%m-%d %H:00:00")
end: str = endDate.strftime("%Y-%m-%d %H:00:00")
targets: List[str] = [date.strftime("%Y-%m-%d %H:00:00") for date in targetDates]

In [19]:
start, end

('2021-07-06 08:00:00', '2021-07-13 07:00:00')

In [20]:
targets

['2021-07-13 08:00:00',
 '2021-07-13 09:00:00',
 '2021-07-13 10:00:00',
 '2021-07-13 11:00:00',
 '2021-07-13 12:00:00',
 '2021-07-13 13:00:00',
 '2021-07-13 14:00:00',
 '2021-07-13 15:00:00',
 '2021-07-13 16:00:00',
 '2021-07-13 17:00:00',
 '2021-07-13 18:00:00',
 '2021-07-13 19:00:00']

# QUERY OBSERVED INPUTS

where are they?

table: "meteomatics_weather" (for start to end - 1)

table: "meteomatics_forecast_weather" (for end)

In [21]:
query1: str = "SELECT * FROM meteomatics_weather WHERE timestamp_utc between '{}' and '{}'"
observed_df1: DataFrame = pd.read_sql_query(query1.format(start, end), con=db_connection())
observed_df1.drop(['id'], axis=1, inplace=True)
observed_df1.rename(columns={'timestamp_utc': 'time'}, inplace=True)

In [22]:
observed_df1

Unnamed: 0,plant_code,time,dew_point_2m_C,temperature_2m_C,msl_pressure_hPa,sfc_pressure_hPa,precipitation_1h_mm,diffuse_rad_w,direct_rad_w,wind_speed_mean_10m_1h_ms,wind_speed_mean_100m_1h_ms,wind_dir_mean_100m_1h_d,wind_dir_mean_10m_1h_d,wind_gusts_10m_1h_ms,wind_gusts_100m_1h_ms,wind_gusts_100m_ms,wind_gusts_10m_ms,low_cloud_cover_p,medium_cloud_cover_p,high_cloud_cover_p
0,UP_PRCLCDPLRM_1,2021-07-06 08:00:00,11.2,28.0,1015.6,956.4,0.0,121.8,616.2,2.1,2.7,39.6,38.3,3.4,4.4,3.9,3.4,0.0,0.0,0.0
1,UP_PRCLCDMZRD_1,2021-07-06 08:00:00,13.9,31.9,1014.0,995.9,0.0,124.7,630.9,2.7,3.7,33.9,33.1,5.5,5.8,5.1,4.6,0.0,0.0,0.0
2,UP_PRCLCDPRZZ_1,2021-07-06 08:00:00,9.8,27.6,1015.6,948.7,0.0,120.4,609.1,2.3,3.0,40.9,39.1,5.0,6.0,4.0,3.6,0.0,0.0,0.0
3,UP_PRCLCMINEO_1,2021-07-06 08:00:00,10.1,29.2,1014.0,970.5,0.0,119.6,605.5,1.9,2.3,77.3,72.5,3.3,3.9,3.9,3.3,0.0,0.0,0.0
4,UP_PEPIZZA_1,2021-07-06 08:00:00,19.8,28.0,1014.3,990.1,0.0,114.9,486.4,5.4,7.7,272.9,273.4,8.3,9.9,9.9,8.3,48.5,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1498,UP_PEPIZZA_1,2021-07-13 06:00:00,15.7,28.9,1013.1,989.2,0.0,88.2,264.8,0.8,1.8,67.0,63.1,2.8,3.8,3.8,2.8,0.0,0.0,0.0
1499,UP_MPNTLCSMBC_1,2021-07-13 06:00:00,3.8,25.6,1013.3,915.4,0.0,89.0,264.1,2.8,3.4,157.1,146.3,5.1,5.6,5.6,5.1,0.0,0.0,0.0
1500,UP_MPNTLCDMRN_1,2021-07-13 06:00:00,7.3,29.5,1012.7,951.9,0.0,88.3,223.0,3.4,6.5,208.0,203.5,6.6,8.5,8.1,6.4,0.0,0.0,3.7
1501,ESAPRO_P263,2021-07-13 06:00:00,14.7,29.4,1011.7,994.7,0.0,88.6,259.1,2.9,5.0,174.1,177.6,5.7,7.3,5.7,5.4,0.0,0.0,0.0


In [23]:
query2: str = "SELECT * FROM meteomatics_forecast_weather WHERE forecast_timestamp_utc = '{}' and timestamp_query_utc = '{}'"
observed_df2: DataFrame = pd.read_sql_query(query2.format(end, end), con=db_connection())
observed_df2.drop(['id','timestamp_query_utc'], axis=1, inplace=True)
observed_df2.rename(columns={'forecast_timestamp_utc': 'time'}, inplace=True)

In [24]:
observed_df2

Unnamed: 0,plant_code,time,dew_point_2m_C,temperature_2m_C,msl_pressure_hPa,sfc_pressure_hPa,precipitation_1h_mm,diffuse_rad_w,direct_rad_w,wind_speed_mean_10m_1h_ms,wind_speed_mean_100m_1h_ms,wind_dir_mean_100m_1h_d,wind_dir_mean_10m_1h_d,wind_gusts_10m_1h_ms,wind_gusts_100m_1h_ms,wind_gusts_100m_ms,wind_gusts_10m_ms,low_cloud_cover_p,medium_cloud_cover_p,high_cloud_cover_p
0,UP_PRCLCDPLRM_1,2021-07-13 07:00:00,7.9,31.2,1013.5,954.8,0.0,74.0,375.4,3.7,5.7,202.1,199.0,6.5,8.3,6.4,6.0,0.0,0.0,0.0
1,UP_PRCLCMINEO_1,2021-07-13 07:00:00,9.9,30.0,1013.4,970.1,0.0,78.9,399.1,3.1,4.3,228.3,226.1,5.6,5.8,5.3,4.9,0.0,0.0,0.0
2,UP_PEPIZZA_1,2021-07-13 07:00:00,11.4,32.5,1013.0,988.9,0.0,81.8,413.9,1.7,3.0,88.2,90.2,4.3,4.7,4.7,4.3,0.0,0.0,0.0
3,ESAPRO_P263,2021-07-13 07:00:00,14.1,32.9,1011.2,994.2,0.0,83.8,423.9,2.8,4.0,170.0,168.4,5.4,5.7,4.9,4.6,0.0,0.0,0.0
4,ESAPRO_P259,2021-07-13 07:00:00,21.8,27.9,1007.3,1007.0,0.0,121.9,228.9,3.8,5.3,58.8,55.4,7.4,8.4,5.2,4.6,0.2,9.8,46.7
5,UP_PRCLCDMZRD_1,2021-07-13 07:00:00,17.6,28.0,1013.0,994.9,0.0,87.1,310.3,5.0,7.9,151.9,154.2,11.2,12.0,11.4,10.6,2.5,0.0,0.0
6,UP_PRCLCDPRZZ_1,2021-07-13 07:00:00,7.7,29.5,1014.1,947.5,0.0,78.6,397.8,3.4,5.2,198.6,196.2,6.4,7.4,6.7,6.4,0.0,0.0,0.0
7,UP_MPNTLCSMBC_1,2021-07-13 07:00:00,5.1,29.2,1013.1,915.3,0.0,84.8,429.3,4.2,4.6,157.2,148.8,5.9,6.3,6.3,5.9,0.0,0.0,0.0
8,UP_MPNTLCDMRN_1,2021-07-13 07:00:00,7.3,32.3,1013.1,952.1,0.0,87.8,318.9,3.3,5.2,213.3,213.4,6.4,8.1,6.4,6.0,0.0,0.0,0.0


In [25]:
observed_df2.dtypes

plant_code                            object
time                          datetime64[ns]
dew_point_2m_C                       float64
temperature_2m_C                     float64
msl_pressure_hPa                     float64
sfc_pressure_hPa                     float64
precipitation_1h_mm                  float64
diffuse_rad_w                        float64
direct_rad_w                         float64
wind_speed_mean_10m_1h_ms            float64
wind_speed_mean_100m_1h_ms           float64
wind_dir_mean_100m_1h_d              float64
wind_dir_mean_10m_1h_d               float64
wind_gusts_10m_1h_ms                 float64
wind_gusts_100m_1h_ms                float64
wind_gusts_100m_ms                   float64
wind_gusts_10m_ms                    float64
low_cloud_cover_p                    float64
medium_cloud_cover_p                 float64
high_cloud_cover_p                   float64
dtype: object

CONCAT THE TWO OBSERVED DF

In [26]:
farm_list: List = ['UP_PRCLCDPLRM_1',
'UP_PRCLCDMZRD_1',
'UP_PRCLCDPRZZ_1',
'UP_PRCLCMINEO_1',
'UP_PEPIZZA_1',
'UP_MPNTLCSMBC_1',
'UP_MPNTLCDMRN_1']

In [27]:
observed_df: DataFrame = pd.concat([observed_df1, observed_df2], axis=0, ignore_index=True)
observed_df: DataFrame = observed_df[observed_df['plant_code'].isin(farm_list)]
observed_df: DataFrame = observed_df.sort_values(by=['plant_code', 'time'], ascending=True, ignore_index=True)

In [28]:
observed_df

Unnamed: 0,plant_code,time,dew_point_2m_C,temperature_2m_C,msl_pressure_hPa,sfc_pressure_hPa,precipitation_1h_mm,diffuse_rad_w,direct_rad_w,wind_speed_mean_10m_1h_ms,wind_speed_mean_100m_1h_ms,wind_dir_mean_100m_1h_d,wind_dir_mean_10m_1h_d,wind_gusts_10m_1h_ms,wind_gusts_100m_1h_ms,wind_gusts_100m_ms,wind_gusts_10m_ms,low_cloud_cover_p,medium_cloud_cover_p,high_cloud_cover_p
0,UP_MPNTLCDMRN_1,2021-07-06 08:00:00,12.2,27.8,1015.6,953.8,0.0,126.1,638.2,2.4,2.9,50.0,48.9,4.4,5.4,4.3,3.8,0.0,0.0,0.0
1,UP_MPNTLCDMRN_1,2021-07-06 09:00:00,11.5,29.8,1015.8,954.1,0.0,152.7,773.0,2.6,3.2,47.1,46.3,3.8,4.5,4.5,3.8,0.0,0.0,0.0
2,UP_MPNTLCDMRN_1,2021-07-06 10:00:00,11.6,31.4,1015.8,954.1,0.0,170.5,863.0,2.8,3.4,49.0,48.3,3.8,4.5,4.5,3.8,0.0,0.0,0.0
3,UP_MPNTLCDMRN_1,2021-07-06 11:00:00,12.2,32.4,1015.4,953.8,0.0,176.3,891.9,2.8,3.4,50.9,50.0,3.9,4.6,4.3,3.7,0.0,0.0,0.0
4,UP_MPNTLCDMRN_1,2021-07-06 12:00:00,12.7,33.0,1015.1,953.6,0.0,172.2,871.2,2.8,3.2,51.3,50.4,3.8,4.3,4.3,3.8,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1171,UP_PRCLCMINEO_1,2021-07-13 03:00:00,11.4,21.9,1013.0,969.7,0.0,0.0,0.0,2.1,3.8,246.0,228.2,4.3,5.1,4.9,4.1,0.0,0.0,0.0
1172,UP_PRCLCMINEO_1,2021-07-13 04:00:00,10.8,22.0,1012.9,969.5,0.0,1.1,0.3,2.1,3.6,233.2,217.9,4.1,4.9,4.7,3.6,0.0,0.0,0.0
1173,UP_PRCLCMINEO_1,2021-07-13 05:00:00,11.0,24.3,1013.4,970.1,0.0,58.7,75.2,2.8,4.7,230.7,222.6,5.0,6.4,6.4,5.0,0.0,0.0,0.0
1174,UP_PRCLCMINEO_1,2021-07-13 06:00:00,10.6,27.5,1013.7,970.3,0.0,90.8,232.3,3.1,4.7,229.4,225.4,5.6,6.4,5.8,5.6,0.0,0.0,0.0


# QUERY TARGETS

where are they?

table: "sorgenia_energy"

In [29]:
query_tar: str = "SELECT * FROM sorgenia_energy WHERE start_date_utc >= '{}' and end_date_utc <= '{}'"
past_targets: DataFrame = pd.read_sql_query(query_tar.format(start, end), con=db_connection())
past_targets: DataFrame = group_hourly(past_targets)
past_targets: DataFrame = past_targets[past_targets['plant_name_up'].isin(farm_list)]

In [31]:
past_targets

Unnamed: 0,plant_name_up,time,kwh
0,UP_MPNTLCDMRN_1,2021-07-06 08:00:00,0.00000
1,UP_MPNTLCDMRN_1,2021-07-06 09:00:00,0.00000
2,UP_MPNTLCDMRN_1,2021-07-06 10:00:00,0.06175
3,UP_MPNTLCDMRN_1,2021-07-06 11:00:00,0.26850
4,UP_MPNTLCDMRN_1,2021-07-06 12:00:00,0.16875
...,...,...,...
1302,UP_PRCLCMINEO_1,2021-07-12 23:00:00,0.98975
1303,UP_PRCLCMINEO_1,2021-07-13 00:00:00,3.38850
1304,UP_PRCLCMINEO_1,2021-07-13 01:00:00,0.13075
1305,UP_PRCLCMINEO_1,2021-07-13 02:00:00,0.00000


MERGE KWH TO observed_df ON time and plant_name

In [32]:
observed_df = observed_df.merge(past_targets, how='left', left_on=['plant_code', 'time'], right_on=['plant_name_up', 'time'])

In [33]:
observed_df

Unnamed: 0,plant_code,time,dew_point_2m_C,temperature_2m_C,msl_pressure_hPa,sfc_pressure_hPa,precipitation_1h_mm,diffuse_rad_w,direct_rad_w,wind_speed_mean_10m_1h_ms,...,wind_dir_mean_10m_1h_d,wind_gusts_10m_1h_ms,wind_gusts_100m_1h_ms,wind_gusts_100m_ms,wind_gusts_10m_ms,low_cloud_cover_p,medium_cloud_cover_p,high_cloud_cover_p,plant_name_up,kwh
0,UP_MPNTLCDMRN_1,2021-07-06 08:00:00,12.2,27.8,1015.6,953.8,0.0,126.1,638.2,2.4,...,48.9,4.4,5.4,4.3,3.8,0.0,0.0,0.0,UP_MPNTLCDMRN_1,0.00000
1,UP_MPNTLCDMRN_1,2021-07-06 09:00:00,11.5,29.8,1015.8,954.1,0.0,152.7,773.0,2.6,...,46.3,3.8,4.5,4.5,3.8,0.0,0.0,0.0,UP_MPNTLCDMRN_1,0.00000
2,UP_MPNTLCDMRN_1,2021-07-06 10:00:00,11.6,31.4,1015.8,954.1,0.0,170.5,863.0,2.8,...,48.3,3.8,4.5,4.5,3.8,0.0,0.0,0.0,UP_MPNTLCDMRN_1,0.06175
3,UP_MPNTLCDMRN_1,2021-07-06 11:00:00,12.2,32.4,1015.4,953.8,0.0,176.3,891.9,2.8,...,50.0,3.9,4.6,4.3,3.7,0.0,0.0,0.0,UP_MPNTLCDMRN_1,0.26850
4,UP_MPNTLCDMRN_1,2021-07-06 12:00:00,12.7,33.0,1015.1,953.6,0.0,172.2,871.2,2.8,...,50.4,3.8,4.3,4.3,3.8,0.0,0.0,0.0,UP_MPNTLCDMRN_1,0.16875
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1171,UP_PRCLCMINEO_1,2021-07-13 03:00:00,11.4,21.9,1013.0,969.7,0.0,0.0,0.0,2.1,...,228.2,4.3,5.1,4.9,4.1,0.0,0.0,0.0,UP_PRCLCMINEO_1,0.13800
1172,UP_PRCLCMINEO_1,2021-07-13 04:00:00,10.8,22.0,1012.9,969.5,0.0,1.1,0.3,2.1,...,217.9,4.1,4.9,4.7,3.6,0.0,0.0,0.0,,
1173,UP_PRCLCMINEO_1,2021-07-13 05:00:00,11.0,24.3,1013.4,970.1,0.0,58.7,75.2,2.8,...,222.6,5.0,6.4,6.4,5.0,0.0,0.0,0.0,,
1174,UP_PRCLCMINEO_1,2021-07-13 06:00:00,10.6,27.5,1013.7,970.3,0.0,90.8,232.3,3.1,...,225.4,5.6,6.4,5.8,5.6,0.0,0.0,0.0,,


In [34]:
observed_df['kwh'] = observed_df['kwh'].fillna(method='ffill')
observed_df.drop(['plant_name_up'], axis=1, inplace=True)

In [35]:
observed_df

Unnamed: 0,plant_code,time,dew_point_2m_C,temperature_2m_C,msl_pressure_hPa,sfc_pressure_hPa,precipitation_1h_mm,diffuse_rad_w,direct_rad_w,wind_speed_mean_10m_1h_ms,...,wind_dir_mean_100m_1h_d,wind_dir_mean_10m_1h_d,wind_gusts_10m_1h_ms,wind_gusts_100m_1h_ms,wind_gusts_100m_ms,wind_gusts_10m_ms,low_cloud_cover_p,medium_cloud_cover_p,high_cloud_cover_p,kwh
0,UP_MPNTLCDMRN_1,2021-07-06 08:00:00,12.2,27.8,1015.6,953.8,0.0,126.1,638.2,2.4,...,50.0,48.9,4.4,5.4,4.3,3.8,0.0,0.0,0.0,0.00000
1,UP_MPNTLCDMRN_1,2021-07-06 09:00:00,11.5,29.8,1015.8,954.1,0.0,152.7,773.0,2.6,...,47.1,46.3,3.8,4.5,4.5,3.8,0.0,0.0,0.0,0.00000
2,UP_MPNTLCDMRN_1,2021-07-06 10:00:00,11.6,31.4,1015.8,954.1,0.0,170.5,863.0,2.8,...,49.0,48.3,3.8,4.5,4.5,3.8,0.0,0.0,0.0,0.06175
3,UP_MPNTLCDMRN_1,2021-07-06 11:00:00,12.2,32.4,1015.4,953.8,0.0,176.3,891.9,2.8,...,50.9,50.0,3.9,4.6,4.3,3.7,0.0,0.0,0.0,0.26850
4,UP_MPNTLCDMRN_1,2021-07-06 12:00:00,12.7,33.0,1015.1,953.6,0.0,172.2,871.2,2.8,...,51.3,50.4,3.8,4.3,4.3,3.8,0.0,0.0,0.0,0.16875
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1171,UP_PRCLCMINEO_1,2021-07-13 03:00:00,11.4,21.9,1013.0,969.7,0.0,0.0,0.0,2.1,...,246.0,228.2,4.3,5.1,4.9,4.1,0.0,0.0,0.0,0.13800
1172,UP_PRCLCMINEO_1,2021-07-13 04:00:00,10.8,22.0,1012.9,969.5,0.0,1.1,0.3,2.1,...,233.2,217.9,4.1,4.9,4.7,3.6,0.0,0.0,0.0,0.13800
1173,UP_PRCLCMINEO_1,2021-07-13 05:00:00,11.0,24.3,1013.4,970.1,0.0,58.7,75.2,2.8,...,230.7,222.6,5.0,6.4,6.4,5.0,0.0,0.0,0.0,0.13800
1174,UP_PRCLCMINEO_1,2021-07-13 06:00:00,10.6,27.5,1013.7,970.3,0.0,90.8,232.3,3.1,...,229.4,225.4,5.6,6.4,5.8,5.6,0.0,0.0,0.0,0.13800


# QUERY KNOWN INPUTS (FORECASTS)

where are they?

table: "meteomatics_forecast_weather" 

In [None]:
# query_fore: str = "SELECT * FROM meteomatics_forecast_weather WHERE forecast_timestamp_utc between '{}' and '{}'"
# known_df: DataFrame = pd.read_sql_query(query_fore.format(targets[0], targets[-1]), con=db_connection())
# known_df.drop(['id', 'timestamp_query_utc'], axis=1, inplace=True)
# # known_df.drop(['id'], axis=1, inplace=True)
# known_df: DataFrame = known_df.sort_values(by=['forecast_timestamp_utc','plant_code'], ascending=True, ignore_index=True)
# known_df: DataFrame = known_df[known_df['plant_code'].isin(farm_list)]

In [None]:
query_fore: str = "SELECT * FROM meteomatics_forecast_weather WHERE forecast_timestamp_utc between '{}' and '{}'"
known_df: DataFrame = pd.read_sql_query(query_fore.format(targets[0], targets[-1]), con=db_connection())
known_df.drop(['id'], axis=1, inplace=True)
known_df: DataFrame = known_df.sort_values(by=['forecast_timestamp_utc', 'plant_code'], ascending=True, ignore_index=True)
known_df['diff'] = known_df['forecast_timestamp_utc'] - known_df['timestamp_query_utc']
known_df = known_df.sort_values('diff', ascending=True).drop_duplicates(subset=['plant_code', 'forecast_timestamp_utc'], keep='first')
# assert known_df['timestamp_query_utc'].unique() == pd.Timestamp(targets[0])
known_df.drop(['timestamp_query_utc'], axis=1, inplace=True)
known_df: DataFrame = known_df[known_df['plant_code'].isin(farm_list)]
known_df.rename(columns={'forecast_timestamp_utc': 'time'}, inplace=True)
known_df['kwh'] = np.nan

# TEST observed_df

In [None]:
maserio_ob = observed_df[observed_df['plant_code']=='UP_MPNTLCDMRN_1']

In [None]:
assert len(maserio_ob) == 168

In [None]:
maserio_kn = known_df[known_df['plant_code']=='UP_MPNTLCDMRN_1']

In [None]:
maserio_kn

# CONCATENATE OBSERVED AND KNOWN

In [None]:
columns = ['plant_code', 'time', 'kwh', 'dew_point_2m_C', 'temperature_2m_C', 'msl_pressure_hPa', 'sfc_pressure_hPa',
           'precipitation_1h_mm', 'wind_speed_mean_10m_1h_ms',
           'wind_speed_mean_100m_1h_ms', 'wind_dir_mean_100m_1h_d', 'wind_dir_mean_10m_1h_d', 'wind_gusts_10m_1h_ms',
           'wind_gusts_10m_ms']

In [None]:
observed_df = observed_df[columns]
known_df = known_df[columns]

In [None]:
df: DataFrame = pd.concat([observed_df, known_df], axis=0, ignore_index=True)
df = df.sort_values(['plant_code', 'time'], ascending=True, ignore_index=True)

In [None]:
# add other engineered features
timestamp_s: Series = df['time'].map(datetime.timestamp)

day: int = 24 * 60 * 60
year: float = 365.2425 * day

df['Day sin']: Series = np.sin(timestamp_s * (2 * np.pi / day))
df['Day cos']: Series = np.cos(timestamp_s * (2 * np.pi / day))
df['Year sin']: Series = np.sin(timestamp_s * (2 * np.pi / year))
df['Year cos']: Series = np.cos(timestamp_s * (2 * np.pi / year))
    
earliest_time: Timestamp = df.time.min()
df['t']: Series = (df['time'] - earliest_time).dt.seconds / 60 / 60 + (df['time'] - earliest_time).dt.days * 24
df['days_from_start']: Series = (df['time'] - earliest_time).dt.days
df["id"] = df["plant_code"]
df['hour']: Series = df["time"].dt.hour
df['day']: Series = df["time"].dt.day
df['day_of_week']: Series = df["time"].dt.dayofweek
df['month']: Series = df["time"].dt.month
df['categorical_id']: Series = df['id'].copy()
df['hours_from_start']: Series = df['t']
df['categorical_day_of_week']: Series = df['day_of_week'].copy()
df['categorical_hour']: Series = df['hour'].copy()

In [None]:
df['kwh'].fillna(method='ffill', inplace=True)

In [None]:
df.columns

In [None]:
df_maserio = df[df['id']=='UP_MPNTLCDMRN_1']
df_maserio['kwh'].shape