In [1]:
import sys
sys.path.insert(0, '..')

In [2]:
from typing import Optional
from datetime import datetime
from etl.ETL import db_connection, group_hourly, extract_weather, etl_plant, etl_weather
import numpy as np
import pandas as pd
from expt_settings.configs import ExperimentConfig
import os
from pandas import DataFrame, Series, Timestamp, Index
from numpy import ndarray
from sqlalchemy.engine import Engine, Connection

In [27]:
def merge_df(energy: DataFrame, weather: DataFrame) -> DataFrame:
    df: DataFrame = energy.merge(weather, left_on=['time', 'plant_name_up'],
                                 right_on=['timestamp_utc', 'plant_name_up'])
    df.drop(['timestamp_utc'], axis=1, inplace=True)
    df = df.sort_values(by=['plant_name_up', 'time'], ascending=True, ignore_index=True)

    return df

In [3]:
# extract and transform energy
engine: Engine = db_connection()
sql_energy: str = "SELECT * FROM sorgenia_energy"
energy_df: DataFrame = pd.read_sql_query(sql_energy, con=engine)
energy_grouped: DataFrame = group_hourly(energy_df, 'start_date_utc', 'plant_name_up')

In [4]:
# extract weather mm
mm_query: str = "SELECT * FROM sorgenia_weather"
weather_df: DataFrame = extract_weather(mm_query, engine)

In [5]:
weather_df.tail(1)

Unnamed: 0,id,plant_name_up,timestamp_utc,dew_point_2m_C,temperature_2m_C,msl_pressure_hPa,sfc_pressure_hPa,precipitation_1h_mm,wind_speed_mean_10m_1h_ms,wind_speed_mean_100m_1h_ms,wind_dir_mean_100m_1h_d,wind_dir_mean_10m_1h_d,wind_gusts_10m_1h_ms,wind_gusts_100m_1h_ms,wind_gusts_100m_ms,wind_gusts_10m_ms
122807,122808,UP_PRCLCDMZRD_1,2020-12-31 23:00:00,5.8,6.6,1016.6,998.2,0.0,1.9,2.3,146.2,95.4,4.2,4.8,4.8,4.2


In [6]:
# INFER THE DATES GAP BETWEEN Energy and Weather dfs
upper: str = weather_df['timestamp_utc'].min().strftime('%Y-%m-%d %H:%M:%S')
lower: str = energy_grouped['time'].min().strftime('%Y-%m-%d %H:%M:%S')

In [7]:
# extract weather copernicus
cop_sql: str = f"SELECT * FROM sorgenia_weather_copernicus WHERE timestamp_utc >= '{lower}' and timestamp_utc < '{upper}'"
weather_remain: DataFrame = extract_weather(cop_sql, engine)

In [8]:
weather_remain.tail(1)

Unnamed: 0,id,plant_name_up,timestamp_utc,dew_point_2m_C,temperature_2m_C,msl_pressure_hPa,sfc_pressure_hPa,precipitation_1h_mm,wind_speed_mean_10m_1h_ms,wind_speed_mean_100m_1h_ms,wind_dir_mean_100m_1h_d,wind_dir_mean_10m_1h_d,wind_gusts_10m_1h_ms,wind_gusts_100m_1h_ms,wind_gusts_100m_ms,wind_gusts_10m_ms
122639,184128,UP_PRCLCDMZRD_1,2018-12-31 23:00:00,5.741654,9.729697,1022.787138,1012.276857,0.24183,4.862552,7.136083,2.533652,2.95239,11.067398,,,8.255812


In [9]:
#  STACK WEATHER df together
weather: DataFrame = pd.concat([weather_df, weather_remain], axis=0)
weather.sort_values(by='timestamp_utc', ascending=True, inplace=True)

In [15]:
weather.head()

Unnamed: 0,id,plant_name_up,timestamp_utc,dew_point_2m_C,temperature_2m_C,msl_pressure_hPa,sfc_pressure_hPa,precipitation_1h_mm,wind_speed_mean_10m_1h_ms,wind_speed_mean_100m_1h_ms,wind_dir_mean_100m_1h_d,wind_dir_mean_10m_1h_d,wind_gusts_10m_1h_ms,wind_gusts_100m_1h_ms,wind_gusts_100m_ms,wind_gusts_10m_ms
6,114049,UP_PRCLCDMZRD_1,2017-01-01,1.062549,6.522103,1028.505569,1017.973297,-1.734723e-15,1.235498,1.718433,28.86852,43.432328,3.39328,,,3.045235
5,70249,UP_PRCLCDPRZZ_1,2017-01-01,-0.942863,2.579172,1028.722515,977.206491,-1.734723e-15,1.430844,2.388356,24.042896,36.907208,4.223982,,,3.939624
4,79009,UP_MPNTLCDMRN_1,2017-01-01,-0.783022,4.890773,1028.693079,983.524042,-1.734723e-15,1.278206,2.224647,22.408971,34.256697,4.027452,,,3.707296
0,61489,UP_PRCLCDPLRM_1,2017-01-01,-1.070487,3.36548,1028.758172,976.032434,-1.734723e-15,1.501277,2.466166,23.136555,35.285347,4.306699,,,4.008301
2,87769,UP_PRCLCMINEO_1,2017-01-01,-0.988801,1.177998,1028.52609,985.832957,-1.734723e-15,0.652449,1.808484,345.206979,338.635278,2.72171,,,2.678065


In [11]:
# INFER THE DATES GAP BETWEEN Energy and Weather dfs
lower: str = weather['timestamp_utc'].max().strftime('%Y-%m-%d %H:%M:%S')
upper: str = energy_grouped['time'].max().strftime('%Y-%m-%d %H:%M:%S')

In [12]:
upper, lower

('2021-10-07 04:00:00', '2020-12-31 23:00:00')

In [13]:
# extract weather copernicus
mm_sql: str = f"SELECT * FROM meteomatics_weather WHERE timestamp_utc >= '{lower}' and timestamp_utc < '{upper}'"
weather_remain2: DataFrame = extract_weather(mm_sql, engine)

In [17]:
weather_remain2.rename(columns={'plant_code': 'plant_name_up'}, inplace=True)
weather_remain2 = weather_remain2[list(weather.columns)]

In [20]:
weather_remain2.head()

Unnamed: 0,id,plant_name_up,timestamp_utc,dew_point_2m_C,temperature_2m_C,msl_pressure_hPa,sfc_pressure_hPa,precipitation_1h_mm,wind_speed_mean_10m_1h_ms,wind_speed_mean_100m_1h_ms,wind_dir_mean_100m_1h_d,wind_dir_mean_10m_1h_d,wind_gusts_10m_1h_ms,wind_gusts_100m_1h_ms,wind_gusts_100m_ms,wind_gusts_10m_ms
0,1,UP_PRCLCDPLRM_1,2021-05-31 12:00:00,12.8,24.5,1016.4,956.8,0.0,3.0,3.6,348.9,354.9,4.3,5.0,5.0,4.3
1,2,UP_PRCLCDMZRD_1,2021-05-31 12:00:00,14.9,27.1,1015.7,997.4,0.0,3.1,4.1,271.9,268.4,4.4,5.6,5.6,4.4
2,3,UP_PRCLCDPRZZ_1,2021-05-31 12:00:00,11.4,24.6,1016.1,948.7,0.0,3.2,4.0,315.4,318.9,4.4,5.3,5.2,4.2
3,4,UP_PRCLCMINEO_1,2021-05-31 12:00:00,12.6,24.5,1015.2,971.4,0.0,1.3,1.7,279.3,282.4,3.4,3.8,1.0,0.8
4,5,UP_PEPIZZA_1,2021-05-31 12:00:00,11.1,23.8,1016.1,991.7,0.0,2.3,3.6,280.4,282.0,5.1,5.8,4.5,3.9


In [21]:
weather_remain2.drop(['id'], axis=1, inplace=True)
weather.drop(['id'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [22]:
weather: DataFrame = pd.concat([weather, weather_remain2], axis=0)
weather.sort_values(by='timestamp_utc', ascending=True, inplace=True)

In [23]:
weather

Unnamed: 0,plant_name_up,timestamp_utc,dew_point_2m_C,temperature_2m_C,msl_pressure_hPa,sfc_pressure_hPa,precipitation_1h_mm,wind_speed_mean_10m_1h_ms,wind_speed_mean_100m_1h_ms,wind_dir_mean_100m_1h_d,wind_dir_mean_10m_1h_d,wind_gusts_10m_1h_ms,wind_gusts_100m_1h_ms,wind_gusts_100m_ms,wind_gusts_10m_ms
6,UP_PRCLCDMZRD_1,2017-01-01 00:00:00,1.062549,6.522103,1028.505569,1017.973297,-1.734723e-15,1.235498,1.718433,28.868520,43.432328,3.393280,,,3.045235
5,UP_PRCLCDPRZZ_1,2017-01-01 00:00:00,-0.942863,2.579172,1028.722515,977.206491,-1.734723e-15,1.430844,2.388356,24.042896,36.907208,4.223982,,,3.939624
4,UP_MPNTLCDMRN_1,2017-01-01 00:00:00,-0.783022,4.890773,1028.693079,983.524042,-1.734723e-15,1.278206,2.224647,22.408971,34.256697,4.027452,,,3.707296
0,UP_PRCLCDPLRM_1,2017-01-01 00:00:00,-1.070487,3.365480,1028.758172,976.032434,-1.734723e-15,1.501277,2.466166,23.136555,35.285347,4.306699,,,4.008301
2,UP_PRCLCMINEO_1,2017-01-01 00:00:00,-0.988801,1.177998,1028.526090,985.832957,-1.734723e-15,0.652449,1.808484,345.206979,338.635278,2.721710,,,2.678065
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26760,UP_PRCLCDPRZZ_1,2021-10-07 03:00:00,10.900000,12.600000,1013.000000,945.700000,0.000000e+00,3.800000,7.900000,279.000000,275.900000,10.500000,12.6,12.6,10.500000
26761,UP_PRCLCMINEO_1,2021-10-07 03:00:00,12.500000,14.500000,1012.000000,967.900000,0.000000e+00,2.900000,6.100000,259.800000,241.100000,9.200000,10.6,10.6,9.000000
26762,UP_PEPIZZA_1,2021-10-07 03:00:00,16.400000,16.700000,1009.400000,985.700000,4.200000e-01,4.600000,8.500000,263.700000,266.400000,12.200000,14.3,14.3,12.200000
26763,UP_MPNTLCSMBC_1,2021-10-07 03:00:00,11.000000,11.900000,1009.700000,909.900000,6.000000e-02,13.700000,14.600000,267.300000,266.000000,17.300000,18.5,18.5,17.300000


In [28]:
df: DataFrame = merge_df(energy_grouped, weather)

In [29]:
df

Unnamed: 0,plant_name_up,time,kwh,dew_point_2m_C,temperature_2m_C,msl_pressure_hPa,sfc_pressure_hPa,precipitation_1h_mm,wind_speed_mean_10m_1h_ms,wind_speed_mean_100m_1h_ms,wind_dir_mean_100m_1h_d,wind_dir_mean_10m_1h_d,wind_gusts_10m_1h_ms,wind_gusts_100m_1h_ms,wind_gusts_100m_ms,wind_gusts_10m_ms
0,UP_MPNTLCDMRN_1,2017-01-01 00:00:00,900.0,-0.783022,4.890773,1028.693079,983.524042,-1.734723e-15,1.278206,2.224647,22.408971,34.256697,4.027452,,,3.707296
1,UP_MPNTLCDMRN_1,2017-01-01 01:00:00,262.5,-0.565645,4.807044,1028.156679,983.034144,-1.734723e-15,1.211802,2.130597,17.880837,28.905779,3.713940,,,3.515824
2,UP_MPNTLCDMRN_1,2017-01-01 02:00:00,127.5,-0.331075,4.911959,1028.162779,983.062126,-1.734723e-15,1.017035,1.846413,15.523460,28.402692,3.547375,,,3.310938
3,UP_MPNTLCDMRN_1,2017-01-01 03:00:00,0.0,-0.668040,4.464566,1027.616920,982.551146,-1.734723e-15,0.715094,1.450024,0.706032,13.915324,3.362168,,,3.166429
4,UP_MPNTLCDMRN_1,2017-01-01 04:00:00,0.0,-0.617584,4.235597,1026.998009,981.962876,6.438134e-05,0.621238,1.329532,350.632351,7.442800,3.240527,,,2.989778
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
266261,UP_PRCLCMINEO_1,2021-10-06 23:00:00,4705.0,13.800000,15.500000,1014.100000,970.100000,0.000000e+00,2.100000,4.100000,277.000000,245.400000,6.300000,7.3,7.3,6.300000
266262,UP_PRCLCMINEO_1,2021-10-07 00:00:00,5454.0,13.500000,15.300000,1013.400000,969.400000,0.000000e+00,2.400000,4.600000,269.100000,241.200000,7.700000,8.8,8.8,7.700000
266263,UP_PRCLCMINEO_1,2021-10-07 01:00:00,8956.0,12.900000,15.000000,1012.800000,968.800000,0.000000e+00,2.600000,5.200000,271.400000,243.900000,8.900000,10.0,10.0,8.900000
266264,UP_PRCLCMINEO_1,2021-10-07 02:00:00,11172.0,12.600000,14.900000,1012.200000,968.100000,0.000000e+00,2.800000,5.900000,269.900000,246.500000,9.200000,10.5,10.5,9.200000
