In [59]:
from sqlalchemy.engine import Engine, Connection
from sqlalchemy import create_engine
from os import getenv
from typing import List, Dict, Optional
import pandas as pd
from pandas import DataFrame, Series, Timestamp
import matplotlib.pyplot as plt
from datetime import datetime
import numpy as np
from numpy import ndarray

In [2]:
import sys
sys.path.insert(0, '..')

In [3]:
try:
    from dotenv import load_dotenv

    load_dotenv()
except:
    print('No ".env" file or python-dotenv not installed... Using default env variables...')

In [4]:
def db_connection() -> Engine:
    dbname: Optional[str] = getenv('POSTGRES_DB_NAME')
    host: Optional[str] = getenv('POSTGRES_HOST')
    user: Optional[str] = getenv('POSTGRES_USERNAME')
    password: Optional[str] = getenv('POSTGRES_PASSWORD')
    port: Optional[str] = getenv('POSTGRES_PORT')
        
    postgres_str: str = f'postgresql://{user}:{password}@{host}:{port}/{dbname}'
    
    engine: Engine = create_engine(postgres_str)
    
    return engine

In [5]:
sql_energy: str = "SELECT * FROM sorgenia_energy"
engine: Engine = db_connection()

In [6]:
energy_df: DataFrame = pd.read_sql_query(sql_energy, con=engine)

In [7]:
energy_df.head()

Unnamed: 0,id,plant_name_up,start_date_utc,end_date_utc,kwh
0,1,UP_PRCLCDPRZZ_1,2016-12-31 23:00:00,2016-12-31 23:15:00,269.4
1,2,UP_PRCLCDPRZZ_1,2016-12-31 23:15:00,2016-12-31 23:30:00,260.1
2,3,UP_PRCLCDPRZZ_1,2016-12-31 23:30:00,2016-12-31 23:45:00,131.1
3,4,UP_PRCLCDPRZZ_1,2016-12-31 23:45:00,2017-01-01 00:00:00,210.9
4,5,UP_PRCLCDPRZZ_1,2017-01-01 00:00:00,2017-01-01 00:15:00,351.3


In [48]:
def group_hourly(df: DataFrame) -> DataFrame:
    df: DataFrame = df.copy()
    df['day']: Series = df['start_date_utc'].dt.year.astype('str') + '-' + df['start_date_utc'].dt.month.astype(
        'str') + '-' + df[
                    'start_date_utc'].dt.day.astype('str')
    df['day']: Series = pd.to_datetime(df['day'], infer_datetime_format=True)
    grouped: DataFrame = df.groupby(['plant_name_up', 'day', df.start_date_utc.dt.hour]).agg(
        {'kwh': 'mean'})
    grouped: DataFrame = grouped.reset_index(drop=False).rename(columns={'start_date_utc': 'time'})
#     grouped: DataFrame = grouped.sort_values(by=['plant_name_up', 'day', 'time'], ascending=True, ignore_index=True)
    grouped['time'] = grouped['day'].astype('str') + ' ' + grouped['time'].astype('str') + ':00:00'
#     grouped['time'] = grouped['time'].astype('datetime64[ns, UTC]')
    grouped['time'] = grouped['time'].astype('datetime64[ns]')
    grouped: DataFrame = grouped.sort_values(by=['plant_name_up', 'time'], ascending=True, ignore_index=True)
    grouped.drop('day', axis=1, inplace=True)

    return grouped

In [49]:
energy_grouped = group_hourly(energy_df)

In [50]:
energy_grouped.head()

Unnamed: 0,plant_name_up,time,kwh
0,UP_MPNTLCDMRN_1,2016-12-31 23:00:00,225.0
1,UP_MPNTLCDMRN_1,2017-01-01 00:00:00,65.625
2,UP_MPNTLCDMRN_1,2017-01-01 01:00:00,31.875
3,UP_MPNTLCDMRN_1,2017-01-01 02:00:00,0.0
4,UP_MPNTLCDMRN_1,2017-01-01 03:00:00,0.0


In [51]:
energy_grouped.dtypes

plant_name_up            object
time             datetime64[ns]
kwh                     float64
dtype: object

In [12]:
energy_grouped.shape

(254203, 3)

In [13]:
energy_grouped.plant_name_up.unique()

array(['UP_MPNTLCDMRN_1', 'UP_MPNTLCSMBC_1', 'UP_PEPIZZA_1',
       'UP_PRCLCDMZRD_1', 'UP_PRCLCDMZRD_2', 'UP_PRCLCDPLRM_1',
       'UP_PRCLCDPRZZ_1', 'UP_PRCLCMINEO_1'], dtype=object)

EXTRACT WEATHER COPERNICUS

In [14]:
mm_query: str = "SELECT * FROM sorgenia_weather"

In [15]:
def extract_weather(weather_sql: str, engine: Engine) -> DataFrame:
    weather_df: DataFrame = pd.read_sql_query(weather_sql, con=engine)
    weather_df['wind_gusts_100m_1h_ms'] = weather_df['wind_gusts_100m_1h_ms'].astype('float64')
    weather_df['wind_gusts_100m_ms'] = weather_df['wind_gusts_100m_ms'].astype('float64')
    weather_df: DataFrame = weather_df.sort_values(by=['timestamp_utc'], ascending=True, ignore_index=True)
        
    return weather_df

In [16]:
weather_df: DataFrame = extract_weather(mm_query, engine)

INFER THE DATES GAP BETWEEN Energy and Weather dfs

In [28]:
upper: str = weather_df['timestamp_utc'].min().strftime('%Y-%m-%d %H:%M:%S')
lower: str = energy_grouped['time'].min().strftime('%Y-%m-%d %H:%M:%S')

In [33]:
cop_sql: str = f"SELECT * FROM sorgenia_weather_copernicus WHERE timestamp_utc >= '{lower}' and timestamp_utc < '{upper}'"

In [34]:
weather_remain: DataFrame = extract_weather(cop_sql, engine)

In [37]:
weather_remain['timestamp_utc'].min(), weather_remain['timestamp_utc'].max()

(Timestamp('2016-12-31 23:00:00'), Timestamp('2018-12-31 23:00:00'))

STACK WEATHER df together

In [43]:
weather: DataFrame = pd.concat([weather_df, weather_remain], axis=0)
weather.sort_values(by='timestamp_utc', ascending=True, inplace=True)

In [47]:
weather.dtypes

id                                     int64
plant_name_up                         object
timestamp_utc                 datetime64[ns]
dew_point_2m_C                       float64
temperature_2m_C                     float64
msl_pressure_hPa                     float64
sfc_pressure_hPa                     float64
precipitation_1h_mm                  float64
wind_speed_mean_10m_1h_ms            float64
wind_speed_mean_100m_1h_ms           float64
wind_dir_mean_100m_1h_d              float64
wind_dir_mean_10m_1h_d               float64
wind_gusts_10m_1h_ms                 float64
wind_gusts_100m_1h_ms                float64
wind_gusts_100m_ms                   float64
wind_gusts_10m_ms                    float64
dtype: object

In [None]:
timestamp_s: Series = df['time'].map(datetime.timestamp)
    day: int = 24 * 60 * 60
    year: float = 365.2425 * day

    df['Day sin']: Series = np.sin(timestamp_s * (2 * np.pi / day))
    df['Day cos']: Series = np.cos(timestamp_s * (2 * np.pi / day))
    df['Year sin']: Series = np.sin(timestamp_s * (2 * np.pi / year))
    df['Year cos']: Series = np.cos(timestamp_s * (2 * np.pi / year))

    earliest_time: Timestamp = df.time.min()
    df['t']: Series = (df['time'] - earliest_time).dt.seconds / 60 / 60 + (df['time'] - earliest_time).dt.days * 24
    df['days_from_start']: Series = (df['time'] - earliest_time).dt.days
    df["id"] = df["plant_name_up"]
    df['hour']: Series = df["time"].dt.hour
    df['day']: Series = df["time"].dt.day
    df['day_of_week']: Series = df["time"].dt.dayofweek
    df['month']: Series = df["time"].dt.month
    df['categorical_id']: Series = df['id'].copy()
    df['hours_from_start']: Series = df['t']
    df['categorical_day_of_week']: Series = df['day_of_week'].copy()
    df['categorical_hour']: Series = df['hour'].copy()

In [None]:
# weather_df['wind_gusts_100m_1h_ms'] = weather_df['wind_gusts_100m_1h_ms'].astype('float64')
# weather_df['wind_gusts_100m_ms'] = weather_df['wind_gusts_100m_ms'].astype('float64')

In [None]:
# weather_df: DataFrame = weather_df.sort_values(by=['timestamp_utc'], ascending=True, ignore_index=True)

In [None]:
weather_df.head()

In [None]:
weather_df.dtypes

In [None]:
weather_df.wind_gusts_100m_ms.value_counts()

ANALYZE DISTRIB

In [None]:
 # Let's go ahead and analyse the distributions of these variables
def analyse_continous(df: DataFrame, var: str):
    df = df.copy()
    df[var].hist(bins=20)
    plt.ylabel('Power range')
    plt.xlabel(var)
    plt.title(var)
    plt.show()

In [None]:
for var in weather_df.columns[3:]:
    analyse_continous(weather_df, var)

In [None]:
assert weather_df.plant_name_up.value_counts().tolist().sort() == energy_grouped.plant_name_up.unique().tolist().sort()

MERGE THE TWO DF

In [45]:
def merge_df(energy: DataFrame, weather: DataFrame) -> DataFrame:
    df: DataFrame = energy.merge(weather, left_on=['time', 'plant_name_up'], right_on=['timestamp_utc', 'plant_name_up'])
    df.drop(['timestamp_utc', 'id'], axis=1, inplace=True)
    df = df.sort_values(by=['plant_name_up','time'], ascending=True, ignore_index=True)
    
    return df

In [52]:
df = merge_df(energy_grouped, weather)

In [53]:
df.head()

Unnamed: 0,plant_name_up,time,kwh,dew_point_2m_C,temperature_2m_C,msl_pressure_hPa,sfc_pressure_hPa,precipitation_1h_mm,wind_speed_mean_10m_1h_ms,wind_speed_mean_100m_1h_ms,wind_dir_mean_100m_1h_d,wind_dir_mean_10m_1h_d,wind_gusts_10m_1h_ms,wind_gusts_100m_1h_ms,wind_gusts_100m_ms,wind_gusts_10m_ms
0,UP_MPNTLCDMRN_1,2016-12-31 23:00:00,225.0,-0.902831,4.980909,1028.935552,983.718765,0.0,1.381208,2.254668,26.531339,38.664892,4.240202,,,4.026465
1,UP_MPNTLCDMRN_1,2017-01-01 00:00:00,65.625,-0.783022,4.890773,1028.693079,983.524042,-1.734723e-15,1.278206,2.224647,22.408971,34.256697,4.027452,,,3.707296
2,UP_MPNTLCDMRN_1,2017-01-01 01:00:00,31.875,-0.565645,4.807044,1028.156679,983.034144,-1.734723e-15,1.211802,2.130597,17.880837,28.905779,3.71394,,,3.515824
3,UP_MPNTLCDMRN_1,2017-01-01 02:00:00,0.0,-0.331075,4.911959,1028.162779,983.062126,-1.734723e-15,1.017035,1.846413,15.52346,28.402692,3.547375,,,3.310938
4,UP_MPNTLCDMRN_1,2017-01-01 03:00:00,0.0,-0.66804,4.464566,1027.61692,982.551146,-1.734723e-15,0.715094,1.450024,0.706032,13.915324,3.362168,,,3.166429


In [54]:
df.shape

(245448, 16)

In [None]:
for var in df.columns[3:]:
    analyse_continous(df, var)

CHECK IF ALL UP HAVE FULL TIME RANGE

In [None]:
for up in df.plant_name_up.unique():
    df1 = df[df['plant_name_up']==up]
    print(df1.time.min(), df1.time.max())

In [60]:
timestamp_s: Series = df['time'].map(datetime.timestamp)
day: int = 24 * 60 * 60
year: float = 365.2425 * day

df['Day sin']: Series = np.sin(timestamp_s * (2 * np.pi / day))
df['Day cos']: Series = np.cos(timestamp_s * (2 * np.pi / day))
df['Year sin']: Series = np.sin(timestamp_s * (2 * np.pi / year))
df['Year cos']: Series = np.cos(timestamp_s * (2 * np.pi / year))

In [61]:
earliest_time: Timestamp = df.time.min()
df['t']: Series = (df['time'] - earliest_time).dt.seconds / 60 / 60 + (df['time'] - earliest_time).dt.days * 24

In [62]:
df['days_from_start']: Series = (df['time'] - earliest_time).dt.days
df["id"] = df["plant_name_up"]
df['hour']: Series = df["time"].dt.hour
df['day']: Series = df["time"].dt.day
df['day_of_week']: Series = df["time"].dt.dayofweek
df['month']: Series = df["time"].dt.month
df['categorical_id']: Series = df['id'].copy()
df['hours_from_start']: Series = df['t']
df['categorical_day_of_week']: Series = df['day_of_week'].copy()
df['categorical_hour']: Series = df['hour'].copy()

In [63]:
df.to_csv(r'C:\Users\Lorenzo\PycharmProjects\TFT\outputs\data\sorgenia_wind\data\sorgenia_wind\data\sorgenia_wind_mm_cop.csv', index=False)

In [None]:
df.head()

In [None]:
df.columns