In [None]:
from sqlalchemy.engine import Engine, Connection
from sqlalchemy import create_engine
from os import getenv
from typing import List, Dict, Optional
import pandas as pd
from pandas import DataFrame, Series, Timestamp
import matplotlib.pyplot as plt

In [None]:
import sys
sys.path.insert(0, '..')

In [None]:
try:
    from dotenv import load_dotenv

    load_dotenv()
except:
    print('No ".env" file or python-dotenv not installed... Using default env variables...')

In [None]:
def db_connection() -> Engine:
    dbname: Optional[str] = getenv('POSTGRES_DB_NAME')
    host: Optional[str] = getenv('POSTGRES_HOST')
    user: Optional[str] = getenv('POSTGRES_USERNAME')
    password: Optional[str] = getenv('POSTGRES_PASSWORD')
    port: Optional[str] = getenv('POSTGRES_PORT')
        
    postgres_str: str = f'postgresql://{user}:{password}@{host}:{port}/{dbname}'
    
    engine: Engine = create_engine(postgres_str)
    
    return engine

In [None]:
sql_energy: str = "SELECT * FROM sorgenia_energy"
engine: Engine = db_connection()

In [None]:
energy_df: DataFrame = pd.read_sql_query(sql_energy, con=engine)

In [None]:
energy_df.head()

In [None]:
def group_hourly(df: DataFrame) -> DataFrame:
    df: DataFrame = df.copy()
    df['day']: Series = df['start_date_utc'].dt.year.astype('str') + '-' + df['start_date_utc'].dt.month.astype(
        'str') + '-' + df[
                    'start_date_utc'].dt.day.astype('str')
    df['day']: Series = pd.to_datetime(df['day'], infer_datetime_format=True)
    grouped: DataFrame = df.groupby(['plant_name_up', 'day', df.start_date_utc.dt.hour]).agg(
        {'kwh': 'mean'})
    grouped: DataFrame = grouped.reset_index(drop=False).rename(columns={'start_date_utc': 'time'})
#     grouped: DataFrame = grouped.sort_values(by=['plant_name_up', 'day', 'time'], ascending=True, ignore_index=True)
    grouped['time'] = grouped['day'].astype('str') + ' ' + grouped['time'].astype('str') + ':00:00'
    grouped['time'] = grouped['time'].astype('datetime64[ns, UTC]')
    grouped: DataFrame = grouped.sort_values(by=['plant_name_up', 'time'], ascending=True, ignore_index=True)
    grouped.drop('day', axis=1, inplace=True)

    return grouped

In [None]:
energy_grouped = group_hourly(energy_df)

In [None]:
energy_grouped.head()

In [None]:
energy_grouped.dtypes

In [None]:
energy_grouped.shape

In [None]:
energy_grouped.plant_name_up.unique()

EXTRACT WEATHER COPERNICUS

In [None]:
def extract_weather(weather_sql: str, engine: Engine) -> DataFrame:
    weather_df: DataFrame = pd.read_sql_query(weather_sql, con=engine)
    weather_df['wind_gusts_100m_1h_ms'] = weather_df['wind_gusts_100m_1h_ms'].astype('float64')
    weather_df['wind_gusts_100m_ms'] = weather_df['wind_gusts_100m_ms'].astype('float64')
    weather_df: DataFrame = weather_df.sort_values(by=['timestamp_utc'], ascending=True, ignore_index=True)
        
    return weather_df

In [None]:
weather_df: DataFrame = extract_weather("SELECT * FROM sorgenia_weather_copernicus", engine)

In [None]:
# weather_df['wind_gusts_100m_1h_ms'] = weather_df['wind_gusts_100m_1h_ms'].astype('float64')
# weather_df['wind_gusts_100m_ms'] = weather_df['wind_gusts_100m_ms'].astype('float64')

In [None]:
# weather_df: DataFrame = weather_df.sort_values(by=['timestamp_utc'], ascending=True, ignore_index=True)

In [None]:
weather_df.head()

In [None]:
weather_df.dtypes

In [None]:
weather_df.wind_gusts_100m_ms.value_counts()

ANALYZE DISTRIB

In [None]:
 # Let's go ahead and analyse the distributions of these variables
def analyse_continous(df: DataFrame, var: str):
    df = df.copy()
    df[var].hist(bins=20)
    plt.ylabel('Power range')
    plt.xlabel(var)
    plt.title(var)
    plt.show()

In [None]:
for var in weather_df.columns[3:]:
    analyse_continous(weather_df, var)

In [None]:
assert weather_df.plant_name_up.value_counts().tolist().sort() == energy_grouped.plant_name_up.unique().tolist().sort()

MERGE THE TWO DF

In [None]:
def merge_df(energy: DataFrame, weather: DataFrame) -> DataFrame:
    df: DataFrame = energy.merge(weather, left_on=['time', 'plant_name_up'], right_on=['timestamp_utc', 'plant_name_up'])
    df.drop(['timestamp_utc', 'id'], axis=1, inplace=True)
    df = df.sort_values(by=['plant_name_up','time'], ascending=True, ignore_index=True)
    
    return df

In [None]:
df = merge_df(energy_grouped, weather_df)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
for var in df.columns[3:]:
    analyse_continous(df, var)

CHECK IF ALL UP HAVE FULL TIME RANGE

In [None]:
for up in df.plant_name_up.unique():
    df1 = df[df['plant_name_up']==up]
    print(df1.time.min(), df1.time.max())

In [None]:
earliest_time: Timestamp = df.time.min()
df['t']: Series = (df['time'] - earliest_time).dt.seconds / 60 / 60 + (df['time'] - earliest_time).dt.days * 24

In [None]:
df['days_from_start']: Series = (df['time'] - earliest_time).dt.days
df["id"] = df["plant_name_up"]
df['hour']: Series = df["time"].dt.hour
df['day']: Series = df["time"].dt.day
df['day_of_week']: Series = df["time"].dt.dayofweek
df['month']: Series = df["time"].dt.month
df['categorical_id']: Series = df['id'].copy()
df['hours_from_start']: Series = df['t']
df['categorical_day_of_week']: Series = df['day_of_week'].copy()
df['categorical_hour']: Series = df['hour'].copy()

In [None]:
df.head()

In [None]:
df.columns