In [24]:
from sqlalchemy.engine import Engine, Connection
from sqlalchemy import create_engine
from os import getenv
from typing import List, Dict, Optional
import pandas as pd
from pandas import DataFrame, Series, Timestamp, DatetimeIndex
import matplotlib.pyplot as plt
from datetime import datetime
import numpy as np
from numpy import ndarray
import math
import os
import pytz

In [10]:
import sys
sys.path.insert(0, '..')

In [11]:
try:
    from dotenv import load_dotenv

    load_dotenv()
except:
    print('No ".env" file or python-dotenv not installed... Using default env variables...')

In [12]:
def db_connection() -> Engine:
    dbname: Optional[str] = getenv('POSTGRES_DB_NAME')
    host: Optional[str] = getenv('POSTGRES_HOST')
    user: Optional[str] = getenv('POSTGRES_USERNAME')
    password: Optional[str] = getenv('POSTGRES_PASSWORD')
    port: Optional[str] = getenv('POSTGRES_PORT')
        
    postgres_str: str = f'postgresql://{user}:{password}@{host}:{port}/{dbname}'
    
    engine: Engine = create_engine(postgres_str)
    
    return engine

In [13]:
engine: Engine = db_connection()

In [6]:
# sql_energy: str = "SELECT * FROM energy_sotavento"

In [7]:
# energy_df: DataFrame = pd.read_sql_query(sql_energy, con=db_connection())

In [52]:
def overlap(row: Series) -> str:
    if math.isnan(row['speed_ms']) and math.isnan(row['energy_kwh']):
        return 'yes'
    else:
        return 'no'


def clean_row(row: Series) -> float:
    if (row['speed_ms'] < row['cut_in'] or row['speed_ms'] > row['cut_out']) and row['energy_kwh'] != 0:
        return 0.
    elif row['energy_kwh'] < 0 and (row['speed_ms'] > row['cut_in'] or row['speed_ms'] < row['cut_out']):
        return 0.
    else:
        return row['energy_kwh']


def fill_gaps(data: DataFrame) -> DataFrame:
    """fill gaps in energy by interpolating based on speed and direction"""
    data['energy_kwh'] = np.where(data['energy_kwh'] > 80000, np.nan, data['energy_kwh'])
    data.set_index(['speed_ms', 'direction_deg'], inplace=True)
    data.interpolate(method='linear', inplace=True)
    data.reset_index(inplace=True)

    return data


def down_sample(df: DataFrame) -> DataFrame:
    """sub-sampling the data from 10 minute intervals to 1h"""
    df: DataFrame = df[5::6]

    return df

In [53]:
class TsCleaner(object):
    """cleans the timeseries filling gaps in timesteps, missing wind speeds and active powers
    Args: df (pandas DataFrame)"""
    def __init__(self, df: DataFrame):
        self.df = df

    def fill_gaps_turbine(self, date_index: DatetimeIndex) -> DataFrame:
        """takes a dataframe subset by turbine, and fills the ws and ap gaps via timeinterpolation"""
        # df_1: DataFrame = self.df[self.df['Turbine'] == turbine]
        # drop duplicates in time
        self.df.drop_duplicates(subset=['date'], keep='first', inplace=True)
        df_index: DataFrame = pd.DataFrame(data=date_index, columns=['date'])
        # merge df_1 and df_index on time(0)
        self.df: DataFrame = self.df.merge(df_index, how='outer', on='date', suffixes=('_sota', '_ind'))
        self.df.sort_values(by='date', ascending=True, ignore_index=True, inplace=True)

        assert self.df.date.to_list() == date_index.to_list()

        # set time as df index to do the time interpolation
        self.df.set_index('date', inplace=True)
        self.df['speed_ms'].interpolate(method='time', inplace=True)
        self.df['energy_kwh'].interpolate(method='time', inplace=True)
        self.df['direction_deg'].interpolate(method='time', inplace=True)
        # reset index
        self.df: DataFrame = self.df.reset_index()


    def fix_gaps(self) -> DataFrame:
        """established date range and fills self.df one turbine and the time and then reconcatenates into one"""
        # create a date range
        date_index: DatetimeIndex = pd.date_range(start=self.df.date.min(), end=self.df.date.max(),
                                                  freq=pd.offsets.Minute(10))
        self.fill_gaps_turbine(date_index)

    @staticmethod
    def gaps(df: DataFrame, time_col: str) -> None:
        """utils method to check time series contiguity"""
        df: DataFrame = df.sort_values(by=time_col, ascending=True, ignore_index=True)
        df[f'{time_col}+1'] = df[time_col].shift(-1)
        df[f'{time_col}_freq'] = (df[f'{time_col}+1'] - df[time_col]).astype('timedelta64[m]')
        gaps: Union[Series, DataFrame] = df[df[f'{time_col}_freq'] > 1.0]
        print(gaps)
        print(df[f'{time_col}_freq'].unique())
        print(df[df[f'{time_col}_freq'] == df[f'{time_col}_freq'].unique()[1]])
        try:
            print(df[df[f'{time_col}_freq'] == df[f'{time_col}_freq'].unique()[2]])
        except Exception as e:
            print("no other time frequencies")

    @staticmethod
    def find_dupl(df: DataFrame, col: str):
        ids: Series = df[col]
        print(df[ids.isin(ids[ids.duplicated()])].sort_values(col))

In [54]:
def etl_plant(sql_energy: str, engine: Engine) -> DataFrame:
    data: DataFrame = pd.read_sql_query(sql_energy, con=engine)
    data = data.replace('-', np.nan)
    data['speed_ms'] = data['speed_ms'].astype('float')
    data['direction_deg'] = data['direction_deg'].astype('float')
    data['energy_kwh'] = data['energy_kwh'].astype('float')
    data['date']: Series = data['date'].astype('datetime64[ns]')
    data['nan_overlap'] = data.apply(overlap, axis=1)
    # load power curve table
    td: DataFrame = pd.read_sql_query("SELECT * FROM turbine_data_sotavento", con=engine)
    td['wind_speed_ms'] = td['wind_speed_ms'].astype('float')
    td['Total_power_kW'] = td['Total_power_kW'].astype('float')
    td['Energy_kWh_10min'] = td['Total_power_kW'] / 6
    # add cut-in and cut-out based on power curve
    data['cut_in'] = 3.
    data['cut_out'] = 25.
    x: Series = data.apply(clean_row, axis=1)
    data['energy_kwh'] = x
    # interpolate NaNs
    cleaner: TsCleaner = TsCleaner(data)
    cleaner.fix_gaps()
    data: DataFrame = cleaner.df
    data.drop(['nan_overlap', 'cut_in', 'cut_out'], axis=1, inplace=True)
    # reappend cut-in and cut-out
    data['cut_in'] = 3.
    data['cut_out'] = 25.
    x: Series = data.apply(clean_row, axis=1)
    data['energy_kwh'] = x
    # fill 99999 values
    data: DataFrame = fill_gaps(data)
    # downsample
    data_reduced: DataFrame = down_sample(data)

    return data_reduced

In [55]:
# energy_df = etl_plant(sql_energy, engine)

In [56]:
# energy_df.head()

In [57]:
def etl_weather(engine: Engine) -> DataFrame:
    data: DataFrame = pd.read_sql_query("SELECT * FROM pala_spain", con=engine)
    data['time']: Series = data['time'].astype('datetime64[ns]')
    data.sort_values(by='time', ascending=True, ignore_index=True, inplace=True)

    timestamp_s: Series = data['time'].map(datetime.timestamp)
    day: int = 24 * 60 * 60
    year: float = 365.2425 * day

    data['Day sin']: Series = np.sin(timestamp_s * (2 * np.pi / day))
    data['Day cos']: Series = np.cos(timestamp_s * (2 * np.pi / day))
    data['Year sin']: Series = np.sin(timestamp_s * (2 * np.pi / year))
    data['Year cos']: Series = np.cos(timestamp_s * (2 * np.pi / year))

    return data

In [58]:
# weather_df: DataFrame = etl_weather(engine)

In [59]:
def etl_final(engine: Engine) -> DataFrame:
    energy_df = etl_plant("SELECT * FROM energy_sotavento", engine)
    weather_df: DataFrame = etl_weather(engine)
    weather_df.drop(['lat', 'long'], axis=1, inplace=True)
    # merge on time column
    df: DataFrame = energy_df.merge(weather_df, left_on=['date'], right_on=['time'])
    df.drop(['date', 'cut_in', 'cut_out'], axis=1, inplace=True)
    df.sort_values(by='time', ascending=True, ignore_index=True, inplace=True)
    
    return df

In [16]:
df: DataFrame = etl_final(engine)

In [17]:
df.head()

Unnamed: 0,speed_ms,direction_deg,energy_kwh,time,dewpoint_2m_K,temperature_K,dewpoint_2m_C,temperature_C,mean_sealev_pressure_hPa,surface_pressure_hPa,precipitation_m,wind_speed_10_ms,u_wind_10_ms,v_wind_10_ms,instant_wind_gust_ms,post_process_wind_gust_ms,Day sin,Day cos,Year sin,Year cos
0,3.85,12.0,39.9,2019-01-01 01:00:00,275.4310278,276.6976326,2.281027806,3.547632644,1031.444331,984.5231871,0,1.325861318,-1.235493975,0.481105887,3.03949853,3.031219384,-5.030095e-12,1.0,0.002021,0.999998
1,3.59,12.0,0.0,2019-01-01 02:00:00,275.2087099,276.8058609,2.058709861,3.655860895,1031.440628,984.4875137,0,1.353927523,-1.205581364,0.616192595,3.184290411,3.159748813,0.258819,0.965926,0.002738,0.999996
2,2.88,12.0,0.0,2019-01-01 03:00:00,275.785831,276.87786,2.635830988,3.727859972,1031.58613,984.6006387,0,1.312767742,-1.143080019,0.645544122,3.320360785,3.291467542,0.5,0.866025,0.003455,0.999994
3,2.6,12.0,0.0,2019-01-01 04:00:00,275.4393907,276.605341,2.289390745,3.455340972,1031.441781,984.4250386,0,1.305021676,-1.122577303,0.665508657,3.476863216,3.460718787,0.7071068,0.707107,0.004172,0.999991
4,0.35,12.0,0.0,2019-01-01 05:00:00,275.3677069,276.4257723,2.217706911,3.275772307,1031.435287,984.3729871,0,1.268083046,-1.098674945,0.633204531,3.438923003,3.482484961,0.8660254,0.5,0.004888,0.999988


In [18]:
df.dtypes

speed_ms                            float64
direction_deg                       float64
energy_kwh                          float64
time                         datetime64[ns]
dewpoint_2m_K                        object
temperature_K                        object
dewpoint_2m_C                        object
temperature_C                        object
mean_sealev_pressure_hPa             object
surface_pressure_hPa                 object
precipitation_m                      object
wind_speed_10_ms                     object
u_wind_10_ms                         object
v_wind_10_ms                         object
instant_wind_gust_ms                 object
post_process_wind_gust_ms            object
Day sin                             float64
Day cos                             float64
Year sin                            float64
Year cos                            float64
dtype: object

# ADDING METEOMATICS WEATHER DATA

In [63]:
datapath: str = r'C:\Users\Lorenzo\PycharmProjects\TFT\outputs\data\sotavento\data\sotavento\data'

In [64]:
# extract
def extract_mm(engine: Engine) -> DataFrame:
    weather_mm: DataFrame = pd.read_sql_query("SELECT *FROM meteomatics_sotavento", con=engine)
    weather_mm['date'] = weather_mm['date'].astype('datetime64[s]')
    weather_mm.rename(columns={'lon': 'long'}, inplace=True)

    return weather_mm

In [65]:
weather_mm = extract_mm(engine)

In [49]:
weather_mm.dtypes

lat                                   object
lon                                   object
date                          datetime64[ns]
dew_point_2m_c                       float64
t_2m_c                               float64
msl_pressure_hpa                     float64
sfc_pressure_hpa                     float64
precip_1h_mm                         float64
wind_speed_mean_10m_1h_ms            float64
wind_speed_mean_100m_1h_ms           float64
wind_dir_mean_100m_1h_d              float64
wind_dir_mean_10m_1h_d               float64
wind_gusts_10m_1h_ms                 float64
wind_gusts_100m_1h_ms                float64
wind_gusts_100m_ms                   float64
wind_gusts_10m_ms                    float64
dtype: object

In [50]:
weather_mm.head()

Unnamed: 0,lat,lon,date,dew_point_2m_c,t_2m_c,msl_pressure_hpa,sfc_pressure_hpa,precip_1h_mm,wind_speed_mean_10m_1h_ms,wind_speed_mean_100m_1h_ms,wind_dir_mean_100m_1h_d,wind_dir_mean_10m_1h_d,wind_gusts_10m_1h_ms,wind_gusts_100m_1h_ms,wind_gusts_100m_ms,wind_gusts_10m_ms
0,43.354375,-7.881212,2019-01-01 02:00:00,-12.0,3.0,1030.9,955.1,0.0,2.4,3.0,141.7,142.6,3.3,4.0,4.0,3.3
1,43.354375,-7.881212,2019-01-01 03:00:00,-11.7,2.5,1031.0,955.1,0.0,2.4,3.2,149.8,145.8,3.7,4.6,4.6,3.7
2,43.354375,-7.881212,2019-01-01 04:00:00,-10.6,2.3,1031.3,955.3,0.0,2.5,3.4,159.9,147.8,3.7,4.6,4.4,3.6
3,43.354375,-7.881212,2019-01-01 05:00:00,-10.2,2.3,1031.5,955.5,0.0,2.4,3.2,165.7,145.5,3.6,4.4,4.2,3.4
4,43.354375,-7.881212,2019-01-01 06:00:00,-10.4,1.8,1031.2,955.3,0.0,2.2,2.8,168.8,140.0,3.4,4.2,3.4,3.0


In [66]:
energy_df = etl_plant("SELECT * FROM energy_sotavento", engine)
weather_mm = extract_mm(engine)
weather_mm.drop(['lat', 'long'], axis=1, inplace=True)
df: DataFrame = energy_df.merge(weather_mm, on=['date'])

In [67]:
df.head()

Unnamed: 0,speed_ms,direction_deg,date,energy_kwh,cut_in,cut_out,dew_point_2m_c,t_2m_c,msl_pressure_hpa,sfc_pressure_hpa,precip_1h_mm,wind_speed_mean_10m_1h_ms,wind_speed_mean_100m_1h_ms,wind_dir_mean_100m_1h_d,wind_dir_mean_10m_1h_d,wind_gusts_10m_1h_ms,wind_gusts_100m_1h_ms,wind_gusts_100m_ms,wind_gusts_10m_ms
0,3.59,12.0,2019-01-01 02:00:00,0.0,3.0,25.0,-12.0,3.0,1030.9,955.1,0.0,2.4,3.0,141.7,142.6,3.3,4.0,4.0,3.3
1,2.88,12.0,2019-01-01 03:00:00,0.0,3.0,25.0,-11.7,2.5,1031.0,955.1,0.0,2.4,3.2,149.8,145.8,3.7,4.6,4.6,3.7
2,2.6,12.0,2019-01-01 04:00:00,0.0,3.0,25.0,-10.6,2.3,1031.3,955.3,0.0,2.5,3.4,159.9,147.8,3.7,4.6,4.4,3.6
3,0.35,12.0,2019-01-01 05:00:00,0.0,3.0,25.0,-10.2,2.3,1031.5,955.5,0.0,2.4,3.2,165.7,145.5,3.6,4.4,4.2,3.4
4,2.04,12.0,2019-01-01 06:00:00,0.0,3.0,25.0,-10.4,1.8,1031.2,955.3,0.0,2.2,2.8,168.8,140.0,3.4,4.2,3.4,3.0
