In [20]:
import os
from dotenv import load_dotenv
import pandas as pd
from datetime import datetime, timedelta
import numpy as np
import pytz
from dotenv import load_dotenv
from util.train import train_model
from util.sahkotin import update_spot
from util.fingrid import update_nuclear
from util.entso_e import entso_e_nuclear
from util.fmi import update_wind_speed, update_temperature

In [21]:
try:
    load_dotenv('.env.local')  # take environment variables from .env.local
except Exception as e:
    print(f"Error loading .env.local file. Did you create one? See README.md.")

# Fetch mandatory environment variables and raise exceptions if they are missing
def get_mandatory_env_variable(name):
    value = os.getenv(name)
    if value is None:
        raise ValueError(f"Mandatory variable {name} not set in environment")
    return value

try:
    # Configuration and secrets, mandatory:
    fingrid_api_key = get_mandatory_env_variable('FINGRID_API_KEY')
    entso_e_api_key = get_mandatory_env_variable('ENTSO_E_API_KEY')
    fmisid_ws_env = get_mandatory_env_variable('FMISID_WS')
    fmisid_t_env = get_mandatory_env_variable('FMISID_T')
    fmisid_ws = ['ws_' + id for id in fmisid_ws_env.split(',')]
    fmisid_t = ['t_' + id for id in fmisid_t_env.split(',')]

except ValueError as e:
    print(f"Error: {e}")
    exit(1)


In [22]:
df = pd.read_csv('data/data.csv')
tmp = df.copy()

In [23]:

# Ensure 'timestamp' column is in datetime format
df['timestamp'] = pd.to_datetime(df['timestamp'])
df.set_index('timestamp', inplace=True)
    
# We operate from this moment back and forward
now = pd.Timestamp.utcnow()

# Round up to the next full hour if not already on a full hour
if now.minute > 0 or now.second > 0 or now.microsecond > 0:
    now = now.ceil('h')  # Rounds up to the nearest hour
    
# Drop rows that are older than a week, unless we intend to do a retrospective prediction update after a model update
#df = df[df.index > now - pd.Timedelta(days=7)]

# Forward-fill the timestamp column for 5*24 = 120 hours ahead
start_time = now + pd.Timedelta(hours=1)  # Start from the next hour
end_time = now + pd.Timedelta(hours=120)  # 5 days ahead
new_index = pd.date_range(start=start_time, end=end_time, freq='h')
df = df.reindex(df.index.union(new_index))

# Reset the index to turn 'timestamp' back into a column before the update functions
df.reset_index(inplace=True)
df.rename(columns={'index': 'Timestamp'}, inplace=True)

# Get the latest FMI wind speed values for the data frame, past and future
# NOTE: To save on API calls, this won't backfill history beyond 7 days even if asked
df = update_wind_speed(df)
        
# Get the latest FMI temperature values for the data frame, past and future
# NOTE: To save on API calls, this won't backfill history beyond 7 days even if asked
df = update_temperature(df)

# Get the latest nuclear power data for the data frame, and infer the future from last known value
# NOTE: To save on API calls, this won't backfill history beyond 7 days even if asked
df = update_nuclear(df, fingrid_api_key=fingrid_api_key)

# Fetch future nuclear downtime information from ENTSO-E unavailability data, h/t github:@pkautio
df_entso_e = entso_e_nuclear(entso_e_api_key)

# Refresh the previously inferred nuclear power numbers with the ENTSO-E data
for index, row in df_entso_e.iterrows():
    mask = (df['Timestamp'] == row['timestamp'])
    df.loc[mask, 'NuclearPowerMW'] = row['NuclearPowerMW']

for index, row in df.iterrows():
    if np.isnan(row['NuclearPowerMW']):
        df.loc[index, 'NuclearPowerMW'] = tmp.loc[index, 'NuclearPowerMW']

# Get the latest spot prices for the data frame, past and future if any
# NOTE: To save on API calls, this won't backfill history beyond 7 days even if asked
df = update_spot(df)

# TODO: Decide if including wind power capacity is necessary; it seems to worsen the MSE and R2
# For now we'll drop it
#df = df.drop(columns=['WindPowerCapacityMW'])

# print("Filled-in dataframe before predict:\n", df)
print("→ Days of data coverage (should be 7 back, 5 forward for now): ", int(len(df)/24))

# Fill in the 'hour', 'day_of_week', and 'month' columns for the model
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df['day_of_week'] = df['Timestamp'].dt.dayofweek + 1
df['hour'] = df['Timestamp'].dt.hour
df['month'] = df['Timestamp'].dt.month

* Fetching wind speed forecast and historical data between 2024-03-06 and 2024-03-18
* Fetching temperature forecast and historical data between 2024-03-06 and 2024-03-18
* Fetching nuclear power production data between 2024-03-06 and 2024-03-18 and inferring missing values


  hourly_nuclear_df = nuclear_df.resample('H').mean().reset_index()
  merged_df['NuclearPowerMW'] = merged_df['NuclearPowerMW'].fillna(method='ffill')


* Fingrid: Fetched 332 hours, aggregated to 28 hourly averages spanning from 2024-03-12 to 2024-03-13
→ Fingrid: Using last known nuclear power production value: 2786 MW
* ENTSO-E: Fetching nuclear downtime messages...
                                              start                       end  \
created_doc_time                                                                
2024-03-02 06:29:11+02:00 2024-03-02 00:00:00+02:00 2024-04-08 01:00:00+03:00   
2024-03-13 09:01:20+02:00 2024-03-14 06:00:00+02:00 2024-03-14 14:00:00+02:00   

                          avail_qty  nominal_power production_resource_name  
created_doc_time                                                             
2024-03-02 06:29:11+02:00         0         1600.0              Olkiluoto 3  
2024-03-13 09:01:20+02:00       530          890.0              Olkiluoto 1  
Empty DataFrame
Columns: [start, end, avail_qty, nominal_power, production_resource_name]
Index: []
                                            

In [24]:
df = df.rename(columns={'Timestamp': 'timestamp'})

In [72]:
specific_timestamp = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0) + timedelta(days=1) 
# Get the current time in Helsinki timezone
helsinki_tz = pytz.timezone('Europe/Helsinki')
now = datetime.now(tz=helsinki_tz)

# Convert to UTC+0 timezone
utc_tz = pytz.timezone('UTC')
specific_timestamp = specific_timestamp.astimezone(utc_tz)


In [74]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
row = df[df['timestamp'] == specific_timestamp]

In [31]:
df_nan = df.dropna()

In [34]:
mae, mse, r2, samples_mae, samples_mse, samples_r2, rf_trained = train_model(df_nan, fmisid_ws=fmisid_ws, fmisid_t=fmisid_t)

→ Feature Importance:
       Feature  Importance
NuclearPowerMW    0.164440
     ws_101256    0.154898
      t_100968    0.150701
      t_101339    0.122489
      t_101118    0.096937
      t_101786    0.062205
     ws_101673    0.058043
          hour    0.049524
         month    0.049020
     ws_101846    0.031594
   day_of_week    0.031359
     ws_101267    0.028790
→ Durbin-Watson autocorrelation test: 2.04
→ ACF values for the first 5 lags:
  Lag 1: 1.0000
  Lag 2: -0.0206
  Lag 3: -0.0075
  Lag 4: 0.0154
  Lag 5: -0.0001
  Lag 6: -0.0058


In [69]:
row.index[0]

10510

In [35]:
price_df = rf_trained.predict(df[['day_of_week', 'hour', 'month', 'NuclearPowerMW'] + fmisid_ws + fmisid_t])
if 'PricePredict_cpkWh' in df.columns:
    df = df.drop(columns=['PricePredict_cpkWh'])
for i in range(row.index[0]+1, len(price_df)):
    df.loc[i, 'predicted_spot_price'] = price_df[i]

In [66]:
df.to_csv('data/data.csv', index=False)