In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px

import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

import pickle

In [None]:
def split_datetime(data, col="datetime"):
    # What columns are of type datetime?
    datetime_columns = data.select_dtypes(include='datetime64').columns
    
    for c in datetime_columns:
        print(f"Timezone for {c} is {data[c].dt.tz}")

    # Adding columns for date & time
    data['year']    = data[col].dt.year
    # data['quarter'] = data[col].dt.quarter
    data['month']   = data[col].dt.month
    data['week']    = data[col].dt.isocalendar().week
    data['hour']    = data[col].dt.hour 

    data['day_of_year']  = data[col].dt.day_of_year
    data['day_of_month'] = data[col].dt.day
    data['day_of_week']  = data[col].dt.day_of_week

    return data

# Merge test data to other data

In [None]:
# import data

## Client Data
client = pd.read_csv('../data/example_test_files/client.csv')

# Datatype conversion
client.date = pd.to_datetime(client.date)

## Electricity Prices Data
electricity_prices = pd.read_csv('../data/example_test_files/electricity_prices.csv')
electricity_prices.forecast_date = pd.to_datetime(electricity_prices.forecast_date)
electricity_prices.origin_date = pd.to_datetime(electricity_prices.origin_date)

## Forecast Weather Data
forecast_weather = pd.read_csv('../data/example_test_files/forecast_weather.csv')
forecast_weather.origin_datetime = pd.to_datetime(forecast_weather.origin_datetime)
forecast_weather.forecast_datetime = pd.to_datetime(forecast_weather.forecast_datetime)

## Gas Prices Data
gas_prices = pd.read_csv('../data/example_test_files/gas_prices.csv')
gas_prices.forecast_date = pd.to_datetime(gas_prices.forecast_date)
gas_prices.origin_date = pd.to_datetime(gas_prices.origin_date)

## Historical Weather Data
historical_weather = pd.read_csv('../data/example_test_files/historical_weather.csv')
historical_weather.datetime = pd.to_datetime(historical_weather.datetime)

## Train Data & Checking for NULL values
test = pd.read_csv('../data/example_test_files/test.csv')
test['datetime'] = pd.to_datetime(test.prediction_datetime, format='%Y-%m-%d %H:%M:%S')

weather_station_to_county_mapping = pd.read_csv('../data/weather_station_to_county_mapping.csv')



## Data Merging (now we merge everything to test)
### Merge Client
# append '_client' to merged columns
client.columns = [f"{column}_client" if column not in ['data_block_id', 'county', 'is_business', 'product_type'] else column for column in client.columns]
# merge train and client

merged_df = pd.merge(test, client, on=['data_block_id', 'county', 'is_business', 'product_type'], how='left')


### Merge Gas Prices
# append _gas_prices to columns
gas_prices.columns = [f"{column}_gas_prices" if column != 'data_block_id' else column for column in gas_prices.columns]

# merge gas_prices
merged_df = pd.merge(merged_df, gas_prices, on=['data_block_id'], how='left')



### Merge Electricity Prices
# add time column for merging with electricity data
merged_df['time_of_day'] = merged_df['datetime'].dt.time

# Merge electricity prices
# the prices are available hourly -> create new column with time 
electricity_prices['time_of_day'] = electricity_prices.forecast_date.dt.time


# append electricity_prices to column names
electricity_prices.columns = [f"{column}_electricity_prices" if column not in ['time_of_day','data_block_id'] else column for column in electricity_prices.columns]


### Merge Electricity Prices
# merge electricity_prices
merged_df = pd.merge(merged_df, electricity_prices, on = ['data_block_id', 'time_of_day'], how='left')

### Merge Historical Weather
# get county and county_name from weather_station_to_county_mapping (merge on latitude and longitude)

# round lat and long to avoid mismatching due to different accuracy
historical_weather.latitude = historical_weather.latitude.round(1)
historical_weather.longitude = historical_weather.longitude.round(1)

weather_station_to_county_mapping.latitude = weather_station_to_county_mapping.latitude.round(1)
weather_station_to_county_mapping.longitude = weather_station_to_county_mapping.longitude.round(1)

# merge historical weather to get counties
merged_hist_weather = pd.merge(historical_weather, weather_station_to_county_mapping, on=['latitude', 'longitude'], how='left')
# get time of day
merged_hist_weather['time_of_day'] = merged_hist_weather['datetime'].dt.time

# aggregate by county and time (summarize weather stations for same county)
merged_hist_weather = merged_hist_weather.groupby(['county', 'time_of_day', 'datetime', 'data_block_id']).mean(numeric_only=True).reset_index()

# append _hist_weather to column names
merged_hist_weather.columns = [f"{column}_hist_weather" if column not in ['county', 'time_of_day','data_block_id'] else column for column in merged_hist_weather.columns]


# merge to merged_df
merged_df = pd.merge(merged_df, merged_hist_weather, on=['data_block_id', 'time_of_day', 'county'], how='left')

### Merge Forecast Weather
# forecast weather

#round lat and long
forecast_weather.latitude = forecast_weather.latitude.round(1)
forecast_weather.longitude = forecast_weather.longitude.round(1)

# merge to get counties
merged_forecast_weather = pd.merge(forecast_weather, weather_station_to_county_mapping, on=['latitude', 'longitude'], how='left')
# merged_forecast_weather['time_of_day'] = merged_forecast_weather.

# # aggregate for duplicate locations
merged_forecast_weather = merged_forecast_weather.groupby(['county', 'forecast_datetime', 'data_block_id']).mean(numeric_only=True).reset_index()

# append forecast_weather to column names
merged_forecast_weather.columns = [f"{column}_forecast_weather" if column not in ['county', 'forecast_datetime','data_block_id'] else column for column in merged_forecast_weather.columns]


# add EET timezone to datetime, and handle daylight-savings
merged_df['datetime_localized'] = merged_df.datetime.dt.tz_localize('EET', ambiguous=True, nonexistent='shift_forward')

# convert UTC timezone to EET timezone in forecast weather
merged_forecast_weather['datetime_EET']  = merged_forecast_weather.forecast_datetime.dt.tz_convert('EET')

# merge forecast_weather
merged_df = pd.merge(merged_df, merged_forecast_weather, left_on=['data_block_id', 'datetime_localized', 'county'], right_on=['data_block_id', 'datetime_EET', 'county'], how='left')


merged_df.to_parquet('../data/example_test_files/merged_test_df.parquet')


In [None]:
merged_df.columns

In [None]:
merged_df.head()

## prepare data frame

In [None]:
# mapping days of the week names and converting to categorical variable
if 'day_of_week' in merged_df.columns:
    weekday_map = {
        0: 'Monday',
        1: 'Tuesday',
        2: 'Wednesday',
        3: 'Thursday',
        4: 'Friday',
        5: 'Saturday',
        6: 'Sunday'
    }
    merged_df['day_of_week'] = merged_df['day_of_week'].map(weekday_map).astype('category')

In [None]:
# encode categories to category datetype

merged_df['county'] = merged_df['county'].astype('category')
merged_df['product_type'] = merged_df['product_type'].astype('category')


In [None]:
# copy df for modelling
model_df = merged_df

# model is not able to handle object type
model_df.drop('time_of_day', axis=1, inplace=True)

# split datetime into meaningful features of int types
model_df = split_datetime(model_df)

# model is not able to handle datetime
model_df = model_df.drop(model_df.select_dtypes(include=['datetime64[ns]', 'datetime64[ns, EET]']).columns, axis=1)

## drop na from target
#model_df.dropna(subset=['target'], inplace=True)  # we dont have target in test data

## Modelling

In [None]:
# load best model from grid search on train data

with open('../models/XGBoost_first_best_model.pickle', 'rb') as file:
    model = pickle.load(file)

In [None]:
# predict on test data

model_df['target_pred'] = model.predict(model_df.drop(['prediction_datetime', 'currently_scored'], axis=1))

In [None]:
# for evaluation: load revealed_targets

revealed_targets = pd.read_csv('../data/example_test_files/revealed_targets.csv')
revealed_targets.head()

In [None]:
# merge revealed_targets to model_df

model_df = model_df.merge(revealed_targets[['target', 'row_id']], on='row_id')

In [None]:
model_df.head()

In [None]:
# Calculate the Mean Absolute Error between the actual and predicted values

mae = mean_absolute_error(model_df.target, model_df.target_pred)

print(f"Mean Absolute Error (MAE): {mae}")

In [None]:
# Calculate the Mean Absolute Error between the actual and predicted values

# we saw that sometimes negative production is predicted -> correct that

model_df['target_corrected'] = model_df['target_pred'].apply(lambda x: x if x>0 else 0)

mae = mean_absolute_error(model_df.target, model_df.target_corrected)

print(f"Mean Absolute Error (MAE): {mae}")

## try working with the Enefit API

In [None]:
import sys
sys.path.append('../data')

import public_timeseries_testing_util as enefit


env = enefit.make_env()
iter_test = env.iter_test()
counter = 0
for (test, revealed_targets, client, historical_weather,
        forecast_weather, electricity_prices, gas_prices, sample_prediction) in iter_test:
    if counter == 0:
        print(test.head(3))
        print(revealed_targets.head(3))
        print(client.head(3))
        print(historical_weather.head(3))
        print(forecast_weather.head(3))
        print(electricity_prices.head(3))
        print(gas_prices.head(3))
        print(sample_prediction.head(3))
    sample_prediction['target'] = 0
    env.predict(sample_prediction)
    counter += 1



    ## continue from here