In [1]:
# crucial for import API interface and loading data
ON_KAGGLE: bool = False

In [2]:
if not ON_KAGGLE:
    import sys
    sys.path.append('../imports')
    from helper_functions import split_datetime
    from actpred_plot import plot_actual_vs_pred
    from data_preprocessing import merge_data, remove_col

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px

import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [3]:
if ON_KAGGLE:
    DATA_DIR = "/kaggle/input/predict-energy-behavior-of-prosumers/"
else:
    DATA_DIR = "../data/"

# Read CSVs and parse relevant date columns
train = pd.read_csv(DATA_DIR + "train.csv")
client = pd.read_csv(DATA_DIR + "client.csv")
historical_weather = pd.read_csv(DATA_DIR + "historical_weather.csv")
forecast_weather = pd.read_csv(DATA_DIR + "forecast_weather.csv")
electricity_prices = pd.read_csv(DATA_DIR + "electricity_prices.csv")
gas_prices = pd.read_csv(DATA_DIR + "gas_prices.csv")
weather_station_to_county_mapping = pd.read_csv(DATA_DIR + 'weather_station_to_county_mapping.csv')


In [4]:
# We merge all DataFrames 
merged_df = merge_data(
    train, client, historical_weather, forecast_weather, 
    electricity_prices, gas_prices, weather_station_to_county_mapping
)

# Drop all non needed columns (ids and timestamps)
merged_df = remove_col(merged_df, drop_row_id=False)

## Training & Model Building

In [5]:
drop_columns = [
    'target', 'hours_ahead_forecast_weather',
    'row_id', 'data_block_id', 'prediction_unit_id', 
    'longitude_hist_weather', 'latitude_hist_weather',
    'longitude_forecast_weather', 'latitude_forecast_weather'
]

selected_fields = ['county', 'is_business', 'product_type', 'is_consumption',
       'eic_count_client', 'installed_capacity_client',
       'rain_hist_weather', 'snowfall_hist_weather',
       'cloudcover_total_hist_weather', 'cloudcover_mid_hist_weather',
       'cloudcover_high_hist_weather', 'diffuse_radiation_hist_weather',
       'temperature_forecast_weather', 'dewpoint_forecast_weather',
       'surface_solar_radiation_downwards_forecast_weather',
       'total_precipitation_forecast_weather', 'year', 'week', 'hour',
       'day_of_year', 'day_of_week'
]

In [6]:
#X_train, X_test, y_train,  y_test = train_test_split(model_df.drop('target', axis=1), model_df['target'], test_size=0.3, random_state=0)



model = XGBRegressor(enable_categorical=True, max_depth=9, learning_rate=0.3)
model.fit(merged_df.drop(['row_id', 'target'], axis=1)[selected_fields], merged_df.target)

# y_pred = bst.predict(X_test)

## main optimisation metric
# print('Mean absolute error test', mean_absolute_error(y_test, y_pred))
# print('Mean absolute error train', mean_absolute_error(y_train, bst.predict(X_train)))

In [7]:
merged_df.sample(10)

Unnamed: 0,county,is_business,product_type,target,is_consumption,row_id,eic_count_client,installed_capacity_client,lowest_price_per_mwh_gas_prices,highest_price_per_mwh_gas_prices,...,surface_solar_radiation_downwards_forecast_weather,snowfall_forecast_weather,total_precipitation_forecast_weather,year,month,week,hour,day_of_year,day_of_month,day_of_week
1821953,14,0,1,55.007,1,1821953,48.0,561.5,46.34,51.88,...,0.0,7.947286e-07,2e-06,2023,3,13,0,90,31,Friday
1738360,10,1,1,4.342,0,1738360,19.0,474.0,50.31,54.69,...,75.609444,5.676062e-05,5.5e-05,2023,3,9,16,63,4,Saturday
704559,3,0,3,5.302,1,704559,55.0,798.83,90.01,106.99,...,495.601481,0.0,0.0,2022,4,15,14,107,17,Sunday
189971,8,1,3,194.555,1,189971,17.0,751.7,59.0,87.9,...,0.0,0.0,0.000229,2021,11,44,18,307,3,Wednesday
882382,7,0,3,1015.43,0,882382,182.0,2187.28,92.0,99.9,...,594.568,0.0,1.7e-05,2022,6,23,16,162,11,Saturday
1809497,8,0,1,29.362,1,1809497,31.0,671.5,44.49,60.0,...,0.0,0.0,1.2e-05,2023,3,13,3,86,27,Monday
665366,0,1,0,2.6,0,665366,10.0,603.0,109.0,133.84,...,176.087593,0.0004715238,0.000471,2022,4,14,14,95,5,Tuesday
190297,1,1,3,35.34,1,190297,12.0,295.5,59.0,87.9,...,0.0,0.0,0.000211,2021,11,44,21,307,3,Wednesday
1351403,15,0,3,123.627,1,1351403,104.0,1190.99,99.29,128.0,...,0.0,0.0,1e-06,2022,11,44,20,308,4,Friday
1495028,6,1,3,0.0,0,1495028,18.0,645.2,112.83,148.0,...,0.0,1.395093e-05,0.000295,2022,12,50,18,352,18,Sunday


In [8]:

def data_prep(test, client, historical_weather,
        forecast_weather, electricity_prices, gas_prices, sample_prediction, weather_station_to_county_mapping):        

    # Datatype conversion
    client.date = pd.to_datetime(client.date)

    ## Electricity Prices Data
    electricity_prices.forecast_date = pd.to_datetime(electricity_prices.forecast_date)
    electricity_prices.origin_date = pd.to_datetime(electricity_prices.origin_date)

    ## Forecast Weather Data
    forecast_weather.origin_datetime = pd.to_datetime(forecast_weather.origin_datetime)
    forecast_weather.forecast_datetime = pd.to_datetime(forecast_weather.forecast_datetime)

    ## Gas Prices Data
    gas_prices.forecast_date = pd.to_datetime(gas_prices.forecast_date)
    gas_prices.origin_date = pd.to_datetime(gas_prices.origin_date)

    ## Historical Weather Data
    historical_weather.datetime = pd.to_datetime(historical_weather.datetime)

    ## Train Data & Checking for NULL values
    test['datetime'] = pd.to_datetime(test.prediction_datetime, format='%Y-%m-%d %H:%M:%S')

    ## Data Merging (now we merge everything to test)
    ### Merge Client
    # append '_client' to merged columns
    client.columns = [f"{column}_client" if column not in ['county', 'is_business', 'product_type'] else column for column in client.columns]

    # merge train and client
    merged_df = pd.merge(test, client, on=['county', 'is_business', 'product_type'], how='left')

    ### Merge Gas Prices

    # merge gas_prices
    merged_df["lowest_price_per_mwh_gas_prices"] = gas_prices.lowest_price_per_mwh.min()
    merged_df["highest_price_per_mwh_gas_prices"] = gas_prices.highest_price_per_mwh.max()

    ### Merge Electricity Prices
    # add time column for merging with electricity data
    merged_df['time_of_day'] = merged_df['datetime'].dt.time

    # Merge electricity prices
    # the prices are available hourly -> create new column with time 
    electricity_prices['time_of_day'] = electricity_prices.forecast_date.dt.time

    # append electricity_prices to column names
    electricity_prices.columns = [f"{column}_electricity_prices" if column not in ['time_of_day'] else column for column in electricity_prices.columns]

    ### Merge Electricity Prices
    # merge electricity_prices
    merged_df = pd.merge(merged_df, electricity_prices, on = ['time_of_day'], how='left')

    ### Merge Historical Weather
    # get county and county_name from weather_station_to_county_mapping (merge on latitude and longitude)

    # round lat and long to avoid mismatching due to different accuracy
    historical_weather.latitude = historical_weather.latitude.astype("float").round(1)
    historical_weather.longitude = historical_weather.longitude.astype("float").round(1)
    
    weather_station_to_county_mapping.latitude = weather_station_to_county_mapping.latitude.astype("float").round(1)
    weather_station_to_county_mapping.longitude = weather_station_to_county_mapping.longitude.astype("float").round(1)

    # merge historical weather to get counties
    merged_hist_weather = pd.merge(historical_weather, weather_station_to_county_mapping, on=['latitude', 'longitude'], how='left')
    # get time of day
    merged_hist_weather['time_of_day'] = merged_hist_weather['datetime'].dt.time
    
    # aggregate by county and time (summarize weather stations for same county)
    merged_hist_weather = merged_hist_weather.groupby(['county', 'time_of_day', 'datetime']).mean(numeric_only=True).reset_index()
    
    # append _hist_weather to column names
    merged_hist_weather.columns = [f"{column}_hist_weather" if column not in ['county', 'time_of_day'] else column for column in merged_hist_weather.columns]

    # merge to merged_df
    merged_df = pd.merge(merged_df, merged_hist_weather, on=['time_of_day', 'county'], how='left')

    ### Merge Forecast Weather
    # forecast weather

    #round lat and long
    forecast_weather.latitude = forecast_weather.latitude.astype("float").round(1)
    forecast_weather.longitude = forecast_weather.longitude.astype("float").round(1)

    # merge to get counties
    merged_forecast_weather = pd.merge(forecast_weather, weather_station_to_county_mapping, on=['latitude', 'longitude'], how='left')
    # merged_forecast_weather['time_of_day'] = merged_forecast_weather.

    # # aggregate for duplicate locations
    merged_forecast_weather = merged_forecast_weather.groupby(['county', 'forecast_datetime']).mean(numeric_only=True).reset_index()

    # append forecast_weather to column names
    merged_forecast_weather.columns = [f"{column}_forecast_weather" if column not in ['county', 'forecast_datetime'] else column for column in merged_forecast_weather.columns]


    # merge forecast_weather
    merged_df = pd.merge(merged_df, merged_forecast_weather, left_on=['datetime', 'county'], right_on=['forecast_datetime', 'county'], how='left')
    
    # split datetime into meaningful features of int types
    merged_df = split_datetime(merged_df)
    
    # mapping days of the week names and converting to categorical variable
    if 'day_of_week' in merged_df.columns:
        weekday_map = {
            0: 'Monday',
            1: 'Tuesday',
            2: 'Wednesday',
            3: 'Thursday',
            4: 'Friday',
            5: 'Saturday',
            6: 'Sunday'
        }
    merged_df['day_of_week'] = merged_df['day_of_week'].map(weekday_map).astype('category')
    # encode categories to category datetype

    merged_df['county'] = merged_df['county'].astype('category')
    merged_df['product_type'] = merged_df['product_type'].astype('category')
    
    # model is not able to handle object type
    merged_df.drop('time_of_day', axis=1, inplace=True)

    # model is not able to handle datetime
    merged_df = merged_df.drop(merged_df.select_dtypes(include=['datetime64[ns]', 'datetime64[ns, EET]', 'object']).columns, axis=1)
    
    drop_columns = [
    'hours_ahead_forecast_weather',
    'row_id',
    'prediction_unit_id',
    'longitude_hist_weather',
    'longitude_forecast_weather',
    'latitude_hist_weather',
    'latitude_forecast_weather',
    'currently_scored'
    ]
    
    merged_df.drop(drop_columns, axis=1, inplace=True)

    return merged_df

In [9]:

def data_prep_with_row_id(test, client, historical_weather,
        forecast_weather, electricity_prices, gas_prices, sample_prediction, weather_station_to_county_mapping):        

    # Datatype conversion
    client.date = pd.to_datetime(client.date)

    ## Electricity Prices Data
    electricity_prices.forecast_date = pd.to_datetime(electricity_prices.forecast_date)
    electricity_prices.origin_date = pd.to_datetime(electricity_prices.origin_date)

    ## Forecast Weather Data
    forecast_weather.origin_datetime = pd.to_datetime(forecast_weather.origin_datetime)
    forecast_weather.forecast_datetime = pd.to_datetime(forecast_weather.forecast_datetime)

    ## Gas Prices Data
    gas_prices.forecast_date = pd.to_datetime(gas_prices.forecast_date)
    gas_prices.origin_date = pd.to_datetime(gas_prices.origin_date)

    ## Historical Weather Data
    historical_weather.datetime = pd.to_datetime(historical_weather.datetime)

    ## Train Data & Checking for NULL values
    test['datetime'] = pd.to_datetime(test.prediction_datetime, format='%Y-%m-%d %H:%M:%S')

    ## Data Merging (now we merge everything to test)
    ### Merge Client
    # append '_client' to merged columns
    client.columns = [f"{column}_client" if column not in ['county', 'is_business', 'product_type'] else column for column in client.columns]

    # merge train and client
    merged_df = pd.merge(test, client, on=['county', 'is_business', 'product_type'], how='left')

    ### Merge Gas Prices

    # merge gas_prices
    merged_df["lowest_price_per_mwh_gas_prices"] = gas_prices.lowest_price_per_mwh.min()
    merged_df["highest_price_per_mwh_gas_prices"] = gas_prices.highest_price_per_mwh.max()

    ### Merge Electricity Prices
    # add time column for merging with electricity data
    merged_df['time_of_day'] = merged_df['datetime'].dt.time

    # Merge electricity prices
    # the prices are available hourly -> create new column with time 
    electricity_prices['time_of_day'] = electricity_prices.forecast_date.dt.time

    # append electricity_prices to column names
    electricity_prices.columns = [f"{column}_electricity_prices" if column not in ['time_of_day'] else column for column in electricity_prices.columns]

    ### Merge Electricity Prices
    # merge electricity_prices
    merged_df = pd.merge(merged_df, electricity_prices, on = ['time_of_day'], how='left')

    ### Merge Historical Weather
    # get county and county_name from weather_station_to_county_mapping (merge on latitude and longitude)

    # round lat and long to avoid mismatching due to different accuracy
    historical_weather.latitude = historical_weather.latitude.astype("float").round(1)
    historical_weather.longitude = historical_weather.longitude.astype("float").round(1)
    
    weather_station_to_county_mapping.latitude = weather_station_to_county_mapping.latitude.astype("float").round(1)
    weather_station_to_county_mapping.longitude = weather_station_to_county_mapping.longitude.astype("float").round(1)

    # merge historical weather to get counties
    merged_hist_weather = pd.merge(historical_weather, weather_station_to_county_mapping, on=['latitude', 'longitude'], how='left')
    # get time of day
    merged_hist_weather['time_of_day'] = merged_hist_weather['datetime'].dt.time
    
    # aggregate by county and time (summarize weather stations for same county)
    merged_hist_weather = merged_hist_weather.groupby(['county', 'time_of_day', 'datetime']).mean(numeric_only=True).reset_index()
    
    # append _hist_weather to column names
    merged_hist_weather.columns = [f"{column}_hist_weather" if column not in ['county', 'time_of_day'] else column for column in merged_hist_weather.columns]

    # merge to merged_df
    merged_df = pd.merge(merged_df, merged_hist_weather, on=['time_of_day', 'county'], how='left')

    ### Merge Forecast Weather
    # forecast weather

    #round lat and long
    forecast_weather.latitude = forecast_weather.latitude.astype("float").round(1)
    forecast_weather.longitude = forecast_weather.longitude.astype("float").round(1)

    # merge to get counties
    merged_forecast_weather = pd.merge(forecast_weather, weather_station_to_county_mapping, on=['latitude', 'longitude'], how='left')
    # merged_forecast_weather['time_of_day'] = merged_forecast_weather.

    # # aggregate for duplicate locations
    merged_forecast_weather = merged_forecast_weather.groupby(['county', 'forecast_datetime']).mean(numeric_only=True).reset_index()

    # append forecast_weather to column names
    merged_forecast_weather.columns = [f"{column}_forecast_weather" if column not in ['county', 'forecast_datetime'] else column for column in merged_forecast_weather.columns]


    # merge forecast_weather
    merged_df = pd.merge(merged_df, merged_forecast_weather, left_on=['datetime', 'county'], right_on=['forecast_datetime', 'county'], how='left')
    
    # split datetime into meaningful features of int types
    merged_df = split_datetime(merged_df)
    
    # mapping days of the week names and converting to categorical variable
    if 'day_of_week' in merged_df.columns:
        weekday_map = {
            0: 'Monday',
            1: 'Tuesday',
            2: 'Wednesday',
            3: 'Thursday',
            4: 'Friday',
            5: 'Saturday',
            6: 'Sunday'
        }
        merged_df['day_of_week'] = merged_df['day_of_week'].map(weekday_map).astype('category')
    merged_df['day_of_week'] = merged_df['day_of_week'].astype('category')
    # encode categories to category datetype

    merged_df['county'] = merged_df['county'].astype('category')
    merged_df['product_type'] = merged_df['product_type'].astype('category')
    
    # model is not able to handle object type
    merged_df.drop('time_of_day', axis=1, inplace=True)

    # model is not able to handle datetime
    merged_df = merged_df.drop(merged_df.select_dtypes(include=['datetime64[ns]', 'datetime64[ns, EET]', 'object']).columns, axis=1)
    
    drop_columns = [
        'hours_ahead_forecast_weather',
        'prediction_unit_id',
        'longitude_hist_weather',
        'longitude_forecast_weather',
        'latitude_hist_weather',
        'latitude_forecast_weather',
        'currently_scored'
    ]
    
    merged_df.drop(drop_columns, axis=1, inplace=True)

    return merged_df

In [10]:
if ON_KAGGLE:
    import enefit
else:
    import sys
    sys.path.append('../imports')
    import public_timeseries_testing_util as enefit


# copy of df before new data
merged_df['row_id'] = merged_df['row_id'].astype('int', errors='ignore')

env = enefit.make_env()
iter_test = env.iter_test()

counter = 0
previous_revealed_targets = pd.DataFrame()

# necessary because of row_id overlap in old and new data
if not ON_KAGGLE:
    merged_df = merged_df[merged_df['row_id'] < 2005872]

print('Merged_df before loop \n', merged_df)

for (test, revealed_targets, client, historical_weather,
    forecast_weather, electricity_prices, gas_prices, sample_prediction) in iter_test:
 
    # logs
    print('Counter:', counter)
    look_index = 2005872
    try:
        print(f'Merged_df index {look_index} \n', merged_df[merged_df['row_id'] == 2005872])
    except:
        print(f'{look_index} is not there yet')

    # merged_df['day_of_week'] = merged_df['day_of_week'].astype('category')

    try: 
        # drop columns if target is na
        model_df = merged_df.dropna(subset=['target'])
        model_df = merged_df.copy()
    except:
        print('some na targets were dropped')
        # create alias anyway
        model_df = merged_df.copy()

    # model.fit(model_df.drop(drop_columns, axis=1)[selected_fields], model_df.target)

    
    if counter in [0, 1]:
        pass
        # print(f'Test dataframe #{counter} \n', test.head(3))
        # print(f'Revealed targets dataframe #{counter} \n', revealed_targets.head(3))
        # print(f'Client dataframe #{counter} \n', client.head(3))
        # print(f'Historical weather dataframe #{counter} \n', historical_weather.head(3))
        # print(f'Forecast weather dataframe #{counter} \n', forecast_weather.head(3))
        # print(f'Electricity prices dataframe #{counter} \n', electricity_prices.head(3))
        # print(f'Gas prices dataframe #{counter} \n', gas_prices.head(3))
        # print(f'Sample prediction dataframe #{counter} \n', sample_prediction.head(3))
    
    prepped_df = merge_data(
        test, client, historical_weather, forecast_weather, 
        electricity_prices, gas_prices, weather_station_to_county_mapping
    )
    print('Prepped_df \n', prepped_df)

    prepped_df = remove_col(prepped_df, drop_row_id=False)

    # print(merged_df.columns, '\n', prepped_df.columns)

    # bring new data to storage
    merged_df = pd.concat([merged_df, prepped_df], axis=0, ignore_index=True).set_index('row_id')
    
    try:
        # add revealed targets to data
        revealed_targets = pd.concat([previous_revealed_targets, revealed_targets], axis=0, ignore_index=True).set_index('row_id')

        # imputing targets
        merged_df.combine_first(revealed_targets)
        # merged_df[['row_id', 'target']] = merged_df[['row_id', 'target']].apply(lambda x: if x.to_list() in revealed_targets['row_id'])

        # reset variable
        previous_revealed_targets = pd.DataFrame()
    except KeyError as e:
        # store unused revealed targets for the next try
        print('KeyError occurred')
        print(e)
        previous_revealed_targets = revealed_targets.copy()
        
    # make and put prediction
    sample_prediction['target'] = model.predict(prepped_df[selected_fields])
    sample_prediction['target'] = sample_prediction['target'].fillna(0).clip(0)

    env.predict(sample_prediction)
    counter += 1


Merged_df before loop 
         county  is_business product_type   target  is_consumption   row_id  \
0            0            0            1    0.713               0        0   
1            0            0            1   96.590               1        1   
2            0            0            2    0.000               0        2   
3            0            0            2   17.314               1        3   
4            0            0            3    2.904               0        4   
...        ...          ...          ...      ...             ...      ...   
2005867     15            1            0  184.072               1  2005867   
2005868     15            1            1    0.000               0  2005868   
2005869     15            1            1   38.646               1  2005869   
2005870     15            1            3    0.000               0  2005870   
2005871     15            1            3  183.756               1  2005871   

         eic_count_client  installed_ca

In [12]:
# merged_df[merged_df['row_id'] == 2005872]
merged_df.iloc[2005872]

county                                                          0
is_business                                                     1
product_type                                                    1
target                                                        NaN
is_consumption                                                  0
eic_count_client                                             98.0
installed_capacity_client                                  2885.6
lowest_price_per_mwh_gas_prices                              28.3
highest_price_per_mwh_gas_prices                             34.1
euros_per_mwh_electricity_prices                            87.67
temperature_hist_weather                                 7.666667
dewpoint_hist_weather                                    2.383333
rain_hist_weather                                             0.0
snowfall_hist_weather                                         0.0
surface_pressure_hist_weather                         1017.816667
cloudcover