In [1]:
# crucial for import API interface and loading data
ON_KAGGLE: bool = False

In [2]:
if ON_KAGGLE:
    import sys
    sys.path.append('/kaggle/input/imports2')
    from helper_functions import split_datetime
    from actpred_plot import plot_actual_vs_pred
    from data_preprocessing import merge_data, remove_col
    from feature_engineering import * # this is bad practice, call functions explicitly
else:
    import sys
    sys.path.append('../imports')
    from helper_functions import split_datetime
    from actpred_plot import plot_actual_vs_pred
    from data_preprocessing import merge_data, remove_col
    from feature_engineering import * # this is bad practice, call functions explicitly

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px

import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [3]:
if ON_KAGGLE:
    DATA_DIR = "/kaggle/input/predict-energy-behavior-of-prosumers/"
else:
    DATA_DIR = "../data/"

# Read CSVs and parse relevant date columns
train = pd.read_csv(DATA_DIR + "train.csv")
client = pd.read_csv(DATA_DIR + "client.csv")
historical_weather = pd.read_csv(DATA_DIR + "historical_weather.csv")
forecast_weather = pd.read_csv(DATA_DIR + "forecast_weather.csv")
electricity_prices = pd.read_csv(DATA_DIR + "electricity_prices.csv")
gas_prices = pd.read_csv(DATA_DIR + "gas_prices.csv")
weather_station_to_county_mapping = pd.read_csv(DATA_DIR + 'weather_station_to_county_mapping.csv')


In [4]:
# We merge all DataFrames 
merged_df = merge_data(
    train, client, historical_weather, forecast_weather, 
    electricity_prices, gas_prices, weather_station_to_county_mapping
)

# Drop all non needed columns (ids and timestamps)
merged_df = remove_col(merged_df, drop_row_id=False)

## Feature engineering

In [5]:
merged_df = add_daylight_col(merged_df)

merged_df = add_capacity_col(merged_df)

merged_df = basic_improvements(merged_df)

# merged_df = add_public_holiday_col(merged_df)

# merged_df = add_school_holiday_col(merged_df)

In [6]:
merged_df.columns

Index(['county', 'is_business', 'product_type', 'target', 'is_consumption',
       'row_id', 'eic_count_client', 'lowest_price_per_mwh_gas_prices',
       'highest_price_per_mwh_gas_prices', 'euros_per_mwh_electricity_prices',
       'temperature_hist_weather', 'dewpoint_hist_weather',
       'surface_pressure_hist_weather', 'cloudcover_low_hist_weather',
       'windspeed_10m_hist_weather', 'winddirection_10m_hist_weather',
       'shortwave_radiation_hist_weather',
       'direct_solar_radiation_hist_weather',
       'cloudcover_high_forecast_weather', 'cloudcover_low_forecast_weather',
       'cloudcover_mid_forecast_weather', 'cloudcover_total_forecast_weather',
       '10_metre_u_wind_component_forecast_weather',
       '10_metre_v_wind_component_forecast_weather',
       'direct_solar_radiation_forecast_weather',
       'surface_solar_radiation_downwards_forecast_weather',
       'snowfall_forecast_weather', 'total_precipitation_forecast_weather',
       'year', 'month', 'week', 

## Training & Model Building

In [12]:
drop_columns = [
    'target', 'hours_ahead_forecast_weather',
    'row_id', 'data_block_id', 'prediction_unit_id', 
    'longitude_hist_weather', 'latitude_hist_weather',
    'longitude_forecast_weather', 'latitude_forecast_weather'
]

selected_fields = ['county', 'is_business', 'product_type', 'is_consumption',
       'eic_count_client',
       'surface_solar_radiation_downwards_forecast_weather',
       'total_precipitation_forecast_weather', 'year', 'week', 'hour',
       'day_of_year', 'day_of_week','daylight', 'capacity_per_eic',
       'squared_capacity_client'
]

In [13]:
model_df = merged_df.copy()
production = model_df[model_df["is_consumption"] == 0]
consumption = model_df[model_df["is_consumption"] == 1]

X_prod = production.drop('target', axis=1)[selected_fields]
y_prod = production['target']
X_con = consumption.drop('target', axis=1)[selected_fields]
y_con = consumption['target']

# Create two models - for consumption and production
prod_model = XGBRegressor(enable_categorical=True, max_depth=6) 
con_model = XGBRegressor(enable_categorical=True, max_depth=6) 
# Fit the models
prod_model.fit(X_prod, y_prod)
con_model.fit(X_con, y_con)

In [None]:
# model = XGBRegressor(enable_categorical=True, max_depth=6, learning_rate=0.3)
# model.fit(merged_df.drop(['row_id', 'target'], axis=1)[selected_fields], merged_df.target)

# # y_pred = bst.predict(X_test)

# ## main optimisation metric
# # print('Mean absolute error test', mean_absolute_error(y_test, y_pred))
# # print('Mean absolute error train', mean_absolute_error(y_train, bst.predict(X_train)))

In [None]:
# y_pred = model.predict(merged_df.drop(['row_id', 'target'], axis=1)[selected_fields])

# # main optimisation metric
# print('Mean absolute error', mean_absolute_error(merged_df.target, y_pred))


In [14]:
if ON_KAGGLE:
    import enefit
else:
    import sys
    sys.path.append('../imports')
    import public_timeseries_testing_util as enefit


# copy of df before new data
merged_df['row_id'] = merged_df['row_id'].astype('int', errors='ignore')

env = enefit.make_env()
iter_test = env.iter_test()

counter = 0
previous_revealed_targets = pd.DataFrame()
all_revealed_targets = pd.DataFrame()

for (test, revealed_targets, client, historical_weather,
    forecast_weather, electricity_prices, gas_prices, sample_prediction) in iter_test:

    
    # if counter % 7 == 0:
    #     model.fit(merged_df.drop(['row_id', 'target'], axis=1)[selected_fields], merged_df.target)
        
    print(counter) 
    print(merged_df[merged_df['row_id'] == 2005872])
    merged_df['day_of_week'] = merged_df['day_of_week'].astype('category')

    try: 
        # drop columns if target is na
        model_df = merged_df.dropna(subset=['target'])
    except:
        print('some na targets were dropped')
        # create alias anyway
        model_df = merged_df

    
    if counter in [0, 1]:
        pass
        # print(f'Test dataframe #{counter} \n', test.head(3))
        # print(f'Revealed targets dataframe #{counter} \n', revealed_targets.head(3))
        # print(f'Client dataframe #{counter} \n', client.head(3))
        # print(f'Historical weather dataframe #{counter} \n', historical_weather.head(3))
        # print(f'Forecast weather dataframe #{counter} \n', forecast_weather.head(3))
        # print(f'Electricity prices dataframe #{counter} \n', electricity_prices.head(3))
        # print(f'Gas prices dataframe #{counter} \n', gas_prices.head(3))
        # print(f'Sample prediction dataframe #{counter} \n', sample_prediction.head(3))
    
    prepped_df = merge_data(
        test, client, historical_weather, forecast_weather, 
        electricity_prices, gas_prices, weather_station_to_county_mapping
    )

    prepped_df = remove_col(prepped_df, drop_row_id=False)


    # feature engineering
    prepped_df = add_daylight_col(prepped_df)
    prepped_df = add_capacity_col(prepped_df)
    prepped_df = basic_improvements(prepped_df)
    # prepped_df = add_public_holiday_col(prepped_df)
    # prepped_df = add_school_holiday_col(prepped_df)

    # print(merged_df.columns, '\n', prepped_df.columns)

    # pd.merge([merged_df, revealed_targets[['row_id', 'target']]], on=['row_id'], how='left', suffixes=('', '_revealed'))
    # merged_df['target'] = merged_df[['target', 'target_revealed']].apply(lambda x: x.to_list()[0] if x.to_list()[0] else x.to_list()[1], axis=1)
    # merged_df.drop('target_revealed', axis=1, inplace=True)
    # leave out for now: using revealed targets as additional feature
    # # bring new data to storage
    # merged_df = pd.concat([merged_df, prepped_df], axis=0, ignore_index=True)
    
    # try:
    #     # add revealed targets to data
    #     revealed_targets = pd.concat([previous_revealed_targets, revealed_targets], axis=0, ignore_index=True)
    #     targets_indexes = merged_df['row_id'][revealed_targets['row_id']].index
    #     merged_df['target'].iloc[targets_indexes] = revealed_targets['target']
    #     previous_revealed_targets = pd.DataFrame()
    # except KeyError as e:
    #     # store unused revealed targets for the next try
    #     print('KeyError occurred')
    #     print(e)
    #     previous_revealed_targets = revealed_targets.copy()

    prepped_df = prepped_df.drop('row_id', axis=1)[selected_fields]

    mask = prepped_df['is_consumption'] == 1
    # clip method makes values < 0 equal 0 because our target is nonnegative and models can produce negative values
    sample_prediction.loc[mask.values, "target"] = con_model.predict(prepped_df[mask]).clip(0)
    
    mask = prepped_df['is_consumption'] == 0
    sample_prediction.loc[mask.values, "target"] = prod_model.predict(prepped_df[mask]).clip(0)
    
    # send predictions
    env.predict(sample_prediction)    

    counter += 1


0
        county  is_business product_type  target  is_consumption   row_id   
2005872      0            0            1   3.401               0  2005872  \

         eic_count_client  lowest_price_per_mwh_gas_prices   
2005872             507.0                             28.3  \

         highest_price_per_mwh_gas_prices  euros_per_mwh_electricity_prices   
2005872                              34.1                             87.54  \

         ...  week  hour  day_of_year  day_of_month  day_of_week  daylight   
2005872  ...    21     0          148            28       Sunday     False  \

         capacity_per_eic  squared_capacity_client  sum_column  temp_dew  
2005872              9.78             2.460373e+07         0.0 -5.920572  

[1 rows x 40 columns]
1
        county  is_business product_type  target  is_consumption   row_id   
2005872      0            0            1   3.401               0  2005872  \

         eic_count_client  lowest_price_per_mwh_gas_prices   
2005872   