In [1]:
# crucial for import API interface and loading data
ON_KAGGLE: bool = False

In [2]:
if ON_KAGGLE:
    import sys
    sys.path.append('/kaggle/input/imports')
    from helper_functions import split_datetime
    from actpred_plot import plot_actual_vs_pred
    from data_preprocessing import merge_data, remove_col
    from feature_engineering import * # this is bad practice, call functions explicitly
else:
    import sys
    sys.path.append('../imports')
    from helper_functions import split_datetime
    from actpred_plot import plot_actual_vs_pred
    from data_preprocessing import merge_data, remove_col
    from feature_engineering import * # this is bad practice, call functions explicitly

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px

import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [3]:
if ON_KAGGLE:
    DATA_DIR = "/kaggle/input/predict-energy-behavior-of-prosumers/"
else:
    DATA_DIR = "../data/"

# Read CSVs and parse relevant date columns
train = pd.read_csv(DATA_DIR + "train.csv")
client = pd.read_csv(DATA_DIR + "client.csv")
historical_weather = pd.read_csv(DATA_DIR + "historical_weather.csv")
forecast_weather = pd.read_csv(DATA_DIR + "forecast_weather.csv")
electricity_prices = pd.read_csv(DATA_DIR + "electricity_prices.csv")
gas_prices = pd.read_csv(DATA_DIR + "gas_prices.csv")
weather_station_to_county_mapping = pd.read_csv(DATA_DIR + 'weather_station_to_county_mapping.csv')


In [4]:
# We merge all DataFrames 
merged_df = merge_data(
    train, client, historical_weather, forecast_weather, 
    electricity_prices, gas_prices, weather_station_to_county_mapping
)

# Drop all non needed columns (ids and timestamps)
merged_df = remove_col(merged_df, drop_row_id=False)

## Feature engineering

In [5]:
merged_df = add_daylight_col(merged_df)

merged_df = add_capacity_col(merged_df)

merged_df = basic_improvements(merged_df)

merged_df = add_shifted_target(merged_df)

# merged_df = add_public_holiday_col(merged_df)

# merged_df = add_school_holiday_col(merged_df)

In [6]:
merged_df.columns

Index(['county', 'is_business', 'product_type', 'target', 'is_consumption',
       'row_id', 'eic_count_client', 'lowest_price_per_mwh_gas_prices',
       'highest_price_per_mwh_gas_prices', 'euros_per_mwh_electricity_prices',
       'temperature_hist_weather', 'dewpoint_hist_weather',
       'surface_pressure_hist_weather', 'cloudcover_low_hist_weather',
       'windspeed_10m_hist_weather', 'winddirection_10m_hist_weather',
       'shortwave_radiation_hist_weather',
       'direct_solar_radiation_hist_weather',
       'cloudcover_high_forecast_weather', 'cloudcover_low_forecast_weather',
       'cloudcover_mid_forecast_weather', 'cloudcover_total_forecast_weather',
       '10_metre_u_wind_component_forecast_weather',
       '10_metre_v_wind_component_forecast_weather',
       'direct_solar_radiation_forecast_weather',
       'surface_solar_radiation_downwards_forecast_weather',
       'snowfall_forecast_weather', 'total_precipitation_forecast_weather',
       'year', 'month', 'week', 

## Training & Model Building

In [7]:
drop_columns = [
    'target', 'hours_ahead_forecast_weather',
    'row_id', 'data_block_id', 'prediction_unit_id', 
    'longitude_hist_weather', 'latitude_hist_weather',
    'longitude_forecast_weather', 'latitude_forecast_weather'
]

selected_fields = ['county', 'is_business', 'product_type', 'is_consumption',
       'eic_count_client',
       'surface_solar_radiation_downwards_forecast_weather',
       'total_precipitation_forecast_weather', 'year', 'week', 'hour',
       'day_of_year', 'day_of_week','daylight', 'capacity_per_eic',
       'squared_capacity_client', 'sum_column', 'temp_dew', 'shifted_target'
]

In [8]:
X = merged_df.drop(['row_id', 'target'], axis=1)[selected_fields]
y = merged_df.target
                   

model = XGBRegressor(enable_categorical=True, max_depth=6, learning_rate=0.3)
model.fit(X, y)

# y_pred = bst.predict(X_test)

## main optimisation metric
# print('Mean absolute error test', mean_absolute_error(y_test, y_pred))
# print('Mean absolute error train', mean_absolute_error(y_train, bst.predict(X_train)))

In [9]:
y_pred = model.predict(merged_df.drop(['row_id', 'target'], axis=1)[selected_fields])

# main optimisation metric
print('Mean absolute error', mean_absolute_error(merged_df.target, y_pred))


Mean absolute error 43.302886152394706


In [10]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2017824 entries, 0 to 2017823
Data columns (total 41 columns):
 #   Column                                              Dtype   
---  ------                                              -----   
 0   county                                              category
 1   is_business                                         int64   
 2   product_type                                        category
 3   target                                              float64 
 4   is_consumption                                      int64   
 5   row_id                                              int64   
 6   eic_count_client                                    float64 
 7   lowest_price_per_mwh_gas_prices                     float64 
 8   highest_price_per_mwh_gas_prices                    float64 
 9   euros_per_mwh_electricity_prices                    float64 
 10  temperature_hist_weather                            float64 
 11  dewpoint_hist_weather   

In [25]:
model_df = merged_df.copy()

In [26]:
if ON_KAGGLE:
    import enefit
else:
    import sys
    sys.path.append('../imports')
    import public_timeseries_testing_util as enefit


env = enefit.make_env()
iter_test = env.iter_test()

counter = 0

for (test, revealed_targets, client, historical_weather,
    forecast_weather, electricity_prices, gas_prices, sample_prediction) in iter_test:
    
    if counter in range(0,5):
        pass
        # print(f'Test dataframe #{counter} \n', test.head(3))
        # print(f'Revealed targets dataframe #{counter} \n', revealed_targets.head(3))
        # print(revealed_targets.columns)
        # print(f'Client dataframe #{counter} \n', client.head(3))
        # print(f'Historical weather dataframe #{counter} \n', historical_weather.head(3))
        # print(f'Forecast weather dataframe #{counter} \n', forecast_weather.head(3))
        # print(f'Electricity prices dataframe #{counter} \n', electricity_prices.head(3))
        # print(f'Gas prices dataframe #{counter} \n', gas_prices.head(3))
        # print(f'Sample prediction dataframe #{counter} \n', sample_prediction.head(3))
    
    prepped_df = merge_data(
        test, client, historical_weather, forecast_weather, 
        electricity_prices, gas_prices, weather_station_to_county_mapping
    )
    prepped_df = remove_col(prepped_df, drop_row_id=False)
    if 'currently_scored' in prepped_df.columns:
        prepped_df = prepped_df.drop('currently_scored', axis=1)

    # Save revealed targets as shifted targets --------------------------------
    # copy revealed targets
    sel_revealed_targets = revealed_targets.copy()
    # rename the target column of the revealed targets for merging
    sel_revealed_targets.rename(columns={'target' : 'shifted_target'}, inplace=True)
    # introduce a hour column to merge on the prepped df
    sel_revealed_targets.datetime = pd.to_datetime(sel_revealed_targets.datetime)
    sel_revealed_targets = split_datetime(sel_revealed_targets)
    # Define as categories to keep the data type
    sel_revealed_targets['county'] = sel_revealed_targets['county'].astype('category')
    sel_revealed_targets['product_type'] = sel_revealed_targets['product_type'].astype('category')
    # take only needed columns
    sel_revealed_targets = sel_revealed_targets[['county', 'is_business', 'product_type', 'is_consumption','hour','shifted_target']]
    # merge the revealed targets as shifted target to the prepped_df
    prepped_df = pd.merge(prepped_df, sel_revealed_targets, on= ['county', 'is_business', 'product_type', 'is_consumption', 'hour'], how='left')
    # feature engineering ---------------------------------------------------------
    prepped_df = add_daylight_col(prepped_df)
    prepped_df = add_capacity_col(prepped_df)
    prepped_df = basic_improvements(prepped_df)
    # prepped_df = add_public_holiday_col(prepped_df)
    # prepped_df = add_school_holiday_col(prepped_df)

    # concatanate the prepped data to training data --------------------------------
    model_df = pd.concat([model_df.reset_index(drop=True), prepped_df.reset_index(drop=True)], axis=0, ignore_index= True)
    
    # merge the revealed targets to the merged_df --------------------------------
    sel_revealed_targets = revealed_targets[['row_id', 'target']].copy()
    # rename the target column of the revealed targets for merging
    #el_revealed_targets.rename(columns={'target' : 'revealed_target'}, inplace=True)
    #merged_df = pd.merge(merged_df, sel_revealed_targets, on= ['row_id'], how='left')

    # Merge df_1 and df_2 on the 'row_id' column
    big_df = pd.merge(model_df, sel_revealed_targets, on='row_id', how='left', suffixes=('_df', '_rev'))

    # Fill NaN values in 'target_df1' with values from 'target_df2'
    big_df['target_df'] = big_df['target_df'].fillna(big_df['target_rev'])

    # Drop the additional 'target_df2' column if needed
    big_df = big_df.drop(columns=['target_rev'])

    # If you want to update the original df_1 with the filled values
    model_df['target'] = big_df['target_df']

    model_df['day_of_week'] = model_df['day_of_week'].astype('category')
    # retrain the model ----------------------------------------------------------------
    if (counter % 2 == 0) and (counter >0):
        X_retrain = model_df.dropna(subset=['target'])[selected_fields]
        y_retrain = model_df.dropna(subset=['target']).target
        model.fit(X_retrain, y_retrain)
        #print("model retrained")

    # Prediction ------------------------------------------------------------------
    sample_prediction['target'] = model.predict(prepped_df.drop('row_id', axis=1)[selected_fields])
    sample_prediction['target'] = sample_prediction['target'].fillna(0).clip(0)
    
    # send predictions
    env.predict(sample_prediction)    

    counter += 1




model retrained
