In [1]:
# crucial for import API interface and loading data
ON_KAGGLE: bool = False

In [2]:
if ON_KAGGLE:
    import sys
    sys.path.append('/kaggle/input/imports')
    from helper_functions import split_datetime
    from actpred_plot import plot_actual_vs_pred
    from data_preprocessing import merge_data, remove_col
    from feature_engineering import * # this is bad practice, call functions explicitly
else:
    import sys
    sys.path.append('../imports')
    from helper_functions import split_datetime
    from actpred_plot import plot_actual_vs_pred
    from data_preprocessing import merge_data, remove_col
    from feature_engineering import * # this is bad practice, call functions explicitly

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px

import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# for NN
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder


In [3]:
if ON_KAGGLE:
    DATA_DIR = "/kaggle/input/predict-energy-behavior-of-prosumers/"
else:
    DATA_DIR = "../data/"

# Read CSVs and parse relevant date columns
train = pd.read_csv(DATA_DIR + "train.csv")
client = pd.read_csv(DATA_DIR + "client.csv")
historical_weather = pd.read_csv(DATA_DIR + "historical_weather.csv")
forecast_weather = pd.read_csv(DATA_DIR + "forecast_weather.csv")
electricity_prices = pd.read_csv(DATA_DIR + "electricity_prices.csv")
gas_prices = pd.read_csv(DATA_DIR + "gas_prices.csv")
weather_station_to_county_mapping = pd.read_csv(DATA_DIR + 'weather_station_to_county_mapping.csv')


In [4]:
# We merge all DataFrames 
merged_df = merge_data(
    train, client, historical_weather, forecast_weather, 
    electricity_prices, gas_prices, weather_station_to_county_mapping
)

# Drop all non needed columns (ids and timestamps)
merged_df = remove_col(merged_df, drop_row_id=False)

## Feature engineering

In [5]:
merged_df = add_daylight_col(merged_df)

merged_df = add_capacity_col(merged_df)

merged_df = basic_improvements(merged_df)

merged_df = add_shifted_target(merged_df)

# merged_df = add_public_holiday_col(merged_df)

# merged_df = add_school_holiday_col(merged_df)

In [6]:
merged_df.columns

Index(['county', 'is_business', 'product_type', 'target', 'is_consumption',
       'row_id', 'eic_count_client', 'lowest_price_per_mwh_gas_prices',
       'highest_price_per_mwh_gas_prices', 'euros_per_mwh_electricity_prices',
       'temperature_hist_weather', 'dewpoint_hist_weather',
       'surface_pressure_hist_weather', 'cloudcover_low_hist_weather',
       'windspeed_10m_hist_weather', 'winddirection_10m_hist_weather',
       'shortwave_radiation_hist_weather',
       'direct_solar_radiation_hist_weather',
       'cloudcover_high_forecast_weather', 'cloudcover_low_forecast_weather',
       'cloudcover_mid_forecast_weather', 'cloudcover_total_forecast_weather',
       '10_metre_u_wind_component_forecast_weather',
       '10_metre_v_wind_component_forecast_weather',
       'direct_solar_radiation_forecast_weather',
       'surface_solar_radiation_downwards_forecast_weather',
       'snowfall_forecast_weather', 'total_precipitation_forecast_weather',
       'year', 'month', 'week', 

In [7]:
merged_df.tail()

Unnamed: 0,county,is_business,product_type,target,is_consumption,row_id,eic_count_client,lowest_price_per_mwh_gas_prices,highest_price_per_mwh_gas_prices,euros_per_mwh_electricity_prices,...,hour,day_of_year,day_of_month,day_of_week,daylight,capacity_per_eic,squared_capacity_client,sum_column,temp_dew,shifted_target
2017819,15,1,0,197.233,1,2018347,15.0,29.0,34.0,-1.29,...,23,151,31,Wednesday,False,41.33,384400.0,0.0,77.500963,188.167
2017820,15,1,1,0.0,0,2018348,20.0,29.0,34.0,-1.29,...,23,151,31,Wednesday,False,31.22,390000.25,0.0,77.500963,0.0
2017821,15,1,1,28.404,1,2018349,20.0,29.0,34.0,-1.29,...,23,151,31,Wednesday,False,31.22,390000.25,0.0,77.500963,31.484
2017822,15,1,3,0.0,0,2018350,55.0,29.0,34.0,-1.29,...,23,151,31,Wednesday,False,39.79,4788219.24,0.0,77.500963,0.0
2017823,15,1,3,196.24,1,2018351,55.0,29.0,34.0,-1.29,...,23,151,31,Wednesday,False,39.79,4788219.24,0.0,77.500963,177.056


## one-hot-encoding and ordinal-encoding

In [8]:
# DATA PREP (Encoding)
# If necessary, encode categorical features here
#encoded_df = pd.get_dummies(df_exp, columns=['county', 'is_business', 'product_type'])
#encoded_df = pd.get_dummies(merged_df, columns=['county', 'is_business', 'product_type', 'day_of_week', 'hour'], drop_first=True)

enc = OneHotEncoder(handle_unknown='ignore', drop='first')
category_list = ['county', 'is_business', 'product_type', 'day_of_week', 'hour']
encoded_categories = enc.fit_transform(merged_df[category_list])

# Convert the encoded sparse matrix to a DataFrame with meaningful column names
encoded_df = pd.DataFrame(encoded_categories.toarray(), columns=enc.get_feature_names_out(category_list))
# The result will be a new DataFrame with binary columns for each category



encoded_df = pd.concat([merged_df, encoded_df], axis=1)
encoded_df

#le = LabelEncoder() # LabelEncoder is used on target, not on features!
#oe = OrdinalEncoder()

# skip for now, they are already ordinal and numeric
# encoded_df['day_of_year'] = oe.fit_transform(encoded_df['day_of_year'])
# encoded_df['year'] = oe.fit_transform(encoded_df['year'])
# encoded_df['week'] = oe.fit_transform(encoded_df['week'])



# encoded_df['hour'] = oe.fit_transform(encoded_df['hour'])

Unnamed: 0,county,is_business,product_type,target,is_consumption,row_id,eic_count_client,lowest_price_per_mwh_gas_prices,highest_price_per_mwh_gas_prices,euros_per_mwh_electricity_prices,...,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23
0,0,0,1,0.713,0,0,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,1,96.590,1,1,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0,2,0.000,0,2,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0,2,17.314,1,3,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,3,2.904,0,4,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017819,15,1,0,197.233,1,2018347,15.0,29.0,34.0,-1.29,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2017820,15,1,1,0.000,0,2018348,20.0,29.0,34.0,-1.29,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2017821,15,1,1,28.404,1,2018349,20.0,29.0,34.0,-1.29,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2017822,15,1,3,0.000,0,2018350,55.0,29.0,34.0,-1.29,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [9]:
encoded_df.columns

Index(['county', 'is_business', 'product_type', 'target', 'is_consumption',
       'row_id', 'eic_count_client', 'lowest_price_per_mwh_gas_prices',
       'highest_price_per_mwh_gas_prices', 'euros_per_mwh_electricity_prices',
       'temperature_hist_weather', 'dewpoint_hist_weather',
       'surface_pressure_hist_weather', 'cloudcover_low_hist_weather',
       'windspeed_10m_hist_weather', 'winddirection_10m_hist_weather',
       'shortwave_radiation_hist_weather',
       'direct_solar_radiation_hist_weather',
       'cloudcover_high_forecast_weather', 'cloudcover_low_forecast_weather',
       'cloudcover_mid_forecast_weather', 'cloudcover_total_forecast_weather',
       '10_metre_u_wind_component_forecast_weather',
       '10_metre_v_wind_component_forecast_weather',
       'direct_solar_radiation_forecast_weather',
       'surface_solar_radiation_downwards_forecast_weather',
       'snowfall_forecast_weather', 'total_precipitation_forecast_weather',
       'year', 'month', 'week', 

In [10]:
# Removing NA
mask = encoded_df.isna().any(axis=1)
encoded_df = encoded_df[~mask] # drop empty values


In [11]:
# drop_columns = [
#     'target', 'hours_ahead_forecast_weather',
#     'row_id', 'data_block_id', 'prediction_unit_id', 
#     'longitude_hist_weather', 'latitude_hist_weather',
#     'longitude_forecast_weather', 'latitude_forecast_weather'
# ]

selected_fields = ['county', 'is_business', 'product_type', 'is_consumption',
       'eic_count_client',
       'surface_solar_radiation_downwards_forecast_weather',
       'total_precipitation_forecast_weather', 'year', 'week', 'hour',
       'day_of_year', 'day_of_week','daylight', 'capacity_per_eic',
       'squared_capacity_client', 'sum_column', 'temp_dew', 'shifted_target'
]

# selected_fields = ['is_consumption', 'eic_count_client',
#        'lowest_price_per_mwh_gas_prices', 'highest_price_per_mwh_gas_prices',
#        'euros_per_mwh_electricity_prices', 'temperature_hist_weather',
#        'dewpoint_hist_weather', 'surface_pressure_hist_weather',
#        'cloudcover_low_hist_weather', 'windspeed_10m_hist_weather',
#        'winddirection_10m_hist_weather', 'shortwave_radiation_hist_weather',
#        'direct_solar_radiation_hist_weather',
#        'cloudcover_high_forecast_weather', 'cloudcover_low_forecast_weather',
#        'cloudcover_mid_forecast_weather', 'cloudcover_total_forecast_weather',
#        '10_metre_u_wind_component_forecast_weather',
#        '10_metre_v_wind_component_forecast_weather',
#        'direct_solar_radiation_forecast_weather',
#        'surface_solar_radiation_downwards_forecast_weather',
#        'snowfall_forecast_weather', 'total_precipitation_forecast_weather',
#        'year', 'month', 'week', 'day_of_year', 'day_of_month', 'daylight',
#        'capacity_per_eic', 'squared_capacity_client', 'sum_column', 'temp_dew',
#        'shifted_target', 'county_1', 'county_2', 'county_3', 'county_4',
#        'county_5', 'county_6', 'county_7', 'county_8', 'county_9', 'county_10',
#        'county_11', 'county_12', 'county_13', 'county_14', 'county_15',
#        'is_business_1', 'product_type_1', 'product_type_2', 'product_type_3',
#        'day_of_week_Monday', 'day_of_week_Saturday', 'day_of_week_Sunday',
#        'day_of_week_Thursday', 'day_of_week_Tuesday', 'day_of_week_Wednesday',
#        'hour_1', 'hour_2', 'hour_3', 'hour_4', 'hour_5', 'hour_6', 'hour_7',
#        'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_12', 'hour_13',
#        'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18', 'hour_19',
#        'hour_20', 'hour_21', 'hour_22', 'hour_23']

# selected_fields = ['is_consumption', 'eic_count_client',
#        'cloudcover_total_forecast_weather',
#        '10_metre_v_wind_component_forecast_weather',
#        'direct_solar_radiation_forecast_weather',
#        'total_precipitation_forecast_weather',
#        'year', 'month', 'week', 'day_of_year', 'day_of_month', 'daylight',
#        'capacity_per_eic', 'squared_capacity_client', 'sum_column', 'temp_dew',
#        'shifted_target', 'county_1', 'county_2', 'county_3', 'county_4',
#        'county_5', 'county_6', 'county_7', 'county_8', 'county_9', 'county_10',
#        'county_11', 'county_12', 'county_13', 'county_14', 'county_15',
#        'is_business_1', 'product_type_1', 'product_type_2', 'product_type_3',
#        'day_of_week_Monday', 'day_of_week_Saturday', 'day_of_week_Sunday',
#        'day_of_week_Thursday', 'day_of_week_Tuesday', 'day_of_week_Wednesday',
#        'hour_1', 'hour_2', 'hour_3', 'hour_4', 'hour_5', 'hour_6', 'hour_7',
#        'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_12', 'hour_13',
#        'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18', 'hour_19',
#        'hour_20', 'hour_21', 'hour_22', 'hour_23']


selected_fields = [I for I in selected_fields if I not in category_list]
selected_fields = selected_fields + enc.get_feature_names_out(category_list).tolist()
selected_fields



['is_consumption',
 'eic_count_client',
 'surface_solar_radiation_downwards_forecast_weather',
 'total_precipitation_forecast_weather',
 'year',
 'week',
 'day_of_year',
 'daylight',
 'capacity_per_eic',
 'squared_capacity_client',
 'sum_column',
 'temp_dew',
 'shifted_target',
 'county_1',
 'county_2',
 'county_3',
 'county_4',
 'county_5',
 'county_6',
 'county_7',
 'county_8',
 'county_9',
 'county_10',
 'county_11',
 'county_12',
 'county_13',
 'county_14',
 'county_15',
 'is_business_1',
 'product_type_1',
 'product_type_2',
 'product_type_3',
 'day_of_week_Monday',
 'day_of_week_Saturday',
 'day_of_week_Sunday',
 'day_of_week_Thursday',
 'day_of_week_Tuesday',
 'day_of_week_Wednesday',
 'hour_1',
 'hour_2',
 'hour_3',
 'hour_4',
 'hour_5',
 'hour_6',
 'hour_7',
 'hour_8',
 'hour_9',
 'hour_10',
 'hour_11',
 'hour_12',
 'hour_13',
 'hour_14',
 'hour_15',
 'hour_16',
 'hour_17',
 'hour_18',
 'hour_19',
 'hour_20',
 'hour_21',
 'hour_22',
 'hour_23']

In [12]:
# Scaling

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(encoded_df[selected_fields])


In [13]:
# keep_cols = encoded_df.columns.tolist()
# keep_cols.remove('target')

X_scaled_df = pd.DataFrame(X_scaled, columns= selected_fields)

X_scaled_df

Unnamed: 0,is_consumption,eic_count_client,surface_solar_radiation_downwards_forecast_weather,total_precipitation_forecast_weather,year,week,day_of_year,daylight,capacity_per_eic,squared_capacity_client,...,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23
0,-1.0,,,,-1.63797,0.556816,0.570558,-0.91867,,,...,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543
1,1.0,,,,-1.63797,0.556816,0.570558,-0.91867,,,...,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543
2,-1.0,,,,-1.63797,0.556816,0.570558,-0.91867,,,...,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543
3,1.0,,,,-1.63797,0.556816,0.570558,-0.91867,,,...,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543
4,-1.0,,,,-1.63797,0.556816,0.570558,-0.91867,,,...,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017819,1.0,-0.405372,-0.595719,-0.317546,1.46175,-0.259003,-0.260157,-0.91867,0.343491,-0.224491,...,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543,4.795177
2017820,-1.0,-0.370681,-0.595719,-0.317546,1.46175,-0.259003,-0.260157,-0.91867,0.048834,-0.224326,...,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543,4.795177
2017821,1.0,-0.370681,-0.595719,-0.317546,1.46175,-0.259003,-0.260157,-0.91867,0.048834,-0.224326,...,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543,4.795177
2017822,-1.0,-0.127846,-0.595719,-0.317546,1.46175,-0.259003,-0.260157,-0.91867,0.298607,-0.094357,...,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543,-0.208543,4.795177


## Training & Model Building

In [14]:
## train model

# X_scaled_df
y = merged_df[~mask].target
# y = merged_df.target


# Split data into training and testing sets
#X_train, X_test, y_train, y_test = train_test_split(X_scaled_df[selected_fields], y, test_size=0.2, random_state=42)


# Create a Neural Network Model
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(X_scaled_df[selected_fields].shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)  # Output layer for regression
])

model.compile(optimizer='adam', loss='mean_absolute_error') # better use MAE

# Train the Model
# model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))
model.fit(X_scaled_df[selected_fields], y, epochs=10, batch_size=32)

# Evaluate the model
# loss = model.evaluate(X_test, y_test)

# evaluation
# y_train_pred = model.predict(X_train)
# y_test_pred = model.predict(X_test)

y_pred = model.predict(X_scaled_df[selected_fields])

#calc_mae_tst_trn(y_test, y_test_pred, y_train, y_train_pred)


# y_pred = bst.predict(X_test)

## main optimisation metric
# print('Mean absolute error test', mean_absolute_error(y_test, y_pred))
# print('Mean absolute error train', mean_absolute_error(y_train, bst.predict(X_train)))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [15]:
# main optimisation metric
# print('Mean absolute error test', mean_absolute_error(y_test, y_test_pred))
# print('Mean absolute error train', mean_absolute_error(y_train, y_train_pred))

print('Mean absolute error', mean_absolute_error(y, y_pred))

ValueError: Input contains NaN.

In [None]:
# y_pred = model.predict(merged_df.drop(['row_id', 'target'], axis=1)[selected_fields])

# # main optimisation metric
# print('Mean absolute error', mean_absolute_error(merged_df.target, y_pred))


In [None]:
if ON_KAGGLE:
    import enefit
else:
    import sys
    sys.path.append('../imports')
    import public_timeseries_testing_util as enefit


# copy of df before new data
merged_df['row_id'] = merged_df['row_id'].astype('int', errors='ignore')

env = enefit.make_env()
iter_test = env.iter_test()

counter = 0
previous_revealed_targets = pd.DataFrame()
all_revealed_targets = pd.DataFrame()

for (test, revealed_targets, client, historical_weather,
    forecast_weather, electricity_prices, gas_prices, sample_prediction) in iter_test:

    
    # if counter % 7 == 0:
    #     model.fit(merged_df.drop(['row_id', 'target'], axis=1)[selected_fields], merged_df.target)
        
    #print("Iteration #:", counter) 

    try: 
        # drop columns if target is na
        model_df = merged_df.dropna(subset=['target'])
    except:
        print('some na targets were dropped')
        # create alias anyway
        model_df = merged_df

    
    if counter in range(0,5):
        pass
        # print(f'Test dataframe #{counter} \n', test.head(3))
        # print(f'Revealed targets dataframe #{counter} \n', revealed_targets.head(3))
        # print(revealed_targets.columns)
        # print(f'Client dataframe #{counter} \n', client.head(3))
        # print(f'Historical weather dataframe #{counter} \n', historical_weather.head(3))
        # print(f'Forecast weather dataframe #{counter} \n', forecast_weather.head(3))
        # print(f'Electricity prices dataframe #{counter} \n', electricity_prices.head(3))
        # print(f'Gas prices dataframe #{counter} \n', gas_prices.head(3))
        # print(f'Sample prediction dataframe #{counter} \n', sample_prediction.head(3))
    
    # merging
    prepped_df = merge_data(
        test, client, historical_weather, forecast_weather, 
        electricity_prices, gas_prices, weather_station_to_county_mapping
    )


    # feature engineering
    prepped_df = remove_col(prepped_df, drop_row_id=False)

    # rename the target column of the revealed targets for merging
    revealed_targets.rename(columns={'target' : 'shifted_target'}, inplace=True)
    # introduce a hour column to merge on the prepped df
    revealed_targets.datetime = pd.to_datetime(revealed_targets.datetime)
    revealed_targets = split_datetime(revealed_targets)
    # take only needed columns
    sel_revealed_targets = revealed_targets[['county', 'is_business', 'product_type', 'is_consumption','hour','shifted_target']]
    # merge the revealed targets as shifted target to the prepped_df
    prepped_df = pd.merge(prepped_df, sel_revealed_targets, on= ['county', 'is_business', 'product_type', 'is_consumption', 'hour'], how='left')

    # feature engineering
    prepped_df = add_daylight_col(prepped_df)
    prepped_df = add_capacity_col(prepped_df)
    prepped_df = basic_improvements(prepped_df)

    # encoding
    #prepped_encoded_df = pd.get_dummies(prepped_df, columns=['county', 'is_business', 'product_type', 'day_of_week', 'hour'], drop_first=True)
    encoded_categories = enc.transform(prepped_df[category_list])

    # Convert the encoded sparse matrix to a DataFrame with meaningful column names
    prepped_encoded_df = pd.DataFrame(encoded_categories.toarray(), columns=enc.get_feature_names_out(category_list))
    # The result will be a new DataFrame with binary columns for each category

    prepped_encoded_df = pd.concat([prepped_df, prepped_encoded_df], axis=1)

    

    # prepped_encoded_df['day_of_year'] = oe.transform(prepped_encoded_df['day_of_year'])
    # prepped_encoded_df['year'] = oe.transform(prepped_encoded_df['year'])
    # prepped_encoded_df['week'] = oe.transform(prepped_encoded_df['week'])


    # # Removing NA
    # mask = prepped_encoded_df.isna().any(axis=1)
    # prepped_encoded_df = prepped_encoded_df[~mask] # drop empty values

    # scaling
    prepped_X_scaled = scaler.transform(prepped_encoded_df[selected_fields])
    prepped_X_scaled_df = pd.DataFrame(prepped_X_scaled, columns= selected_fields)



    # predicting
    sample_prediction['target'] = model.predict(prepped_X_scaled_df[selected_fields])
    sample_prediction['target'] = sample_prediction['target'].fillna(0).clip(0)
    
    # send predictions
    env.predict(sample_prediction)    

    counter += 1


In [None]:
sample_prediction