In [None]:
# crucial for import API interface and loading data
ON_KAGGLE: bool = False

In [1]:
if not ON_KAGGLE:
    import sys
    sys.path.append('../imports')
    from actpred_plot import plot_actual_vs_pred
    from data_preprocessing import merge_data, remove_col

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px

import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [2]:
if ON_KAGGLE:
    DATA_DIR = "/kaggle/input/predict-energy-behavior-of-prosumers/"
else:
    DATA_DIR = "../data/"

# Read CSVs and parse relevant date columns
train = pd.read_csv(DATA_DIR + "train.csv")
client = pd.read_csv(DATA_DIR + "client.csv")
historical_weather = pd.read_csv(DATA_DIR + "historical_weather.csv")
forecast_weather = pd.read_csv(DATA_DIR + "forecast_weather.csv")
electricity_prices = pd.read_csv(DATA_DIR + "electricity_prices.csv")
gas_prices = pd.read_csv(DATA_DIR + "gas_prices.csv")
weather_station_to_county_mapping = pd.read_csv(DATA_DIR + 'weather_station_to_county_mapping.csv')


## Datetime conversion


In [3]:
client.date = pd.to_datetime(client.date)

electricity_prices.forecast_date = pd.to_datetime(electricity_prices.forecast_date)
electricity_prices.origin_date = pd.to_datetime(electricity_prices.origin_date)

forecast_weather.origin_datetime = pd.to_datetime(forecast_weather.origin_datetime)
forecast_weather.forecast_datetime = pd.to_datetime(forecast_weather.forecast_datetime)

gas_prices.forecast_date = pd.to_datetime(gas_prices.forecast_date)
gas_prices.origin_date = pd.to_datetime(gas_prices.origin_date)

historical_weather.datetime = pd.to_datetime(historical_weather.datetime)

train.datetime = pd.to_datetime(train.datetime, format='%Y-%m-%d %H:%M:%S')

# Merging historical data

In [4]:
# append '_client' to merged columns
client.columns = [f"{column}_client" if column not in ['data_block_id', 'county', 'is_business', 'product_type'] else column for column in client.columns]

# merge train and client
merged_df = pd.merge(train, client, on=['data_block_id', 'county', 'is_business', 'product_type'], how='left')


# append _gas_prices to columns
gas_prices.columns = [f"{column}_gas_prices" if column != 'data_block_id' else column for column in gas_prices.columns]

# merge gas_prices
merged_df = pd.merge(merged_df, gas_prices, on=['data_block_id'], how='left')


# add time column for merging with electricity data
merged_df['time_of_day'] = merged_df['datetime'].dt.time

# the prices are available hourly -> create new column with time 
electricity_prices['time_of_day'] = electricity_prices.forecast_date.dt.time

# append electricity_prices to column names
electricity_prices.columns = [f"{column}_electricity_prices" if column not in ['time_of_day','data_block_id'] else column for column in electricity_prices.columns]

# merge electricity_prices
merged_df = pd.merge(merged_df, electricity_prices, on = ['data_block_id', 'time_of_day'], how='left')


# get county and county_name from weather_station_to_county_mapping (merge on latitude and longitude)
# round lat and long to avoid mismatching due to different accuracy
historical_weather.latitude = historical_weather.latitude.round(1)
historical_weather.longitude = historical_weather.longitude.round(1)

weather_station_to_county_mapping.latitude = weather_station_to_county_mapping.latitude.round(1)
weather_station_to_county_mapping.longitude = weather_station_to_county_mapping.longitude.round(1)


# merge historical weather to get counties
merged_hist_weather = pd.merge(historical_weather, weather_station_to_county_mapping, on=['latitude', 'longitude'], how='left')

# get time of day
merged_hist_weather['time_of_day'] = merged_hist_weather['datetime'].dt.time

# aggregate by county and time (summarize weather stations for same county)
merged_hist_weather = merged_hist_weather.groupby(['county', 'time_of_day', 'datetime', 'data_block_id']).mean(numeric_only=True).reset_index()

# append _hist_weather to column names
merged_hist_weather.columns = [f"{column}_hist_weather" if column not in ['county', 'time_of_day','data_block_id'] else column for column in merged_hist_weather.columns]

# merge to merged_df
merged_df = pd.merge(merged_df, merged_hist_weather, on=['data_block_id', 'time_of_day', 'county'], how='left')


#round lat and long
forecast_weather.latitude = forecast_weather.latitude.round(1)
forecast_weather.longitude = forecast_weather.longitude.round(1)

# merge to get counties
merged_forecast_weather = pd.merge(forecast_weather, weather_station_to_county_mapping, on=['latitude', 'longitude'], how='left')
# merged_forecast_weather['time_of_day'] = merged_forecast_weather.

# # aggregate for duplicate locations
merged_forecast_weather = merged_forecast_weather.groupby(['county', 'forecast_datetime', 'data_block_id']).mean(numeric_only=True).reset_index()

# append forecast_weather to column names
merged_forecast_weather.columns = [f"{column}_forecast_weather" if column not in ['county', 'forecast_datetime','data_block_id'] else column for column in merged_forecast_weather.columns]

# merge forecast_weather
merged_df = pd.merge(merged_df, merged_forecast_weather, left_on=['data_block_id', 'datetime', 'county'], right_on=['data_block_id', 'forecast_datetime', 'county'], how='left')



## Data Preparation

In [5]:
def split_datetime(data, col="datetime"):
    # What columns are of type datetime?
    datetime_columns = data.select_dtypes(include='datetime64').columns
    
    for c in datetime_columns:
        # print(f"Timezone for {c} is {data[c].dt.tz}")
        pass

    # Adding columns for date & time
    data['year']    = data[col].dt.year
    # data['quarter'] = data[col].dt.quarter
    data['month']   = data[col].dt.month
    data['week']    = data[col].dt.isocalendar().week
    data['hour']    = data[col].dt.hour 

    data['day_of_year']  = data[col].dt.day_of_year
    data['day_of_month'] = data[col].dt.day
    data['day_of_week']  = data[col].dt.day_of_week

    return data

In [6]:
# encode categories to category datetype

merged_df['county'] = merged_df['county'].astype('category')
merged_df['product_type'] = merged_df['product_type'].astype('category')


In [7]:
# model is not able to handle object type
merged_df.drop('time_of_day', axis=1, inplace=True)

# split datetime into meaningful features of int types
merged_df = split_datetime(merged_df)

# model is not able to handle datetime
merged_df = merged_df.drop(merged_df.select_dtypes(include=['datetime64[ns]', 'datetime64[ns, EET]']).columns, axis=1)

# drop na from target
merged_df.dropna(subset=['target'], inplace=True)


In [8]:
# mapping days of the week names and converting to categorical variable
if 'day_of_week' in merged_df.columns:
    weekday_map = {
        0: 'Monday',
        1: 'Tuesday',
        2: 'Wednesday',
        3: 'Thursday',
        4: 'Friday',
        5: 'Saturday',
        6: 'Sunday'
    }
    merged_df['day_of_week'] = merged_df['day_of_week'].map(weekday_map).astype('category')

## Training & Model Building

In [9]:
#X_train, X_test, y_train,  y_test = train_test_split(model_df.drop('target', axis=1), model_df['target'], test_size=0.3, random_state=0)
drop_columns = [
    'target',
    'hours_ahead_forecast_weather',
    'row_id',
    'data_block_id',
    'prediction_unit_id',
    'longitude_hist_weather',
    'longitude_forecast_weather',
    'latitude_hist_weather',
    'latitude_forecast_weather'
]


model = XGBRegressor(enable_categorical=True, max_depth=9, learning_rate=0.3)
# model.fit(merged_df.drop(drop_columns, axis=1), merged_df.target)

# y_pred = bst.predict(X_test)

## main optimisation metric
# print('Mean absolute error test', mean_absolute_error(y_test, y_pred))
# print('Mean absolute error train', mean_absolute_error(y_train, bst.predict(X_train)))

In [10]:
merged_df.sample(10)

Unnamed: 0,county,is_business,product_type,target,is_consumption,data_block_id,row_id,prediction_unit_id,eic_count_client,installed_capacity_client,...,surface_solar_radiation_downwards_forecast_weather,snowfall_forecast_weather,total_precipitation_forecast_weather,year,month,week,hour,day_of_year,day_of_month,day_of_week
258034,11,1,1,0.0,0,86,258034,46,25.0,920.2,...,0.0,8.34465e-07,0.000239,2021,11,47,6,330,26,Friday
369132,11,0,3,0.0,0,122,369132,45,270.0,3233.37,...,0.0,0.0,0.0,2022,1,52,23,1,1,Saturday
1817654,5,1,3,913.946,0,574,1817654,23,90.0,4161.3,...,307.586667,0.0,1e-06,2023,3,13,16,88,29,Wednesday
113625,10,0,1,13.787,1,38,113625,38,24.0,301.2,...,241.605278,0.0,0.0,2021,10,40,12,282,9,Saturday
748519,7,1,1,147.413,1,242,748519,29,11.0,463.6,...,0.0,0.0,0.0,2022,5,17,1,121,1,Sunday
1422172,11,1,3,0.004,0,451,1422172,48,146.0,7195.85,...,0.0,1.491606e-05,1.5e-05,2022,11,47,5,330,26,Saturday
1110904,10,1,3,314.529,0,355,1110904,42,63.0,2174.2,...,598.805,0.0,3e-06,2022,8,34,12,234,22,Monday
706519,10,0,3,175.684,1,229,706519,39,135.0,1573.7,...,0.0,0.0,0.0,2022,4,16,4,108,18,Monday
1173600,15,0,3,2.173,0,375,1173600,58,97.0,1160.1,...,0.0,0.0,0.0,2022,9,36,3,254,11,Sunday
391625,11,1,1,371.645,1,130,391625,46,32.0,1032.2,...,0.0,6.638234e-05,6.5e-05,2022,1,1,4,9,9,Sunday


## Loading Test Data / API

In [11]:

def data_prep(test, client, historical_weather,
        forecast_weather, electricity_prices, gas_prices, sample_prediction, weather_station_to_county_mapping):        

    # Datatype conversion
    client.date = pd.to_datetime(client.date)

    ## Electricity Prices Data
    electricity_prices.forecast_date = pd.to_datetime(electricity_prices.forecast_date)
    electricity_prices.origin_date = pd.to_datetime(electricity_prices.origin_date)

    ## Forecast Weather Data
    forecast_weather.origin_datetime = pd.to_datetime(forecast_weather.origin_datetime)
    forecast_weather.forecast_datetime = pd.to_datetime(forecast_weather.forecast_datetime)

    ## Gas Prices Data
    gas_prices.forecast_date = pd.to_datetime(gas_prices.forecast_date)
    gas_prices.origin_date = pd.to_datetime(gas_prices.origin_date)

    ## Historical Weather Data
    historical_weather.datetime = pd.to_datetime(historical_weather.datetime)

    ## Train Data & Checking for NULL values
    test['datetime'] = pd.to_datetime(test.prediction_datetime, format='%Y-%m-%d %H:%M:%S')

    ## Data Merging (now we merge everything to test)
    ### Merge Client
    # append '_client' to merged columns
    client.columns = [f"{column}_client" if column not in ['county', 'is_business', 'product_type'] else column for column in client.columns]

    # merge train and client
    merged_df = pd.merge(test, client, on=['county', 'is_business', 'product_type'], how='left')

    ### Merge Gas Prices

    # merge gas_prices
    merged_df["lowest_price_per_mwh_gas_prices"] = gas_prices.lowest_price_per_mwh.min()
    merged_df["highest_price_per_mwh_gas_prices"] = gas_prices.highest_price_per_mwh.max()

    ### Merge Electricity Prices
    # add time column for merging with electricity data
    merged_df['time_of_day'] = merged_df['datetime'].dt.time

    # Merge electricity prices
    # the prices are available hourly -> create new column with time 
    electricity_prices['time_of_day'] = electricity_prices.forecast_date.dt.time

    # append electricity_prices to column names
    electricity_prices.columns = [f"{column}_electricity_prices" if column not in ['time_of_day'] else column for column in electricity_prices.columns]

    ### Merge Electricity Prices
    # merge electricity_prices
    merged_df = pd.merge(merged_df, electricity_prices, on = ['time_of_day'], how='left')

    ### Merge Historical Weather
    # get county and county_name from weather_station_to_county_mapping (merge on latitude and longitude)

    # round lat and long to avoid mismatching due to different accuracy
    historical_weather.latitude = historical_weather.latitude.astype("float").round(1)
    historical_weather.longitude = historical_weather.longitude.astype("float").round(1)
    
    weather_station_to_county_mapping.latitude = weather_station_to_county_mapping.latitude.astype("float").round(1)
    weather_station_to_county_mapping.longitude = weather_station_to_county_mapping.longitude.astype("float").round(1)

    # merge historical weather to get counties
    merged_hist_weather = pd.merge(historical_weather, weather_station_to_county_mapping, on=['latitude', 'longitude'], how='left')
    # get time of day
    merged_hist_weather['time_of_day'] = merged_hist_weather['datetime'].dt.time
    
    # aggregate by county and time (summarize weather stations for same county)
    merged_hist_weather = merged_hist_weather.groupby(['county', 'time_of_day', 'datetime']).mean(numeric_only=True).reset_index()
    
    # append _hist_weather to column names
    merged_hist_weather.columns = [f"{column}_hist_weather" if column not in ['county', 'time_of_day'] else column for column in merged_hist_weather.columns]

    # merge to merged_df
    merged_df = pd.merge(merged_df, merged_hist_weather, on=['time_of_day', 'county'], how='left')

    ### Merge Forecast Weather
    # forecast weather

    #round lat and long
    forecast_weather.latitude = forecast_weather.latitude.astype("float").round(1)
    forecast_weather.longitude = forecast_weather.longitude.astype("float").round(1)

    # merge to get counties
    merged_forecast_weather = pd.merge(forecast_weather, weather_station_to_county_mapping, on=['latitude', 'longitude'], how='left')
    # merged_forecast_weather['time_of_day'] = merged_forecast_weather.

    # # aggregate for duplicate locations
    merged_forecast_weather = merged_forecast_weather.groupby(['county', 'forecast_datetime']).mean(numeric_only=True).reset_index()

    # append forecast_weather to column names
    merged_forecast_weather.columns = [f"{column}_forecast_weather" if column not in ['county', 'forecast_datetime'] else column for column in merged_forecast_weather.columns]


    # merge forecast_weather
    merged_df = pd.merge(merged_df, merged_forecast_weather, left_on=['datetime', 'county'], right_on=['forecast_datetime', 'county'], how='left')
    
    # split datetime into meaningful features of int types
    merged_df = split_datetime(merged_df)
    
    # mapping days of the week names and converting to categorical variable
    if 'day_of_week' in merged_df.columns:
        weekday_map = {
            0: 'Monday',
            1: 'Tuesday',
            2: 'Wednesday',
            3: 'Thursday',
            4: 'Friday',
            5: 'Saturday',
            6: 'Sunday'
        }
    merged_df['day_of_week'] = merged_df['day_of_week'].map(weekday_map).astype('category')
    # encode categories to category datetype

    merged_df['county'] = merged_df['county'].astype('category')
    merged_df['product_type'] = merged_df['product_type'].astype('category')
    
    # model is not able to handle object type
    merged_df.drop('time_of_day', axis=1, inplace=True)

    # model is not able to handle datetime
    merged_df = merged_df.drop(merged_df.select_dtypes(include=['datetime64[ns]', 'datetime64[ns, EET]', 'object']).columns, axis=1)
    
    drop_columns = [
    'hours_ahead_forecast_weather',
    'row_id',
    'prediction_unit_id',
    'longitude_hist_weather',
    'longitude_forecast_weather',
    'latitude_hist_weather',
    'latitude_forecast_weather',
    'currently_scored'
    ]
    
    merged_df.drop(drop_columns, axis=1, inplace=True)

    return merged_df

In [12]:

def data_prep_with_row_id(test, client, historical_weather,
        forecast_weather, electricity_prices, gas_prices, sample_prediction, weather_station_to_county_mapping):        

    # Datatype conversion
    client.date = pd.to_datetime(client.date)

    ## Electricity Prices Data
    electricity_prices.forecast_date = pd.to_datetime(electricity_prices.forecast_date)
    electricity_prices.origin_date = pd.to_datetime(electricity_prices.origin_date)

    ## Forecast Weather Data
    forecast_weather.origin_datetime = pd.to_datetime(forecast_weather.origin_datetime)
    forecast_weather.forecast_datetime = pd.to_datetime(forecast_weather.forecast_datetime)

    ## Gas Prices Data
    gas_prices.forecast_date = pd.to_datetime(gas_prices.forecast_date)
    gas_prices.origin_date = pd.to_datetime(gas_prices.origin_date)

    ## Historical Weather Data
    historical_weather.datetime = pd.to_datetime(historical_weather.datetime)

    ## Train Data & Checking for NULL values
    test['datetime'] = pd.to_datetime(test.prediction_datetime, format='%Y-%m-%d %H:%M:%S')

    ## Data Merging (now we merge everything to test)
    ### Merge Client
    # append '_client' to merged columns
    client.columns = [f"{column}_client" if column not in ['county', 'is_business', 'product_type'] else column for column in client.columns]

    # merge train and client
    merged_df = pd.merge(test, client, on=['county', 'is_business', 'product_type'], how='left')

    ### Merge Gas Prices

    # merge gas_prices
    merged_df["lowest_price_per_mwh_gas_prices"] = gas_prices.lowest_price_per_mwh.min()
    merged_df["highest_price_per_mwh_gas_prices"] = gas_prices.highest_price_per_mwh.max()

    ### Merge Electricity Prices
    # add time column for merging with electricity data
    merged_df['time_of_day'] = merged_df['datetime'].dt.time

    # Merge electricity prices
    # the prices are available hourly -> create new column with time 
    electricity_prices['time_of_day'] = electricity_prices.forecast_date.dt.time

    # append electricity_prices to column names
    electricity_prices.columns = [f"{column}_electricity_prices" if column not in ['time_of_day'] else column for column in electricity_prices.columns]

    ### Merge Electricity Prices
    # merge electricity_prices
    merged_df = pd.merge(merged_df, electricity_prices, on = ['time_of_day'], how='left')

    ### Merge Historical Weather
    # get county and county_name from weather_station_to_county_mapping (merge on latitude and longitude)

    # round lat and long to avoid mismatching due to different accuracy
    historical_weather.latitude = historical_weather.latitude.astype("float").round(1)
    historical_weather.longitude = historical_weather.longitude.astype("float").round(1)
    
    weather_station_to_county_mapping.latitude = weather_station_to_county_mapping.latitude.astype("float").round(1)
    weather_station_to_county_mapping.longitude = weather_station_to_county_mapping.longitude.astype("float").round(1)

    # merge historical weather to get counties
    merged_hist_weather = pd.merge(historical_weather, weather_station_to_county_mapping, on=['latitude', 'longitude'], how='left')
    # get time of day
    merged_hist_weather['time_of_day'] = merged_hist_weather['datetime'].dt.time
    
    # aggregate by county and time (summarize weather stations for same county)
    merged_hist_weather = merged_hist_weather.groupby(['county', 'time_of_day', 'datetime']).mean(numeric_only=True).reset_index()
    
    # append _hist_weather to column names
    merged_hist_weather.columns = [f"{column}_hist_weather" if column not in ['county', 'time_of_day'] else column for column in merged_hist_weather.columns]

    # merge to merged_df
    merged_df = pd.merge(merged_df, merged_hist_weather, on=['time_of_day', 'county'], how='left')

    ### Merge Forecast Weather
    # forecast weather

    #round lat and long
    forecast_weather.latitude = forecast_weather.latitude.astype("float").round(1)
    forecast_weather.longitude = forecast_weather.longitude.astype("float").round(1)

    # merge to get counties
    merged_forecast_weather = pd.merge(forecast_weather, weather_station_to_county_mapping, on=['latitude', 'longitude'], how='left')
    # merged_forecast_weather['time_of_day'] = merged_forecast_weather.

    # # aggregate for duplicate locations
    merged_forecast_weather = merged_forecast_weather.groupby(['county', 'forecast_datetime']).mean(numeric_only=True).reset_index()

    # append forecast_weather to column names
    merged_forecast_weather.columns = [f"{column}_forecast_weather" if column not in ['county', 'forecast_datetime'] else column for column in merged_forecast_weather.columns]


    # merge forecast_weather
    merged_df = pd.merge(merged_df, merged_forecast_weather, left_on=['datetime', 'county'], right_on=['forecast_datetime', 'county'], how='left')
    
    # split datetime into meaningful features of int types
    merged_df = split_datetime(merged_df)
    
    # mapping days of the week names and converting to categorical variable
    if 'day_of_week' in merged_df.columns:
        weekday_map = {
            0: 'Monday',
            1: 'Tuesday',
            2: 'Wednesday',
            3: 'Thursday',
            4: 'Friday',
            5: 'Saturday',
            6: 'Sunday'
        }
        merged_df['day_of_week'] = merged_df['day_of_week'].map(weekday_map).astype('category')
    merged_df['day_of_week'] = merged_df['day_of_week'].astype('category')
    # encode categories to category datetype

    merged_df['county'] = merged_df['county'].astype('category')
    merged_df['product_type'] = merged_df['product_type'].astype('category')
    
    # model is not able to handle object type
    merged_df.drop('time_of_day', axis=1, inplace=True)

    # model is not able to handle datetime
    merged_df = merged_df.drop(merged_df.select_dtypes(include=['datetime64[ns]', 'datetime64[ns, EET]', 'object']).columns, axis=1)
    
    drop_columns = [
        'hours_ahead_forecast_weather',
        'prediction_unit_id',
        'longitude_hist_weather',
        'longitude_forecast_weather',
        'latitude_hist_weather',
        'latitude_forecast_weather',
        'currently_scored'
    ]
    
    merged_df.drop(drop_columns, axis=1, inplace=True)

    return merged_df

## prepare data frame

In [13]:
# no row_id after data_prep function

In [None]:
selected_fields = ['county', 'is_business', 'product_type', 'is_consumption',
       'eic_count_client', 'installed_capacity_client',
       'rain_hist_weather', 'snowfall_hist_weather',
       'cloudcover_total_hist_weather', 'cloudcover_mid_hist_weather',
       'cloudcover_high_hist_weather', 'diffuse_radiation_hist_weather',
       'temperature_forecast_weather', 'dewpoint_forecast_weather',
       'surface_solar_radiation_downwards_forecast_weather',
       'total_precipitation_forecast_weather', 'year', 'week', 'hour',
       'day_of_year', 'day_of_week']

In [14]:
if ON_KAGGLE:
    import enefit
else:
    import sys
    sys.path.append('../imports')
    import public_timeseries_testing_util as enefit


# copy of df before new data
merged_df['row_id'] = merged_df['row_id'].astype('int', errors='ignore')

env = enefit.make_env()
iter_test = env.iter_test()

counter = 0
previous_revealed_targets = pd.DataFrame()
for (test, revealed_targets, client, historical_weather,
        forecast_weather, electricity_prices, gas_prices, sample_prediction) in iter_test:
    print(counter)
    print(merged_df[merged_df['row_id'] == 2005874])
    merged_df['day_of_week'] = merged_df['day_of_week'].astype('category')

    try: 
        # drop columns if target is na
        model_df = merged_df.dropna(subset=['target'])
    except:
        # create alias
        model_df = merged_df
        print('some na targets were dropped')


    model.fit(model_df.drop(drop_columns, axis=1)[selected_fields], model_df.target)

    
    if counter in [0, 1]:
        pass
        # print(f'Test dataframe #{counter} \n', test.head(3))
        # print(f'Revealed targets dataframe #{counter} \n', revealed_targets.head(3))
        # print(f'Client dataframe #{counter} \n', client.head(3))
        # print(f'Historical weather dataframe #{counter} \n', historical_weather.head(3))
        # print(f'Forecast weather dataframe #{counter} \n', forecast_weather.head(3))
        # print(f'Electricity prices dataframe #{counter} \n', electricity_prices.head(3))
        # print(f'Gas prices dataframe #{counter} \n', gas_prices.head(3))
        # print(f'Sample prediction dataframe #{counter} \n', sample_prediction.head(3))
    
    prepped_df = data_prep_with_row_id(
        test, client, historical_weather, forecast_weather, electricity_prices, 
        gas_prices, sample_prediction, weather_station_to_county_mapping
    )

    # print(merged_df.columns, '\n', prepped_df.columns)

    # bring new data to storage
    merged_df = pd.concat([merged_df, prepped_df], axis=0, ignore_index=True)
    
    try:
        # add revealed targets to data
        revealed_targets = pd.concat([previous_revealed_targets, revealed_targets], axis=0)
        targets_indexes = merged_df['row_id'][revealed_targets['row_id']].index
        merged_df['target'].iloc[targets_indexes[0]: targets_indexes[-1]+1] = revealed_targets['target']
        previous_revealed_targets = pd.DataFrame()
    except KeyError as e:
        # store unused revealed targets for the next try
        print('KeyError occurred')
        print(e)
        previous_revealed_targets = revealed_targets.copy()
        
    # make and put prediction
    sample_prediction['target'] = model.predict(prepped_df.drop('row_id', axis=1)[selected_fields])
    sample_prediction['target'] = sample_prediction['target'].fillna(0).clip(0)

    env.predict(sample_prediction)
    counter += 1


0
<class 'pandas.core.frame.DataFrame'>
Index: 2017824 entries, 0 to 2018351
Data columns (total 51 columns):
 #   Column                                              Dtype   
---  ------                                              -----   
 0   county                                              category
 1   is_business                                         int64   
 2   product_type                                        category
 3   target                                              float64 
 4   is_consumption                                      int64   
 5   data_block_id                                       int64   
 6   row_id                                              int64   
 7   prediction_unit_id                                  int64   
 8   eic_count_client                                    float64 
 9   installed_capacity_client                           float64 
 10  lowest_price_per_mwh_gas_prices                     float64 
 11  highest_price_per_mwh_gas_p

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['target'].iloc[targets_indexes[0]: targets_indexes[-1]+1] = revealed_targets['target']


1
<class 'pandas.core.frame.DataFrame'>
Index: 2017824 entries, 0 to 2017823
Data columns (total 51 columns):
 #   Column                                              Dtype   
---  ------                                              -----   
 0   county                                              category
 1   is_business                                         int64   
 2   product_type                                        category
 3   target                                              float64 
 4   is_consumption                                      int64   
 5   data_block_id                                       float64 
 6   row_id                                              int64   
 7   prediction_unit_id                                  float64 
 8   eic_count_client                                    float64 
 9   installed_capacity_client                           float64 
 10  lowest_price_per_mwh_gas_prices                     float64 
 11  highest_price_per_mwh_gas_p

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['target'].iloc[targets_indexes[0]: targets_indexes[-1]+1] = revealed_targets['target']


2
<class 'pandas.core.frame.DataFrame'>
Index: 2017824 entries, 0 to 2017823
Data columns (total 51 columns):
 #   Column                                              Dtype   
---  ------                                              -----   
 0   county                                              category
 1   is_business                                         int64   
 2   product_type                                        category
 3   target                                              float64 
 4   is_consumption                                      int64   
 5   data_block_id                                       float64 
 6   row_id                                              int64   
 7   prediction_unit_id                                  float64 
 8   eic_count_client                                    float64 
 9   installed_capacity_client                           float64 
 10  lowest_price_per_mwh_gas_prices                     float64 
 11  highest_price_per_mwh_gas_p

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['target'].iloc[targets_indexes[0]: targets_indexes[-1]+1] = revealed_targets['target']


3
<class 'pandas.core.frame.DataFrame'>
Index: 2017824 entries, 0 to 2017823
Data columns (total 51 columns):
 #   Column                                              Dtype   
---  ------                                              -----   
 0   county                                              category
 1   is_business                                         int64   
 2   product_type                                        category
 3   target                                              float64 
 4   is_consumption                                      int64   
 5   data_block_id                                       float64 
 6   row_id                                              int64   
 7   prediction_unit_id                                  float64 
 8   eic_count_client                                    float64 
 9   installed_capacity_client                           float64 
 10  lowest_price_per_mwh_gas_prices                     float64 
 11  highest_price_per_mwh_gas_p

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['target'].iloc[targets_indexes[0]: targets_indexes[-1]+1] = revealed_targets['target']


In [20]:
merged_df[merged_df['row_id'] == 2005874]

Unnamed: 0,county,is_business,product_type,target,is_consumption,data_block_id,row_id,prediction_unit_id,eic_count_client,installed_capacity_client,...,surface_solar_radiation_downwards_forecast_weather,snowfall_forecast_weather,total_precipitation_forecast_weather,year,month,week,hour,day_of_year,day_of_month,day_of_week
2005346,0,0,2,0.469,0,634.0,2005874,1.0,11.0,34.0,...,0.0,0.0,0.0,2023,5,21,0,148,28,Sunday
2017826,0,0,2,,0,,2005874,,11.0,34.0,...,0.0,0.0,0.0,2023,5,21,0,148,28,Sunday
