# Submission Notebook
## Preparing and Merging Train Data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px

import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [2]:
# crucial for import API interface and loading data
ON_KAGGLE: bool = False

if ON_KAGGLE:
    DATA_DIR = "/kaggle/input/predict-energy-behavior-of-prosumers/"
else:
    DATA_DIR = "../data/"

# Read CSVs and parse relevant date columns
train = pd.read_csv(DATA_DIR + "train.csv")
client = pd.read_csv(DATA_DIR + "client.csv")
historical_weather = pd.read_csv(DATA_DIR + "historical_weather.csv")
forecast_weather = pd.read_csv(DATA_DIR + "forecast_weather.csv")
electricity_prices = pd.read_csv(DATA_DIR + "electricity_prices.csv")
gas_prices = pd.read_csv(DATA_DIR + "gas_prices.csv")
weather_station_to_county_mapping = pd.read_csv(DATA_DIR + 'weather_station_to_county_mapping.csv')


## Datetime conversion


In [3]:
client.date = pd.to_datetime(client.date)

electricity_prices.forecast_date = pd.to_datetime(electricity_prices.forecast_date)
electricity_prices.origin_date = pd.to_datetime(electricity_prices.origin_date)

forecast_weather.origin_datetime = pd.to_datetime(forecast_weather.origin_datetime)
forecast_weather.forecast_datetime = pd.to_datetime(forecast_weather.forecast_datetime)

gas_prices.forecast_date = pd.to_datetime(gas_prices.forecast_date)
gas_prices.origin_date = pd.to_datetime(gas_prices.origin_date)

historical_weather.datetime = pd.to_datetime(historical_weather.datetime)

train.datetime = pd.to_datetime(train.datetime, format='%Y-%m-%d %H:%M:%S')

# Merging historical data

In [4]:
# append '_client' to merged columns
client.columns = [f"{column}_client" if column not in ['data_block_id', 'county', 'is_business', 'product_type'] else column for column in client.columns]

# merge train and client
merged_df = pd.merge(train, client, on=['data_block_id', 'county', 'is_business', 'product_type'], how='left')


# append _gas_prices to columns
gas_prices.columns = [f"{column}_gas_prices" if column != 'data_block_id' else column for column in gas_prices.columns]

# merge gas_prices
merged_df = pd.merge(merged_df, gas_prices, on=['data_block_id'], how='left')


# add time column for merging with electricity data
merged_df['time_of_day'] = merged_df['datetime'].dt.time

# the prices are available hourly -> create new column with time 
electricity_prices['time_of_day'] = electricity_prices.forecast_date.dt.time

# append electricity_prices to column names
electricity_prices.columns = [f"{column}_electricity_prices" if column not in ['time_of_day','data_block_id'] else column for column in electricity_prices.columns]

# merge electricity_prices
merged_df = pd.merge(merged_df, electricity_prices, on = ['data_block_id', 'time_of_day'], how='left')


# get county and county_name from weather_station_to_county_mapping (merge on latitude and longitude)
# round lat and long to avoid mismatching due to different accuracy
historical_weather.latitude = historical_weather.latitude.round(1)
historical_weather.longitude = historical_weather.longitude.round(1)

weather_station_to_county_mapping.latitude = weather_station_to_county_mapping.latitude.round(1)
weather_station_to_county_mapping.longitude = weather_station_to_county_mapping.longitude.round(1)


# merge historical weather to get counties
merged_hist_weather = pd.merge(historical_weather, weather_station_to_county_mapping, on=['latitude', 'longitude'], how='left')

# get time of day
merged_hist_weather['time_of_day'] = merged_hist_weather['datetime'].dt.time

# aggregate by county and time (summarize weather stations for same county)
merged_hist_weather = merged_hist_weather.groupby(['county', 'time_of_day', 'datetime', 'data_block_id']).mean(numeric_only=True).reset_index()

# append _hist_weather to column names
merged_hist_weather.columns = [f"{column}_hist_weather" if column not in ['county', 'time_of_day','data_block_id'] else column for column in merged_hist_weather.columns]

# merge to merged_df
merged_df = pd.merge(merged_df, merged_hist_weather, on=['data_block_id', 'time_of_day', 'county'], how='left')


#round lat and long
forecast_weather.latitude = forecast_weather.latitude.round(1)
forecast_weather.longitude = forecast_weather.longitude.round(1)

# merge to get counties
merged_forecast_weather = pd.merge(forecast_weather, weather_station_to_county_mapping, on=['latitude', 'longitude'], how='left')
# merged_forecast_weather['time_of_day'] = merged_forecast_weather.

# # aggregate for duplicate locations
merged_forecast_weather = merged_forecast_weather.groupby(['county', 'forecast_datetime', 'data_block_id']).mean(numeric_only=True).reset_index()

# append forecast_weather to column names
merged_forecast_weather.columns = [f"{column}_forecast_weather" if column not in ['county', 'forecast_datetime','data_block_id'] else column for column in merged_forecast_weather.columns]

# merge forecast_weather
merged_df = pd.merge(merged_df, merged_forecast_weather, left_on=['data_block_id', 'datetime', 'county'], right_on=['data_block_id', 'forecast_datetime', 'county'], how='left')



## Data Preparation

In [5]:
def split_datetime(data, col="datetime"):
    # What columns are of type datetime?
    datetime_columns = data.select_dtypes(include='datetime64').columns
    
    for c in datetime_columns:
        # print(f"Timezone for {c} is {data[c].dt.tz}")
        pass

    # Adding columns for date & time
    data['year']    = data[col].dt.year
    # data['quarter'] = data[col].dt.quarter
    data['month']   = data[col].dt.month
    data['week']    = data[col].dt.isocalendar().week
    data['hour']    = data[col].dt.hour 

    data['day_of_year']  = data[col].dt.day_of_year
    data['day_of_month'] = data[col].dt.day
    data['day_of_week']  = data[col].dt.day_of_week

    return data

In [6]:
# mapping days of the week names and converting to categorical variable
if 'day_of_week' in merged_df.columns:
    weekday_map = {
        0: 'Monday',
        1: 'Tuesday',
        2: 'Wednesday',
        3: 'Thursday',
        4: 'Friday',
        5: 'Saturday',
        6: 'Sunday'
    }
    merged_df['day_of_week'] = merged_df['day_of_week'].map(weekday_map).astype('category')

In [7]:
# encode categories to category datetype

merged_df['county'] = merged_df['county'].astype('category')
merged_df['product_type'] = merged_df['product_type'].astype('category')


In [8]:
# model is not able to handle object type
merged_df.drop('time_of_day', axis=1, inplace=True)

# split datetime into meaningful features of int types
merged_df = split_datetime(merged_df)

# model is not able to handle datetime
merged_df = merged_df.drop(merged_df.select_dtypes(include=['datetime64[ns]', 'datetime64[ns, EET]']).columns, axis=1)

# drop na from target
merged_df.dropna(subset=['target'], inplace=True)


## Training & Model Building

In [9]:
#X_train, X_test, y_train,  y_test = train_test_split(model_df.drop('target', axis=1), model_df['target'], test_size=0.3, random_state=0)
drop_columns = [
    'target',
    'hours_ahead_forecast_weather',
    'row_id',
    'data_block_id',
    'prediction_unit_id',
    'longitude_hist_weather',
    'longitude_forecast_weather',
    'latitude_hist_weather',
    'latitude_forecast_weather'
]


model = XGBRegressor(enable_categorical=True, max_depth=9, learning_rate=0.3)
model.fit(merged_df.drop(drop_columns, axis=1), merged_df.target)

# y_pred = bst.predict(X_test)

## main optimisation metric
# print('Mean absolute error test', mean_absolute_error(y_test, y_pred))
# print('Mean absolute error train', mean_absolute_error(y_train, bst.predict(X_train)))

## Loading Test Data / API

In [10]:

def data_prep(test, client, historical_weather,
        forecast_weather, electricity_prices, gas_prices, sample_prediction, weather_station_to_county_mapping):        

    # Datatype conversion
    client.date = pd.to_datetime(client.date)

    ## Electricity Prices Data
    electricity_prices.forecast_date = pd.to_datetime(electricity_prices.forecast_date)
    electricity_prices.origin_date = pd.to_datetime(electricity_prices.origin_date)

    ## Forecast Weather Data
    forecast_weather.origin_datetime = pd.to_datetime(forecast_weather.origin_datetime)
    forecast_weather.forecast_datetime = pd.to_datetime(forecast_weather.forecast_datetime)

    ## Gas Prices Data
    gas_prices.forecast_date = pd.to_datetime(gas_prices.forecast_date)
    gas_prices.origin_date = pd.to_datetime(gas_prices.origin_date)

    ## Historical Weather Data
    historical_weather.datetime = pd.to_datetime(historical_weather.datetime)

    ## Train Data & Checking for NULL values
    test['datetime'] = pd.to_datetime(test.prediction_datetime, format='%Y-%m-%d %H:%M:%S')

    ## Data Merging (now we merge everything to test)
    ### Merge Client
    # append '_client' to merged columns
    client.columns = [f"{column}_client" if column not in ['county', 'is_business', 'product_type'] else column for column in client.columns]

    # merge train and client
    merged_df = pd.merge(test, client, on=['county', 'is_business', 'product_type'], how='left')

    ### Merge Gas Prices

    # merge gas_prices
    merged_df["lowest_price_per_mwh_gas_prices"] = gas_prices.lowest_price_per_mwh.min()
    merged_df["highest_price_per_mwh_gas_prices"] = gas_prices.highest_price_per_mwh.max()

    ### Merge Electricity Prices
    # add time column for merging with electricity data
    merged_df['time_of_day'] = merged_df['datetime'].dt.time

    # Merge electricity prices
    # the prices are available hourly -> create new column with time 
    electricity_prices['time_of_day'] = electricity_prices.forecast_date.dt.time

    # append electricity_prices to column names
    electricity_prices.columns = [f"{column}_electricity_prices" if column not in ['time_of_day'] else column for column in electricity_prices.columns]

    ### Merge Electricity Prices
    # merge electricity_prices
    merged_df = pd.merge(merged_df, electricity_prices, on = ['time_of_day'], how='left')

    ### Merge Historical Weather
    # get county and county_name from weather_station_to_county_mapping (merge on latitude and longitude)

    # round lat and long to avoid mismatching due to different accuracy
    historical_weather.latitude = historical_weather.latitude.astype("float").round(1)
    historical_weather.longitude = historical_weather.longitude.astype("float").round(1)
    
    weather_station_to_county_mapping.latitude = weather_station_to_county_mapping.latitude.astype("float").round(1)
    weather_station_to_county_mapping.longitude = weather_station_to_county_mapping.longitude.astype("float").round(1)

    # merge historical weather to get counties
    merged_hist_weather = pd.merge(historical_weather, weather_station_to_county_mapping, on=['latitude', 'longitude'], how='left')
    # get time of day
    merged_hist_weather['time_of_day'] = merged_hist_weather['datetime'].dt.time
    
    # aggregate by county and time (summarize weather stations for same county)
    merged_hist_weather = merged_hist_weather.groupby(['county', 'time_of_day', 'datetime']).mean(numeric_only=True).reset_index()
    
    # append _hist_weather to column names
    merged_hist_weather.columns = [f"{column}_hist_weather" if column not in ['county', 'time_of_day'] else column for column in merged_hist_weather.columns]

    # merge to merged_df
    merged_df = pd.merge(merged_df, merged_hist_weather, on=['time_of_day', 'county'], how='left')

    ### Merge Forecast Weather
    # forecast weather

    #round lat and long
    forecast_weather.latitude = forecast_weather.latitude.astype("float").round(1)
    forecast_weather.longitude = forecast_weather.longitude.astype("float").round(1)

    # merge to get counties
    merged_forecast_weather = pd.merge(forecast_weather, weather_station_to_county_mapping, on=['latitude', 'longitude'], how='left')
    # merged_forecast_weather['time_of_day'] = merged_forecast_weather.

    # # aggregate for duplicate locations
    merged_forecast_weather = merged_forecast_weather.groupby(['county', 'forecast_datetime']).mean(numeric_only=True).reset_index()

    # append forecast_weather to column names
    merged_forecast_weather.columns = [f"{column}_forecast_weather" if column not in ['county', 'forecast_datetime'] else column for column in merged_forecast_weather.columns]


    # merge forecast_weather
    merged_df = pd.merge(merged_df, merged_forecast_weather, left_on=['datetime', 'county'], right_on=['forecast_datetime', 'county'], how='left')
    
    # split datetime into meaningful features of int types
    merged_df = split_datetime(merged_df)
    
    # mapping days of the week names and converting to categorical variable
    if 'day_of_week' in merged_df.columns:
        weekday_map = {
            0: 'Monday',
            1: 'Tuesday',
            2: 'Wednesday',
            3: 'Thursday',
            4: 'Friday',
            5: 'Saturday',
            6: 'Sunday'
        }
    merged_df['day_of_week'] = merged_df['day_of_week'].map(weekday_map).astype('category')
    # encode categories to category datetype

    merged_df['county'] = merged_df['county'].astype('category')
    merged_df['product_type'] = merged_df['product_type'].astype('category')
    
    # model is not able to handle object type
    merged_df.drop('time_of_day', axis=1, inplace=True)

    # model is not able to handle datetime
    merged_df = merged_df.drop(merged_df.select_dtypes(include=['datetime64[ns]', 'datetime64[ns, EET]', 'object']).columns, axis=1)
    
    drop_columns = [
    'hours_ahead_forecast_weather',
    'row_id',
    'prediction_unit_id',
    'longitude_hist_weather',
    'longitude_forecast_weather',
    'latitude_hist_weather',
    'latitude_forecast_weather',
    'currently_scored'
    ]
    
    merged_df.drop(drop_columns, axis=1, inplace=True)

    return merged_df

## prepare data frame

In [28]:
if ON_KAGGLE:
    import enefit
else:
    import sys
    sys.path.append('../imports')
    import public_timeseries_testing_util as enefit

# copy of df before new data
merged_df['row_id'] = merged_df['row_id'].astype('int', errors='ignore')
merged_df = merged_df[merged_df['row_id'] < 1999536]
merged_df_before = merged_df.copy()

env = enefit.make_env()
iter_test = env.iter_test()
counter = 0
for (test, revealed_targets, client, historical_weather,
        forecast_weather, electricity_prices, gas_prices, sample_prediction) in iter_test:
    if counter in [0, 1]:
        print(f'Test dataframe #{counter} \n', test.head(3))
        print(f'Revealed targets dataframe #{counter} \n', revealed_targets.head(3))
        print(f'Client dataframe #{counter} \n', client.head(3))
        print(f'Historical weather dataframe #{counter} \n', historical_weather.head(3))
        print(f'Forecast weather dataframe #{counter} \n', forecast_weather.head(3))
        print(f'Electricity prices dataframe #{counter} \n', electricity_prices.head(3))
        print(f'Gas prices dataframe #{counter} \n', gas_prices.head(3))
        print(f'Sample prediction dataframe #{counter} \n', sample_prediction.head(3))
    
    prepped_df = data_prep(
        test, client, historical_weather, forecast_weather, electricity_prices, 
        gas_prices, sample_prediction, weather_station_to_county_mapping
    )

    # bring new data to storage
    merged_df = pd.concat([merged_df, prepped_df], axis=0, ignore_index=True)
    targets_indexes = merged_df['row_id'][revealed_targets['row_id']].index
    merged_df['target'].iloc[targets_indexes[0]: targets_indexes[-1]+1] = revealed_targets['target']
    
    sample_prediction['target'] = model.predict(prepped_df)
    sample_prediction['target'] = sample_prediction['target'].fillna(0).clip(0)

    env.predict(sample_prediction)
    counter += 1

# copy of df after new data
merged_df_after = merged_df.copy()

Test dataframe #0 
    county  is_business  product_type  is_consumption  prediction_datetime  \
0       0            0             1               0  2023-05-28 00:00:00   
1       0            0             1               1  2023-05-28 00:00:00   
2       0            0             2               0  2023-05-28 00:00:00   

    row_id  prediction_unit_id  currently_scored  
0  2005872                   0             False  
1  2005873                   0             False  
2  2005874                   1             False  
Revealed targets dataframe #0 
    county  is_business  product_type   target  is_consumption  \
0       0            0             1    2.675               0   
1       0            0             1  471.887               1   
2       0            0             2    0.000               0   

              datetime   row_id  prediction_unit_id  
0  2023-05-26 00:00:00  1999536                   0  
1  2023-05-26 00:00:00  1999537                   0  
2  2023-05-2

KeyError: '[2002128, 2002129, 2002130, 2002131, 2002132, 2002133, 2002134, 2002135, 2002136, 2002137, 2002138, 2002139, 2002140, 2002141, 2002142, 2002143, 2002144, 2002145, 2002146, 2002147, 2002148, 2002149, 2002150, 2002151, 2002152, 2002153, 2002154, 2002155, 2002156, 2002157, 2002158, 2002159, 2002160, 2002161, 2002162, 2002163, 2002164, 2002165, 2002166, 2002167, 2002168, 2002169, 2002170, 2002171, 2002172, 2002173, 2002174, 2002175, 2002176, 2002177, 2002178, 2002179, 2002180, 2002181, 2002182, 2002183, 2002184, 2002185, 2002186, 2002187, 2002188, 2002189, 2002190, 2002191, 2002192, 2002193, 2002194, 2002195, 2002196, 2002197, 2002198, 2002199, 2002200, 2002201, 2002202, 2002203, 2002204, 2002205, 2002206, 2002207, 2002208, 2002209, 2002210, 2002211, 2002212, 2002213, 2002214, 2002215, 2002216, 2002217, 2002218, 2002219, 2002220, 2002221, 2002222, 2002223, 2002224, 2002225, 2002226, 2002227, 2002228, 2002229, 2002230, 2002231, 2002232, 2002233, 2002234, 2002235, 2002236, 2002237, 2002238, 2002239, 2002240, 2002241, 2002242, 2002243, 2002244, 2002245, 2002246, 2002247, 2002248, 2002249, 2002250, 2002251, 2002252, 2002253, 2002254, 2002255, 2002256, 2002257, 2002258, 2002259, 2002260, 2002261, 2002262, 2002263, 2002264, 2002265, 2002266, 2002267, 2002268, 2002269, 2002270, 2002271, 2002272, 2002273, 2002274, 2002275, 2002276, 2002277, 2002278, 2002279, 2002280, 2002281, 2002282, 2002283, 2002284, 2002285, 2002286, 2002287, 2002288, 2002289, 2002290, 2002291, 2002292, 2002293, 2002294, 2002295, 2002296, 2002297, 2002298, 2002299, 2002300, 2002301, 2002302, 2002303, 2002304, 2002305, 2002306, 2002307, 2002308, 2002309, 2002310, 2002311, 2002312, 2002313, 2002314, 2002315, 2002316, 2002317, 2002318, 2002319, 2002320, 2002321, 2002322, 2002323, 2002324, 2002325, 2002326, 2002327, 2002328, 2002329, 2002330, 2002331, 2002332, 2002333, 2002334, 2002335, 2002336, 2002337, 2002338, 2002339, 2002340, 2002341, 2002342, 2002343, 2002344, 2002345, 2002346, 2002347, 2002348, 2002349, 2002350, 2002351, 2002352, 2002353, 2002354, 2002355, 2002356, 2002357, 2002358, 2002359, 2002360, 2002361, 2002362, 2002363, 2002364, 2002365, 2002366, 2002367, 2002368, 2002369, 2002370, 2002371, 2002372, 2002373, 2002374, 2002375, 2002376, 2002377, 2002378, 2002379, 2002380, 2002381, 2002382, 2002383, 2002384, 2002385, 2002386, 2002387, 2002388, 2002389, 2002390, 2002391, 2002392, 2002393, 2002394, 2002395, 2002396, 2002397, 2002398, 2002399, 2002400, 2002401, 2002402, 2002403, 2002404, 2002405, 2002406, 2002407, 2002408, 2002409, 2002410, 2002411, 2002412, 2002413, 2002414, 2002415, 2002416, 2002417, 2002418, 2002419, 2002420, 2002421, 2002422, 2002423, 2002424, 2002425, 2002426, 2002427, 2002428, 2002429, 2002430, 2002431, 2002432, 2002433, 2002434, 2002435, 2002436, 2002437, 2002438, 2002439, 2002440, 2002441, 2002442, 2002443, 2002444, 2002445, 2002446, 2002447, 2002448, 2002449, 2002450, 2002451, 2002452, 2002453, 2002454, 2002455, 2002456, 2002457, 2002458, 2002459, 2002460, 2002461, 2002462, 2002463, 2002464, 2002465, 2002466, 2002467, 2002468, 2002469, 2002470, 2002471, 2002472, 2002473, 2002474, 2002475, 2002476, 2002477, 2002478, 2002479, 2002480, 2002481, 2002482, 2002483, 2002484, 2002485, 2002486, 2002487, 2002488, 2002489, 2002490, 2002491, 2002492, 2002493, 2002494, 2002495, 2002496, 2002497, 2002498, 2002499, 2002500, 2002501, 2002502, 2002503, 2002504, 2002505, 2002506, 2002507, 2002508, 2002509, 2002510, 2002511, 2002512, 2002513, 2002514, 2002515, 2002516, 2002517, 2002518, 2002519, 2002520, 2002521, 2002522, 2002523, 2002524, 2002525, 2002526, 2002527, 2002528, 2002529, 2002530, 2002531, 2002532, 2002533, 2002534, 2002535, 2002536, 2002537, 2002538, 2002539, 2002540, 2002541, 2002542, 2002543, 2002544, 2002545, 2002546, 2002547, 2002548, 2002549, 2002550, 2002551, 2002552, 2002553, 2002554, 2002555, 2002556, 2002557, 2002558, 2002559, 2002560, 2002561, 2002562, 2002563, 2002564, 2002565, 2002566, 2002567, 2002568, 2002569, 2002570, 2002571, 2002572, 2002573, 2002574, 2002575, 2002576, 2002577, 2002578, 2002579, 2002580, 2002581, 2002582, 2002583, 2002584, 2002585, 2002586, 2002587, 2002588, 2002589, 2002590, 2002591, 2002592, 2002593, 2002594, 2002595, 2002596, 2002597, 2002598, 2002599, 2002600, 2002601, 2002602, 2002603, 2002604, 2002605, 2002606, 2002607, 2002608, 2002609, 2002610, 2002611, 2002612, 2002613, 2002614, 2002615, 2002616, 2002617, 2002618, 2002619, 2002620, 2002621, 2002622, 2002623, 2002624, 2002625, 2002626, 2002627, 2002628, 2002629, 2002630, 2002631, 2002632, 2002633, 2002634, 2002635, 2002636, 2002637, 2002638, 2002639, 2002640, 2002641, 2002642, 2002643, 2002644, 2002645, 2002646, 2002647, 2002648, 2002649, 2002650, 2002651, 2002652, 2002653, 2002654, 2002655, 2002656, 2002657, 2002658, 2002659, 2002660, 2002661, 2002662, 2002663, 2002664, 2002665, 2002666, 2002667, 2002668, 2002669, 2002670, 2002671, 2002672, 2002673, 2002674, 2002675, 2002676, 2002677, 2002678, 2002679, 2002680, 2002681, 2002682, 2002683, 2002684, 2002685, 2002686, 2002687, 2002688, 2002689, 2002690, 2002691, 2002692, 2002693, 2002694, 2002695, 2002696, 2002697, 2002698, 2002699, 2002700, 2002701, 2002702, 2002703] not in index'

In [30]:
merged_df.tail()

Unnamed: 0,county,is_business,product_type,target,is_consumption,data_block_id,row_id,prediction_unit_id,eic_count_client,installed_capacity_client,...,surface_solar_radiation_downwards_forecast_weather,snowfall_forecast_weather,total_precipitation_forecast_weather,year,month,week,hour,day_of_year,day_of_month,day_of_week
2002123,15,1,0,,1,,,,15.0,620.0,...,0.0,0.0,0.0,2023,5,21,23,148,28,Sunday
2002124,15,1,1,,0,,,,20.0,624.5,...,0.0,0.0,0.0,2023,5,21,23,148,28,Sunday
2002125,15,1,1,,1,,,,20.0,624.5,...,0.0,0.0,0.0,2023,5,21,23,148,28,Sunday
2002126,15,1,3,,0,,,,55.0,2188.2,...,0.0,0.0,0.0,2023,5,21,23,148,28,Sunday
2002127,15,1,3,,1,,,,55.0,2188.2,...,0.0,0.0,0.0,2023,5,21,23,148,28,Sunday


In [21]:
merged_df.iloc[targets_indexes[0]: targets_indexes[-1]+1]

Unnamed: 0,county,is_business,product_type,target,is_consumption,data_block_id,row_id,prediction_unit_id,eic_count_client,installed_capacity_client,...,surface_solar_radiation_downwards_forecast_weather,snowfall_forecast_weather,total_precipitation_forecast_weather,year,month,week,hour,day_of_year,day_of_month,day_of_week
2008992,0,1,1,2.073,0,635.0,2009520.0,4.0,97.0,2881.60,...,4.610370,0.0,0.000000,2023,5,22,4,149,29,0
2008993,0,1,1,503.735,1,635.0,2009521.0,4.0,97.0,2881.60,...,4.610370,0.0,0.000000,2023,5,22,4,149,29,0
2008994,0,1,2,0.000,0,635.0,2009522.0,61.0,10.0,77.00,...,4.610370,0.0,0.000000,2023,5,22,4,149,29,0
2008995,0,1,2,4.986,1,635.0,2009523.0,61.0,10.0,77.00,...,4.610370,0.0,0.000000,2023,5,22,4,149,29,0
2008996,0,1,3,23.590,0,635.0,2009524.0,5.0,481.0,19303.81,...,4.610370,0.0,0.000000,2023,5,22,4,149,29,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2012107,0,0,2,188.167,1,636.0,2012635.0,1.0,10.0,31.00,...,2.862222,0.0,0.000081,2023,5,22,4,150,30,1
2012108,0,0,3,0.000,0,636.0,2012636.0,2.0,1515.0,15963.06,...,2.862222,0.0,0.000081,2023,5,22,4,150,30,1
2012109,0,0,3,31.484,1,636.0,2012637.0,2.0,1515.0,15963.06,...,2.862222,0.0,0.000081,2023,5,22,4,150,30,1
2012110,0,1,0,0.000,0,636.0,2012638.0,3.0,25.0,1273.20,...,2.862222,0.0,0.000081,2023,5,22,4,150,30,1


In [None]:
merged_df_before.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2017824 entries, 0 to 2018351
Data columns (total 51 columns):
 #   Column                                              Dtype   
---  ------                                              -----   
 0   county                                              category
 1   is_business                                         int64   
 2   product_type                                        category
 3   target                                              float64 
 4   is_consumption                                      int64   
 5   data_block_id                                       int64   
 6   row_id                                              int64   
 7   prediction_unit_id                                  int64   
 8   eic_count_client                                    float64 
 9   installed_capacity_client                           float64 
 10  lowest_price_per_mwh_gas_prices                     float64 
 11  highest_price_per_mwh_gas_pri

In [None]:
merged_df_after.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2030304 entries, 0 to 2030303
Data columns (total 51 columns):
 #   Column                                              Dtype   
---  ------                                              -----   
 0   county                                              category
 1   is_business                                         int64   
 2   product_type                                        category
 3   target                                              float64 
 4   is_consumption                                      int64   
 5   data_block_id                                       float64 
 6   row_id                                              float64 
 7   prediction_unit_id                                  float64 
 8   eic_count_client                                    float64 
 9   installed_capacity_client                           float64 
 10  lowest_price_per_mwh_gas_prices                     float64 
 11  highest_price_per_mwh_ga

In [None]:
targets_indexes = merged_df['row_id'][revealed_targets['row_id']].index
merged_df['target'].iloc[min(targets_indexes): max(targets_indexes)+1] = revealed_targets['target']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['target'].iloc[min(targets_indexes): max(targets_indexes)+1] = revealed_targets['target']


In [None]:
targets_indexes

Index([2008992, 2008993, 2008994, 2008995, 2008996, 2008997, 2008998, 2008999,
       2009000, 2009001,
       ...
       2012102, 2012103, 2012104, 2012105, 2012106, 2012107, 2012108, 2012109,
       2012110, 2012111],
      dtype='int64', length=3120)

In [None]:
merged_df.iloc[min(targets_indexes): max(targets_indexes)+1]

Unnamed: 0,county,is_business,product_type,target,is_consumption,data_block_id,row_id,prediction_unit_id,eic_count_client,installed_capacity_client,...,surface_solar_radiation_downwards_forecast_weather,snowfall_forecast_weather,total_precipitation_forecast_weather,year,month,week,hour,day_of_year,day_of_month,day_of_week
2008992,0,1,1,2.073,0,635.0,2009520.0,4.0,97.0,2881.60,...,4.610370,0.0,0.000000,2023,5,22,4,149,29,0
2008993,0,1,1,503.735,1,635.0,2009521.0,4.0,97.0,2881.60,...,4.610370,0.0,0.000000,2023,5,22,4,149,29,0
2008994,0,1,2,0.000,0,635.0,2009522.0,61.0,10.0,77.00,...,4.610370,0.0,0.000000,2023,5,22,4,149,29,0
2008995,0,1,2,4.986,1,635.0,2009523.0,61.0,10.0,77.00,...,4.610370,0.0,0.000000,2023,5,22,4,149,29,0
2008996,0,1,3,23.590,0,635.0,2009524.0,5.0,481.0,19303.81,...,4.610370,0.0,0.000000,2023,5,22,4,149,29,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2012107,0,0,2,188.167,1,636.0,2012635.0,1.0,10.0,31.00,...,2.862222,0.0,0.000081,2023,5,22,4,150,30,1
2012108,0,0,3,0.000,0,636.0,2012636.0,2.0,1515.0,15963.06,...,2.862222,0.0,0.000081,2023,5,22,4,150,30,1
2012109,0,0,3,31.484,1,636.0,2012637.0,2.0,1515.0,15963.06,...,2.862222,0.0,0.000081,2023,5,22,4,150,30,1
2012110,0,1,0,0.000,0,636.0,2012638.0,3.0,25.0,1273.20,...,2.862222,0.0,0.000081,2023,5,22,4,150,30,1


In [None]:
revealed_targets

Unnamed: 0,county,is_business,product_type,target,is_consumption,datetime,row_id,prediction_unit_id
0,0,0,1,2.073,0,2023-05-29 00:00:00,2008992,0
1,0,0,1,503.735,1,2023-05-29 00:00:00,2008993,0
2,0,0,2,0.000,0,2023-05-29 00:00:00,2008994,1
3,0,0,2,4.986,1,2023-05-29 00:00:00,2008995,1
4,0,0,3,23.590,0,2023-05-29 00:00:00,2008996,2
...,...,...,...,...,...,...,...,...
3115,15,1,0,188.167,1,2023-05-29 23:00:00,2012107,64
3116,15,1,1,0.000,0,2023-05-29 23:00:00,2012108,59
3117,15,1,1,31.484,1,2023-05-29 23:00:00,2012109,59
3118,15,1,3,0.000,0,2023-05-29 23:00:00,2012110,60
