## Setup

### Imports and settings

In [23]:
import json
import pandas as pd


### Data loading

In [17]:
train = pd.read_csv("../data/train.csv")
# product type mapping codes
product_type = {0: "Combined", 1: "Fixed", 2: "General service", 3: "Spot"}

In [18]:
gas_prices = pd.read_csv("../data/gas_prices.csv")

In [28]:
client = pd.read_csv("../data/client.csv")
# county codes mapping
path = "../data/county_id_to_name_map.json"
with open(path, "r", encoding="utf-8") as f:
    file = f.readline()
    county_id_to_name_map = json.loads(file)

In [33]:
el_prices = pd.read_csv("../data/electricity_prices.csv")
weather_forcast = pd.read_csv("../data/forecast_weather.csv")
weather_history = pd.read_csv("../data/historical_weather.csv")

In [35]:
sample_submission = pd.read_csv("../data/example_test_files/sample_submission.csv")
revealed_targets = pd.read_csv("../data/example_test_files/revealed_targets.csv")

***

## EDA

### Preliminary

#### Train

In [3]:
train.head()

Unnamed: 0,county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id
0,0,0,1,0.713,0,2021-09-01 00:00:00,0,0,0
1,0,0,1,96.59,1,2021-09-01 00:00:00,0,1,0
2,0,0,2,0.0,0,2021-09-01 00:00:00,0,2,1
3,0,0,2,17.314,1,2021-09-01 00:00:00,0,3,1
4,0,0,3,2.904,0,2021-09-01 00:00:00,0,4,2


In [39]:
train["target"].median()

31.133

In [14]:
train[train.columns.drop(["target", "datetime"])].nunique()

county                     16
is_business                 2
product_type                4
is_consumption              2
data_block_id             638
row_id                2018352
prediction_unit_id         69
dtype: int64

#### Gas prices

In [19]:
gas_prices.head()

Unnamed: 0,forecast_date,lowest_price_per_mwh,highest_price_per_mwh,origin_date,data_block_id
0,2021-09-01,45.23,46.32,2021-08-31,1
1,2021-09-02,45.62,46.29,2021-09-01,2
2,2021-09-03,45.85,46.4,2021-09-02,3
3,2021-09-04,46.3,46.8,2021-09-03,4
4,2021-09-05,46.3,46.58,2021-09-04,5


#### Client

In [21]:
client.head()

Unnamed: 0,product_type,county,eic_count,installed_capacity,is_business,date,data_block_id
0,1,0,108,952.89,0,2021-09-01,2
1,2,0,17,166.4,0,2021-09-01,2
2,3,0,688,7207.88,0,2021-09-01,2
3,0,0,5,400.0,1,2021-09-01,2
4,1,0,43,1411.0,1,2021-09-01,2


#### Electricity prices

In [30]:
el_prices.head()

Unnamed: 0,forecast_date,euros_per_mwh,origin_date,data_block_id
0,2021-09-01 00:00:00,92.51,2021-08-31 00:00:00,1
1,2021-09-01 01:00:00,88.9,2021-08-31 01:00:00,1
2,2021-09-01 02:00:00,87.35,2021-08-31 02:00:00,1
3,2021-09-01 03:00:00,86.88,2021-08-31 03:00:00,1
4,2021-09-01 04:00:00,88.43,2021-08-31 04:00:00,1


#### Weather forecast

In [32]:
weather_forcast.head()

Unnamed: 0,latitude,longitude,origin_datetime,hours_ahead,temperature,dewpoint,cloudcover_high,cloudcover_low,cloudcover_mid,cloudcover_total,10_metre_u_wind_component,10_metre_v_wind_component,data_block_id,forecast_datetime,direct_solar_radiation,surface_solar_radiation_downwards,snowfall,total_precipitation
0,57.6,21.7,2021-09-01 00:00:00+00:00,1,15.655786,11.553613,0.904816,0.019714,0.0,0.905899,-0.411328,-9.106137,1,2021-09-01 01:00:00+00:00,0.0,0.0,0.0,0.0
1,57.6,22.2,2021-09-01 00:00:00+00:00,1,13.003931,10.689844,0.886322,0.004456,0.0,0.886658,0.206347,-5.355405,1,2021-09-01 01:00:00+00:00,0.0,0.0,0.0,0.0
2,57.6,22.7,2021-09-01 00:00:00+00:00,1,14.206567,11.671777,0.729034,0.005615,0.0,0.730499,1.451587,-7.417905,1,2021-09-01 01:00:00+00:00,0.0,0.0,0.0,0.0
3,57.6,23.2,2021-09-01 00:00:00+00:00,1,14.844507,12.264917,0.336304,0.074341,0.000626,0.385468,1.090869,-9.163999,1,2021-09-01 01:00:00+00:00,0.0,0.0,0.0,0.0
4,57.6,23.7,2021-09-01 00:00:00+00:00,1,15.293848,12.458887,0.102875,0.088074,1.5e-05,0.17659,1.268481,-8.975766,1,2021-09-01 01:00:00+00:00,0.0,0.0,0.0,0.0


#### Historical weather

In [34]:
weather_history.head()

Unnamed: 0,datetime,temperature,dewpoint,rain,snowfall,surface_pressure,cloudcover_total,cloudcover_low,cloudcover_mid,cloudcover_high,windspeed_10m,winddirection_10m,shortwave_radiation,direct_solar_radiation,diffuse_radiation,latitude,longitude,data_block_id
0,2021-09-01 00:00:00,14.4,12.0,0.0,0.0,1015.8,4,4,0,0,6.694444,3,0.0,0.0,0.0,57.6,21.7,1
1,2021-09-01 00:00:00,14.0,12.0,0.0,0.0,1010.6,7,8,0,0,4.944444,353,0.0,0.0,0.0,57.6,22.2,1
2,2021-09-01 00:00:00,14.4,12.8,0.0,0.0,1014.9,6,7,0,0,5.833333,348,0.0,0.0,0.0,57.6,22.7,1
3,2021-09-01 00:00:00,15.4,13.0,0.0,0.0,1014.4,4,2,4,0,7.111111,349,0.0,0.0,0.0,57.6,23.2,1
4,2021-09-01 00:00:00,15.9,12.6,0.0,0.0,1013.8,12,7,0,20,8.388889,360,0.0,0.0,0.0,57.6,23.7,1


#### Submission samples

In [36]:
sample_submission.head()

Unnamed: 0,row_id,data_block_id,target
0,2005872,634,0
1,2005873,634,0
2,2005874,634,0
3,2005875,634,0
4,2005876,634,0


In [37]:
revealed_targets.head()

Unnamed: 0,county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id
0,0,0,1,2.675,0,2023-05-26 00:00:00,634,1999536,0
1,0,0,1,471.887,1,2023-05-26 00:00:00,634,1999537,0
2,0,0,2,0.0,0,2023-05-26 00:00:00,634,1999538,1
3,0,0,2,5.414,1,2023-05-26 00:00:00,634,1999539,1
4,0,0,3,13.899,0,2023-05-26 00:00:00,634,1999540,2


In [38]:
import enefit
env = enefit.make_env()
iter_test = env.iter_test()

counter = 0
for (test, revealed_targets, client, historical_weather,
        forecast_weather, electricity_prices, gas_prices, sample_prediction) in iter_test:
    if counter == 0:
        print(test.head(3))
        print(revealed_targets.head(3))
        print(client.head(3))
        print(historical_weather.head(3))
        print(forecast_weather.head(3))
        print(electricity_prices.head(3))
        print(gas_prices.head(3))
        print(sample_prediction.head(3))
        
    test['datetime'] = pd.to_datetime(test['prediction_datetime'])
    test['hour'] = test['datetime'].dt.hour
    test['day'] = test['datetime'].dt.day
    test['month'] = test['datetime'].dt.month
    test['year'] = test['datetime'].dt.year
    
    test = pd.get_dummies(test[features].astype(int), columns=['product_type', 'is_business', 'is_consumption']).astype(float)
    print(test.head())
    
    sample_prediction['target'] = lgb_model.predict(test[X.columns])

ModuleNotFoundError: No module named 'enefit'