In [72]:
import pandas as pd

# Import data

In [86]:
df_client=pd.read_csv("./raw_data/client.csv")
df_electricity=pd.read_csv("./raw_data/electricity_prices.csv")
df_gas=pd.read_csv("./raw_data/gas_prices.csv")
df_forecast_weather = pd.read_csv("./raw_data/forecast_weather.csv").sample(10000)
df_weather_to_country = pd.read_csv("./raw_data/weather_station_to_county_mapping.csv")

# Changing data format

### Manipulating dates
Check the number of unique values for the date column in the electricity
By checking only the unique dates, it will reduce the number of calculation when we will compare the date between the electricity df and the train df

In [74]:
print("Number of unique values in elec :", df_electricity["origin_date"].nunique())
print("Number of unique values in gas :", df_gas["origin_date"].nunique())

Number of unique values in elec : 15286
Number of unique values in gas : 637


# Feature Engineering

### Adding gas and electricity prices in df_train
First of all we can see that the dataframe for the gas, do not use the same date format than the electricity and the train dataframes date format.
So we will be able to merge the info from the electricity dataframe, based on the datetime from the train dataframe.
But for the gas dataframe, we will need to add a formated column in train df. A column with the same format as the date in the gas df. We keep the date, we just remove the time.
After this manipulation, we can merge the gas prices too.

In [87]:
# Import train data
df_train = pd.read_csv("./raw_data/train.csv").sample(10000)

## Merge elec prices
# Creating a new column based on datetime
df_train["origin_date"] = df_train["datetime"]
# Selecting the infos we need to merge in the electricity df
df_elec_prices = df_electricity.filter(['euros_per_mwh', 'origin_date'], axis=1)
# Merging
df_train = df_train.merge(df_elec_prices, how="left", on="origin_date")

## Merge gas prices
# Removing the hours in the origin_date column created above
df_train["origin_date"] = df_train["origin_date"].apply(lambda x: pd.to_datetime(x).strftime("%Y-%m-%d"))
# Selecting the infos we need to merge in the gas df
df_gas_prices = df_gas.filter(['lowest_price_per_mwh', 'highest_price_per_mwh', 'origin_date'])
# Merging
df_train = df_train.merge(df_gas_prices, how="left", on="origin_date")

# Remove the origin_date column (because it's just a compacted version of "datetime")
df_train.drop(columns=["origin_date"], inplace=True)
# df_train.describe()
df_train

Unnamed: 0,county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id,euros_per_mwh,lowest_price_per_mwh,highest_price_per_mwh
0,5,1,0,1262.220,1,2021-12-18 15:00:00,108,325649,21,99.99,86.17,90.40
1,15,1,0,201.394,1,2022-08-28 15:00:00,361,1130347,64,449.95,245.01,282.00
2,7,1,1,69.961,0,2022-10-19 16:00:00,413,1298386,29,231.79,108.29,125.00
3,3,0,3,77.710,1,2022-11-09 23:00:00,434,1368277,12,107.12,104.29,116.00
4,14,1,2,0.002,0,2022-11-16 08:00:00,441,1389484,68,425.86,108.00,123.82
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,7,1,1,91.649,0,2023-03-19 11:00:00,564,1785198,29,104.39,42.25,47.40
9996,0,0,3,2.656,0,2022-03-23 20:00:00,203,624284,2,225.02,82.10,84.50
9997,4,0,1,29.377,1,2023-02-04 21:00:00,521,1649347,15,119.92,62.50,66.92
9998,8,1,3,331.005,1,2023-03-19 13:00:00,564,1785471,33,95.23,42.25,47.40


## Adding clients

In [88]:
df_client

Unnamed: 0,product_type,county,eic_count,installed_capacity,is_business,date,data_block_id
0,1,0,108,952.89,0,2021-09-01,2
1,2,0,17,166.40,0,2021-09-01,2
2,3,0,688,7207.88,0,2021-09-01,2
3,0,0,5,400.00,1,2021-09-01,2
4,1,0,43,1411.00,1,2021-09-01,2
...,...,...,...,...,...,...,...
41914,1,15,51,415.60,0,2023-05-29,637
41915,3,15,161,2035.75,0,2023-05-29,637
41916,0,15,15,620.00,1,2023-05-29,637
41917,1,15,20,624.50,1,2023-05-29,637


## Add weather to final df

In [89]:
df_forecast_weather_county = df_forecast_weather.merge(df_weather_to_country, how="left", on=["latitude", "longitude"])
df_forecast_weather_county.rename(columns={"origin_datetime": "datetime"}, inplace=True)
# print(df_forecast_weather_county[df_forecast_weather_county["county"] == 11])
df_train = df_train.merge(df_forecast_weather_county, how="left", on=["county", "datetime"])

In [91]:
df_train["temperature"].isna().sum()

9953

 ## Split datetime in multiple columns
Split into 4 columns the datetime (year, month, day, hour)

In [79]:
df_train["year"] = df_train["datetime"].apply(lambda x : x[0:4])
df_train["month"] = df_train["datetime"].apply(lambda x : x[5:7])
df_train["day"] = df_train["datetime"].apply(lambda x : x[8:10])
df_train["hour"] = df_train["datetime"].apply(lambda x : x[11:13])
df_train.drop(columns="datetime", inplace=True)

In [80]:
df_train.to_csv("./formated_data/train.csv",index=False)