# Preparing Data

In [None]:
import pandas as pd
import numpy as np

In [None]:
client = pd.read_csv('../data/client.csv')
client.head()

## Client Data

In [None]:
# Datatype conversion
client.date = pd.to_datetime(client.date)

## Electricity Prices Data

In [None]:
electricity_prices = pd.read_csv('../data/electricity_prices.csv')

In [None]:
electricity_prices.forecast_date = pd.to_datetime(electricity_prices.forecast_date)
electricity_prices.origin_date = pd.to_datetime(electricity_prices.origin_date)

## Forecast Weather Data

In [None]:
forecast_weather = pd.read_csv('../data/forecast_weather.csv')

In [None]:
forecast_weather.origin_datetime = pd.to_datetime(forecast_weather.origin_datetime)
forecast_weather.forecast_datetime = pd.to_datetime(forecast_weather.forecast_datetime)

## Gas Prices Data

In [None]:
gas_prices = pd.read_csv('../data/gas_prices.csv')

In [None]:
gas_prices.forecast_date = pd.to_datetime(gas_prices.forecast_date)
gas_prices.origin_date = pd.to_datetime(gas_prices.origin_date)

## Historical Weather Data

In [None]:
historical_weather = pd.read_csv('../data/historical_weather.csv')

In [None]:
historical_weather.datetime = pd.to_datetime(historical_weather.datetime)

## Train Data & Checking for NULL values

In [None]:
train = pd.read_csv('../data/train.csv')

In [None]:
train.datetime = pd.to_datetime(train.datetime, format='%Y-%m-%d %H:%M:%S')

In [None]:
weather_station_to_county_mapping = pd.read_csv('../data/weather_station_to_county_mapping.csv')

https://www.kaggle.com/code/fabiendaniel/mapping-locations-and-county-codes/notebook  for county codes
Here, they remove the 'maa' appendix from the county names. but is this really needed?

## Data Merging (now we merge everything to train)

### Merge Client

In [None]:
# append '_client' to merged columns
client.columns = [f"{column}_client" if column not in ['data_block_id', 'county', 'is_business', 'product_type'] else column for column in client.columns]

In [None]:
# merge train and client

merged_df = pd.merge(train, client, on=['data_block_id', 'county', 'is_business', 'product_type'], how='left')



### Merge Gas Prices

In [None]:
# append _gas_prices to columns
gas_prices.columns = [f"{column}_gas_prices" if column != 'data_block_id' else column for column in gas_prices.columns]

In [None]:
# merge gas_prices

merged_df = pd.merge(merged_df, gas_prices, on=['data_block_id'], how='left')


### Merge Electricity Prices

In [None]:
# add time column for merging with electricity data
merged_df['time_of_day'] = merged_df['datetime'].dt.time


In [None]:
# Merge electricity prices
# the prices are available hourly -> create new column with time 

electricity_prices['time_of_day'] = electricity_prices.forecast_date.dt.time

# append electricity_prices to column names
electricity_prices.columns = [f"{column}_electricity_prices" if column not in ['time_of_day','data_block_id'] else column for column in electricity_prices.columns]



### Merge Electricity Prices

In [None]:
# merge electricity_prices

merged_df = pd.merge(merged_df, electricity_prices, on = ['data_block_id', 'time_of_day'], how='left')


### Merge Historical Weather

In [None]:
# get county and county_name from weather_station_to_county_mapping (merge on latitude and longitude)

# round lat and long to avoid mismatching due to different accuracy
historical_weather.latitude = historical_weather.latitude.round(1)
historical_weather.longitude = historical_weather.longitude.round(1)

weather_station_to_county_mapping.latitude = weather_station_to_county_mapping.latitude.round(1)
weather_station_to_county_mapping.longitude = weather_station_to_county_mapping.longitude.round(1)


In [None]:
# merge historical weather to get counties
merged_hist_weather = pd.merge(historical_weather, weather_station_to_county_mapping, on=['latitude', 'longitude'], how='left')
# get time of day
merged_hist_weather['time_of_day'] = merged_hist_weather['datetime'].dt.time

# aggregate by county and time (summarize weather stations for same county)
merged_hist_weather = merged_hist_weather.groupby(['county', 'time_of_day', 'datetime', 'data_block_id']).mean(numeric_only=True).reset_index()

# append _hist_weather to column names
merged_hist_weather.columns = [f"{column}_hist_weather" if column not in ['county', 'time_of_day','data_block_id'] else column for column in merged_hist_weather.columns]




In [None]:
# merge to merged_df
merged_df = pd.merge(merged_df, merged_hist_weather, on=['data_block_id', 'time_of_day', 'county'], how='left')

### Merge Forecast Weather

In [None]:
# forecast weather

#round lat and long
forecast_weather.latitude = forecast_weather.latitude.round(1)
forecast_weather.longitude = forecast_weather.longitude.round(1)

# merge to get counties
merged_forecast_weather = pd.merge(forecast_weather, weather_station_to_county_mapping, on=['latitude', 'longitude'], how='left')
# merged_forecast_weather['time_of_day'] = merged_forecast_weather.

# # aggregate for duplicate locations
merged_forecast_weather = merged_forecast_weather.groupby(['county', 'forecast_datetime', 'data_block_id']).mean(numeric_only=True).reset_index()

# append forecast_weather to column names
merged_forecast_weather.columns = [f"{column}_forecast_weather" if column not in ['county', 'forecast_datetime','data_block_id'] else column for column in merged_forecast_weather.columns]



In [None]:
# add EET timezone to datetime, and handle daylight-savings
merged_df['datetime_localized'] = merged_df.datetime.dt.tz_localize('EET', ambiguous=True, nonexistent='shift_forward')

# convert UTC timezone to EET timezone in forecast weather
merged_forecast_weather['datetime_EET']  = merged_forecast_weather.forecast_datetime.dt.tz_convert('EET')


In [None]:
# merge forecast_weather
merged_df = pd.merge(merged_df, merged_forecast_weather, left_on=['data_block_id', 'datetime_localized', 'county'], right_on=['data_block_id', 'datetime_EET', 'county'], how='left')


In [None]:
!pip install pyarrow
!pip fastparquet

In [None]:
merged_df.to_parquet('../data/merged_df.parquet')