# First look at the data

This script only shows the original data the organisation committee gave and how to use them starting from the script `load_original_data.py`. 

The environmental variable `DATA_FOLDER` must be defined in your shell/bash. This variable refers to the folder where your projects' data are saved.

For the Battle of the Weather Demand Forecasting (Battle of the Network 2024), the project name, hence the folder, is `BoN2024_data`.  

The original data are saved in the `original` folder. 

## Load the data

In [None]:
# Load variables from load_original_data
import data_loader
dmas_h_cons, raw_weather, calendar, dmas_characteristics = data_loader.load_original_data()

## Plot a typical day 
### By DMA
Let's see what a random day where I have the data for 24 hours in a row from all the DMAs looks like. 

In [None]:
%matplotlib widget
import matplotlib.pyplot as plt
import pandas as pd 

# Find the days where you have 24 hours in a row of data for all the DMAs
full_days = dmas_h_cons.groupby(dmas_h_cons.index.date).count().iloc[:, :] == 24
# Find days where all DMAs have 24 hours of data
full_days = full_days[full_days.sum(axis=1) == 10]
# Select a random day from the full days
random_day = pd.Timestamp(full_days.sample().index[0])

# Plot the data for that day
plt.figure(figsize=(10,10))
plt.title(f"Consumption of all DMAs for {random_day}")
for dma in dmas_h_cons.columns:
    plt.plot(dmas_h_cons.loc[random_day:random_day+pd.Timedelta(hours=23), dma], label=dma)

plt.legend()
plt.grid(visible=True)

### Total consumption 
When we aggregate all the 10 DMAs, the total consumption looks like this:

In [None]:
# Plot the sum of all the DMAs for that day
plt.figure(figsize=(12,8))
plt.title(f"Total network consumption for {random_day}")
plt.plot(dmas_h_cons.loc[random_day:random_day+pd.Timedelta(hours=23), :].sum(axis=1, skipna=False), label="Network", linewidth=3, color="black")
plt.legend()
plt.grid(visible=True)

And this is how it looks like when consumptions are relative to the average value of the dma.

In [None]:
plt.figure(figsize=(12,8))
plt.title(f"Scaled consumption of network and DMAs for {random_day}")
scaled_consumption = dmas_h_cons.copy()
scaled_consumption = scaled_consumption.div(dmas_characteristics['h_mean'].values, axis=1)
for dma in scaled_consumption.columns:
    plt.plot(scaled_consumption.loc[random_day:random_day+pd.Timedelta(hours=23), dma], label=dma)

plt.plot(dmas_h_cons.loc[random_day:random_day+pd.Timedelta(hours=23), :].sum(axis=1)/dmas_characteristics.loc[:,'h_mean'].sum(), label="Network", linewidth=3, linestyle='--', color='black')

plt.grid(True)
plt.legend()

## Complete time series
### By DMA

In [None]:
fig, axs = plt.subplots(11, 1, figsize=(12, 44))
for ax, dma in zip(axs.flatten(), dmas_characteristics.index):
    #ax.title(f"Consumption of {dma} from {dmas_h_cons.index.min()} to {dmas_h_cons.index.max()}")
    dma_d_cons = dmas_h_cons.loc[:,dma].resample('D').mean()
    dma_w_cons = dmas_h_cons.loc[:,dma].resample('W').mean()
    ax.plot(dma_d_cons, label=dma)
    ax.plot(dma_w_cons, label=f"{dma} weekly", linestyle='--')

plt.subplot(11,1,11)
plt.plot(dmas_h_cons.sum(axis=1, skipna=False).resample('D').mean(), label="Network")