# Understanding the gaps

Let's start by looking at how many gaps are in each dma and in the weather, how big they are and how to deal with them.


In [None]:
# Load variables from load_original_data
import sys
sys.path.append("..")  # Adds higher directory to python modules path.

import data_loader
dmas_h_q, wea_h, calendar, dmas_characteristics = data_loader.load_original_data()

In [None]:
%matplotlib widget
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

## First days
The data starts on the 1st of January 2021 which is a Friday and a Holiday in Italy. The whole forecasting problem works week by week starting on a Monday, thus it make sense to start from the first Monday (4th of January). 
Let's look at the trajectories in the firste 3 days to see if there are gaps and if we are loosing some data

In [None]:
# Create a figure with two subplots
fig = plt.figure(figsize=(12, 8))
gspec = plt.GridSpec(5, 1)

ax1 = plt.subplot(gspec[:4, 0])
# Plot the first 3 days of hourly consumption
ax1.plot(dmas_h_q[:72])
ax1.set_xlabel('Time')
ax1.set_ylabel('Hourly Consumption [L/s]')
ax1.set_title('First 3 Days of Hourly Consumption by DMA')
ax1.legend(dmas_characteristics.index, loc='upper right' )

ax2 = plt.subplot(gspec[4, 0])
# Create a histogram of NaN values in the first 72 values
nan_counts = np.isnan(dmas_h_q[:72]).sum(axis=0)
ax2.bar(range(len(nan_counts)), nan_counts)
ax2.plot([0, 9], [72,72], 'r--', label='Max')
ax2.set_xlabel('DMA')
ax2.set_xticks(range(len(nan_counts)))
ax2.set_xticklabels(dmas_characteristics.index, rotation=-45)
ax2.set_ylabel('NaN Counts')
ax2.set_title('NaN Counts in the First 3 Days')

# Adjust the layout of the subplots
plt.tight_layout()

# Show the figure
plt.show()


## Histogram by variable
A histogram counting how many gaps for each variable (dma or weather data). 
10 dmas complete networ and 4 weather variables = 15 variable -> figure with (5*3) 

In [None]:
net_h_q = dmas_h_q.sum(axis=1, skipna=False)
net_h_q = net_h_q.to_frame()
net_h_q.columns = ['Network']

# All variables concatenated 
vars_h = pd.concat([dmas_h_q.iloc[72:], net_h_q[72:], wea_h.iloc[72:len(dmas_h_q)]], axis=1)

# count the total number of Nans in each variable and how long each gaps is 
def count_consecutive_nans( array ) :

    nans = np.isnan(array)

    consecutive_nans = []
    count = 0
    for i in range(nans.shape[0]):
        if nans.iloc[i]:
            count += 1
        else:
            if count != 0:
                consecutive_nans.append(count)
            count = 0
    
    return consecutive_nans

# Count the total number of nans for each variable 
nans_count = vars_h.isna().sum()

# Count the total number of gaps for each variable
nans_count_consecutive = vars_h.apply(count_consecutive_nans, axis=0)


In [None]:

# Plot the 15 histograms of the gaps for each variable
fig = plt.figure(figsize=(12, 8))
gspec = plt.GridSpec(3, 5)

for i in range(15):
    ax = plt.subplot(gspec[i])
    unique_elements = np.unique(nans_count_consecutive.iloc[i])
    ax.bar(unique_elements, [sum(nans_count_consecutive.iloc[i] == x) for x in unique_elements])
    ax.title.set_text(vars_h.columns[i])

##  Violin plot by variable
h

In [None]:
# remove empty variables (no gaps)

for var in nans_count_consecutive.index :
    if len(nans_count_consecutive.loc[var]) ==  0 :
        nans_count_consecutive = nans_count_consecutive.drop(var)


fig = plt.figure(figsize=(12, 8))
plt.violinplot(nans_count_consecutive, showmeans=True)
plt.xticks(range(1, nans_count_consecutive.shape[0]+1), nans_count_consecutive.index, rotation=-45)
plt.ylabel('Distribution of the Gaps')
plt.title('Distribution of the Gaps for each Variable')
plt.show()

In [None]:
fig = plt.figure(figsize=(12, 8))
plt.bar(range(nans_count_consecutive.shape[0]), [np.array(nans_count_consecutive.iloc[i]).sum() for i in range(nans_count_consecutive.shape[0])])
plt.xticks(range(nans_count_consecutive.shape[0]), nans_count_consecutive.index, rotation=-45)
plt.ylabel('Distribution of the Gaps')
plt.title('Distribution of the Gaps for each Variable')
plt.show()

# Try to fill all the data where **only one hour is missing**

In [None]:

def fill_one_hour_gaps( a_column: pd.Series ) -> pd.Series :
    nans = np.isnan(a_column)

    for i in range(1, nans.shape[0]-1):
        if nans.iloc[i] and not nans.iloc[i-1] and not nans.iloc[i+1] :
            a_column.iloc[i] = (a_column.iloc[i-1] + a_column.iloc[i+1])/2
    return a_column

myfilled__vars_h = vars_h.apply(fill_one_hour_gaps, axis=0)

filled__vars_h = vars_h.interpolate(method='linear', axis=0, limit_direction='both', limit=1)
print(myfilled__vars_h.isna().sum()-filled__vars_h.isna().sum())