## Importing the time series and setting date time

In [2]:
import pandas as pd # pandas
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
# from sklearn import impute
from  scipy.stats import skew, kurtosis, shapiro


plt.rc('axes', linewidth=2)
plt.rc('axes', labelsize= 16)
plt.rc('axes',titlesize = 18)
plt.rc('legend',fontsize=14)
plt.rc('xtick', labelsize=16) 
plt.rc('ytick', labelsize=16) 
plt.rc('figure',figsize=(10,8))

consumptionLoad = pd.read_csv("D:\Onedrive\Leuven\Final project\data\consumption.csv",index_col = "meter_id")
new_index = np.arange(1,len(consumptionLoad.index)+1)
new_index = []
for i in np.arange(1,len(consumptionLoad.index)+1):
    new_index.append("meter_" + str(i))
new_index = pd.Series(new_index)
consumptionLoad.set_index(new_index,inplace=True)
consumptionLoad_T = consumptionLoad.transpose()
dates = pd.to_datetime(consumptionLoad_T.index)
consumptionLoad_T = consumptionLoad_T.set_index(dates);consumptionLoad_T.head()
consumptionLoad = consumptionLoad_T

def figure_layout(figsize=(10,8),titel="",xlabel="",ylabel="",fontsize_titel=18,fontsize_axis=16,fontsize_legend=14,fontsize_ticks=16):
    plt.figure(figsize=figsize)
    ax1 = plt.gca()
    plt.rc('legend',fontsize=fontsize_legend)
    plt.title(titel, fontsize=fontsize_titel, fontweight = 'bold')
    plt.grid(True)
    plt.xlabel(xlabel, fontsize=fontsize_axis)
    plt.ylabel(ylabel, fontsize=fontsize_axis)
    for tick in ax1.xaxis.get_major_ticks():
            tick.label1.set_fontsize(fontsize_ticks)
    #         tick.label1.set_fontweight('bold')
    for tick in ax1.yaxis.get_major_ticks():
        tick.label1.set_fontsize(fontsize_ticks)
    #     tick.label1.set_fontweight('bold')
    
    return ax1

plt.rc('axes', linewidth=2)
plt.rc('axes', labelsize= 16)
plt.rc('axes',titlesize = 18)
plt.rc('legend',fontsize=14)
plt.rc('xtick', labelsize=16) 
plt.rc('ytick', labelsize=16) 
plt.rc('figure',figsize=(10,8))



In [28]:
def visualization_NaN(months_show,meter_show,rows = 4, columns = 3):
    if rows*columns != len(months_show):
        raise Exception("The number of months doesn't match with the amount of provided figures")
    originalMeter = pd.DataFrame(data=consumptionLoad_T[meter_show])
    originalMeter["month"] = originalMeter.index.month_name()
    originalMeter["day"] = originalMeter.index.day
    r=0
    c=0
    _, axes = plt.subplots(rows,columns)

    for m in months_show:
        originalMeter_month = originalMeter[originalMeter["month"] == m]

        index = []
        lost = []
        full = []

        for i in np.arange(originalMeter_month["day"][0],originalMeter_month["day"][-1]+1,1):
            dataDay = originalMeter_month[originalMeter_month["day"] == i]
            index.append(str(i))
            nNaN = dataDay[meter_show].isnull().sum()
            lost.append(nNaN)
            full.append(len(dataDay)-nNaN)

        df = pd.DataFrame(index=index)
        df["full"] = full
        df["lost"] = lost

        df.plot.bar(stacked=True,figsize=(30,24),ax=axes[r][c],title= m)

        if c == columns - 1:
            r += 1
            c = 0
        else:
            c += 1
    
    plt.show()

## Selecting the meters that have observations in all months.

In [3]:
data_monthly = consumptionLoad_T.resample('MS',axis=0).sum()
new_index_months = data_monthly.index.month_name()
data_monthly.set_index(new_index_months,inplace=True)
### Create an empty dataframe to store the first index of non zero value
first_non_zero_ind = pd.DataFrame(index = ['first_non_zero_row'])

In [4]:
### Create empty list to store IDs of TS to remove (because they are full of zeros/NaN only)
ID_NaN = []
### Loop through the columns and though the indexes of the dataframe
for col_ind in range(len(data_monthly.columns)):
    ID = data_monthly.columns[col_ind]
    for row_ind in range(len(data_monthly.index)):
### we check for non zero values, but only if a first value has not been selected yet 
        if (data_monthly.iloc[[row_ind],[col_ind]].values != 0) and (not ID in first_non_zero_ind.columns):
            first_non_zero_ind.insert(col_ind,ID,[row_ind]) #The value inserted indicates when the data starts
### in the case of an 'empty' TS, we place a NaN and we save the ID of this TS     
        elif ( all(val == 0 for val in data_monthly[[ID]].values) ) and (not ID in first_non_zero_ind.columns):
            first_non_zero_ind.insert(col_ind,ID,np.NaN)
            ID_NaN.append(ID)

In [5]:
print("Meter without data: {}.".format(ID_NaN))

Meter without data: ['meter_258'].


In [6]:
meters_with_full_data = first_non_zero_ind.columns[first_non_zero_ind.iloc[0] == 0]
fullYeardata = consumptionLoad_T[meters_with_full_data]

In [7]:
amount_measurements = fullYeardata.shape[1]

## Investigating best way to subsitude the missing values

In [8]:
amount_measurements = fullYeardata.shape[0]
pct_NaN = fullYeardata.isnull().sum().sort_values().div(amount_measurements).mul(100)
print("The amount of missing data per smart meter ranges between {} % and {} %.".format(np.around(pct_NaN.min(),2),np.around(pct_NaN.max(),2)))

The amount of missing data per smart meter ranges between 1.1 % and 35.34 %.


In [63]:
data_MV = fullYeardata.copy(deep=True)
data_MV["month"] = data_MV.index.month_name()
data_MV = data_MV[data_MV["month"] == "March"]
data_MV.drop("month",axis='columns', inplace=True)
# visualization_NaN(["January","February","March","April","May","June","July","August","September","October","November","December"],"meter_3002")
print("Amount of meters with no missing days in the month March: {}.".format(len(data_MV.columns[data_MV.isnull().sum() == 0])))
col_no_missing = list(data_MV.columns[data_MV.isnull().sum() == 0])


Amount of meters with no missing days in the month March: 181.


In [80]:
reference = data_MV[col_no_missing].copy(deep=True)
test = reference.copy(deep = True)

In [83]:
test.index

DatetimeIndex(['2017-03-01 00:00:00', '2017-03-01 00:30:00',
               '2017-03-01 01:00:00', '2017-03-01 01:30:00',
               '2017-03-01 02:00:00', '2017-03-01 02:30:00',
               '2017-03-01 03:00:00', '2017-03-01 03:30:00',
               '2017-03-01 04:00:00', '2017-03-01 04:30:00',
               ...
               '2017-03-31 19:00:00', '2017-03-31 19:30:00',
               '2017-03-31 20:00:00', '2017-03-31 20:30:00',
               '2017-03-31 21:00:00', '2017-03-31 21:30:00',
               '2017-03-31 22:00:00', '2017-03-31 22:30:00',
               '2017-03-31 23:00:00', '2017-03-31 23:30:00'],
              dtype='datetime64[ns]', length=1488, freq=None)

In [84]:
# Select randomly 7 days that will be removed. 

for col_ind in range(len(test.columns)):
    ID = test.columns[col_ind]
    rand_days = np.random.randint(2,31,size=7)
    temporal_df = pd.DataFrame(index = test.index)
    temporal_df["day"] = test.index.day










# Test the error that is made on the estimated days





NameError: name 'temporal' is not defined