# DLR Dataset Timeseries Description

In [None]:
import pandas as pd
import numpy as np
import datetime as dt

import plotly.plotly as py
import plotly.offline as po
import plotly.graph_objs as go
import plotly.tools as tls
import colorlover as cl
#from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
cf.go_offline()

from features.feature_ts import *
from observations.obs_processing import *
from experiment.algorithms.cluster_prep import preprocessX
from evaluation.eval_clusters import *

### Retrieve Profiles

In [None]:
year_range = [1994,2014]
X = genX(year_range)

### Visualise Daily Load Profile

In [None]:
dlp = X.sample(1)
data = [go.Scatter(x = dlp.columns, y = dlp.iloc[0], mode = 'lines', line = dict(width = 2, color = 'red'))]
layout = dict(title= 'Sample Daily Load Profile', titlefont=dict(size=20),
        xaxis = dict(title='time of day', titlefont=dict(size=16), tickfont=dict(size=18)),
        yaxis=dict(title='Mean hourly electricity demand (A)', titlefont=dict(size=16), tickfont=dict(size=18),
            range=[0, dlp.max()*1.2]),
        margin=dict(l=60, b=50, t=40),
        showlegend=False, width=750, height=375)

po.iplot(dict(data=data, layout=layout))

## Household Count

In [None]:
dailycount = X.groupby(pd.Grouper(freq='D', level=-1))['0'].count()
monthlycount = dailycount.resample('M').agg(np.median)
med_annualcount = dailycount.resample('A').agg(np.median)
max_annualcount = dailycount.resample('A').agg(np.max)
min_annualcount = dailycount.resample('A').agg(np.min)
profilecount = X.groupby(pd.Grouper(level=0))['0'].count()

In [None]:
print('median daily household count: ', dailycount.median())
print('median monthly household count: ', monthlycount.median())
print('median annual household count: ', med_annualcount.median())
print('total daily load profiles from 1994 - 2014: ', dailycount.sum())
print('total number of households observed: ', len(profilecount))
print('households observed for half a year or more: ', len(profilecount[profilecount/365>0.5]))
print('mean days observed per household: ', int(profilecount.mean()))

In [None]:
dailycount[dailycount == 1245]

### Count of Households Observed / Year

In [None]:
annualcount = pd.concat([max_annualcount, med_annualcount, min_annualcount], axis=1, keys=['max','median','min'])
annualcount.head()

In [None]:
trace1 = go.Bar(dict(
    x=annualcount.index, 
    y=annualcount['max'],
    marker = dict(color='purple')
))

trace2 = go.Bar(dict(
    x=annualcount.index, 
    y=annualcount['median'],
    marker = dict(color='red')
))

trace3 = go.Bar(dict(
    x=annualcount.index, 
    y=annualcount['min'],
    marker = dict(color='orange')
))

layout = go.Layout(dict(
    xaxis = dict(title='Year'),
    yaxis = dict(title='number of households'),
    barmode='stack'
))

data = [trace3, trace2, trace1]

po.plot(data)

#cufflinks that no longer works
#annualcount.iplot(kind='bar', xTitle='Year', yTitle='number of households', colors=['blue','red','orange'],
#                  title='Min, median and max household count observered per day per year',
#                barmode='overlay')

### Count of Households Observed / Day

In [None]:
trace1 = go.Bar(dict(
    x=dailycount.index, 
    y=dailycount.values,
    marker = dict(color='darkblue')
))

layout = go.Layout(dict(
    title='Number of households observed for each day from 1994 - 2014',
    font=dict(size=22),
    xaxis = dict(title='Year', titlefont=dict(size=20), tickfont=dict(size=18)),
    yaxis = dict(title='number of households', titlefont=dict(size=20), tickfont=dict(size=18)),
    barmode='stack'
))

data = [trace1]

po.plot(dict(data=data, layout=layout))

#dailycount.iplot(kind='bar', xTitle='Year', yTitle='number of households', 
#                 title='Number of households observed for each day from 1994 - 2014')

### Count of Days Observed 

In [None]:
profilecount.iplot(kind='histogram', xTitle='number of days observed (shaded area indicates more than half a year observed)', 
                   yTitle='number of IDs', colors='red',
                   title = 'Count of Profile IDs observed for n days',
                   vspan={'x0':182,'x1':profilecount.max(),'color':'grey','fill':True,'opacity':.05})

## Average Demand

### Daily Demand (kWh)

In [None]:
Xdd_A = X.sum(axis=1)
#Xdd_A.iplot(kind='histogram', bins=100, histnorm='percent', title='Histogram of daily demand of all profiles',
#          yTitle='% of profiles in bin',xTitle='binned daily demand (A)',color='blue')

In [None]:
Xdd = Xdd_A*230/1000

### Average Annual Daily Demand (A)

In [None]:
Xadd = resampleProfiles(Xdd,'A','mean').reset_index().groupby('ProfileID').mean()
Xadd.columns = ['ADD_kWh']
#Xadd.iplot(kind='histogram', bins=100, histnorm='percent', title='Histogram of average annual daily demand of all profiles',
#          yTitle='% of profiles in bin',xTitle='bins of average daily consumption (kWh)')

In [None]:
Xadd.head()

In [None]:
centiles = pd.qcut(x=Xadd.iloc[:,0], q=100, retbins=True)

In [None]:
data = [go.Scatter(x = pd.DataFrame(centiles[1]).index, y = centiles[1].cumsum(), mode = 'lines', 
                   line = dict(width = 3, color = 'purple'))]
layout = dict(title= 'Cumulative Daily Consumption for Household Percentiles', titlefont=dict(size=18),
        xaxis=dict(title='household percentile', titlefont=dict(size=16), tickfont=dict(size=14)),
        yaxis=dict(title='cum. daily consumption (kWh)', titlefont=dict(size=16), tickfont=dict(size=14),
            #range=[0, centiles[1].cumsum().max()*1.1]
                  ),
        margin=dict(l=80, b=50, t=60),
        showlegend=False, width=700, height=400)

po.iplot(dict(data=data, layout=layout))

In [None]:
trace1 = go.Histogram(dict(
    x=Xadd['ADD_kWh'], 
    xbins=dict(start=0,
               end=100,
               size= 5),
#    nbinsx=200,
    histnorm='percent',
    marker=dict(color=cl.scales['5']['qual']['Pastel1'][3], 
                                        line=dict(color='#000000', width=0.5))
))

layout = go.Layout(dict(
    title='Histogram of annualised average daily energy consumption of all households',
    #font=dict(size=18),
    xaxis = dict(title='average daily consumption bins (kWh)', dtick = 10, titlefont=dict(size=16), tickfont=dict(size=14)),
    yaxis = dict(title='percent (%) of households in bin', titlefont=dict(size=16), tickfont=dict(size=14)),
    margin=dict(l=80, b=50, t=60),
    width=700, height=400
))

data = [trace1]

po.iplot(dict(data=data, layout=layout))

### Average Monthly Demand (kWh)

In [None]:
XmonthlyPower = resampleProfiles(Xdd, interval='M', aggfunc='sum')
Xamd = resampleProfiles(XmonthlyPower, interval='A', aggfunc='mean').reset_index().groupby('ProfileID').mean()
Xamd.columns = ['AMD_kWh']
Xamd.iplot(kind='histogram', bins=100, histnorm='percent', 
           title='Histogram of average monthly energy consumption for households',
           yTitle='% of households in bin',xTitle='bins of average monthly consumption (kWh)')

### Representative Load Profiles

In [None]:
Xrlp = resampleProfiles(X,'A','mean').reset_index().groupby('ProfileID').mean()
Xrlp.head()

In [None]:
rdlps = X.reset_index(level=1)
rdlp = rdlps.groupby(rdlps['date'].dt.month).mean()

data = [go.Scatter(x = rdlp.columns, y = rdlp.iloc[11], mode = 'lines', line = dict(width = 2, color = 'blue'))]
layout = dict(title= 'Sample Representative Daily Load Profile', titlefont=dict(size=20),
        xaxis = dict(title='time of day', titlefont=dict(size=16), tickfont=dict(size=18)),
        yaxis=dict(title='Mean hourly electricity demand (A)', titlefont=dict(size=16), tickfont=dict(size=18),
            range=[0, dlp.max()*1.2]),
        margin=dict(l=60, b=50, t=40),
        showlegend=False, width=750, height=375)

po.iplot(dict(data=data, layout=layout))

In [None]:
bins = pd.cut(Xamd.iloc[:,0], range(0, 100*int(np.ceil(Xamd.max()/100))+1, 100 ), labels=range(100, 100*int(np.ceil(Xamd.max()/100))+1, 100 ))
amdrlp = Xrlp.join(bins)
MeanRLP = amdrlp.groupby('AMD (kWh)').mean().fillna(0)
StdRLP = Xrlp.join(bins).groupby('AMD (kWh)').std().fillna(0)

def plotRLP(maxMonthlyDemand, color='red'):
    amdrlp.loc[amdrlp['AMD (kWh)'] == maxMonthlyDemand, amdrlp.columns!='AMD (kWh)'].iplot(kind='box', 
            title='Variance of mean hourly demand for households with an average monthly demand between ' + str(maxMonthlyDemand-100) + ' and '+ str(maxMonthlyDemand) + ' kWh',
            xTitle = 'time of day',
            yTitle = 'mean hourly demand (kWh)',
            color=color,
            legend=False)
                                                                                           
plotRLP(100, 'blue')
    
MeanRLP.T.iplot(kind='scatter', 
            title='Representative load profiles for households in the same range of average monthly consumption', 
            xTitle = 'time of day',
            yTitle = 'mean hourly demand (kWh)',
            width = 3,)


### Exploring the Effect of Normalisation on Load Profiles

In [None]:
cent8 = realCentroids('exp8_kmeans_unit_norm')
rdlps = cent8.loc[[5, 43, 58, 59],'0':'23']
rdlps

In [None]:
def plotNorm(rdlps, norm):
    
    colours = cl.scales[str(len(rdlps))]['div']['Spectral']
    
    norm_rdlps = preprocessX(rdlps, norm)
    
    traces = []
    for r in range(0, len(norm_rdlps)):
        traces.append(go.Scatter(
            x = rdlps.columns,
            y = norm_rdlps[r],
            mode = 'lines',
            line = dict(width = 2, color = colours[r]),
            )
        )
        
    if norm is None:
        titlenorm='Un'
    else:
        titlenorm=norm + ' '
    layout = dict(title= titlenorm +'normalised RDLPs', titlefont=dict(size=20),
            xaxis = dict(
                title='time of day',
                titlefont=dict(size=16),
                tickfont=dict(size=18)),
            yaxis=dict(
                title='normalised electricity demand',
                titlefont=dict(size=16),
                tickfont=dict(size=18),
                range=[0, norm_rdlps.max()*1.1]),
            margin=dict(l=60, b=50, t=40),
            showlegend=False, width=750, height=375)
    fig=dict(data=traces, layout=layout)
    
    return po.iplot(fig)

In [None]:
plotNorm(rdlps, None)

In [None]:
plotNorm(rdlps, 'unit_norm')

In [None]:
plotNorm(rdlps, 'zero-one')

In [None]:
plotNorm(rdlps, norm='demin')

In [None]:
plotNorm(rdlps, norm='sa_norm')

## Experimenting with Self Organising Maps & kmeans

In [None]:
som.view_umatrix(colorbar=True, bestmatches=True)

In [None]:
som.view_component_planes([6])

In [None]:
pd.DataFrame(som.clusters).iplot(kind='heatmap')

In [None]:
k = [som.clusters[som.bmus[i][1],som.bmus[i][0]] for i in range(0, len(som.bmus))]
pd.DataFrame(k).iplot(kind='histogram', title='count of profiles per cluster kmeans + SOM')

In [None]:
len(som.codebook)
#som.n_dim
#pd.DataFrame(som.codebook).groupby

m = np.arange(0, nrow*ncol, 1).reshape(nrow, ncol)
k = [m[som.bmus[i][1],som.bmus[i][0]] for i in range(0, len(som.bmus))]
c = som.codebook.reshape(nrow * ncol, som.n_dim)

In [None]:
pd.DataFrame(clustX).iplot(kind='histogram', title='count of profiles per cluster kmeans only', color='blue')