# DLR Dataset Timeseries Exploration

In [None]:
import pandas as pd
import numpy as np
import datetime as dt

import plotly.plotly as py
import plotly.offline as po
import plotly.graph_objs as go
import plotly.tools as tls
import colorlover as cl
#from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
cf.go_offline()

from sklearn.preprocessing import normalize
from sklearn.cluster import MiniBatchKMeans
import somoclu

from features.feature_ts import *
from experiment.algorithms.clusters import *
from experiment.algorithms.cluster_metrics import *
from observations.obs_processing import *

## Retrieve Profiles

In [None]:
year_range = [1994,1994]
X = genX(year_range)

In [None]:
print(X.iloc[0:4, 0:6])
X.index.levels[0]

### Average Daily Demand (A)

In [None]:
Xadd = resampleProfiles(X.sum(axis=1),'A','mean').reset_index().groupby('ProfileID').mean()
Xadd.columns = ['ADD (A)']
Xadd.head()

### Average Monthly Demand (kWh)

In [None]:
XmonthlyPower = resampleProfiles(X, interval='M', aggfunc='sum').sum(axis=1)*(230/1000)
Xamd = resampleProfiles(XmonthlyPower, interval='A', aggfunc='mean').reset_index().groupby('ProfileID').mean()
Xamd.columns = ['AMD (kWh)']
Xamd.head()

### Representative Load Profiles

In [None]:
Xrlp = resampleProfiles(X,'A','mean').reset_index().groupby('ProfileID').mean()
Xrlp.head()

## Visualising the Data

In [None]:
Xamd.iplot(kind='histogram', title='Histogram of average monthly energy consumption for households',yTitle='household count',xTitle='average monthly consumption (kWh)')

In [None]:
bins = pd.cut(Xamd.iloc[:,0], range(0, 100*int(np.ceil(Xamd.max()/100))+1, 100 ), labels=range(100, 100*int(np.ceil(Xamd.max()/100))+1, 100 ))
amdrlp = Xrlp.join(bins)
MeanRLP = amdrlp.groupby('AMD (kWh)').mean().fillna(0)
StdRLP = Xrlp.join(bins).groupby('AMD (kWh)').std().fillna(0)

def plotRLP(maxMonthlyDemand, color='red'):
    amdrlp.loc[amdrlp['AMD (kWh)'] == maxMonthlyDemand, amdrlp.columns!='AMD (kWh)'].iplot(kind='box', 
            title='Variance of mean hourly demand for households with an average monthly demand between ' + str(maxMonthlyDemand-100) + ' and '+ str(maxMonthlyDemand) + ' kWh',
            xTitle = 'time of day',
            yTitle = 'mean hourly demand (kWh)',
            color=color,
            legend=False)
                                                                                           
plotRLP(600, 'blue')
    
MeanRLP.T.iplot(kind='scatter', 
            title='Representative load profiles for households in the same range of average monthly consumption', 
            xTitle = 'time of day',
            yTitle = 'mean hourly demand (kWh)',
            width = 3,)


In [None]:
corr = X.corr()
corr.iplot(kind='heatmap', title = 'Correlation Matrix for hourly household consumption',
           xTitle = 'time of day', yTitle = 'time of day', colorscale='-spectral')

In [None]:
X.iloc[:10000,[6,20]].iplot(kind='scatter', mode='markers', size=2, x=6, y=20,  title='scatter', 
                                   xTitle='consumption at 5h00 (kWh)', yTitle='consumption at 18h00 (kWh)')

## Explore Profiles

In [None]:
data = dailyProfiles(year, unit, directory)

In [None]:
dna = data.dropna()
dna_mean = pd.DataFrame(dna.mean(axis=1), columns=['mean_daily'])
norm = pd.DataFrame(normalize(dna.iloc[:,0:24], return_norm=False))
#norm['mean_daily'] = dna['mean_daily']
norm.set_index(dna_mean.index,inplace=True)
normdata = pd.concat([norm, dna_mean], axis=1)
normdata.head()

In [None]:
ylorbr = cl.scales['9']['seq']['YlOrBr']
colorscale = cl.interp(ylorbr, 1500 )[150:]

cols = pd.qcut(normdata.mean_daily, 1350, labels=colorscale, retbins=False, precision=2, duplicates='raise')

In [None]:
normdata.iloc[-500:,0:24].T.iplot(kind='scatter', colors = cols.to_dict(), filename='cufflinks/cf-simple-line')

In [None]:
data.describe()

In [None]:
print('number of daily profiles: ' + str(data.shape[0]))
data.describe().loc['mean'].iplot(kind='scatter', filename='cufflinks/cf-simple-line')

## Cluster Profiles

In [None]:
try:
    eval_results = pd.read_csv('log/3-117-3_eval_kmeans.csv')
    cluster_results = pd.read_csv('log/3-117-3_centroids_kmeans.csv')
    cluster_labels = pd.read_csv('log/3-117-3_labels_kmeans.csv')
except:
    
    range_n_clusters = range(3, 36, 3)
    cluster_stats, cluster_centroids, cluster_labels = kmeans(X, range_n_clusters, normalise = False)
    eval_results, cluster_results = kmeansResults(cluster_stats, cluster_centroids)

## Evaluation

### High Level Evaluation

In [None]:
Xind = X.reset_index().loc[:,['ProfileID','date']]
lbls = pd.DataFrame(cluster_labels)
l = pd.merge(Xind, lbls, left_index=True, right_index=True)

def profileMode(n_clusters, Xlabel):
    xl = Xlabel.groupby(['ProfileID',str(n_clusters)])['date'].count().reset_index()
    xlcount = xl.iloc[xl.groupby('ProfileID').apply(lambda x: x['date'].idxmax())]
    xlcount.groupby(str(n_clusters))['ProfileID'].count().iplot(kind='bar')
    
profileMode(48, l)

In [None]:
cluster_it = sorted([int(l) for l in cluster_labels.columns]) 
data = cluster_results[['k','n_clust','cluster_size']].set_index(['n_clust','k'])

fig = tls.make_subplots(rows=int(np.ceil(len(cluster_it)/2)), cols=2,
                        subplot_titles=[str(x) + ' clusters' for x in cluster_it],
                        shared_xaxes=False, print_grid=False)

count=0
for i in cluster_it: 
    r = int(count/2) + 1
    c = count % 2 + 1
    fig.append_trace({'x': data.loc[(i)].index, 'y': data.loc[(i),'cluster_size'], 
                      'type': 'bar', 'name': str(i)+' clusters'}, r, c)
    count+=1
    
fig['layout'].update(height=80*count, 
                     title='Count of profiles per cluster for different numbers of clusters',
                     showlegend=False)   

po.iplot(fig)

### Davies-Bouldin Index

In [None]:
eval_results[['n_clust','dbi']].iplot(kind='scatter', x='n_clust', 
                                   title='Davies-Bouldin Index as a function of numbers of clusters', 
                                   xTitle='number of clusters',
                                   yTitle='DBI value')

### Mean Index Adequacy

In [None]:
eval_results[['mia','n_clust']].iplot(kind='scatter', x='n_clust', 
                                   title='Mean Index Adequacy as a function of numbers of clusters', 
                                   xTitle='number of clusters',
                                   yTitle='MIA index value',
                                   color='blue')

### Silhouette Score

The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters. Negative values generally indicate that a sample has been assigned to the wrong cluster, as a different cluster is more similar.

In [None]:
eval_results[['silhouette','n_clust']].iplot(kind='scatter', x='n_clust', 
                                   title='Silhouette Score as a function of numbers of clusters', 
                                   xTitle='number of clusters',
                                   yTitle='Silhouette Score',
                                   color='green')

### Experimenting with Self Organising Maps & kmeans

In [None]:
nrow = ncol = 30

In [None]:
som = somoclu.Somoclu(nrow, ncol, compactsupport=True, maptype='planar')#, initialization='pca')
som.train(np.array(X))

In [None]:
som.cluster(algorithm=MiniBatchKMeans(n_clusters=33, random_state=10))

In [None]:
som.view_umatrix(colorbar=True, bestmatches=True)

In [None]:
som.view_component_planes([6])

In [None]:
pd.DataFrame(som.clusters).iplot(kind='heatmap')

In [None]:
k = [som.clusters[som.bmus[i][1],som.bmus[i][0]] for i in range(0, len(som.bmus))]
pd.DataFrame(k).iplot(kind='histogram', title='count of profiles per cluster kmeans + SOM')

In [None]:
len(som.codebook)
#som.n_dim
#pd.DataFrame(som.codebook).groupby

m = np.arange(0, nrow*ncol, 1).reshape(nrow, ncol)
k = [m[som.bmus[i][1],som.bmus[i][0]] for i in range(0, len(som.bmus))]
c = som.codebook.reshape(nrow * ncol, som.n_dim)
c

In [None]:
dbi = davies_bouldin_score(X, k)
mia = mean_index_adequacy(X, k)
ss = silhouette_score(X, k, sample_size=10000) #unreliable because of random sample
print(dbi)
print(mia)
print(ss)
(dbi*mia)/(ss)

In [None]:
clusterer = MiniBatchKMeans(n_clusters=33, random_state=10)
clustX = clusterer.fit_predict(X)

In [None]:
dbik = davies_bouldin_score(X, clustX)
miak = mean_index_adequacy(X, clustX)
ssk = silhouette_score(X, clustX, sample_size=10000)

print(dbik)
print(miak)
print(ssk)

In [None]:
pd.DataFrame(clustX).iplot(kind='histogram', title='count of profiles per cluster kmeans only', color='blue')