# Daily Load Profile Timeseries Clustering Evaluation

In [None]:
import pandas as pd
import numpy as np
import datetime as dt
import os
from math import ceil

import plotly.plotly as py
import plotly.offline as po
import plotly.graph_objs as go
import plotly.figure_factory as ff
import plotly.tools as tools
import colorlover as cl
#from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
cf.go_offline()

import matplotlib. pyplot as plt
from matplotlib import colors
from matplotlib.colors import LinearSegmentedColormap

import evaluation.eval_clusters as ec
import evaluation.eval_cluster_plot as pc
from support import data_dir
eval_dir = os.path.join(data_dir,'cluster_evaluation')

In [None]:
experiments = ec.getExperiments('exp')
exp2 = ec.getExperiments('exp2')
exp3 = ec.getExperiments('exp3')
exp4 = ec.getExperiments('exp4')
exp5 = ec.getExperiments('exp5')
exp6 = ec.getExperiments('exp6')
#experiments

## Analyse Cluster Scores

### Davies-Bouldin Index

In [None]:
pc.plotClusterIndex('dbi', 'Davies-Bouldin Index', experiments, groupby='algorithm')

### Mean Index Adequacy

In [None]:
pc.plotClusterIndex('mia','Mean Index Adequacy', experiments)

### Silhouette Score

The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters. Negative values generally indicate that a sample has been assigned to the wrong cluster, as a different cluster is more similar.

In [None]:
pc.plotClusterIndex('silhouette', 'Silhouette Score', experiments, groupby='algorithm')

### Combined Cluster Score

In [None]:
pc.plotClusterIndex('score','Combined Cluster Score', experiments, groupby='algorithm', ylog=True)

## Explore Cluster Labels, Centroids and Sizes

### Select best clusters for different algorithms

In [None]:
cluster_results = ec.readResults()

selected_clusters = dict()
for e in experiments:
    clusters = ec.selectClusters(cluster_results, 5, threshold=1200, experiment=e)
    selected_clusters[e] = clusters
selected_clusters['best'] = ec.selectClusters(cluster_results, 10, threshold=1200)

In [None]:
selected_clusters['best']

### Get Cluster Labels

In [None]:
best_exp = ['exp2_kmeans_unit_norm', 'exp4_kmeans_unit_norm', 'exp5_kmeans_unit_norm', 'exp6_kmeans_unit_norm',
            'exp6_kmeans_demin']
            #'exp4_kmeans_zero-one', 'exp5_kmeans_zero-one']

In [None]:
cluster_labels = dict()
for e in best_exp[-1:]:
    labels = ec.getLabels(e)
    cluster_labels[e] = labels

### Get denormalised (real) cluster centroids

In [None]:
real_cluster_centroids = dict()

for e in best_exp:
    rccpath = os.path.join(eval_dir, 'best_centroids', e +'_centroids.csv')
    centroids  = pd.read_csv(rccpath, index_col='k')
    real_cluster_centroids[e] = centroids

### Get normalised cluster centroids 
#### only for exp2 & exp3

In [None]:
norm_cluster_centroids = dict()
for k, v in selected_clusters.items():
    name = k
    centroids, cs, meta  = ec.getCentroids(v)
    norm_cluster_centroids[name] = {'centroids':centroids, 'cluster_size':cs, 'description':meta}

In [None]:
real_cluster_centroids['exp2_kmeans_unit_norm']

### Visualise Centroids

In [None]:
ec.exploreAMDBins(cluster_results, 'exp5_kmeans_unit_norm')

In [None]:
ec.exploreAMDBins(cluster_results, 'exp6_kmeans_unit_norm')

In [None]:
i = 0
pc.plotClusterCentroids(real_cluster_centroids[best_exp[i]])

## Explore Patterns in Cluster Labels

### Visualise Cluster Label Assignment

In [None]:
pc.plotClusterLabels(list(cluster_labels.values())[2], 2014)

### Visualise TEMPORAL Cluster Specificity

In [None]:
pc.plotClusterSpecificity(list(cluster_labels.values())[1], corr_list=['daytype','weekday','monthly','season','yearly'])

### Visualise CONTEXTUAL Cluster Specificity (Daily Demand Assignment)

In [None]:
int100_likelihood, q100_likelihood = ec.demandCorr(list(cluster_labels.values())[1], compare='total')

In [None]:
#Equally spaced daily demand intervals
i = int100_likelihood.stack().reset_index()
i.columns = ['int100_bins', 'cluster', 'values']
fig = i.iplot(kind='heatmap', x = 'int100_bins', y='cluster', z='values', colorscale='Reds', 
              title= 'Heatmap of relative likelihood of Cluster k being used in consumption bin', asFigure=True)
fig['layout']['xaxis'].update(dict(title = 'total daily demand bins (Amps)', 
                                   tickmode='array', tickvals=list(range(0,100,10)), ticktext = list(range(0,1000,100))))
fig['layout']['yaxis'].update(dict(title='Cluster k'))
po.iplot(fig)

#Equally sized daily demand intervals (quantiles)
rel_q100 = q100_likelihood.drop(columns='Cluster 33')/0.01

slatered=['#232c2e', '#ffffff','#c34513']
label_cmap, label_cs = pc.colorscale_from_list(slatered, 'label_cmap') 
colorscl= pc.asymmetric_colorscale(rel_q100, label_cmap, ref_point=1.0)

heatmap = go.Heatmap(z = rel_q100.T.values, x = rel_q100.index, y = rel_q100.columns, name = 'corr', 
                          colorscale=colorscl)
layout = go.Layout(
        title= 'Heatmap of relative likelihood of Cluster k being used in consumption quantile',
        xaxis=dict(title = 'total daily demand quantiles (Amps) - log scale', type='log'),
        yaxis=dict(title ='Cluster k'))
fig = {'data':[heatmap], 'layout':layout }
po.iplot(fig)

## Analyse Cluster Representativity and Specificity

### Consumption Error - total

In [None]:
total_consE = dict()
cepath = os.path.join(eval_dir, 'consumption_error.csv')
consumption_error = pd.read_csv(cepath, index_col='k', usecols=['k','experiment','compare',
                                                                'mape','mdape','mdlq','mdsyma']).drop_duplicates()
consumption_error.rename({'experiment':'experiment_name'}, axis=1)

for e in best_exp:
    consE = consumption_error.loc[(consumption_error.experiment==e)&(consumption_error.compare=='total'),:]
    total_consE[e] = {'mape':consE.mape,'mdape':consE.mdape,'mdlq':consE.mdlq,'mdsyma':consE.mdsyma}

In [None]:
pc.plotClusterMetrics(total_consE, 'TOTAL consumption error evaluation metrics')

### Consumption Error - max

In [None]:
peak_consE = dict()
for e in best_exp:
    consE = consumption_error.loc[(consumption_error.experiment==e)&(consumption_error.compare=='peak'),:]
    peak_consE[e] = {'mape':consE.mape,'mdape':consE.mdape,'mdlq':consE.mdlq,'mdsyma':consE.mdsyma}

In [None]:
pc.plotClusterMetrics(peak_consE, 'PEAK consumption error evaluation metrics')

### Peak Coincidence Ratio

In [None]:
peak_coincR = dict()
pcrpath = os.path.join(eval_dir, 'peak_coincidence.csv')
peak_eval = pd.read_csv(pcrpath, index_col='k').drop_duplicates()

for e in best_exp:
    peak_coincR[e] = {'coincidence_ratio': peak_eval.loc[peak_eval['experiment']==e,'coincidence_ratio']}

In [None]:
pc.plotClusterMetrics(peak_coincR, 'daily peak coincidence ratios', metric='coincidence_ratio', make_area_plot=True)

### Cluster Entropy - TEMPORAL
#### weekday, month

In [None]:
temporal_entropy = dict()
max_entropy = dict()

for k,l in cluster_labels.items():
    weekday_likelihood, relative_likelihood = ec.weekdayCorr(l)
    monthly_likelihood, relative_likelihood = ec.monthlyCorr(l)
    
    wce, wme = ec.clusterEntropy(weekday_likelihood, random_likelihood=None)
    mce, mme = ec.clusterEntropy(monthly_likelihood, random_likelihood=None)
    
    temporal_entropy[k] = {'weekday_entropy':wce.reset_index(drop=True),'monthly_entropy':mce.reset_index(drop=True)}
    max_entropy[k] = {'weekday_entropy':wme,'monthly_entropy':mme}

In [None]:
pc.plotClusterMetrics(temporal_entropy, 'time-derived cluster entropy')#, metric='weekday_entropy', make_area_plot=False )

### Cluster Entropy - CONTEXTUAL
#### total daily demand, max daily demand

In [None]:
context_entropy = dict()

for k,l in cluster_labels.items():
    total_int, total_q = ec.demandCorr(l, compare='total')
    peak_int, peak_q = ec.demandCorr(l, compare='peak')
   
    ice, ime = ec.clusterEntropy(total_int, random_likelihood=None)
    pce, pme = ec.clusterEntropy(peak_int, random_likelihood=None)
    
    context_entropy[k] = {'total_entropy':ice.reset_index(drop=True),'peak_entropy':pce.reset_index(drop=True)}

In [None]:
pc.plotClusterMetrics(context_entropy, 'context-derived cluster entropy')

In [None]:
measures = ['total_consE', 'peak_consE', 'peak_coincR', 'temporal_entropy', 'context_entropy']
mean_measures = list()

for m in measures:
    m_data = eval(m)
    for k,v in m_data.items():
        for i,j in v.items():
            me = ec.meanError(j)
            mean_measures.append([m, k, i, me])

In [None]:
evaluation_table = pd.DataFrame(mean_measures, columns=['measure','experiment','metric','value'])
evalcrit = evaluation_table.measure.apply(lambda x: x.split('_',1)[1])
evaluation_table.insert(0, 'evaluation_criteria', evalcrit)
evaluation_table.head()

In [None]:
mean_dict = []
for index, row in evaluation_table.iterrows(): 
    r = dict(zip(list(evaluation_table.columns),row.values))
    mean_dict.append(r)

In [None]:
#et = evaluation_table.set_index(['evaluation_criteria','experiment','measure','metric']).unstack(level=['evaluation_criteria','measure','metric'])
et = evaluation_table.set_index(['evaluation_criteria','experiment','measure','metric']).unstack(level=['experiment'])
et