# Daily Load Profile Timeseries Clustering Evaluation

In [None]:
import pandas as pd
import numpy as np
import datetime as dt
import os
from math import ceil, log

import plotly.plotly as py
import plotly.offline as po
import plotly.graph_objs as go
import plotly.figure_factory as ff
import plotly.tools as tools
import colorlover as cl
#from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
cf.go_offline()

import matplotlib. pyplot as plt
from matplotlib import colors
from matplotlib.colors import LinearSegmentedColormap

import evaluation.eval_clusters as ec
import evaluation.eval_cluster_plot as pc
from support import data_dir, image_dir, results_dir
eval_dir = os.path.join(data_dir,'cluster_evaluation')

In [None]:
experiments = ec.getExperiments()

In [None]:
best_exp = ['exp2_kmeans_unit_norm', 'exp4_kmeans_zero-one', 'exp5_kmeans_unit_norm', 'exp5_kmeans_zero-one', 
            'exp6_kmeans_unit_norm','exp7_kmeans_unit_norm','exp8_kmeans_unit_norm']

## Analyse Cluster Scores

### Davies-Bouldin Index

In [None]:
pc.plotClusterIndex('dbi', 'Davies-Bouldin Index', experiments, groupby='algorithm')

### Mean Index Adequacy

In [None]:
pc.plotClusterIndex('mia','Mean Index Adequacy', experiments)

### Silhouette Score

The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters. Negative values generally indicate that a sample has been assigned to the wrong cluster, as a different cluster is more similar.

In [None]:
pc.plotClusterIndex('silhouette', 'Silhouette Score', experiments, groupby='experiment')

### Combined Cluster Score

In [None]:
pc.plotClusterIndex('score','Ix Score for all Experiments', experiments, groupby='algorithm', ylog=True)

In [None]:
pc.plotClusterIndex('score','Combined Index Score', experiments, groupby='experiment')

## Explore Cluster Labels, Centroids and Sizes

### Select best clusters for different algorithms

In [None]:
cluster_results = ec.readResults()
selected_clusters = ec.selectClusters(cluster_results, len(cluster_results))
selected_clusters.rename(columns={'experiment':'Experiment','algorithm':'Algorithm','preprocessing':'Norm', 
                                  'SOM dimensions':'SOM dim','clusters':'Clusters','dbi':'DBI', 'mia':'MIA', 
                                  'silhouette':'Silhouette','score':'CI score','run time':'Run time',
                                  'experiment_name':'Experiment name'}, inplace=True)
top10 = selected_clusters.round({'DBI':4, 'MIA':4, 'Silhouette':4, 'CI score': 6, 'Run time':2}).head(10).set_axis(range(1,11), inplace=False)
top10.reset_index().rename(columns={'index':'Rank'})

In [None]:
#Percentage experiments with CI score below 4
ci4 = selected_clusters.loc[selected_clusters['CI score']<4,'CI score'].count()/len(selected_clusters) 

#Percentage experiments with CI score below 6.5
ci65 = selected_clusters.loc[selected_clusters['CI score']<6.5,'CI score'].count()/len(selected_clusters) 

#Max CI score
cimax = selected_clusters['CI score'].max()

#Score difference between best and tenth best experiment
(top10.iloc[9,8] - top10.iloc[0,8])/top10.iloc[0,8]

### Histograms of algorithm performance

In [None]:
data = [go.Histogram(x=selected_clusters['CI score'], nbinsx = 200, histnorm='percent')]
layout = dict(title='Distribution of CI Scores across Clustering Algorithms', titlefont=dict(size=20),
                     xaxis = dict(title='CI score bins', titlefont=dict(size=16), tickfont=dict(size=16)), 
                     yaxis = dict(title='Percent', titlefont=dict(size=16), tickfont=dict(size=16)),
                     margin=dict(t=30, l=40, b=40),
                     height=350, width=1000)
# Plot!
fig0 = go.Figure(data=data, layout=layout)
po.iplot(fig0)
#po.plot(fig0, filename=data_dir+'/cluster_evaluation/plots/clustering_evaluation/DistplotQuantScoresAll'+'.html')

In [None]:
x0 = selected_clusters[selected_clusters.Norm.isna()]['CI score']  
x1 = selected_clusters[selected_clusters.Norm=='unit_norm']['CI score']  
x2 = selected_clusters[selected_clusters.Norm=='demin']['CI score']
x3 = selected_clusters[selected_clusters.Norm=='zero-one']['CI score'] 
x4 = selected_clusters[selected_clusters.Norm=='sa_norm']['CI score']

# Group data together
hist_data = [x0, x1, x2, x3, x4]

group_labels = ['no norm', 'unit norm', 'demin', 'zero-one', 'SA norm']

# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels, histnorm='percent', bin_size=0.05, 
                         show_curve=False, show_rug=False)
fig['layout'].update(title='Distribution of Quantitative Scores across Normalisation Algorithms', titlefont=dict(size=16),
                     xaxis = dict(title='CI score bins'), 
                     yaxis = dict(title='Percent'),
                     margin=dict(t=30, l=30, b=30),
                     height=250, width=600)

# Plot!
po.iplot(fig)
po.plot(fig, filename=data_dir+'/cluster_evaluation/plots/clustering_evaluation/DistplotQuantScoresNormalisation'+'.html')

In [None]:
y0 = selected_clusters[selected_clusters['Experiment name'].str.contains('exp1|exp2|exp3')]['CI score']  
y1 = selected_clusters[selected_clusters['Experiment name'].str.contains('exp4|exp5|exp6')]['CI score']  
y2 = selected_clusters[selected_clusters['Experiment name'].str.contains('exp7|exp8')]['CI score']

# Group data together
hist_data2 = [y0, y1, y2]

group_labels2 = ['no pre-binning', 'AMC', 'integral kmeans']

fig2 = ff.create_distplot(hist_data2, group_labels2, histnorm='percent', bin_size=0.05, 
                          show_curve=False, show_rug=False, colors=['#393E46', '#2BCDC1', '#F66095'])
fig2['layout'].update(title='Distribution of Quantitative Scores across Pre-binning Algorithms', titlefont=dict(size=16),
                     xaxis = dict(title='CI score bins'), 
                     yaxis = dict(title='Percent'),
                     margin=dict(t=30, l=30, b=30),
                     height=250, width=600)
# Plot!
po.iplot(fig2)
po.plot(fig2, filename=data_dir+'/cluster_evaluation/plots/clustering_evaluation/DistplotQuantScoresPrebinning'+'.html')

In [None]:
z0 = selected_clusters[selected_clusters.Algorithm=='kmeans']['CI score']  
z1 = selected_clusters[selected_clusters.Algorithm=='som']['CI score']  
z2 = selected_clusters[selected_clusters.Algorithm=='som+kmeans']['CI score']

# Group data together
hist_data3 = [z0, z1, z2]

group_labels3 = ['kmeans', 'som', 'som+kmeans']

fig3 = ff.create_distplot(hist_data3, group_labels3, histnorm='percent', bin_size=0.05, 
                          show_curve=False, show_rug=False, colors=['#1E90FF','#DC143C', '#800080'])
fig3['layout'].update(title='Distribution of Quantitative Scores across Clustering Algorithms', titlefont=dict(size=16),
                     xaxis = dict(title='CI score bins'), 
                     yaxis = dict(title='Percent'),
                     margin=dict(t=30, l=30, b=30),
                     height=250, width=600)
# Plot!
po.iplot(fig3)
po.plot(fig3, filename=data_dir+'/cluster_evaluation/plots/clustering_evaluation/DistplotQuantScoresClustering'+'.html')

### Analyse algorithm run times

In [None]:
runtimes = selected_clusters.loc[(selected_clusters.Norm=='unit_norm')].groupby('Algorithm')[['CI score','Run time']].mean()
runtimes.rename(columns={'Run time':'Mean run time (s)','CI score':'Mean CI score'}, inplace=True)
runtimes.round(2)

In [None]:
kmeansruntimes = selected_clusters.loc[(selected_clusters.Algorithm=='kmeans')].groupby('Clusters')['Run time'].mean()
somkmeansruntimes = selected_clusters.loc[(selected_clusters.Algorithm=='som+kmeans')].groupby('SOM dim')['Run time'].mean()
somruntimes = selected_clusters.loc[(selected_clusters.Algorithm=='som')].groupby('SOM dim')['Run time'].mean()

data = [go.Scatter(x=somruntimes.index**2,
                  y=somruntimes.values,
                  name='som',
                  mode='lines'),
        go.Scatter(x=kmeansruntimes.index,
                  y=kmeansruntimes.values,
                  name='k-means',
                  mode='lines')
       ]
layout = dict(title='Run times for som and k-means algorithms', titlefont=dict(size=18),
                     xaxis = dict(title='number of SOM dimensions or clusters', titlefont=dict(size=16), tickfont=dict(size=16)), 
                     yaxis = dict(title='run time (s)', titlefont=dict(size=16), tickfont=dict(size=16)),
                     margin=dict(t=30),
                     height=350, width=600)
# Plot!
fig0 = go.Figure(data=data, layout=layout)
po.iplot(fig0)

### Visualise Centroids

#### Get denormalised (real) cluster centroids

In [None]:
real_cluster_centroids = dict()

for e in best_exp:
    rccpath = os.path.join(eval_dir, 'best_centroids', e +'BEST1_centroids.csv')
    centroids  = pd.read_csv(rccpath, index_col='k')
    real_cluster_centroids[e] = centroids

In [None]:
i = 6
ex = ec.exploreAMDBins(best_exp[i]).reset_index()
mapper = ec.mapBins(real_cluster_centroids[best_exp[i]])
out = pd.merge(ex, mapper, on='elec_bin').sort_values(by='mean_dd')
out.rename(columns={'total_sample':'Members','score':'Ix','n_clust':'Clusters','bin_labels':'Mean daily demand bin'}, inplace=True)
out = out.round({'Ix':3})
o = out.reset_index().drop(columns=['som_dim','elec_bin','mean_dd','index'],axis=0)
po = o.pivot(index=o.index, columns='experiment_name').swaplevel(axis=1)
po.set_index((best_exp[i], 'Mean daily demand bin'), inplace=True)
po.index.rename('Mean daily demand bin', inplace=True)
po

In [None]:
for i in range(0,7):
    pc.plotClusterCentroids(real_cluster_centroids[best_exp[i]])#, threshold=10490, groupby=None) 

### Visualise Centroid and Member Profiles

In [None]:
best_exp

In [None]:
i = 3
centroids = ec.realCentroids(best_exp[i])
centroids['cluster_size'].plot('bar', figsize=(14,4))

In [None]:
clusters = centroids.nlargest(15, 'cluster_size').sort_index().index.values
clusters

In [None]:
pc.plotMembersSample(best_exp[i], largest=15)

## Explore Patterns in Cluster Labels

### Visualise TEMPORAL Cluster Specificity

In [None]:
for i in range(0,7):
    pc.plotClusterSpecificity(best_exp[i], corr_list=['daytype','weekday'], threshold=10490, relative=[[5,1,1],1])
    pc.plotClusterSpecificity(best_exp[i], corr_list=['season','monthly'], threshold=10490, relative=[[8, 4],1])
    pc.plotClusterSpecificity(best_exp[i], corr_list=['yearly'], threshold=10490)

### Visualise CONTEXTUAL Cluster Specificity (Daily Demand Assignment)

In [None]:
experiment = 'exp8_kmeans_unit_norm'

corr_path = os.path.join(data_dir, 'cluster_evaluation', 'k_correlations')

dif = pd.read_csv(os.path.join(corr_path, 'demandi_corr.csv'), index_col=[0,1,2], header=[0]).drop_duplicates()
dif_temp = dif.reset_index(level=[-2,-1])
int100_total = dif_temp[(dif_temp.experiment==experiment+'BEST1')&(dif_temp.compare=='total')].drop(['experiment','compare'],axis=1)

dqf = pd.read_csv(os.path.join(corr_path, 'demandq_corr.csv'), index_col=[0,1,2], header=[0]).drop_duplicates()
dqf_temp = dqf.reset_index(level=[-2,-1])
q100_total = dqf_temp[(dqf_temp.experiment==experiment+'BEST1')&(dqf_temp.compare=='total')].drop(['experiment','compare'],axis=1)

In [None]:
#Equally spaced daily demand intervals
i = int100_total.T.stack().reset_index()
i.columns = ['int100_bins', 'cluster', 'values']

heatmap = go.Heatmap(z = i['values'], x = i['int100_bins'], y = i['cluster'], 
                          colorscale='Reds')
layout = go.Layout(
        title= 'Relative likelihood that cluster k is used in particular consumption bin',
        xaxis=dict(title = 'total daily demand bins (Amps)', 
                   tickmode='array', tickvals=list(range(0,100,10)), ticktext = list(range(0,1000,100))),
        yaxis=dict(title ='k clusters for '+experiment)
        )
fig = {'data':[heatmap], 'layout':layout }
po.iplot(fig)

#Equally sized daily demand intervals (quantiles)
rel_q100 = q100_total.T[1::]#.drop(columns=37)/0.01

slatered=['#232c2e', '#ffffe0','#c34513']
label_cmap, label_cs = pc.colorscale_from_list(slatered, 'label_cmap') 
colorscl= pc.asymmetric_colorscale(rel_q100, label_cmap, ref_point=1/49)

heatmap = go.Heatmap(z = rel_q100.T.values, x = rel_q100.index, y = rel_q100.columns, name = 'corr', 
                          colorscale=colorscl)
layout = go.Layout(
        title= 'Heatmap of relative likelihood of Cluster k being used in consumption quantile',
        xaxis=dict(title = 'total daily demand quantiles (Amps) - log scale', type='log'),
        yaxis=dict(title ='Cluster k'))
fig = {'data':[heatmap], 'layout':layout }
po.iplot(fig)

## Analyse Cluster Representativity and Homogeneity

In [None]:
total_consE, peak_consE, peak_coincR, temporal_entropy, demand_entropy, good_clusters = ec.getMeasures(best_exp, 
                                                                                                       threshold = 10490,
                                                                                                       weighted=False)

### Consumption Error - total

In [None]:
pc.subplotClusterMetrics(total_consE, 'TOTAL consumption error evaluation metrics')

### Consumption Error - max

In [None]:
pc.subplotClusterMetrics(peak_consE, 'PEAK consumption error evaluation metrics')

### Peak Coincidence Ratio

In [None]:
pc.plotClusterMetrics(peak_coincR, 'daily peak coincidence ratios', metric='coincidence_ratio', make_area_plot=True)

### Cluster Entropy - TEMPORAL
#### weekday, month

In [None]:
pc.plotClusterMetrics(temporal_entropy, 'weekday cluster entropy', metric='weekday_entropy')#, make_area_plot=False )

In [None]:
pc.plotClusterMetrics(temporal_entropy, 'monthly cluster entropy', metric='monthly_entropy')

### Cluster Entropy - ENERGY DEMAND
#### total daily demand, max daily demand

In [None]:
pc.plotClusterMetrics(demand_entropy, 'total demand cluster entropy', metric='total_entropy')

In [None]:
pc.plotClusterMetrics(demand_entropy, 'peak demand cluster entropy', metric='peak_entropy')

## Cluster Scoring Matrix

In [None]:
ec.saveMeasures(best_exp, 10490, weighted=True)
data = pd.read_csv(os.path.join(eval_dir,'cluster_entropy.csv'), index_col=[0,1], header=[0,1,2])
data.reset_index(level=0, drop=True, inplace=True)
data.rename(dict(zip(data.index, [s.replace('_', ' ', 2) for s in data.index])),inplace=True)
df = data.iloc[:,:-1]

### Unweighted Mean Peak Coincidence Ratio

In [None]:
myd = pd.DataFrame()
for x in peak_coincR.keys(): #set threshold value to same as data - 10490
    myd = myd.append({'experiment': x.replace('_',' ', 2), 'mean peak coincidence ratio': peak_coincR[x]['coincidence_ratio'].mean()}, ignore_index=True)
#myd = myd.set_index('experiment')

In [None]:
rrr = df.loc(axis=1)[:,:,'coincidence_ratio']
rrr.columns = rrr.columns.droplevel().droplevel()
rrr.reset_index(inplace=True)
pcr = pd.merge(myd, rrr, left_on='experiment', right_on='index')
pcr.rename(columns={'mean peak coincidence ratio':'Mean pcr','coincidence_ratio':'Weighted pcr',
                    'experiment':'Experiment'},inplace=True)
pcr.set_index('Experiment',inplace=True)
pcr.drop(columns=['index'],inplace=True)
pcr.round(3).sort_index()

### Ranked Scores

In [None]:
rank_coincR = df[['coincR']].rank(ascending=False, method='min').groupby(level=['measure','metric'],axis=1).mean().T
rank_clusters = df[['clusters']].rank(ascending=False, method='min').groupby(level=['measure','metric'],axis=1).mean().T

rank_consE = df[['consE']].rank(method='min').groupby(level=['measure'],axis=1).mean().T
rank_consE.insert(loc=0, column='metric', value='mean_error')
rank_consE.set_index('metric',append=True,inplace=True)

rank_entropy = df['entropy'].rank(method='min').T

In [None]:
conse = df[['consE']].rank(method='min').T
conse.rename(columns={'experiment':'Experiment','algorithm':'Algorithm','preprocessing':'Norm', 
                                  'SOM dimensions':'SOM dim','clusters':'Clusters','dbi':'DBI', 'mia':'MIA', 
                                  'silhouette':'Silhouette','score':'CI score','run time':'Run time',
                                  'experiment_name':'Experiment name'}, inplace=True)



In [None]:
ranked_results = pd.concat([rank_clusters, rank_coincR, rank_consE, rank_entropy], levels=['measure','metric'])
ranked_results.insert(loc=0, column='weights', value= [2, 3, 6 ,6, 5, 5, 4, 4])#, 2])

score_results = ranked_results.loc[:,ranked_results.columns[1::]].multiply(ranked_results['weights'], axis='index').sum()
score = pd.DataFrame(score_results, columns=['score']).T
score.index = pd.MultiIndex.from_tuples([('', '', 'SCORE')])

ranked_results.set_index('weights',append=True,inplace=True)
score_results = ranked_results.append(score)

In [None]:
#only run this cell if you want information about additional parameters for experiments
algs = [col.split(' ') for col in score_results.columns]
preb = ['','AMC','AMC','AMC','AMC','integral k-means','integral k-means']
dropz = ['','','','','True','','True']
multic = []
for a in range(0, len(algs)):
    multic.append(algs[a]+[preb[a]]+[dropz[a]])
score_results.columns = pd.MultiIndex.from_tuples(multic, names=['Experiment', 'Algorithm','Normalisation',
                                                                 'Pre-binning','Drop Zeros'])

In [None]:
score_results.index.set_names('weight', level=2, inplace=True)
score_results

## Archetypes

In [None]:
pc.plotClusterCentroids(real_cluster_centroids['exp8_kmeans_unit_norm'].loc[[33, 39, 40, 41, 44, 45, 46, 47, 48, 49, 50, 51]], 
                        groupby=None,
                        title='Mpumalanga Rural Newly Electrified', threshold=10490) 

In [None]:
pc.plotClusterCentroids(real_cluster_centroids['exp8_kmeans_unit_norm'].loc[[39, 44, 45, 46, 49, 50]], 
                        groupby=None, 
                        title='Mpumalanga Informal Settlement Newly Electrified', threshold=10490) 

In [None]:
pc.plotClusterCentroids(real_cluster_centroids['exp8_kmeans_unit_norm'].loc[[39, 45, 46, 49, 50, 53]], 
                        groupby=None, 
                        title='Eastern Cape Informal Settlement Newly Electrified', threshold=10490) 

In [None]:
pc.plotClusterCentroids(real_cluster_centroids['exp8_kmeans_unit_norm'].loc[[9, 11, 44]], 
                        groupby=None, 
                        title='Limpopo Informal Settlement Medium-term Electrified', threshold=10490) 

In [None]:
pc.plotClusterCentroids(real_cluster_centroids['exp8_kmeans_unit_norm'].loc[[3, 4, 6, 7, 24]], 
                        groupby=None, 
                        title='Gauteng Township Longterm Electrified', threshold=10490) 

In [None]:
pc.plotClusterCentroids(real_cluster_centroids['exp8_kmeans_unit_norm'].loc[[1, 3, 4, 5, 35, 36, 38]], 
                        groupby=None, 
                        title='KwaZulu Natal Lower Middle Class Long-term Electrified', threshold=10490) 

In [None]:
pc.plotClusterCentroids(real_cluster_centroids['exp8_kmeans_unit_norm'].loc[[2, 4, 35, 36, 38, 57]], 
                        groupby=None, 
                        title='KwaZulu Natal Upper Middle Class Long-term Electrified', threshold=10490) 

In [None]:
pc.plotClusterCentroids(real_cluster_centroids['exp8_kmeans_unit_norm'].loc[[6, 7, 37, 54, 57]], 
                        groupby=None, 
                        title='Western Cape Upper Middle Class Medium-term Electrified', threshold=10490) 