# Daily Load Profile Timeseries Clustering Evaluation

In [None]:
import pandas as pd
import numpy as np
import datetime as dt
import os
from math import ceil

import plotly.plotly as py
import plotly.offline as po
import plotly.graph_objs as go
import plotly.figure_factory as ff
import plotly.tools as tools
import colorlover as cl
#from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
cf.go_offline()

import matplotlib. pyplot as plt
from matplotlib import colors
from matplotlib.colors import LinearSegmentedColormap

import evaluation.eval_clusters as ec
import evaluation.eval_cluster_plot as pc
from support import data_dir, image_dir
eval_dir = os.path.join(data_dir,'cluster_evaluation')

In [None]:
experiments = ec.getExperiments('exp')

In [None]:
best_exp = ['exp2_kmeans_unit_norm', 'exp4_kmeans_zero-one', 'exp5_kmeans_unit_norm', 'exp5_kmeans_zero-one', 
            'exp6_kmeans_unit_norm','exp7_kmeans_unit_norm','exp8_kmeans_unit_norm']

## Analyse Cluster Scores

### Davies-Bouldin Index

In [None]:
pc.plotClusterIndex('dbi', 'Davies-Bouldin Index', experiments, groupby='algorithm')

### Mean Index Adequacy

In [None]:
pc.plotClusterIndex('mia','Mean Index Adequacy', experiments)

### Silhouette Score

The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters. Negative values generally indicate that a sample has been assigned to the wrong cluster, as a different cluster is more similar.

In [None]:
pc.plotClusterIndex('silhouette', 'Silhouette Score', experiments, groupby='experiment')

### Combined Cluster Score

In [None]:
pc.plotClusterIndex('score','Combined Cluster Score', experiments, groupby='algorithm', ylog=True)

In [None]:
pc.plotClusterIndex('score','Combined Cluster Score', experiments, groupby='experiment', ylog=True)

## Explore Cluster Labels, Centroids and Sizes

### Select best clusters for different algorithms

In [None]:
cluster_results = ec.readResults()

selected_clusters = ec.selectClusters(cluster_results, len(cluster_results))
selected_clusters.head(10).set_axis(range(1,11), inplace=False)
#selected_clusters = dict()
#for e in experiments:
#    clusters = ec.selectClusters(cluster_results, 5, experiment=e)
#    selected_clusters[e] = clusters
#selected_clusters['best'] = ec.selectClusters(cluster_results, 10)

In [None]:
x0 = selected_clusters[selected_clusters.pre_processing	.isna()]['score']  
x1 = selected_clusters[selected_clusters.pre_processing	=='unit_norm']['score']  
x2 = selected_clusters[selected_clusters.pre_processing=='demin']['score']
x3 = selected_clusters[selected_clusters.pre_processing=='zero-one']['score'] 
x4 = selected_clusters[selected_clusters.pre_processing=='sa_norm']['score']

# Group data together
hist_data = [x0, x1, x2, x3, x4]

group_labels = ['no norm', 'unit norm', 'demin', 'zero-one', 'SA norm']

# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels, histnorm='percent', bin_size=1, 
                         show_curve=False, show_rug=False)
fig['layout'].update(title='Distribution of Quantitative Scores across Normalisation Algorithms', 
                     xaxis = dict(title='bins for score range 0-100', range=[0, 100]), 
                     yaxis = dict(title='Percent'),
                     margin=dict(t=30),
                     height=250, width=600)

# Plot!
po.iplot(fig)
po.plot(fig, filename=data_dir+'/cluster_evaluation/plots/clustering_evaluation/DistplotQuantScoresNormalisation'+'.html')

In [None]:
y0 = selected_clusters[selected_clusters.experiment_name.str.contains('exp1|exp2|exp3')]['score']  
y1 = selected_clusters[selected_clusters.experiment_name.str.contains('exp4|exp5|exp6')]['score']  
y2 = selected_clusters[selected_clusters.experiment_name.str.contains('exp7|exp8')]['score']

# Group data together
hist_data2 = [y0, y1, y2]

group_labels2 = ['no pre-binning', 'AMC', 'integral kmeans']

fig2 = ff.create_distplot(hist_data2, group_labels2, histnorm='percent', bin_size=1, 
                          show_curve=False, show_rug=False, colors=['#393E46', '#2BCDC1', '#F66095'])
fig2['layout'].update(title='Distribution of Quantitative Scores across Pre-binning Algorithms', 
                     xaxis = dict(title='bins for score range 0-100', range=[0, 100]), 
                     yaxis = dict(title='Percent'),
                     margin=dict(t=30),
                     height=250, width=600)
# Plot!
po.iplot(fig2)
po.plot(fig2, filename=data_dir+'/cluster_evaluation/plots/clustering_evaluation/DistplotQuantScoresPrebinning'+'.html')

In [None]:
z0 = selected_clusters[selected_clusters.algorithm=='kmeans']['score']  
z1 = selected_clusters[selected_clusters.algorithm=='som']['score']  
z2 = selected_clusters[selected_clusters.algorithm=='som+kmeans']['score']

# Group data together
hist_data3 = [z0, z1, z2]

group_labels3 = ['kmeans', 'som', 'som+kmeans']

fig3 = ff.create_distplot(hist_data3, group_labels3, histnorm='percent', bin_size=1, 
                          show_curve=False, show_rug=False, colors=['#1E90FF','#DC143C', '#800080'])
fig3['layout'].update(title='Distribution of Quantitative Scores across Clustering Algorithms', 
                     xaxis = dict(title='bins for score range 0-100', range=[0, 100]), 
                     yaxis = dict(title='Percent'),
                     margin=dict(t=30),
                     height=250, width=600)
# Plot!
po.iplot(fig3)
po.plot(fig3, filename=data_dir+'/cluster_evaluation/plots/clustering_evaluation/DistplotQuantScoresClustering'+'.html')

### Visualise Centroids

#### Get denormalised (real) cluster centroids

In [None]:
real_cluster_centroids = dict()

for e in best_exp:
    rccpath = os.path.join(eval_dir, 'best_centroids', e +'BEST1_centroids.csv')
    centroids  = pd.read_csv(rccpath, index_col='k')
    real_cluster_centroids[e] = centroids

In [None]:
i = 6
ex = ec.exploreAMDBins(best_exp[i]).reset_index()
mapper = ec.mapBins(real_cluster_centroids[best_exp[i]])
out = pd.merge(ex, mapper, on='elec_bin').sort_values(by='mean_dd')
out.reset_index().drop(columns=['som_dim','elec_bin','mean_dd','index'],axis=0).set_index(['experiment_name','bin_labels'])

In [None]:
for i in range(0,7):
    pc.plotClusterCentroids(real_cluster_centroids[best_exp[i]])#, threshold=10490, groupby=None) 

### Visualise Centroid and Member Profiles

In [None]:
best_exp

In [None]:
i = 3
centroids = ec.realCentroids(best_exp[i])
centroids['cluster_size'].plot('bar', figsize=(14,4))

In [None]:
clusters = centroids.nlargest(15, 'cluster_size').sort_index().index.values
clusters

In [None]:
pc.plotMembersSample(best_exp[i], largest=15)

## Explore Patterns in Cluster Labels

### Visualise TEMPORAL Cluster Specificity

In [None]:
for i in range(0,7):
    pc.plotClusterSpecificity(best_exp[i], corr_list=['daytype','weekday'], threshold=10490, relative=[[5,1,1],1])
    pc.plotClusterSpecificity(best_exp[i], corr_list=['season','monthly'], threshold=10490, relative=[[8, 4],1])
    pc.plotClusterSpecificity(best_exp[i], corr_list=['yearly'], threshold=10490)

### Visualise CONTEXTUAL Cluster Specificity (Daily Demand Assignment)

In [None]:
experiment = 'exp8_kmeans_unit_norm'

corr_path = os.path.join(data_dir, 'cluster_evaluation', 'k_correlations')

dif = pd.read_csv(os.path.join(corr_path, 'demandi_corr.csv'), index_col=[0,1,2], header=[0]).drop_duplicates()
dif_temp = dif.reset_index(level=[-2,-1])
int100_total = dif_temp[(dif_temp.experiment==experiment+'BEST1')&(dif_temp.compare=='total')].drop(['experiment','compare'],axis=1)

dqf = pd.read_csv(os.path.join(corr_path, 'demandq_corr.csv'), index_col=[0,1,2], header=[0]).drop_duplicates()
dqf_temp = dqf.reset_index(level=[-2,-1])
q100_total = dqf_temp[(dqf_temp.experiment==experiment+'BEST1')&(dqf_temp.compare=='total')].drop(['experiment','compare'],axis=1)

In [None]:
#Equally spaced daily demand intervals
i = int100_total.T.stack().reset_index()
i.columns = ['int100_bins', 'cluster', 'values']

heatmap = go.Heatmap(z = i['values'], x = i['int100_bins'], y = i['cluster'], 
                          colorscale='Reds')
layout = go.Layout(
        title= 'Relative likelihood that cluster k is used in particular consumption bin',
        xaxis=dict(title = 'total daily demand bins (Amps)', 
                   tickmode='array', tickvals=list(range(0,100,10)), ticktext = list(range(0,1000,100))),
        yaxis=dict(title ='k clusters for '+experiment)
        )
fig = {'data':[heatmap], 'layout':layout }
po.iplot(fig)

#Equally sized daily demand intervals (quantiles)
rel_q100 = q100_total.T[1::]#.drop(columns=37)/0.01

slatered=['#232c2e', '#ffffe0','#c34513']
label_cmap, label_cs = pc.colorscale_from_list(slatered, 'label_cmap') 
colorscl= pc.asymmetric_colorscale(rel_q100, label_cmap, ref_point=1/49)

heatmap = go.Heatmap(z = rel_q100.T.values, x = rel_q100.index, y = rel_q100.columns, name = 'corr', 
                          colorscale=colorscl)
layout = go.Layout(
        title= 'Heatmap of relative likelihood of Cluster k being used in consumption quantile',
        xaxis=dict(title = 'total daily demand quantiles (Amps) - log scale', type='log'),
        yaxis=dict(title ='Cluster k'))
fig = {'data':[heatmap], 'layout':layout }
po.iplot(fig)

## Analyse Cluster Representativity and Homogeneity

In [None]:
total_consE, peak_consE, peak_coincR, temporal_entropy, demand_entropy, good_clusters = ec.getMeasures(best_exp, 10490)

### Consumption Error - total

In [None]:
pc.subplotClusterMetrics(total_consE, 'TOTAL consumption error evaluation metrics')

### Consumption Error - max

In [None]:
pc.subplotClusterMetrics(peak_consE, 'PEAK consumption error evaluation metrics')

### Peak Coincidence Ratio

In [None]:
pc.plotClusterMetrics(peak_coincR, 'daily peak coincidence ratios', metric='coincidence_ratio', make_area_plot=True)

### Cluster Entropy - TEMPORAL
#### weekday, month

In [None]:
pc.plotClusterMetrics(temporal_entropy, 'weekday cluster entropy', metric='weekday_entropy')#, make_area_plot=False )

In [None]:
pc.plotClusterMetrics(temporal_entropy, 'monthly cluster entropy', metric='monthly_entropy')#

### Cluster Entropy - ENERGY DEMAND
#### total daily demand, max daily demand

In [None]:
pc.plotClusterMetrics(demand_entropy, 'total demand cluster entropy', metric='total_entropy')

In [None]:
pc.plotClusterMetrics(demand_entropy, 'peak demand cluster entropy', metric='peak_entropy')

## Cluster Scoring Matrix

In [None]:
ec.saveMeasures(best_exp, 10490)
data = pd.read_csv(os.path.join(eval_dir,'cluster_entropy.csv'), index_col=[0,1], header=[0,1,2])
data.reset_index(level=0, drop=True, inplace=True)
data.rename(dict(zip(data.index, [s.replace('_', ' ', 2) for s in data.index])),inplace=True)

In [None]:
df = data.iloc[:,:-1]

In [None]:
rank_coincR = df[['coincR']].rank(ascending=False).groupby(level=['measure','metric'],axis=1).mean().T
rank_clusters = df[['clusters']].rank(ascending=False).groupby(level=['measure','metric'],axis=1).mean().T

rank_consE = df[['consE']].rank().groupby(level=['measure'],axis=1).mean().T
rank_consE.insert(loc=0, column='metric', value='mean_error')
rank_consE.set_index('metric',append=True,inplace=True)

rank_entropy = df['entropy'].rank().T

In [None]:
df[['consE']].rank().T

In [None]:
ranked_results = pd.concat([rank_clusters, rank_coincR, rank_consE, rank_entropy], levels=['measure','metric'])
ranked_results.insert(loc=0, column='weights', value= [2, 3, 6 ,6, 5, 5, 4, 4])#, 2])

score_results = ranked_results.loc[:,ranked_results.columns[1::]].multiply(ranked_results['weights'], axis='index').sum()
score = pd.DataFrame(score_results, columns=['score']).T
score.index = pd.MultiIndex.from_tuples([('', '', 'SCORE')])

ranked_results.set_index('weights',append=True,inplace=True)
score_results = ranked_results.append(score)

In [None]:
score_results

## Archetypes

In [None]:
pc.plotClusterCentroids(real_cluster_centroids['exp8_kmeans_unit_norm'].loc[[40, 41, 43, 44, 45, 48, 49, 50, 53]], 
                        groupby=None,
                        title='Customer Archetype: Rural Free State', threshold=10490) 

In [None]:
pc.plotClusterCentroids(real_cluster_centroids['exp8_kmeans_unit_norm'].loc[[34, 40, 43, 44, 48, 49, 50, 53]], 
                        groupby=None, 
                        title='Customer Archetype: Informal Settlement Newly Electrified', threshold=10490) 

In [None]:
pc.plotClusterCentroids(real_cluster_centroids['exp8_kmeans_unit_norm'].loc[[3, 4, 23, 25, 27, 28, 29]], 
                        groupby=None, 
                        title='Customer Archetype: Township KZN Longterm Electrified', threshold=10490) 

In [None]:
pc.plotClusterCentroids(real_cluster_centroids['exp8_kmeans_unit_norm'].loc[[2, 4, 6, 7, 31, 36, 37, 38]], 
                        groupby=None, 
                        title='Customer Archetype: Upper Middle Class Longterm Electrified', threshold=10490) 