In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from scipy.ndimage.filters import gaussian_filter1d

#model
from sklearn.cluster import AgglomerativeClustering

#visualization
from scipy.cluster.hierarchy import dendrogram
# packages for cluster determination 
from sklearn.metrics import silhouette_samples, silhouette_score

In [None]:
def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack([model.children_, model.distances_,
                                      counts]).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)


# 1. Clustering avg. Baseline consumption

In [None]:
# load consumption data
consumption = pd.read_pickle('/content/drive/MyDrive/Stanford-TUBerlin/CodePaper/Consumption_Matrices/{}_accounts_actual_usage'.format('SD'))

# compute avg. consumption for baseline 
time_base = pd.date_range(start='1/1/2002',  end='11/1/2007', freq='2MS')
consumption_base = consumption[time_base]

consumption_avg = pd.DataFrame(index=consumption.index)
consumption_avg['avg_consumption'] = consumption_base.mean(axis=1)

In [None]:
model = AgglomerativeClustering( n_clusters=4, linkage='ward')
model = model.fit(consumption_avg)

clusters = pd.DataFrame(index=consumption.index)
clusters['avg_consump'] = model.labels_
clusters['pattern']=  np.nan

In [None]:
consumption_avg['cluster'] = clusters['avg_consump']

In [None]:
consumption_avg.groupby('cluster').mean()

Unnamed: 0_level_0,avg_consumption
cluster,Unnamed: 1_level_1
0,117.335324
1,321.297422
2,199.469235
3,634.901359


# Cluster on Pattern

In [None]:
# load pattern
pattern = pd.read_pickle('/content/drive/MyDrive/Stanford-TUBerlin/CodePaper/Residuals/pattern_SD')

In [None]:
# silhouette score
cluster_groups = clusters.groupby('avg_consump')

for name, group in cluster_groups:
  pattern_group = pattern.loc[group.index].pattern.apply(pd.Series)
  range_n_clusters = range(2,10)

  fig, ax = plt.subplots(1, figsize=(10,7))

  silhouettes = []

  for n_clusters in range_n_clusters:
      # Initialize the clusterer with n_clusters value and a random generator
      # seed of 17 for reproducibility.
      clusterer = AgglomerativeClustering(n_clusters=n_clusters, linkage='complete')
      cluster_labels = clusterer.fit_predict(pattern_group)
      # The silhouette_score gives the average value for all the samples.
      # This gives a perspective into the density and separation of the formed
      # clusters
      silhouette_avg = silhouette_score(pattern_group, cluster_labels)
      silhouettes.append(silhouette_avg)
      
  ax.plot(range_n_clusters, silhouettes, 'o-')
  ax.set_ylabel('Average silhouette coefficient')
  ax.set_xlabel('K')
  ax.grid()
  fig.suptitle('Silhouette coefficient for group {}'.format(name))
  plt.show()

In [None]:
# dendrogramm
cluster_groups = clusters.groupby('avg_consump')

for name, group in cluster_groups:
  pattern_group = pattern.loc[group.index].pattern.apply(pd.Series)

  fig, ax = plt.subplots(1, figsize=(10,7))
  model = AgglomerativeClustering(distance_threshold=0, n_clusters=None, linkage='complete')
  model = model.fit(pattern_group)
  plot_dendrogram(model, truncate_mode='level', p=3 )
  ax.set_ylabel('similarity')
  fig.suptitle('Dendrogram for group {}'.format(name))
  plt.show()

In [None]:
xtimeline = pd.date_range(start='1/1/2008',  end='7/1/2020', freq='2MS')

In [None]:
cluster_groups = clusters.groupby('avg_consump')
n_clusters = [4, 3, 5, 2]
for i, (name, group) in enumerate(cluster_groups):
  pattern_group = pattern.loc[group.index].pattern.apply(pd.Series)
  model = AgglomerativeClustering( n_clusters=n_clusters[i], linkage='complete')
  labels = model.fit_predict(pattern_group)
  clusters.loc[group.index, 'pattern'] = labels.astype(int)




In [None]:
clusters.to_pickle('/content/drive/MyDrive/Stanford-TUBerlin/CodePaper/Clustering/SD_clusters')