<a href="https://colab.research.google.com/github/Sankarlalr23/Automotive_Telematics_Analytics/blob/main/Hierarchial_Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Hierarchical Clustering to Time Series



https://towardsdatascience.com/how-to-apply-hierarchical-clustering-to-time-series-a5fe2a7d8447

In [None]:
# pip install sktime

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt



In [None]:
from sktime.datasets.base import load_italy_power_demand
data = load_italy_power_demand()

In [None]:
data

Unnamed: 0,dim_0,class_val
0,0 -0.710520 1 -1.183300 2 -1.372400 3...,1
1,0 -0.993010 1 -1.426800 2 -1.579900 3...,1
2,0 1.319100 1 0.569770 2 0.195130 3...,2
3,0 -0.812440 1 -1.157600 2 -1.416400 3...,2
4,0 -0.972840 1 -1.390500 2 -1.536700 3...,1
...,...,...
1024,0 0.431020 1 -0.518480 2 -1.118200 3...,2
1025,0 -0.61222 1 -1.04990 2 -1.36250 3 ...,1
1026,0 -1.26270 1 -1.44980 2 -1.51990 3 ...,2
1027,0 0.714320 1 0.629030 2 0.117280 3...,2


Step 1: Compute a Distance Matrix

In [None]:
from sktime.distances.elastic_cython import dtw_distance

# Italy Power Demand time series are loaded in a pd.Series format.
# The dtw_distance function expects series to be shaped as a (l, m) array, 
# where l=length of series, m=# dimensions           
series_list = data['dim_0'].values
series_list

array([0    -0.710520
1    -1.183300
2    -1.372400
3    -1.593100
4    -1.467000
5    -1.372400
6    -1.088800
7     0.045967
8     0.928530
9     1.086100
10    1.275300
11    0.960050
12    0.613330
13    0.014447
14   -0.647480
15   -0.269230
16   -0.206190
17    0.613330
18    1.369800
19    1.464400
20    1.054600
21    0.581810
22    0.172050
23   -0.269230
dtype: float64,
       0    -0.993010
1    -1.426800
2    -1.579900
3    -1.605400
4    -1.630900
5    -1.375800
6    -1.018500
7    -0.355100
8     0.716580
9     1.201400
10    1.124800
11    1.048300
12    0.793130
13    0.461420
14    0.486940
15    0.563480
16    0.614520
17    0.308320
18    0.257290
19    1.099300
20    1.048300
21    0.691070
22   -0.048906
23   -0.380620
dtype: float64,
       0     1.319100
1     0.569770
2     0.195130
3    -0.085856
4    -0.179520
5    -0.273180
6    -0.085856
7    -1.397100
8    -1.116100
9    -0.741490
10    0.007805
11   -0.085856
12    0.007805
13   -0.460500
14   -0.554160
15

In [None]:
series_list.shape

(1096,)

In [None]:
for i in range(len(series_list)):
    length = len(series_list[i])
    series_list[i] = series_list[i].values.reshape((length, 1))

In [None]:
series_list

array([array([[-0.71052 ],
       [-1.1833  ],
       [-1.3724  ],
       [-1.5931  ],
       [-1.467   ],
       [-1.3724  ],
       [-1.0888  ],
       [ 0.045967],
       [ 0.92853 ],
       [ 1.0861  ],
       [ 1.2753  ],
       [ 0.96005 ],
       [ 0.61333 ],
       [ 0.014447],
       [-0.64748 ],
       [-0.26923 ],
       [-0.20619 ],
       [ 0.61333 ],
       [ 1.3698  ],
       [ 1.4644  ],
       [ 1.0546  ],
       [ 0.58181 ],
       [ 0.17205 ],
       [-0.26923 ]]),
       array([[-0.99301 ],
       [-1.4268  ],
       [-1.5799  ],
       [-1.6054  ],
       [-1.6309  ],
       [-1.3758  ],
       [-1.0185  ],
       [-0.3551  ],
       [ 0.71658 ],
       [ 1.2014  ],
       [ 1.1248  ],
       [ 1.0483  ],
       [ 0.79313 ],
       [ 0.46142 ],
       [ 0.48694 ],
       [ 0.56348 ],
       [ 0.61452 ],
       [ 0.30832 ],
       [ 0.25729 ],
       [ 1.0993  ],
       [ 1.0483  ],
       [ 0.69107 ],
       [-0.048906],
       [-0.38062 ]]),
       array([[ 1.3191

In [None]:
# Initialize distance matrix
n_series = len(series_list)
distance_matrix = np.zeros(shape=(n_series, n_series))

In [None]:
# Build distance matrix
for i in range(n_series):
    for j in range(n_series):
        x = series_list[i]
        y = series_list[j]
        if i != j:
            dist = dtw_distance(x, y)
            distance_matrix[i, j] = dist

Step 2: Build a Linkage Matrix

In [None]:
from scipy.cluster.hierarchy import single, complete, average, ward, dendrogram

def hierarchical_clustering(dist_mat, method='complete'):
    if method == 'complete':
        Z = complete(distance_matrix)
    if method == 'single':
        Z = single(distance_matrix)
    if method == 'average':
        Z = average(distance_matrix)
    if method == 'ward':
        Z = ward(distance_matrix)
    
    fig = plt.figure(figsize=(16, 8))
    dn = dendrogram(Z)
    plt.title(f"Dendrogram for {method}-linkage with correlation distance")
    plt.show()
    
    return Z

linkage_matrix = hierarchical_clustering(distance_matrix)

Step 3: Create Clusters

In [None]:
from scipy.cluster.hierarchy import fcluster

# select maximum number of clusters
cluster_labels = fcluster(linkage_matrix, 4, criterion='maxclust')
print(np.unique(cluster_labels))
#>> 4 unique clusters

In [None]:

cluster_labels = fcluster(linkage_matrix, 10, criterion='maxclust')
print(np.unique(cluster_labels))
#>> 10 unique clusters


In [None]:
# hand-select an appropriate cut-off on the dendrogram
cluster_labels = fcluster(linkage_matrix, 600, criterion='distance')
print(np.unique(cluster_labels))
#>> 3 unique clusters


In [None]:
cluster_labels = fcluster(linkage_matrix, 800, criterion='distance')
print(np.unique(cluster_labels))
#>> 2 unique clusters