In [1]:
from matplotlib import pyplot as plt
from utils.utils import *
import utils.promethee_functions as pf
import utils.clustering_functions as cf

data = read_data()

group0 = ["PAK", "SDN", "BDI", "HTI"]
group1 = ["EST", "CZE", "MLT", "SGP", "IRL"]
group2 = ["CHE", "ISL", "NZL", "SWE"]

all_groups = group0 + group1 + group2

data = data.loc[all_groups]

data = scale_data(data)

print("\nData scaled")
get_min_max_criteria(data, False)

L = data.iloc[0]["co2prod"].shape[0] # Length of the time series

utils.py Loading
Reading HDI dataset
co2prod: min=0.0, max=33.3863
hdi: min=0.257, max=0.967
le: min=37.105, max=85.473
gdi: min=0.383, max=1.041
eys: min=3.5751, max=23.2477
mys: min=1.4606, max=14.2559

Data scaled
co2prod: min=0.0, max=1.0
hdi: min=0.0, max=1.0
le: min=0.0, max=1.0
gdi: min=0.0, max=1.0
eys: min=0.0, max=1.0
mys: min=0.0, max=1.0


In [2]:
n_samples = data.shape[0]
n_features = data.shape[1]

formatted_data = np.stack([np.stack(data.iloc[:, i].values) for i in range(n_features)], axis=-1)

formatted_data.shape

names = data.index
names_formatted = [name for name in names]


In [10]:
from tslearn.clustering import TimeSeriesKMeans

n_clusters = 3

km = TimeSeriesKMeans(n_clusters=n_clusters, metric="euclidean", max_iter=5).fit(formatted_data)
km.labels_

clusters = [[] for _ in range(n_clusters)]
for i in range(n_samples):
    clusters[km.labels_[i]].append(names_formatted[i])

for i in range(n_clusters):
    print(f"Cluster {i}: {clusters[i]}")

Cluster 0: ['MLT', 'SGP', 'IRL', 'CHE', 'ISL', 'NZL', 'SWE']
Cluster 1: ['PAK', 'SDN', 'BDI', 'HTI']
Cluster 2: ['EST', 'CZE']


In [11]:
def dunn_index_multivariate(clusters, data):
    """ 
        Compute the Dunn index for a clustering of multivariate time series data
        - clusters: list of lists of indexes of the time series in each cluster
        - data: the data set (dataframe with index as id of the time series), each cell is a np.array (time series)
    """
    def dunn_index_univariate(clusters, data):
        """
            Compute the Dunn index for a clustering of univariate time series data
            - clusters: list of lists of indexes of the time series in each cluster
            - data: the data set (dataframe with index as id of the time series), only one column where each cell is a np.array (time series)
        """
        centroids = []

        # Define the centroids of the clusters
        for cluster in clusters:
            centroid = np.zeros_like(data.iloc[0])
            for country in cluster:
                centroid += data.loc[country]
            centroid /= len(cluster)
            centroids.append(centroid)

        centroids = np.array(centroids)
        
        # Compute the distances between clusters
        inter_cluster_distances = []
        for i in range(len(clusters)):
            for j in range(i+1, len(clusters)):
                inter_cluster_distances.append(np.linalg.norm(centroids[i] - centroids[j]))

        # Compute the diameter of each cluster
        # Diameter = max distance between two countries in the cluster
        cluster_diameters = []
        for cluster in clusters:
            diameter = 0
            for i in range(len(cluster)):
                for j in range(i+1, len(cluster)):
                    diameter = max(diameter, np.linalg.norm(data.loc[cluster[i]] - data.loc[cluster[j]]))
            cluster_diameters.append(diameter)

        dunn_index = min(inter_cluster_distances) / max(cluster_diameters)
        return dunn_index

    indexes = {}
    criterias = data.columns

    for criteria in criterias:
        indexes[criteria] = dunn_index_univariate(clusters, data[criteria])
    
    return indexes

dunn_indexes = dunn_index_multivariate(clusters, data)

# Get the mean of the Dunn index for each criteria
mean_dunn_index = np.mean(list(dunn_indexes.values()))
print(f"Mean Dunn index: {mean_dunn_index}")

Mean Dunn index: 0.4080525707838137
