In [2]:
from matplotlib import pyplot as plt
from utils.utils import *
import utils.promethee_functions as pf
import utils.clustering_functions as cf

data = read_data()
# Nb of criteria
K = data.columns.shape[0] # Nb of criteria
L = data.iloc[0]["co2prod"].shape[0] # Length of the time series


group0 = ["PAK", "SDN", "BDI", "HTI"]
group1 = ["EST", "CZE", "MLT", "SGP", "IRL"]
group2 = ["CHE", "ISL", "NZL", "SWE"]

all_groups = group0 + group1 + group2

data = data.loc[all_groups]

data = scale_data(data)

print("\nData scaled")
get_min_max_criteria(data, False)

utils.py Loading
Reading HDI dataset
co2prod: min=0.0, max=33.3863
hdi: min=0.257, max=0.967
le: min=37.105, max=85.473
gdi: min=0.383, max=1.041
eys: min=3.5751, max=23.2477
mys: min=1.4606, max=14.2559

Data scaled
co2prod: min=0.0, max=1.0
hdi: min=0.0, max=1.0
le: min=0.0, max=1.0
gdi: min=0.0, max=1.0
eys: min=0.0, max=1.0
mys: min=0.0, max=1.0


In [5]:
clusters = [group0, group1, group2]
clusters

[['PAK', 'SDN', 'BDI', 'HTI'],
 ['EST', 'CZE', 'MLT', 'SGP', 'IRL'],
 ['CHE', 'ISL', 'NZL', 'SWE']]

In [6]:
co2prod = data["co2prod"]
co2prod

iso3
PAK    [0.9758013116603509, 0.976721908513296, 0.9757...
SDN    [0.9911965293383521, 0.9919595069996899, 0.993...
BDI    [0.9992023677291763, 0.9990114927479534, 0.999...
HTI    [0.9947065203015965, 0.9949419973396827, 0.995...
EST    [0.0, 0.0721707642540932, 0.32579554710505815,...
CZE    [0.32192410394855164, 0.38557088790790495, 0.3...
MLT    [0.7179655598003531, 0.7384510061757973, 0.738...
SGP    [0.3874761727105134, 0.3972690098937967, 0.375...
IRL    [0.5983674281281074, 0.5917314934679975, 0.596...
CHE    [0.7206720525440102, 0.7117422603965367, 0.715...
ISL    [0.6297037413466369, 0.6536514155842347, 0.634...
NZL    [0.6812110990476761, 0.680625580248843, 0.6613...
SWE    [0.7143769854590014, 0.7147643462845307, 0.717...
Name: co2prod, dtype: object

In [13]:
centroids = []
# Define the centroids of the clusters
for cluster in clusters:
    centroid = np.zeros_like(co2prod.iloc[0])
    for country in cluster:
        centroid += co2prod.loc[country]
    centroid /= len(cluster)
    centroids.append(centroid)

centroids = np.array(centroids)
centroids.shape

(3, 33)

In [None]:
# Compute the distances between clusters
inter_cluster_distances = []
for i in range(len(clusters)):
    for j in range(i+1, len(clusters)):
        inter_cluster_distances.append(np.linalg.norm(centroids[i] - centroids[j]))

inter_cluster_distances

[np.float64(2.4344475392631133),
 np.float64(1.712223600243771),
 np.float64(0.7605762037892209)]

In [15]:
# Compute the diameter of each cluster
# Diameter = max distance between two countries in the cluster
cluster_diameters = []
for cluster in clusters:
    diameter = 0
    for i in range(len(cluster)):
        for j in range(i+1, len(cluster)):
            diameter = max(diameter, np.linalg.norm(co2prod.loc[cluster[i]] - co2prod.loc[cluster[j]]))
    cluster_diameters.append(diameter)

cluster_diameters

[np.float64(0.17868947685737727),
 np.float64(1.9341718961490786),
 np.float64(1.1755711350624918)]

In [17]:
dunn_index = min(inter_cluster_distances) / max(cluster_diameters)
dunn_index

np.float64(0.3932309249780344)

In [19]:
def dunn_index_multivariate(clusters, data):
    """ 
        Compute the Dunn index for a clustering of multivariate time series data
        - clusters: list of lists of indexes of the time series in each cluster
        - data: the data set (dataframe with index as id of the time series), each cell is a np.array (time series)
    """
    def dunn_index_univariate(clusters, data):
        """
            Compute the Dunn index for a clustering of univariate time series data
            - clusters: list of lists of indexes of the time series in each cluster
            - data: the data set (dataframe with index as id of the time series), only one column where each cell is a np.array (time series)
        """
        centroids = []

        # Define the centroids of the clusters
        for cluster in clusters:
            centroid = np.zeros_like(data.iloc[0])
            for country in cluster:
                centroid += data.loc[country]
            centroid /= len(cluster)
            centroids.append(centroid)

        centroids = np.array(centroids)
        
        # Compute the distances between clusters
        inter_cluster_distances = []
        for i in range(len(clusters)):
            for j in range(i+1, len(clusters)):
                inter_cluster_distances.append(np.linalg.norm(centroids[i] - centroids[j]))

        # Compute the diameter of each cluster
        # Diameter = max distance between two countries in the cluster
        cluster_diameters = []
        for cluster in clusters:
            diameter = 0
            for i in range(len(cluster)):
                for j in range(i+1, len(cluster)):
                    diameter = max(diameter, np.linalg.norm(data.loc[cluster[i]] - data.loc[cluster[j]]))
            cluster_diameters.append(diameter)

        dunn_index = min(inter_cluster_distances) / max(cluster_diameters)
        return dunn_index

    indexes = {}
    criterias = data.columns

    for criteria in criterias:
        indexes[criteria] = dunn_index_univariate(clusters, data[criteria])
    
    return indexes

dunn_index_multivariate(clusters, data)

{'co2prod': np.float64(0.3932309249780344),
 'hdi': np.float64(0.39579279031086606),
 'le': np.float64(0.26273938806215114),
 'gdi': np.float64(0.05137623166662448),
 'eys': np.float64(0.3902749917533445),
 'mys': np.float64(0.4789228091604594)}