## Clustering de clientes dentro de una instancia
Esto es para computar las features de clustering en cada instancia

In [1]:
import pyvrp as p
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import OPTICS
import os
from sklearn.neighbors import NearestNeighbors

instances = os.listdir("Homberger")
instances = [i for i in instances if ".txt" in i]
dictionary = {}
for inst in instances:
    prefix = "".join(inst.split("_")[:2])
    if prefix not in dictionary:
        dictionary[prefix] = inst

In [2]:
def plotClusteredInstance(instance_name,clustering_labels,min_samples,labeled = False):

    instance = p.read("Homberger/" + instance_name, round_func="round", instance_format="solomon")
    model = p.Model.from_data(instance)
    clients = model._clients

    x_coords = np.array([clients[i].x for i in range(len(clients)) if clustering_labels[i] != -1])
    y_coords = np.array([clients[i].y for i in range(len(clients)) if clustering_labels[i] != -1])

    x_outliers = np.array([clients[i].x for i in range(len(clients)) if clustering_labels[i] == -1])
    y_outliers = np.array([clients[i].y for i in range(len(clients)) if clustering_labels[i] == -1])

    labels = [label for label in clustering_labels if label != -1]

    # This is the depot
    fig, ax = plt.subplots()
    ax.scatter(model._depots[0].x,model._depots[0].y, label="Depot", marker="*", color='blue', s=100)
    clients_scatter = ax.scatter(x_coords, y_coords, s=40, label="Clients", c=labels, cmap="tab20")
    outliers_scatter = ax.scatter(x_outliers, y_outliers, s=20, label="Outliers", c="black")

    if labeled:
        for i, label in enumerate(clustering_labels):
            if label != -1:
                ax.text(x_coords[i+1], y_coords[i+1], f'{label}', fontsize=6, ha='center', va='center', color="white", weight="bold")
            else:
                ax.text(x_coords[i+1], y_coords[i+1],'X', fontsize=6, ha='center', va='center', color="white", weight="bold")

    ax.grid(color="grey", linestyle="solid", linewidth=0.)
    ax.set_title("Coordinates")
    ax.set_aspect("equal", "datalim")
    ax.legend(frameon=False, ncol=2)
    plt.text(0.98, 1.05, "min_samples: " + str(min_samples), ha='right', va='top', transform=plt.gca().transAxes)
    plt.savefig("imgs/" + instance_name + "_" + str(min_samples) + '.png')
    plt.show()

In [3]:
#Calculamos la calidad del clustering comparando la distancia intra cluster con inter cluster.
#Distancia intra cluster se calcula como la distancia promedio de todos los puntos de un cluster al centroide del cluster
#Distancia inter cluster se calcula como la distancia entre el centroide del cluster con la distancia al resto de clusters

def clusteringQuality(clients, labels):
    #Almacenamos los clientes en listas separadas para cada cluster
    client_clusters = dict()
    for i in range(len(clients)):
        if labels[i] not in client_clusters.keys():
            client_clusters[labels[i]] = []
        client_clusters[labels[i]].append(clients[i])

    #Obtenemos centroides
    centroids = dict()
    for label, cluster in client_clusters.items():
        centroid = np.array([0,0])
        for client in cluster:
            centroid += np.array([client.x,client.y])
        centroid = np.divide(centroid,len(cluster))
        centroids[label] = centroid

    #Obtenemos average intra cluster distance
    intra_cluster_distances = dict()
    for label,cluster in client_clusters.items():
        distance_to_centroid = 0
        for client in cluster:
            distance_to_centroid += np.sqrt((client.x - centroids[label][0])**2 + (client.y - centroids[label][1])**2)
        distance_to_centroid /= len(cluster)
        
        intra_cluster_distances[label] = distance_to_centroid
    average_intra_cluster_distance = np.mean(list(intra_cluster_distances.values()))

    #Obtenemos distancia de cada cluster al cluster vecino más cercano
    centroid_locations = list(centroids.values())
    if len(centroid_locations) > 1:
        neighbors = NearestNeighbors(n_neighbors=2, algorithm='auto').fit(centroid_locations)
        distances, indices = neighbors.kneighbors(centroid_locations)
        average_inter_cluster_distance = np.mean(distances)
    else:
        average_inter_cluster_distance = 0
    
    #Normalizamos los valores para que escala de instancia no afecte
    max_x = 0
    max_y = 0
    for client in clients:
        if (client.x > max_x):
            max_x = client.x
        if (client.y > max_y):
            max_y = client.y

    max_possible_distance = np.sqrt(max_x**2 + max_y**2)
    average_intra_cluster_distance /= max_possible_distance
    average_inter_cluster_distance /= max_possible_distance

    #Queremos minimizar la distancia intra cluster mientras maximizamos al distancia inter cluster
    #Penalizamos tener demasiados outliers
    outlier_ratio = (sum([i for i in labels if i == -1])*-1) / len(labels)
    quality = -2*average_intra_cluster_distance + 1*average_inter_cluster_distance + -1*outlier_ratio
    return quality
        