## Clustering Evaluation

### All imports here

In [1]:
import json
from sklearn import metrics

In [2]:
import matplotlib.pyplot as plt
%pylab inline
plt.xkcd();

Populating the interactive namespace from numpy and matplotlib


### Functions

In [3]:
def parse_distance_matrix(pic_matrix_json):
    """
    parse distance matrix output from pic clustering algorithm
    @arg pic_matrix_json json file containing pic clustering distance
    @return distance dictionary and set of pois
    """
    distance_dict, pois = {}, set()
    for item in pic_matrix_json['distances']:
        pois.add(item['poi1'])
        pois.add(item['poi2'])
        if item['poi1'] not in distance_dict.keys():
            distance_dict[item['poi1']] = {item['poi2']: item['distance']}
        else:
            distance_dict[item['poi1']][item['poi2']] = item['distance']
    return distance_dict, pois

def build_distance_matrix(distance_dict, pois):
    """
    build distance matrix from distance dictionary and set of pois, key error should never be thrown here
    """
    matrix = []
    for poi_i in pois:  # each row
        row = []
        for poi_j in pois:  # each column
            if poi_i == poi_j:  # diagonal
                row.append(0.0)
            elif poi_i in distance_dict.keys():  # if there is relation between poi_i and poi_j
                if poi_j in distance_dict[poi_i].keys():
                    row.append(distance_dict[poi_i][poi_j])
                else:
                    row.append(distance_dict[poi_j][poi_i])
            else:  # if there is relation between poi_i and poi_j
                row.append(1.0)
        matrix.append(np.array(row))
    return np.array(matrix)

def parse_clustering_result(clusters_json):
    """
    parse clustering result to get cluster id and corresponding pois in cluster
    """
    clusters = {}
    for cluster in clusters_json['clusters']:
        if cluster['cluster_id'] not in clusters:
            clusters[cluster['cluster_id']] = []
        for poi in cluster['poi_in_cluster']:
            clusters[cluster['cluster_id']].append(poi['poi_id'])
    return clusters

def build_label(clusters, pois):
    """
    get cluster label for each poi from clusters and pois
    """
    poi_cluster_label = {}
    for key, value in clusters.items():  # for each cluster_i and pois_in_cluster_i
        for poi in value:  # for each poi in cluster_i
            assert poi not in poi_cluster_label.keys()  # poi shouldn't be labeled, poi assigned to one cluster only
            poi_cluster_label[poi] = key
    
    labels = []
    for poi in pois:  # get cluster label for each poi from poi_cluster dict
        labels.append(poi_cluster_label[poi])
    return np.array(labels)

### PIC

In [4]:
with open('../../results/pic_matrix.json') as pic_matrix:
    pic_matrix_json = json.load(pic_matrix)

In [5]:
distance_dict, pois = parse_distance_matrix(pic_matrix_json)

In [6]:
matrix = build_distance_matrix(distance_dict, pois)

In [7]:
with open('../../results/pic_clusters.json') as pic_clusters:
    pic_clusters_json = json.load(pic_clusters)

In [8]:
clusters = parse_clustering_result(pic_clusters_json)

In [9]:
labels = build_label(clusters, pois)

In [10]:
metrics.silhouette_score(matrix, labels, metric='precomputed')

0.45147020556856621

In [None]:
metrics.calinski_harabaz_score(matrix, labels)