# Distance to desired clustering
1. Run a contingency matrix.  
2. Linear sum assignment.   
3. Sum up the off-diagonal occurrences.  

In [1]:
from sklearn.metrics.cluster import contingency_matrix
from scipy.optimize import linear_sum_assignment
import numpy as np

In [2]:
def distance_to_clustering(ypred, ytrue, verbose=False):
    cm = contingency_matrix(ypred, ytrue)
    row_ind, col_ind = linear_sum_assignment(-cm)
    if verbose:
        print(cm)
        print(row_ind, col_ind)
    return cm.sum() - cm[row_ind, col_ind].sum()

In [3]:
ypred = ['a', 'b', 'd', 'c', 'a']
ytrue = ['a', 'b', 'c', 'd', 'a']
distance_to_clustering(ypred, ytrue, True)

[[2 0 0 0]
 [0 1 0 0]
 [0 0 0 1]
 [0 0 1 0]]
[0 1 2 3] [0 1 3 2]


0

In [4]:
ypred = ['a', 'b', 'b', 'c']
ytrue = ['a', 'a', 'b', 'c']
distance_to_clustering(ypred, ytrue, True)

[[1 0 0]
 [1 1 0]
 [0 0 1]]
[0 1 2] [0 1 2]


1

In [5]:
ypred = ['a', 'b', 'b', 'b', 'c']
ytrue = ['a', 'a', 'b', 'c', 'c']
distance_to_clustering(ypred, ytrue, True)

[[1 0 0]
 [1 1 1]
 [0 0 1]]
[0 1 2] [0 1 2]


2

In [6]:
ypred = ['c', 'b', 'b', 'b', 'a']
ytrue = ['a', 'a', 'b', 'c', 'c']
distance_to_clustering(ypred, ytrue, True)

[[0 0 1]
 [1 1 1]
 [1 0 0]]
[0 1 2] [2 1 0]


2

In [7]:
ypred = ['a', 'a', 'b', 'b', 'c']
ytrue = ['a', 'b', 'b', 'c', 'c']
distance_to_clustering(ypred, ytrue, True)

[[1 1 0]
 [0 1 1]
 [0 0 1]]
[0 1 2] [0 1 2]


2