Hierarchical Clustering for Description Dataset
==========================

In [1]:
import numpy as np
from sklearn.cluster import spectral_clustering
import csv
from collections import defaultdict
import scipy.io as sio
from glob import glob

In [2]:
# Load ground truth
groundTruthFile = 'data/largeSimMatrix.mat'
groundTruth = sio.loadmat(groundTruthFile)['largeSimMatrix']
groundTruth = 1.0 - groundTruth
print 'ground truth', groundTruth.shape
print

# Load similarity matrix
simMtxFileLst = sorted(glob('model/*_sim.csv'))
simMtx = {f.split('.')[0]:np.loadtxt(f, delimiter=',') for f in simMtxFileLst}
simMtxTypes = sorted(simMtx.keys())
print simMtxTypes
for name in simMtxTypes:
    print name, simMtx[name].shape

# Load category names
catNameFile = 'data/categoryKey.csv'
catName = {}
with open(catNameFile) as fin:
    r = csv.reader(fin)
    for row in r:
        catName[int(row[0])] = row[1]
catList = sorted(catName.keys())

ground truth (1055, 1055)

['model/angular_sim', 'model/cosine_sim', 'model/innerProd_sim', 'model/jsd_sim', 'model/l1_sim', 'model/l2_sim']
model/angular_sim (1055, 1055)
model/cosine_sim (1055, 1055)
model/innerProd_sim (1055, 1055)
model/jsd_sim (1055, 1055)
model/l1_sim (1055, 1055)
model/l2_sim (1055, 1055)


In [3]:
catNameFile = 'data/categoryKey.csv'
catName = {}
with open(catNameFile) as fin:
    r = csv.reader(fin)
    for row in r:
        catName[int(row[0])] = row[1]
catList = sorted(catName.keys())

In [4]:
def printClustering(labels, size=10):
    rest = defaultdict(list)
    for l, c in zip(labels, catList):
        rest[l].append(catName[c])
    for l in sorted(rest.keys()):
        print l, len(rest[l]), rest[l][:size]

In [5]:
for name in simMtxTypes:
    print name
    printClustering(spectral_clustering(simMtx[name], n_clusters=2, eigen_solver='arpack'))
    print

model/angular_sim
0 1054 ['abbey', 'abutment arch', 'access road', 'acropolis', 'africa', 'agora', 'agriculture', 'air field', 'airlock', 'airplane']
1 1 ['guest room']

model/cosine_sim
0 1027 ['abbey', 'abutment arch', 'access road', 'acropolis', 'africa', 'agora', 'agriculture', 'air field', 'airlock', 'airplane']
1 28 ['aisle', 'assembly line', 'bistro', 'cascade', 'cirque', 'crag', 'dale', 'escarpment', 'exterior', 'glen']

model/innerProd_sim
0 1054 ['abbey', 'abutment arch', 'access road', 'acropolis', 'africa', 'agora', 'agriculture', 'air field', 'airlock', 'airplane']
1 1 ['hotel room']

model/jsd_sim
0 1053 ['abbey', 'abutment arch', 'access road', 'acropolis', 'africa', 'agora', 'agriculture', 'air field', 'airlock', 'airplane']
1 2 ['motel room', 'waterfall (cataract)']

model/l1_sim
0 1054 ['abbey', 'abutment arch', 'access road', 'acropolis', 'africa', 'agora', 'agriculture', 'air field', 'airlock', 'airplane']
1 1 ['motel room']

model/l2_sim
0 1038 ['abbey', 'abutment 