Clustering Visual Categories, using Descriptions
===========================

In [1]:
from heapq import *
import numpy as np
import csv
from collections import Counter, defaultdict
import multiprocessing as mp
from math import log
from heapq import *
from functools import partial
import pickle
import gzip

In [2]:
# Load files
catDescFile = 'data/cat_desc.csv'
catDescHeaders = None
catDescCnt = defaultdict(Counter)
descCnt = Counter()
with open(catDescFile) as fin:
    r = csv.reader(fin)
    for row in r:
        if catDescHeaders is None:
            catDescHeaders = row
            print catDescHeaders
        else:
            catDescCnt[int(row[0]) - 1][row[1]] += int(row[-1])
            descCnt[row[1]] += int(row[-1])

catNameFile = 'data/categoryKey.csv'
catName = {}
with open(catNameFile) as fin:
    r = csv.reader(fin)
    for row in r:
        catName[int(row[0]) - 1] = row[1]

print sorted(catDescCnt.keys()) == sorted(catName.keys()) and sorted(catDescCnt.keys()) == range(len(catName))

['category_id', 'description', 'count']
True


In [3]:
# Filter out description of occurance < $threshold$, replace description with id, and normalize distribution
threshold = 10
descList = [d for d, c in descCnt.most_common() if c >= threshold]
descId = {d:i for i, d in enumerate(descList)}
catDist = defaultdict(Counter)
for cat, dist in catDescCnt.iteritems():
    s = 0.0
    for d, v in dist.iteritems():
        if d in descId:
            s += v
    for d, v in dist.iteritems():
        if d in descId:
            catDist[cat][descId[d]] += v / s

In [4]:
def computeEntropy(itr):
    s = 0.0
    ans = 0.0
    for x in itr:
        s += x
        ans += x * log(x)
    return log(s) - ans / s

def mergeCandidiate(nodeInfo, s1, s2):
    s = nodeInfo[s1][0] + nodeInfo[s2][0]
    dist = nodeInfo[s1][1] + nodeInfo[s2][1]
    sumLeafEntropy = nodeInfo[s1][2] + nodeInfo[s2][2]
    return (computeEntropy(dist.itervalues()) - sumLeafEntropy / s, s1, s2)
    
def addCandidate(nodeInfo, newS, activeNodes, candidatePool, mpPool):
    mpKernel = partial(mergeCandidiate, nodeInfo, newS)
    newCandidates = mpPool.map(mpKernel, activeNodes)
    for c in newCandidates:
        heappush(candidatePool, c)
    activeNodes.add(newS)

def distHC1(catDist):
    mpPool = mp.Pool(processes=2)
    leafNum = len(catDist)
    activeNodes = set()
    nodeInfo = []
    candidatePool = []
    heapify(candidatePool)
    nextId = 0
    
    for leaf in xrange(leafNum):
        nodeInfo.append((1, catDist[leaf], computeEntropy(catDist[leaf].itervalues()), None, None, 0.0))
        addCandidate(nodeInfo, nextId, activeNodes, candidatePool, mpPool)
        nextId += 1
        if leaf % 10 == 0:
            print 'Adding leaf #', leaf
    
    while len(activeNodes) > 1:
        jsd, s1, s2 = heappop(candidatePool)
        while s1 not in activeNodes or s2 not in activeNodes:
            jsd, s1, s2 = heappop(candidatePool)
        nodeInfo.append((nodeInfo[s1][0] + nodeInfo[s2][0], 
                        nodeInfo[s1][1] + nodeInfo[s2][1], 
                        nodeInfo[s1][2] + nodeInfo[s2][2], 
                        s1, s2, jsd))
        activeNodes.remove(s1)
        activeNodes.remove(s2)
        addCandidate(nodeInfo, nextId, activeNodes, candidatePool, mpPool)
        nextId += 1
        if len(activeNodes) % 10 == 0:
            print 'Remaining active nodes #', len(activeNodes)
    
    mpPool.close()
    rootId = activeNodes.pop()
    return rootId, nodeInfo

In [5]:
rootId, nodeInfo = distHC1(catDist)
toStore = [(x[0],) + x[2:] for x in nodeInfo]
outputFileName = 'model/hcTree1_original.pickle.gz'
with gzip.open(outputFileName, 'wb') as fout:
    pickle.dump(toStore, fout)

Adding leaf # 0
Adding leaf # 10
Adding leaf # 20
Adding leaf # 30
Adding leaf # 40
Adding leaf # 50
Adding leaf # 60
Adding leaf # 70
Adding leaf # 80
Adding leaf # 90
Adding leaf # 100
Adding leaf # 110
Adding leaf # 120
Adding leaf # 130
Adding leaf # 140
Adding leaf # 150
Adding leaf # 160
Adding leaf # 170
Adding leaf # 180
Adding leaf # 190
Adding leaf # 200
Adding leaf # 210
Adding leaf # 220
Adding leaf # 230
Adding leaf # 240
Adding leaf # 250
Adding leaf # 260
Adding leaf # 270
Adding leaf # 280
Adding leaf # 290
Adding leaf # 300
Adding leaf # 310
Adding leaf # 320
Adding leaf # 330
Adding leaf # 340
Adding leaf # 350
Adding leaf # 360
Adding leaf # 370
Adding leaf # 380
Adding leaf # 390
Adding leaf # 400
Adding leaf # 410
Adding leaf # 420
Adding leaf # 430
Adding leaf # 440
Adding leaf # 450
Adding leaf # 460
Adding leaf # 470
Adding leaf # 480
Adding leaf # 490
Adding leaf # 500
Adding leaf # 510
Adding leaf # 520
Adding leaf # 530
Adding leaf # 540
Adding leaf # 550
Add

In [6]:
# def parseTree(nodeInfo, headNode):
#     tree = {}
#     if nodeInfo[headNode][3] is None or nodeInfo[headNode][4] is None:
#         tree['name'] = catName[headNode]
#         tree['children'] = []
#     else:
#         s1, s2 = nodeInfo[headNode][3], nodeInfo[headNode][4]
#         tree['name'] = str(nodeInfo[headNode][-1])[:6] + ' - ' \
#             + str(nodeInfo[headNode][-1] - nodeInfo[s1][-1] - nodeInfo[s2][-1])[:6]
#         tree['children'] = [parseTree(nodeInfo, s1), parseTree(nodeInfo, s2)]
#     return tree

In [7]:
# import json
# catTree = parseTree(nodeInfo, rootId)
# with open('vis/hcTree1.json', 'w') as fout:
#     json.dump(catTree, fout)