Exloring various threshold for flattening the taxonomy tree
=================================

In [1]:
from collections import Counter, defaultdict, deque
import pickle
import gzip
import csv
import json

In [2]:
# Load files
catNameFile = 'data/categoryKey.csv'
catName = {}
with open(catNameFile) as fin:
    r = csv.reader(fin)
    for row in r:
        catName[int(row[0]) - 1] = row[1]

# Load taxonomy tree
taxonomyFile = 'model/hcTree1.pickle.gz'
with gzip.open(taxonomyFile) as fin:
    taxoTreeNodes = pickle.load(fin)

# Load label info
treeLabelFile = 'model/hcTree1_matching.pickle.gz'
with gzip.open(treeLabelFile) as fin:
    matchingMtx, descList = pickle.load(fin)
print len(descList)

# Load web page format
webpageFormat = 'vis/sample.html'
with open(webpageFormat) as fin:
    webpageString = fin.read()

8313


In [3]:
def assignMatching(topDesc=0):
    if topDesc <= 0:
        topDesc = len(descList)
    # Find best matching for each description
    descBest = [0] * topDesc
    for i, val in enumerate(matchingMtx):
        for j, v in enumerate(val[:topDesc]):
            if v < matchingMtx[descBest[j]][j]:
                descBest[j] = i

    # Find best matching for each node
    nodeBest = [val.index(min(val[:topDesc])) for val in matchingMtx]

    # Designate node description
    return [d if descBest[d] == i else None for i, d in enumerate(nodeBest)]

In [4]:
def buildFlattenedTree_recur(nodeDesc, topNode, sizeLimit=0):
    nodeRec = taxoTreeNodes[topNode]
    if nodeRec[0] < sizeLimit:
        return ()
    s1, s2 = nodeRec[-3], nodeRec[-2]
    if s1 is None and s2 is None:
        return ({'name':catName[topNode], 'children':()},)
    else:
        st1 = buildFlattenedTree_recur(nodeDesc, s1, sizeLimit)
        st2 = buildFlattenedTree_recur(nodeDesc, s2, sizeLimit)
        if nodeDesc[topNode - len(catName)] is not None:
            return ({'name':descList[nodeDesc[topNode - len(catName)]], 'children':st1+st2},)
        else:
            return st1+st2
    
def buildFlattenedTree(nodeDesc, sizeLimit=0):
    topNode = len(taxoTreeNodes) - 1
    rootForest = buildFlattenedTree_recur(nodeDesc, topNode, sizeLimit)
    if len(rootForest) == 1:
        return rootForest[0]
    else:
        return {'name':'ROOT', 'children':rootForest}

def treeSize(tree):
    ans = 1
    for c in tree['children']:
        ans += treeSize(c)
    return ans

In [5]:
def listNodes(tree):
    nm = 'name'
    cn = 'children'
    queue = deque([tree])
    nodeLst = []
    parent = {0:None}
    nextNodeId = 0
    nextQueueId = 1
    while len(queue) > 0:
        t = queue.popleft()
        stLst = []
        for st in t[cn]:
            parent[nextQueueId] = nextNodeId
            queue.append(st)
            stLst.append(nextQueueId)
            nextQueueId += 1
        nodeLst.append((t[nm], parent[nextNodeId], tuple(stLst)))
        nextNodeId += 1
    return nodeLst

In [6]:
topDescLst = [0, 5000, 3000, 2000]
sizeLimitLst = [0, 5]
for topDesc in topDescLst:
    for sizeLimit in sizeLimitLst:
        name = 'taxoTree_' + str(topDesc) + '_' + str(sizeLimit)
        nodeDesc = assignMatching(topDesc)
        taxoTree = buildFlattenedTree(nodeDesc, sizeLimit)
        with open('vis/data/' + name + '.json', 'w') as fout:
            json.dump(taxoTree, fout)
        with open('vis/' + name + '.html', 'w') as fout:
            print >>fout, webpageString.replace('##PATH_TO_TREE_JSON##', 'data/' + name + '.json')\
                                .replace('##HEIGHT##', str(10 * treeSize(taxoTree)))
        nodeLst = listNodes(taxoTree)
        with gzip.open('model/' + name + '.pickle.gz', 'wb') as fout:
            pickle.dump(nodeLst, fout)
        print name

taxoTree_0_0
taxoTree_0_5
taxoTree_5000_0
taxoTree_5000_5
taxoTree_3000_0
taxoTree_3000_5
taxoTree_2000_0
taxoTree_2000_5
