In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pickle
import numpy as np

from treelib import Node, Tree
from gensim.models.doc2vec import Doc2Vec


In [3]:
def translate_to_tree(root):
    tree = Tree()
    tree.create_node(root.name, root.name)
    for child in root.children:
        tree.create_node(child.data.top_terms(10), child.name, parent=root.name)
        for grandchild in child.children:
            tree.create_node(grandchild.data.top_terms(10), grandchild.name, parent=child.name)
    #print(tree.show(stdout=False))
    return tree

### Training Doc2Vec model for robustness metric

In [3]:
#using DBLP dataset from original TaxoGen paper

documents = []

with open('input/documents.txt', 'r') as f:
    for line in f:
        documents.append(line.strip().split())

In [4]:
from gensim.models.doc2vec import TaggedDocument

for i, doc in enumerate(documents):
    documents[i] = TaggedDocument(doc, [i])

In [None]:
from embedding import doc2vec_embedding

model = doc2vec_embedding(documents)

In [6]:
model.save('output/doc2vec.model')

In [4]:
from gensim.models.doc2vec import Doc2Vec

model = Doc2Vec.load('output/doc2vec.model')

### Word2vec taxonomy

In [5]:
from evaluation.metrics import robustness, create_analysis_units

In [12]:
filename = 'output/skipgram_taxonomy.pkl'
root = pickle.load(open(filename, 'rb'))

analysis_units = create_analysis_units(translate_to_tree(root), model)

result = robustness(analysis_units)
print(result)

Robustness: 0.32 | Units: 5 | Total unit nodes: 25
	Unit: agent_based_modeling_and_simulation | Nodes: 5 | Outside nodes: 45 | Outside proportion: 0.45
	Unit: information_theoretic_security | Nodes: 5 | Outside nodes: 48 | Outside proportion: 0.48
	Unit: population_sizing | Nodes: 5 | Outside nodes: 74 | Outside proportion: 0.74
	Unit: hownet | Nodes: 5 | Outside nodes: 82 | Outside proportion: 0.82
	Unit: small_sample_size_problem | Nodes: 5 | Outside nodes: 91 | Outside proportion: 0.91



0.372 0.32 0.410

### FastText taxonomy

In [7]:
filename = 'output/fasttext_taxonomy.pkl'
root = pickle.load(open(filename, 'rb'))

analysis_units = create_analysis_units(translate_to_tree(root), model)

result = robustness(analysis_units)
print(result)

Robustness: 0.43400000000000005 | Units: 5 | Total unit nodes: 25
	Unit: chosen_plaintext | Nodes: 5 | Outside nodes: 1 | Outside proportion: 0.01
	Unit: ls_svms | Nodes: 5 | Outside nodes: 52 | Outside proportion: 0.52
	Unit: geoclef | Nodes: 5 | Outside nodes: 59 | Outside proportion: 0.59
	Unit: infosleuth | Nodes: 5 | Outside nodes: 79 | Outside proportion: 0.79
	Unit: policy_gradient | Nodes: 5 | Outside nodes: 92 | Outside proportion: 0.92



0.472 0.424 0.434