## Analyze the Results of the Category Extract Notebook
This notebook takes the two files generated by the Category Graph notebook and calculates some basic graph stats.

In [17]:
import pickle
import numpy as np
from tqdm import tqdm_notebook
import sys
sys.setrecursionlimit(2000)

In [10]:
def dfs(G, start, visited=None, depth=0, stats=None):
    if visited is None:
        visited = set()
        stats = {'min':-1, 'max':-1, 'paths':[], 'cycles':0}
    visited.add(start)
    if set(G[start]) - visited == set():
        #terminal_type = 'leaf'
        if G[start] != []:
            stats['cycles'] += 1 
            #terminal_type = 'cycle'
        #print('{}: {} @ {}'.format(terminal_type, start, depth))
        if depth < stats['min'] or stats['min'] == -1:
            stats['min'] = depth
        if depth > stats['max']:
            stats['max'] = depth
        stats['paths'].append(depth)
    for next in set(G[start]) - visited:
        dfs(G, next, visited, depth+1, stats)
    return stats
    
def graph_stats(G, roots):
    '''
    Do DFS and report the min and max depths for each "root"
    '''
    stats = {}
    for root in roots:
        stats[root] = dfs(G, root)
    return stats

In [11]:
# simple tree
test1 = {
    'a': ['b', 'c'],
    'b': ['d'],
    'c': ['e'],
    'd': [],
    'e': []
}
stats = graph_stats(test1, ['a'])
print(stats)
print('mean: {}  stdev: {}'.format(np.mean(stats['a']['paths']), np.std(stats['a']['paths'])))

{'a': {'max': 2, 'min': 2, 'cycles': 0, 'paths': [2, 2]}}
mean: 2.0  stdev: 0.0


In [12]:
# deeper, asymetrical tree
test2 = {
    'a': ['b', 'c'],
    'b': ['d', 'e'],
    'c': ['f', 'g'],
    'd': [],
    'e': ['h'],
    'f': [],
    'g': [],
    'h': []
}
stats = graph_stats(test2, ['a'])
print(stats)
print('mean: {}  stdev: {}'.format(np.mean(stats['a']['paths']), np.std(stats['a']['paths'])))

{'a': {'max': 3, 'min': 2, 'cycles': 0, 'paths': [3, 2, 2, 2]}}
mean: 2.25  stdev: 0.4330127018922193


In [13]:
# tree with multiple roots
test3 = {
    'a': ['d', 'c'],
    'b': ['c', 'e'],
    'c': [],
    'd': ['f'],
    'e': ['h'],
    'f': ['g'],
    'g': [],
    'h': []
}
stats = graph_stats(test3, ['a', 'b'])
for root in stats:
    print('*** {} ***'.format(root))
    print(stats[root])
    print('paths: {}'.format(len(stats[root]['paths'])))
    print('mean: {}  stdev: {}'.format(np.mean(stats[root]['paths']), 
                                       np.std(stats[root]['paths'])))
print('total paths: {}'.format(sum([len(stats[root]['paths']) for root in stats])))

*** b ***
{'max': 2, 'min': 1, 'cycles': 0, 'paths': [2, 1]}
paths: 2
mean: 1.5  stdev: 0.5
*** a ***
{'max': 3, 'min': 1, 'cycles': 0, 'paths': [3, 1]}
paths: 2
mean: 2.0  stdev: 1.0
total paths: 4


In [14]:
# graph with cycle
test4 = {
    'a': ['b', 'c'],
    'b': ['d'],
    'c': ['e'],
    'd': [],
    'e': ['a']
}
stats = graph_stats(test4, ['a'])
print(stats)
print('mean: {}  stdev: {}'.format(np.mean(stats['a']['paths']), np.std(stats['a']['paths'])))

{'a': {'max': 2, 'min': 2, 'cycles': 1, 'paths': [2, 2]}}
mean: 2.0  stdev: 0.0


In [18]:
with open('../data/ontology/readable_graph.p', 'rb') as fp:
    readable_graph = pickle.load(fp)
with open('../data/ontology/readable_roots.p', 'rb') as fp:
    readable_roots = pickle.load(fp)

In [19]:
stats = graph_stats(readable_graph, readable_roots)
for stat in tqdm_notebook(stats):
    print('{}: min={} max={} cycles={} paths={} mean={:.2f} stdev={:.2f}'.format(
        stat,
        stats[stat]['min'],
        stats[stat]['max'],  
        stats[stat]['cycles'],
        len(stats[stat]['paths']),
        np.mean(stats[stat]['paths']),
        np.std(stats[stat]['paths'])))

education: min=1 max=1438 cycles=8096 paths=632829 mean=1006.25 stdev=410.19
rights: min=1 max=1458 cycles=8259 paths=632779 mean=1010.24 stdev=422.94
hunger: min=1 max=1459 cycles=8162 paths=632690 mean=1001.81 stdev=429.25
space: min=1 max=1413 cycles=8054 paths=631576 mean=979.26 stdev=409.49
energy: min=1 max=1409 cycles=8125 paths=631122 mean=979.31 stdev=405.89
prevention: min=1 max=1417 cycles=8142 paths=632226 mean=974.81 stdev=412.66
aid: min=1 max=1397 cycles=8044 paths=631882 mean=979.38 stdev=395.38
biotechnology: min=1 max=1441 cycles=8311 paths=632608 mean=989.50 stdev=410.80
bioinformatics: min=1 max=1385 cycles=8015 paths=631482 mean=984.20 stdev=399.42
biological engineering: min=1 max=1440 cycles=8025 paths=632214 mean=989.50 stdev=427.60
justice: min=1 max=1384 cycles=8122 paths=632251 mean=973.50 stdev=393.04
political philosphy: min=0 max=0 cycles=0 paths=1 mean=0.00 stdev=0.00
environmental economics: min=1 max=1429 cycles=8136 paths=632113 mean=973.78 stdev=412.1