In [1]:
import networkx as nx
from tqdm import tqdm
import random
import numpy as np
import pickle
import os
import glob
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn


seed = 42
random.seed(seed)
np.random.seed(seed)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
G = nx.DiGraph()
for synset in (wn.all_synsets('n')):
    name = synset.name()
    G.add_node(name)
    hyponyms = synset.hyponyms()

    for hypo in hyponyms:
        new_name = hypo.name()
        G.add_node(new_name)
        G.add_edge(name, new_name)

for synset in (wn.all_synsets('v')):
    name = synset.name()
    G.add_node(name)
    hyponyms = synset.hyponyms()

    for hypo in hyponyms:
        new_name = hypo.name()
        G.add_node(new_name)
        G.add_edge(name, new_name)


In [39]:
with open('../data/psychology/test_nodes.pickle', 'rb') as f:
    test = pickle.load(f)

In [40]:
total_parents = 0
parents_in_wnet = 0 
child_in_wnet = 0
found_child = []

for child, parents in test:
    child = child.replace(' ', '_')
    for i in range(10):
        true_name = f'{child}.n.0{i}'
        if true_name in G.nodes():
            child_in_wnet += 1
            found_child.append((child, parents))
            break
    
    for parent in parents:
        total_parents += 1
        parent = parent.replace(' ', '_')
        for i in range(10):
            true_name = f'{parent}.n.0{i}'
            if true_name in G.nodes():
                parents_in_wnet += 1
                break

In [41]:
total_parents, parents_in_wnet, child_in_wnet, len(found_child)

(1286, 801, 55, 55)

In [42]:
found_parents = []
for child, parents in found_child:
    found = False
    for parent in parents:
        total_parents += 1
        parent = parent.replace(' ', '_')
        if not found:
            for i in range(10):
                true_name = f'{parent}.n.0{i}'
                if true_name in G.nodes():
                    found_parents.append((child, parents))
                    found = True
                    break

In [43]:
len(found_parents)

42

In [44]:
found_parents[2]

('physiological_nystagmus', ['eye movement'])

In [45]:
paths = []
cp = []
for child, parents in found_parents:
    child = child.replace(' ', '_')
    for i in range(10):
        child_true_name = f'{child}.n.0{i}'
        if child_true_name in G.nodes():
            for parent in parents:
                total_parents += 1
                parent = parent.replace(' ', '_')
                for i in range(10):
                    true_name = f'{parent}.n.0{i}'
                    if true_name in G.nodes():
                        try:
                            paths.append(nx.shortest_path_length(G, true_name, child_true_name))
                        except nx.NetworkXNoPath:
                            paths.append(-1)

                        cp.append((true_name, child_true_name))
                        break
            break

In [46]:
for c, p in test:
    if c == 'afterimage':
        print(c, p)

In [47]:
(np.array(paths) != -1)[28:], cp[29]

(array([False,  True, False, False,  True, False, False, False, False,
         True, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False,  True, False, False,
        False, False]),
 ('anticonvulsant.n.01', 'ethosuximide.n.01'))

In [48]:
nx.shortest_path_length(G, 'ant.n.01', 'formica_fusca.n.01')

2

In [49]:
np.array(paths)[(np.array(paths) != -1)], np.array(paths)[(np.array(paths) != -1)].mean(), (np.array(paths) == -1).sum(), len(paths), 

(array([2, 2, 1, 0, 1, 4]), 1.6666666666666667, 51, 57)

In [50]:
mag = nx.read_edgelist('../data/psychology/all.edgelist', create_using=nx.DiGraph, delimiter='\t')

In [51]:
total_parents = 0
parents_in_wnet = 0 
child_in_wnet = 0
found_child = []

for node in mag.nodes():
    child = node.replace(' ', '_')
    parents = list(mag.predecessors(node))

    for i in range(10):
        true_name = f'{child}.n.0{i}'
        if true_name in G.nodes():
            child_in_wnet += 1
            found_child.append((child, parents))
            break
    
    for parent in parents:
        total_parents += 1
        parent = parent.replace(' ', '_')
        for i in range(10):
            true_name = f'{parent}.n.0{i}'
            if true_name in G.nodes():
                parents_in_wnet += 1
                break

In [52]:
total_parents, parents_in_wnet, child_in_wnet, len(found_child)

(30032, 18989, 1173, 1173)

In [53]:
len(mag.nodes()), len(mag.edges())

(23156, 30032)

In [54]:
found_child[0]

('perphenazine', ['psychiatry'])

In [55]:
found_parents = []
for child, parents in found_child:
    found = False
    for parent in parents:
        total_parents += 1
        parent = parent.replace(' ', '_')
        if not found:
            for i in range(10):
                true_name = f'{parent}.n.0{i}'
                if true_name in G.nodes():
                    found_parents.append((child, parents))
                    found = True
                    break

In [56]:
len(found_parents)

1008

In [59]:
paths = []
for child, parents in found_parents:
    child = child.replace(' ', '_')
    for i in range(10):
        child_true_name = f'{child}.n.0{i}'
        if child_true_name in G.nodes():
            for parent in parents:
                total_parents += 1
                parent = parent.replace(' ', '_')
                for i in range(10):
                    true_name = f'{parent}.n.0{i}'
                    if true_name in G.nodes():
                        try:
                            paths.append(nx.shortest_path_length(G, true_name, child_true_name))
                        except nx.NetworkXNoPath:
                            paths.append(-1)
                        break
            break

In [60]:
len(np.array(paths)[(np.array(paths) != -1)]), np.array(paths)[(np.array(paths) != -1)].mean(), (np.array(paths) == -1).sum(), len(paths)

(105, 1.7142857142857142, 1241, 1346)

In [61]:
(np.array(paths) == 0).sum()

10