In [10]:
import warnings
warnings.filterwarnings('ignore')

In [24]:
!wget https://raw.githubusercontent.com/facebookresearch/poincare-embeddings/624584cfbad684d74bf034f8dbacd515230bedc4/wordnet/mammals_filter.txt
def get_mammal_relations():
    """Gets mammal subtree by doing transitive closure.
    Taken from https://github.com/facebookresearch/poincare-embeddings.
    """
    
    import re
    import pandas as pd
    import numpy as np
    from nltk.corpus import wordnet as wn
    from tqdm import tqdm 
    import nltk
    nltk.download('wordnet')
    nltk.download('omw-1.4')


    
    edges = set()
    for synset in tqdm(wn.all_synsets(pos='n')):
        for hyper in synset.closure(lambda s: s.hypernyms()):
            edges.add((synset.name(), hyper.name()))

        for instance in synset.instance_hyponyms():
            for hyper in instance.closure(lambda s: s.instance_hypernyms()):
                edges.add((instance.name(), hyper.name()))
                for h in hyper.closure(lambda s: s.hypernyms()):
                    edges.add((instance.name(), h.name()))

    nouns = pd.DataFrame(list(edges), columns=['id1', 'id2'])
    nouns['weight'] = 1
    mammal_set = set(nouns[nouns.id2 == 'mammal.n.01'].id1.unique())
    mammal_set.add('mammal.n.01')
    mammals = nouns[nouns.id1.isin(mammal_set) & nouns.id2.isin(mammal_set)]
    with open('mammals_filter.txt', 'r') as fin:
        filt = re.compile(f'({"|".join([l.strip() for l in fin.readlines()])})')
    filtered_mammals = mammals[~mammals.id1.str.cat(' ' + mammals.id2).str.match(filt)]

    nouns.to_csv('noun_closure.csv', index=False)
    filtered_mammals.to_csv('mammal_closure.csv', index=False)
    mammal = pd.read_csv('mammal_closure.csv')
    print('Total unique nodes: ', len(np.unique(list(mammal.id1.values) + list(mammal.id2.values))))
    mammal_relations = [[mammal.id1[i].split('.')[0], mammal.id2[i].split('.')[0]] for i in range(len(mammal))]
    return mammal_relations
    

/bin/bash: /opt/conda/lib/libtinfo.so.6: no version information available (required by /bin/bash)
--2023-02-20 12:34:46--  https://raw.githubusercontent.com/facebookresearch/poincare-embeddings/624584cfbad684d74bf034f8dbacd515230bedc4/wordnet/mammals_filter.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 390 [text/plain]
Saving to: ‘mammals_filter.txt.6’


2023-02-20 12:34:47 (18.0 MB/s) - ‘mammals_filter.txt.6’ saved [390/390]



In [25]:
mammal_relations = get_mammal_relations()
print(mammal_relations[0:3])

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
82115it [00:05, 13799.40it/s]


Total unique nodes:  1180
[['kid', 'even-toed_ungulate'], ['crowbait', 'ungulate'], ['addax', 'ungulate']]
