In [1]:
from preprocessing.EntityNameRetriever import EntityNameRetriever
from preprocessing.graph import graph_from_edgelist, remove_void_types
import networkx as nx
from preprocessing.CorpusManager import CorpusManager
import pickle
from preprocessing.utils import save_data_with_pickle, load_data_with_pickle
import time
import random

# List of classes used to test the correctness of the workflow
# PATH in which utility files are stored
# PICKLES_PATH = '../../source_files/pickles/'
PICKLES_PATH = '/datahdd/vmanuel/MTNCI_datasets/source_files/pickles/'

# PATH that refers to the file which let the building of the Ontology Graph
PATH_TO_EDGELIST = '/datahdd/vmanuel/MTNCI_datasets/source_files/pickles/dbpedia_edgelist_no_closure.tsv'
# PATH to the corpus from which information are extracted
CORPUS_PATH = '/datahdd/vmanuel/ELMo/Corpora/shuffled_text_with_words'
LOG = 'MEGAlog_4_3.txt'

FILE_ID = '16_3'

GRAPH_PATH = PICKLES_PATH + FILE_ID + 'graph'
ENTITY_DICT_PATH = PICKLES_PATH + FILE_ID +'entity_dict'
WORD_INDEXES_PATH = PICKLES_PATH + FILE_ID + 'word_indexes'
WORD_OCCURRENCE_INDEX_PATH = PICKLES_PATH + FILE_ID + 'word_occurrence_indexes'
FOUND_ENTITY_DICT_PATH = PICKLES_PATH + FILE_ID + 'found_entity_dict'
FINAL_TREE_PATH = PICKLES_PATH + FILE_ID + 'final_tree'

LENGTH = 100000

avoid_multilabeling = True

In [6]:
e = EntityNameRetriever()
entity_dict = load_data_with_pickle(ENTITY_DICT_PATH)

In [5]:
void_types = [t for t, v in entity_dict.items() if v == []]
G = load_data_with_pickle(GRAPH_PATH)
pruned_G = remove_void_types(G, void_types)

In [11]:
list_of_classes = [n for n in pruned_G.nodes()]

c = CorpusManager()
c.read_corpus(CORPUS_PATH, LENGTH)
c.create_all_entities(entity_dict, concepts=list_of_classes)    
occurrences_of_entities = load_data_with_pickle(WORD_OCCURRENCE_INDEX_PATH)


read input corpus


100%|██████████| 100000/100000 [00:04<00:00, 23952.65it/s]


283021 words in vocab
118 entities
0 entity-tokens in vocab


In [7]:
list_of_classes

['NaturalEvent',
 'Earthquake',
 'SolarEclipse',
 'StormSurge',
 'Province',
 'HistoricalProvince',
 'TimePeriod',
 'CareerStation',
 'GeologicalPeriod',
 'HistoricalPeriod',
 'PeriodOfArtisticStyle',
 'PrehistoricalPeriod',
 'ProtohistoricalPeriod',
 'Year',
 'YearInSpaceflight',
 'AnatomicalStructure',
 'Artery',
 'BloodVessel',
 'Bone',
 'Brain',
 'Embryology',
 'Ligament',
 'Lymph',
 'Muscle',
 'Nerve',
 'Vein',
 'Territory',
 'OldTerritory',
 'Beverage',
 'Beer',
 'Vodka',
 'Wine',
 'ControlledDesignationOfOriginWine',
 'Department',
 'OverseasDepartment',
 'SportsSeason',
 'MotorsportSeason',
 'SportsTeamSeason',
 'BaseballSeason',
 'FootballLeagueSeason',
 'NCAATeamSeason',
 'SoccerClubSeason',
 'SoccerLeagueSeason',
 'FloweringPlant',
 'Grape',
 'SportCompetitionResult',
 'OlympicResult',
 'SnookerWorldRanking',
 'Tower',
 'Lighthouse',
 'WaterTower',
 'ReligiousBuilding',
 'Abbey',
 'Church',
 'Monastery',
 'Mosque',
 'Shrine',
 'Synagogue',
 'Temple',
 'MotorcycleRider',
 'Mo

In [None]:
found_entities = set(occurrences_of_entities.keys())
found_entity_dict = {k: set(v).intersection(found_entities) for k,v in entity_dict.items() if set(v).intersection(found_entities)}    

In [None]:
c.word_indexes = load_data_with_pickle(WORD_INDEXES_PATH)


In [None]:
occurrences_of_entities['aakash']

In [None]:
found_entity_dict

In [None]:
c.check_entity_in_row(ENTITY = 'aakash', ROW = 46355, verbose= True)

In [None]:
from collections import defaultdict

entity_concept_dict =  defaultdict(list)

for concept, entities in found_entity_dict.items():
    for entity in entities:
        entity_concept_dict[entity].append(concept)

In [8]:
FILTERED_DATASET_PATH = '../source_files/vectors/' + '16_3' + '/' 

E_TRAIN_PATH = FILTERED_DATASET_PATH + 'filtered_entities_train'
E_VAL_PATH = FILTERED_DATASET_PATH + 'filtered_entities_val'
E_TEST_PATH = FILTERED_DATASET_PATH + 'filtered_entities_test'

E_train = load_data_with_pickle(E_TRAIN_PATH)
E_val = load_data_with_pickle(E_VAL_PATH)
E_test = load_data_with_pickle(E_TEST_PATH)

Y_TRAIN_PATH = FILTERED_DATASET_PATH + 'filtered_Y_train'
Y_VAL_PATH = FILTERED_DATASET_PATH + 'filtered_Y_val'
Y_TEST_PATH = FILTERED_DATASET_PATH + 'filtered_Y_test'

Y_train = load_data_with_pickle(Y_TRAIN_PATH)
Y_val = load_data_with_pickle(Y_VAL_PATH)
Y_test = load_data_with_pickle(Y_TEST_PATH)

from collections import defaultdict
entity_dict = defaultdict(list)

for (Y, E) in [(Y_train, E_train), (Y_val, E_val), (Y_test, E_test)]:
    for y, e in zip(Y, E):
        entity_dict[e] = [y]

In [12]:
entity_dict

defaultdict(list,
            {'pashtuns': ['EthnicGroup'],
             'kilometres': ['Aircraft'],
             'art': ['TelevisionShow'],
             'history': ['Book'],
             'family': ['Band'],
             'philippines': ['Place'],
             'poem': ['Poem'],
             'ophichthus fasciatus': ['Fish'],
             'february': ['Holiday'],
             'eisenberg thuringia': ['Town'],
             'perth': ['Place'],
             'town': ['Settlement'],
             'meyers': ['Aircraft'],
             'election': ['Election'],
             'writer': ['Person'],
             'census': ['AnatomicalStructure'],
             'comune': ['Organisation'],
             'sinocyclocheilus': ['Fish'],
             'daughter': ['Mammal'],
             'tributary': ['River'],
             'press': ['Publisher'],
             'gretchen peters': ['Artist'],
             'soleidae': ['Animal'],
             'pruchnik': ['Town'],
             'john': ['Single'],
             'stan

In [10]:
def write_choi_like_dataset(out_dict):
    if out_dict['mention_span'] in E_test:
        with open('./choi-like-datasets/test.json', 'a') as out:
#             output = '{'
#             output += '"y_str": {}, '.format(["\"{}\"".format(x) for x in out_dict['y_str']])
#             output += '"annot_id" : "MTNCI", '
#             output += '"mention_span": "{}, '.format(out_dict['mention_span'])
#             output += '"right_context_token": {}, '.format(["\"{}\"".format(x) for x in out_dict['right_context_token']])
#             output += '"left_context_token": {}'.format(["\"{}\"".format(x) for x in out_dict['left_context_token']])
#             output += '}\n'
            output = json.dumps(out_dict) + '\n'
            out.write(output)

In [15]:
from tqdm.notebook import tqdm
import json

with open('./choi-like-datasets/single_label_test.json', 'w') as out_test, \
open('./choi-like-datasets/single_label_train.json', 'w') as out_train, \
open('./choi-like-datasets/single_label_val.json', 'w') as out_val:

    for entity, concept in tqdm(entity_dict.items()):
        for row, indexes in occurrences_of_entities[entity]:
            for index in indexes:
                out_dict = {}

                out_dict["y_str"] = concept
                out_dict["annot_id"] = 'MTNCI'
                out_dict["mention_span"] = entity
                out_dict['right_context_token'] = c.corpus[row][:index]
                out_dict['left_context_token'] = c.corpus[row][index + len(entity.split(' ')):]


                if out_dict['mention_span'] in E_test:
                    output = json.dumps(out_dict) + '\n'
                    out_test.write(output)
                elif out_dict['mention_span'] in E_train:
                    output = json.dumps(out_dict) + '\n'
                    out_train.write(output)
                elif out_dict['mention_span'] in E_val:
                    output = json.dumps(out_dict) + '\n'
                    out_val.write(output)
                    

HBox(children=(FloatProgress(value=0.0, max=25350.0), HTML(value='')))




In [None]:
len(found_entity_dict.keys())

In [None]:
save_data_with_pickle('../source_files/pickles/16_3_multilabel_final_tree', pruned_G)

In [None]:
save_data_with_pickle('./entity_concepts_dict_(multilabel).pkl', entity_concept_dict)

In [19]:
set(v[0] for v in entity_dict.values())

{'Actor',
 'AdministrativeRegion',
 'Aircraft',
 'Airline',
 'Amphibian',
 'AnatomicalStructure',
 'Animal',
 'Arachnid',
 'Architect',
 'Artist',
 'ArtistDiscography',
 'Athlete',
 'AustralianRulesFootballPlayer',
 'Award',
 'Band',
 'BaseballPlayer',
 'BasketballPlayer',
 'Bird',
 'BodyOfWater',
 'Book',
 'Boxer',
 'Building',
 'BusCompany',
 'City',
 'Cleric',
 'Colour',
 'ComicsCharacter',
 'ComicsCreator',
 'Cricketer',
 'Crustacean',
 'Currency',
 'Cyclist',
 'Dam',
 'Diocese',
 'Disease',
 'EducationalInstitution',
 'Election',
 'EthnicGroup',
 'Eukaryote',
 'Fashion',
 'FictionalCharacter',
 'Film',
 'Fish',
 'Food',
 'Fungus',
 'GivenName',
 'Gymnast',
 'HistoricBuilding',
 'HistoricPlace',
 'Holiday',
 'Hospital',
 'Insect',
 'Island',
 'Lake',
 'Language',
 'Legislature',
 'Mammal',
 'MemberOfParliament',
 'MilitaryStructure',
 'MilitaryUnit',
 'Mollusca',
 'Monarch',
 'Mountain',
 'MountainRange',
 'Museum',
 'Musical',
 'MusicalWork',
 'Noble',
 'OfficeHolder',
 'Organisat

In [20]:
import torch
a = torch.load('../source_files/embeddings/16_3_nickel.pth')
a

{'conf': {'checkpoint': 'dbpedia.pth',
  'dset': 'dbpedia/MTNCI_single_label_edgelist.csv',
  'dim': 10,
  'manifold': 'poincare',
  'model': 'distance',
  'lr': 0.3,
  'epochs': 300,
  'batchsize': 10,
  'negs': 50,
  'burnin': 20,
  'dampening': 0.75,
  'ndproc': 4,
  'eval_each': 1,
  'fresh': True,
  'debug': False,
  'gpu': -1,
  'sym': False,
  'maxnorm': 500000,
  'sparse': True,
  'burnin_multiplier': 0.01,
  'neg_multiplier': 1.0,
  'quiet': False,
  'lr_type': 'constant',
  'train_threads': 2,
  'margin': 0.1,
  'eval': 'reconstruction',
  'epoch_start': 0},
 'objects': ['NaturalEvent',
  'Earthquake',
  'SolarEclipse',
  'StormSurge',
  'Province',
  'HistoricalProvince',
  'TimePeriod',
  'CareerStation',
  'GeologicalPeriod',
  'HistoricalPeriod',
  'PeriodOfArtisticStyle',
  'PrehistoricalPeriod',
  'ProtohistoricalPeriod',
  'Year',
  'YearInSpaceflight',
  'AnatomicalStructure',
  'Artery',
  'BloodVessel',
  'Bone',
  'Brain',
  'Embryology',
  'Ligament',
  'Lymph',
 

In [21]:
from MTNCI import MTNCI

In [23]:
model = MTNCI(input_d=1024,
                    out_spec = [{'manifold':'euclid', 
                                'dim':[64, 
                                       10]},
                                {'manifold':'poincare', 
                                 'dim':[128, 128, 10]}],
                    dims = [512, 512])


In [27]:
def get_n_params(model):
    pp=0
    for p in list(model.parameters()):
        nn=1
        for s in list(p.size()):
            nn = nn*s
        pp += nn
    return pp

In [28]:
get_n_params(model)

907132

In [32]:
len(lopez_data)

107253