In [1]:
from transformers import LongformerForMaskedLM, LongformerModel, LongformerTokenizer, LongformerConfig, AdamW, get_linear_schedule_with_warmup
from transformers.models.longformer.modeling_longformer import LongformerLMHead, _compute_global_attention_mask

# Generate entities_classes.json and relation_classes.json

In [4]:
import json
  
# Opening JSON file
f = open('../data/docred_joint/types.json')
  
# returns JSON object as 
# a dictionary
data = json.load(f)

In [6]:
entities = data['entities']
relations = data['relations']

In [7]:
entities_simplified = []

for entity,entitity_dict in entities.items():
    entities_simplified.append(entity)

In [8]:
entities_simplified

['LOC', 'MISC', 'PER', 'NUM', 'TIME', 'ORG']

In [16]:
relations_simplified = []

for relation,relation_dict in relations.items():
    relation_string = relation_dict['verbose'].title()
    relation_string = relation_string.replace(" ","")
    relations_simplified.append(relation_string)

In [17]:
relations_simplified

['CapitalOf',
 'Conflict',
 'Genre',
 'Operator',
 'LocatedInTheAdministrativeTerritorialEntity',
 'HasPart',
 'LanguagesSpoken,WrittenOrSigned',
 'LocatedInOrNextToBodyOfWater',
 'BasinCountry',
 'OriginalNetwork',
 'OwnedBy',
 'Publisher',
 'Composer',
 'NarrativeLocation',
 'Subsidiary',
 'InfluencedBy',
 'LocationOfFormation',
 'SisterCity',
 'Dissolved,AbolishedOrDemolished',
 'LegislativeBody',
 'FoundedBy',
 'League',
 'Country',
 'PlaceOfBirth',
 'Sibling',
 'HeadOfGovernment',
 'Location',
 'AppliesToJurisdiction',
 'StartTime',
 'EndTime',
 'PointInTime',
 'MemberOf',
 'LyricsBy',
 'Characters',
 'RecordLabel',
 'Employer',
 'MemberOfPoliticalParty',
 'Mother',
 'CountryOfCitizenship',
 'Spouse',
 'PlaceOfDeath',
 'Father',
 'SeparatedFrom',
 'NotableWork',
 'SubclassOf',
 'TerritoryClaimedBy',
 'PublicationDate',
 'DateOfDeath',
 'Inception',
 'Developer',
 'Series',
 'ProductionCompany',
 'Creator',
 'ParentTaxon',
 'EthnicGroup',
 'Performer',
 'Manufacturer',
 'PositionHe

In [18]:
entity_dict = {"docred":entities_simplified}
relation_dict = {"docred":relations_simplified}
    
with open("../data/docred_joint/entity_classes.json", "w") as outfile:
    json.dump(entity_dict, outfile)
    
with open("../data/docred_joint/relation_classes.json", "w") as outfile:
    json.dump(relation_dict, outfile)

# Create train json

In [176]:
f = open('../data/docred_joint/test_joint.json')
  
# returns JSON object as 
# a dictionary
data = json.load(f)

In [177]:
data[0]

{'vertexSet': [[{'name': 'Jeff Prosserman',
    'pos': [0, 2],
    'sent_id': 0,
    'type': 'PER'},
   {'type': 'PER', 'pos': [7, 8], 'name': 'Prosserman', 'sent_id': 7},
   {'type': 'PER', 'pos': [6, 7], 'name': 'Prosserman', 'sent_id': 1},
   {'type': 'PER', 'pos': [7, 8], 'name': 'Prosserman', 'sent_id': 6},
   {'type': 'PER', 'pos': [13, 14], 'name': 'Prosserman', 'sent_id': 8},
   {'type': 'PER', 'pos': [0, 1], 'name': 'Prosserman', 'sent_id': 2},
   {'type': 'PER', 'pos': [0, 1], 'name': 'Prosserman', 'sent_id': 5}],
  [{'type': 'TIME', 'pos': [4, 8], 'name': 'November 2, 1983', 'sent_id': 0}],
  [{'name': 'Canadian', 'pos': [11, 12], 'sent_id': 0, 'type': 'LOC'}],
  [{'type': 'ORG', 'pos': [3, 5], 'name': 'LIVESTAGE °', 'sent_id': 1},
   {'name': 'LIVESTAGE °', 'pos': [10, 12], 'sent_id': 8, 'type': 'ORG'}],
  [{'type': 'ORG', 'pos': [11, 13], 'name': 'Gusto Goods', 'sent_id': 1}],
  [{'name': 'New York', 'pos': [19, 21], 'sent_id': 1, 'type': 'LOC'}],
  [{'name': 'Toronto', 'p

In [42]:
from transformers import AutoTokenizer, AutoConfig

tokenizer = AutoTokenizer.from_pretrained(
    'allenai/longformer-base-4096')

tokens = tokenizer.encode("This is a sentence from [MASK] training data")
tokens = tokenizer.convert_ids_to_tokens(tokens)
tokens

['<s>',
 'This',
 'Ġis',
 'Ġa',
 'Ġsentence',
 'Ġfrom',
 'Ġ[',
 'MAS',
 'K',
 ']',
 'Ġtraining',
 'Ġdata',
 '</s>']

In [178]:
def find_sub_list(sl,l):
    results=[]
    sll=len(sl)
    for ind in (i for i,e in enumerate(l) if e==sl[0]):
        if l[ind:ind+sll]==sl:
            results.append((ind,ind+sll))

    return results

def tokenize(string):
    tokens = tokenizer.encode(string)
    tokens = tokenizer.convert_ids_to_tokens(tokens)
    return tokens[1:-1]

In [52]:
entity = data[0]['vertexSet'][0][0]

sents = data[0]['sents']

entity_name = entity['name']
sent = sents[entity['sent_id']]
sent = ' '.join(sent)

entity_tokens = tokenizer.encode(entity_name)
entity_tokens = tokenizer.convert_ids_to_tokens(entity_tokens)[1:-1]

sent_tokens = tokenizer.encode(sent)
sent_tokens = tokenizer.convert_ids_to_tokens(sent_tokens)[1:-1]

In [53]:
find_sub_list(entity_tokens,sent_tokens)

[(0, 3)]

In [54]:
sent_tokens

['G',
 'amb',
 'ier',
 'ĠIsland',
 'Ġis',
 'Ġan',
 'Ġisland',
 'Ġlocated',
 'Ġin',
 'ĠHowe',
 'ĠSound',
 'Ġnear',
 'ĠVancouver',
 'Ġ,',
 'ĠBritish',
 'ĠColumbia',
 'Ġ.']

In [169]:
# Opening JSON file
f = open('../data/docred_joint/types.json')
  
# returns JSON object as 
# a dictionary
types = json.load(f)

relations_dict = types['relations']

In [170]:
relations_dict

{'P1376': {'symmetric': False, 'short': 'P1376', 'verbose': 'capital of'},
 'P607': {'symmetric': False, 'short': 'P607', 'verbose': 'conflict'},
 'P136': {'symmetric': False, 'short': 'P136', 'verbose': 'genre'},
 'P137': {'symmetric': False, 'short': 'P137', 'verbose': 'operator'},
 'P131': {'symmetric': False,
  'short': 'P131',
  'verbose': 'located in the administrative territorial entity'},
 'P527': {'symmetric': False, 'short': 'P527', 'verbose': 'has part'},
 'P1412': {'symmetric': False,
  'short': 'P1412',
  'verbose': 'languages spoken, written or signed'},
 'P206': {'symmetric': False,
  'short': 'P206',
  'verbose': 'located in or next to body of water'},
 'P205': {'symmetric': False, 'short': 'P205', 'verbose': 'basin country'},
 'P449': {'symmetric': False, 'short': 'P449', 'verbose': 'original network'},
 'P127': {'symmetric': False, 'short': 'P127', 'verbose': 'owned by'},
 'P123': {'symmetric': False, 'short': 'P123', 'verbose': 'publisher'},
 'P86': {'symmetric': Fal

In [179]:
docs = []

for doc in data:
    new_doc = {}
    new_doc['vertexSet'] = doc['vertexSet']
    new_doc['labels'] = doc['labels']
    new_doc['title'] = doc['title']
    new_doc['sents'] = doc['sents']
    
    sents = doc['sents']
    tokenized_sents = []
    
    entities_dict = {}
    for entities in doc['vertexSet']:
        for entity in entities:
            new_entity = entity
            
            entity_name = entity['name']
            sent = sents[entity['sent_id']]
            sent = ' '.join(sent)
            
            if entity['pos'][0] == 0:
                tokens = []
            else:
                tokens = ['* ']
            tokens.extend(sents[entity['sent_id']][entity['pos'][0]:entity['pos'][1]])
#             print(' un '.join(tokens))
            entity_tokens = tokenize(' * '.join(tokens))
            entity_tokens = list(filter(('*').__ne__, entity_tokens))
            entity_tokens = list(filter(('Ġ*').__ne__, entity_tokens))
            if entity['pos'][0] > 0:
                entity_tokens = entity_tokens[1:]
            sent_tokens = tokenize(sent)
#             print("entity_tokens: ",entity_tokens)
#             print("sent_tokens: ", sent_tokens)
            
            new_pos = find_sub_list(entity_tokens,sent_tokens)
            
            if len(new_pos) > 0:
                new_entity['new_pos'] = new_pos[0]
#                 print(sent_tokens[new_pos[0][0]:new_pos[0][1]])
                entities_dict[str(entity['pos'][0]) + '_' + str(entity['pos'][1]) + '_' + str(entity['sent_id'])] = new_entity
            else:
                print("NEW ENTITY: ",new_entity)
                print("entity_tokens: ",entity_tokens)
                print("sent_tokens: ", sent_tokens)
                
    for sent in doc['sents']:
        sent = ' '.join(sent)
        sent_tokens = tokenize(sent)
        tokenized_sents.append(sent_tokens)
    
    relations = []
    entities = []
    for relation in doc['labels']:
        relationship = relations_dict[relation['r']]['verbose']
        relationship = relationship.title()
        relationship = relationship.replace(" ","")
        
        sub = doc['vertexSet'][relation['h']][0]
        sub_dict_key = str(sub['pos'][0]) + '_' + str(sub['pos'][1]) + '_' + str(sub['sent_id'])
        
        obj = doc['vertexSet'][relation['t']][0]
        obj_dict_key = str(obj['pos'][0]) + '_' + str(obj['pos'][1]) + '_' + str(obj['sent_id'])
        
        sub_ent = None
        obj_ent = None
        
        if sub_dict_key in entities_dict:
            sub_ent = entities_dict[sub_dict_key]
            entities.append(sub_ent)
        if obj_dict_key in entities_dict:
            obj_ent = entities_dict[obj_dict_key]
            entities.append(obj_ent)
        
        if sub_ent and obj_ent:
            relation_dict = {}
            relation_dict['relation'] = relationship
            sub_dict_key = str(sub_ent['new_pos'][0]) + '_' + str(sub_ent['new_pos'][1]) + '_' + str(sub_ent['sent_id'])
            relation_dict['sub'] = sub_dict_key
            obj_dict_key = str(obj_ent['new_pos'][0]) + '_' + str(obj_ent['new_pos'][1]) + '_' + str(obj_ent['sent_id'])
            relation_dict['obj'] = obj_dict_key
            relations.append(relation_dict)
            
    new_doc['entities'] = entities
    new_doc['relations'] = relations
    new_doc['tokenized_sents'] = tokenized_sents
    docs.append(new_doc)
        

In [180]:
docs[0]

{'vertexSet': [[{'name': 'Jeff Prosserman',
    'pos': [0, 2],
    'sent_id': 0,
    'type': 'PER',
    'new_pos': (0, 4)},
   {'type': 'PER',
    'pos': [7, 8],
    'name': 'Prosserman',
    'sent_id': 7,
    'new_pos': (7, 10)},
   {'type': 'PER',
    'pos': [6, 7],
    'name': 'Prosserman',
    'sent_id': 1,
    'new_pos': (9, 12)},
   {'type': 'PER',
    'pos': [7, 8],
    'name': 'Prosserman',
    'sent_id': 6,
    'new_pos': (7, 10)},
   {'type': 'PER',
    'pos': [13, 14],
    'name': 'Prosserman',
    'sent_id': 8,
    'new_pos': (16, 19)},
   {'type': 'PER',
    'pos': [0, 1],
    'name': 'Prosserman',
    'sent_id': 2,
    'new_pos': (0, 3)},
   {'type': 'PER',
    'pos': [0, 1],
    'name': 'Prosserman',
    'sent_id': 5,
    'new_pos': (0, 3)}],
  [{'type': 'TIME',
    'pos': [4, 8],
    'name': 'November 2, 1983',
    'sent_id': 0,
    'new_pos': (6, 10)}],
  [{'name': 'Canadian',
    'pos': [11, 12],
    'sent_id': 0,
    'type': 'LOC',
    'new_pos': (13, 14)}],
  [{'typ

In [181]:
pure_doc = []
doc_count = 0

for doc in docs:
    doc_dict = {}
    
    tokenized_sents_len = [len(sent) for sent in doc['tokenized_sents']]
    
    doc_dict['doc_key'] = str(doc_count)
    doc_dict['document'] = '\n'.join([' '.join(sent) for sent in doc['sents']])
    tokens = []
    for sent in doc['tokenized_sents']:
        tokens.extend(sent)
    doc_dict['tokens'] = tokens
    
    entities = []
    
    for entity in doc['entities']:
        tokens_before = tokenized_sents_len[:entity['sent_id']]
        tokens_before = sum(tokens_before)
        entity_start = entity['new_pos'][0] + tokens_before
        entity_end = entity['new_pos'][1] + tokens_before
        entities.append([entity_start,entity_end,entity['type']])
        
    relations = []
    for relation in doc['relations']:
        relation_sub_spans = relation['sub'].split('_')
        relation_sub_spans = [int(span) for span in relation_sub_spans]
        
        tokens_before = tokenized_sents_len[:relation_sub_spans[-1]]
        tokens_before = sum(tokens_before)
        sub_start = relation_sub_spans[0] + tokens_before
        sub_end = relation_sub_spans[1] + tokens_before
        
        relation_obj_spans = relation['obj'].split('_')
        relation_obj_spans = [int(span) for span in relation_obj_spans]
        
        tokens_before = tokenized_sents_len[:relation_obj_spans[-1]]
        tokens_before = sum(tokens_before)
        obj_start = relation_obj_spans[0] + tokens_before
        obj_end = relation_obj_spans[1] + tokens_before
        
        relations.append([sub_start,sub_end,obj_start,obj_end,relation['relation']])
    
    doc_count += 1

    doc_dict['ner'] = entities
    doc_dict['relations'] = relations
    
    pure_doc.append(doc_dict)

In [182]:
pure_doc[0]

{'doc_key': '0',
 'document': "Jeff Prosserman ( born November 2 , 1983 ) is a Canadian cross - platform producer and director with experience in film , interactive , and mobile content strategy , development , and production .\nPrior to founding LIVESTAGE ° , Prosserman was the Founder of Gusto Goods , a production company based in New York and Toronto .\nProsserman was born in Toronto , Ontario , Canada .\nHe produced , wrote , and directed Chasing Madoff , a 2010 feature documentary on an attempt to expose Bernie Madoff 's Ponzi scheme .\nChasing Madoff premiered at the International Documentary Film Festival Amsterdam and was distributed across the United States by Cohen Media Group and in Canada by Entertainment One .\nProsserman was also the Executive Producer of Monogamy , a feature film that premiered at the Tribeca Film Festival and was released by Oscilloscope Laboratories .\nFollowing the release of these films , Prosserman worked as a content producer to launch The Daily , 

In [183]:
with open("../data/docred_pure/test.jsonl", 'w') as f:
    for item in pure_doc:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")