In [None]:
!pip install torch torchvision datasets evaluate transformers[torch]

Collecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers[torch]
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m96.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [3

In [None]:
import pandas as pd
import os
import torch
import random
import pyarrow as pa
import numpy as np
import evaluate
import json


from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split

from datasets import Dataset


from tqdm.notebook import tqdm
from torch.utils.data import DataLoader

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [None]:
component = 2

In [None]:
if 'google.colab' in str(get_ipython()):
    from google.colab import drive
    drive.mount('/content/drive')
    data_dir = '/content/drive/MyDrive/Colab Notebooks/text_mining_re_data/'
else:
    data_dir = os.getcwd()

data_dir = os.path.join(data_dir, f'component_{component}')


Mounted at /content/drive


In [None]:
with open(os.path.join(data_dir, 'id2label.json'), 'r') as fp:
    id2label = json.load(fp)

id2label = {int(key): id2label[key] for key in id2label.keys()}
label2id = {id2label[key]: key for key in id2label.keys()}

In [None]:
def get_latest_annotations(test_raw):
    def get_latest(x):
        latest = x[0]
        for annot in x[1:]:
            if annot['updated_at'] > latest['updated_at']:
                latest = annot
        return [latest]
    test_raw['annotations'] = test_raw['annotations'].apply(get_latest)

    return test_raw

In [None]:
def get_relation_text(text, first, second):
    first_mark = '[{}]'.format(first['entity'])
    first_entity = first_mark + text[first['start']: first['end']] + first_mark
    second_mark = '[{}]'.format(second['entity'])
    second_entity = second_mark + text[second['start']: second['end']] + second_mark
    subtext = text[first['end']: second['start']]
    return first_entity + subtext + second_entity

def existing_relation(relations , ent1_id, ent2_id):
    for rel in relations:
        if rel['from'] == ent1_id and rel['to'] == ent2_id:
            return rel
    return False

if component==1:
    def possible_relation(entities, ent1_id, ent2_id):
        ent1, ent2 = entities[ent1_id], entities[ent2_id]
        if ent1['entity'] == 'PLAYER' and ent2['entity'] in ['PLAYER', 'CLUB', 'NATIONALITY', 'COUNTRY', 'REFERENCE', 'POSITION', 'BIRTHDATE'] \
            or ent1['entity'] == 'CLUB' and ent2['entity'] in ['CLUB', 'REFERENCE']:
            return True
        return False
else:
    def possible_relation(entities, ent1_id, ent2_id):
        ent1, ent2 = entities[ent1_id], entities[ent2_id]
        if ent1['entity'] == 'UNEXPECTED EVENT' and ent2['entity'] in ['EMPLOYEE', 'TIME', 'DATE', 'LOCATION', 'CAUSE', 'SOLUTION'] \
            or ent1['entity'] == 'EXPECTED EVENT' and ent2['entity'] in ['EMPLOYEE', 'TIME', 'DATE', 'LOCATION']\
            or ent1['entity'] == 'ACTIVITY' and ent2['entity'] in ['EMPLOYEE', 'TIME', 'DATE', 'LOCATION']\
            or ent1['entity'] == 'CAUSE' and ent2['entity'] in ['ACTIVITY']:
            return True
        return False

def generate_possible_relations(entities, entities_list, relations=None, ratio=None):
    entities_list.sort(key=lambda x: x[1])
    possible_relations = []
    for i, ent_one in enumerate(entities_list):
        for j, ent_two in enumerate(entities_list[i+1:i+11]): # change to 10 entities
            if possible_relation(entities, ent_one[0], ent_two[0]) and \
             ((relations is not None and not existing_relation(relations, ent_one[0], ent_two[0])) \
              or relations is None):
                possible_relations.append([ent_one[0], ent_two[0]])
    if ratio:
        possible_relations = random.sample(possible_relations, min(len(possible_relations), ratio * len(relations)))
    return possible_relations

def generate_test_relations(documents):
    texts = []
    entity_pairs = []
    for i, doc in documents.iterrows():
        annotations = doc['annotations'][0]['result']
        entities = {}
        entities_list = []
        text = doc['data']['text']
        for item in annotations:
            if item['type'] == 'labels':
                entities[item['id']] = {'start': item['value']['start'], 'end': item['value']['end'], 'entity': item['value']['labels'][0],
                                         'text': item['value']['text']}
                entities_list.append([item['id'], item['value']['start'], item['value']['end'], item['value']['labels'][0]])

        entities_list = sorted(entities_list, key=lambda x: x[1])

        possible_relations = generate_possible_relations(entities, entities_list)

        for relation in possible_relations:
            first = entities[relation[0]]
            second = entities[relation[1]]
            texts.append(get_relation_text(text, first, second))
            entity_pairs.append([first['text'], second['text']])




    return pd.DataFrame(data={'text': texts, 'entity_pairs':entity_pairs})

def parse_ner_output(output):
    import random
    import string

    out_list = []

    prev_lab = None

    for lab in output['label']:
        if prev_lab is not None and lab['labels'] == prev_lab['value']['labels']:
            prev_lab['value']['end'] = lab['end']
            prev_lab['value']['text'] += ' ' + lab['text']
        else:
            if prev_lab is not None:
                out_list.append(prev_lab)
            prev_lab = {'id': ''.join(random.choices(string.ascii_uppercase, k=10)), 'type': 'labels',
            'value': {'end': lab['end'], 'text': lab['text'], 'start': lab['start'], 'labels': lab['labels']}}

    if prev_lab is not None:
        out_list.append(prev_lab)

    return pd.DataFrame({'annotations': [[{'result':out_list}]], 'data': [{'text':output['text'][0]}]})

In [None]:
ner_dir = os.path.join(data_dir, 'ner_output')

In [None]:
test_raw = pd.DataFrame()

for file in os.listdir(ner_dir):
    temp = pd.read_json(os.path.join(ner_dir, file))

    temp_df = parse_ner_output(temp)

    test_raw = pd.concat([test_raw, temp_df], ignore_index=True)

In [None]:
test_raw

Unnamed: 0,annotations,data
0,"[{'result': [{'id': 'HIIYCAKUBX', 'type': 'lab...","{'text': 'On July 6, 1987, with the plant in M..."
1,"[{'result': [{'id': 'CVEBYZXBBO', 'type': 'lab...","{'text': 'At 2323 hours, 17 May 1988, in Mode ..."
2,"[{'result': [{'id': 'KQYTFAYHSN', 'type': 'lab...","{'text': 'On 2/18/87, at 0001 hours, during no..."
3,"[{'result': [{'id': 'YNMKNQWTDR', 'type': 'lab...","{'text': 'On March 26, 1988, while at 85% powe..."
4,"[{'result': [{'id': 'QXSYRCVTFS', 'type': 'lab...","{'text': 'On 3/22/88, at 0042, with the Plant ..."


In [None]:
test_raw = generate_test_relations(test_raw[:1])

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


def tokenize_function(examples):
    examples['input_ids'], examples['attention_mask'] = \
    tokenizer(examples['text'], padding="max_length", truncation=True).values()

    return examples


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
test_raw = test_raw.apply(tokenize_function, axis=1)

In [None]:
class REset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data

    def __getitem__(self, idx):
        item = {key: val[idx] if key in ['entity_pairs', 'text'] else torch.tensor(val[idx]) for key, val in self.data.items()}

        return item

    def __len__(self):
        return len(self.data['input_ids'])

test_dataset = REset(test_raw)

In [None]:
test_loader = DataLoader(test_dataset, batch_size=16)

In [None]:
model_dir = os.path.join(data_dir, 'models')

model = AutoModelForSequenceClassification.from_pretrained(
    os.path.join(model_dir, "best_model"), num_labels=len(id2label.keys()), id2label=id2label, label2id=label2id
).to(device)

In [None]:
import re

def output_select(logits, texts):
    # mask = torch.zeros(logits.size()).to(bool)
    # mask[:, label2id['None']] = True
    # for i, text in enumerate(texts):
    #     text = re.split(r'\[|]|\s', batch['text'][i])
    #     ent1 = text[1]
    #     ent2 = text[-2]
    #     print
    #     if ent1 == 'PLAYER':
    #         if ent2 == 'PLAYER':
    #             mask[i, [label2id['teammate_of'], label2id['refers_to']]] = True
    #         if ent2 == 'BIRTHDATE':
    #             mask[i, label2id['born']] = True
    #         if ent2 == 'NATIONALITY':
    #             mask[i, label2id['has_nationality']] = True
    #         if ent2 == 'COUNTRY':
    #             mask[i, label2id['originates_from']] = True
    #         if ent2 == 'POSITION':
    #             mask[i, label2id['plays_as']] = True
    #         if ent2 == 'CLUB':
    #             mask[i, [label2id['plays_for'], label2id['played_for']]] = True
    #         if ent2 == 'REFERENCE':
    #             mask[i, label2id['refers_to']] = True
    #     else:
    #         if ent2 == 'CLUB':
    #             mask[i, label2id['refers_to']] = True
    #         if ent2 == 'REFERENCE':
    #             mask[i, label2id['refers_to']] = True
    # logits = (logits + 100) * mask
    # logits *= mask
    return logits.argmax(dim=-1)

In [None]:
results = []
model.eval()
with torch.no_grad():
    for batch in tqdm(test_loader):
        text_tensor, mask = batch['input_ids'].to(model.device), batch['attention_mask'].to(model.device)
        output = model(text_tensor, mask).logits.cpu()
        output = output_select(output, batch['text'])
        output = [id2label[out] for out in output.tolist()]
        results.extend(list(zip(batch['entity_pairs'][0],  output, batch['entity_pairs'][1])))

  0%|          | 0/12 [00:00<?, ?it/s]

In [None]:
final_results = []
for res in results:
    if res[1] not in ['None']:
        final_results.append(res)

In [None]:
len(results)

186

In [None]:
len(final_results)

137

In [None]:
final_results

[('July', 'happened_on', ','),
 ('July', 'happened_on', '1987'),
 ('July', 'happened_on', 'the'),
 ('July', 'happened_on', 'plant'),
 ('July', 'happened_on', '2'),
 (',', 'happened_on', 'the'),
 (',', 'done_by', 'plant'),
 (',', 'happened_on', '2'),
 (',', 'done_by', 'at'),
 (',', 'happened_on', 'amps'),
 ('in', 'happened_on', '2'),
 ('in', 'done_by', 'at'),
 ('in', 'happened_on', 'amps'),
 ('in', 'done_by', 'reactor'),
 ('in', 'happened_during', 'reactor'),
 ('5', 'happened_on', 'amps'),
 ('5', 'done_by', 'reactor'),
 ('5', 'happened_on', 'reactor'),
 ('5', 'done_by', 'scram'),
 ('5', 'happened_on', 'coolant'),
 ('power', 'happened_on', 'reactor'),
 ('power', 'done_by', 'scram'),
 ('power', 'happened_on', 'coolant'),
 ('power', 'done_by', 'pressure'),
 ('power', 'happened_on', 'a'),
 ('on', 'happened_on', 'coolant'),
 ('on', 'done_by', 'pressure'),
 ('on', 'happened_on', 'a'),
 ('on', 'done_by', 'main'),
 ('on', 'happened_on', 'trip'),
 ('occurred', 'happened_on', 'a'),
 ('occurred', 

In [None]:
import pickle
with open("test_output", "wb") as fp:   #Pickling
    pickle.dump(final_results, fp)
with open("test_output", "rb") as fp:   # Unpickling
    a = pickle.load(fp)

In [None]:
a

[('July', 'happened_on', ','),
 ('July', 'happened_on', '1987'),
 ('July', 'happened_on', 'the'),
 ('July', 'happened_on', 'plant'),
 ('July', 'happened_on', '2'),
 (',', 'happened_on', 'the'),
 (',', 'done_by', 'plant'),
 (',', 'happened_on', '2'),
 (',', 'done_by', 'at'),
 (',', 'happened_on', 'amps'),
 ('in', 'happened_on', '2'),
 ('in', 'done_by', 'at'),
 ('in', 'happened_on', 'amps'),
 ('in', 'done_by', 'reactor'),
 ('in', 'happened_during', 'reactor'),
 ('5', 'happened_on', 'amps'),
 ('5', 'done_by', 'reactor'),
 ('5', 'happened_on', 'reactor'),
 ('5', 'done_by', 'scram'),
 ('5', 'happened_on', 'coolant'),
 ('power', 'happened_on', 'reactor'),
 ('power', 'done_by', 'scram'),
 ('power', 'happened_on', 'coolant'),
 ('power', 'done_by', 'pressure'),
 ('power', 'happened_on', 'a'),
 ('on', 'happened_on', 'coolant'),
 ('on', 'done_by', 'pressure'),
 ('on', 'happened_on', 'a'),
 ('on', 'done_by', 'main'),
 ('on', 'happened_on', 'trip'),
 ('occurred', 'happened_on', 'a'),
 ('occurred', 