In [None]:
!pip install torch torchvision datasets evaluate transformers[torch]

Collecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers[torch]
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [3

In [None]:
import pandas as pd
import os
import shutil
import torch
import random
import json
import pyarrow as pa
import numpy as np
import evaluate

from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split

from datasets import Dataset


from tqdm.notebook import tqdm
from torch.utils.data import DataLoader

In [None]:
component = 2

In [None]:
if 'google.colab' in str(get_ipython()):
    from google.colab import drive
    drive.mount('/content/drive')
    data_dir = '/content/drive/MyDrive/Colab Notebooks/text_mining_re_data/'
else:
    data_dir = os.getcwd()

data_dir = os.path.join(data_dir, f'component_{component}')

train_raw = pd.read_json(os.path.join(data_dir, 'train_tm.json'))


Mounted at /content/drive


In [None]:
train_raw = train_raw[['annotations', 'data']]

In [None]:
relations = {}
for index, row in train_raw.iterrows():
    items = row['annotations'][0]['result']
    for item in items:
        if item['type'] == 'relation':
            if len(item['labels']) != 0:
                relations[item['labels'][0]] = relations.get(item['labels'][0], 0) + 1

In [None]:
id2label = {i: rel for i, rel in enumerate(list(relations.keys()) + ['None'])}
label2id = {id2label[key]: key for key in id2label.keys()}

In [None]:
with open(os.path.join(data_dir, 'id2label.json'), 'w') as fp:
    json.dump(id2label, fp)

In [None]:
id2label

{0: 'caused_by',
 1: 'happened_on',
 2: 'happened_during',
 3: 'solved_by',
 4: 'happened_at',
 5: 'done_by',
 6: 'refers_to',
 7: 'None'}

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
def existing_relation(relations , ent1_id, ent2_id):
    for rel in relations:
        if rel['from'] == ent1_id and rel['to'] == ent2_id:
            return rel
    return False

if component==1:
    def possible_relation(entities, ent1_id, ent2_id):
        ent1, ent2 = entities[ent1_id], entities[ent2_id]
        if ent1['entity'] == 'PLAYER' and ent2['entity'] in ['PLAYER', 'CLUB', 'NATIONALITY', 'COUNTRY', 'REFERENCE', 'POSITION', 'BIRTHDATE'] \
            or ent1['entity'] == 'CLUB' and ent2['entity'] in ['CLUB', 'REFERENCE']:
            return True
        return False
else:
    def possible_relation(entities, ent1_id, ent2_id):
        ent1, ent2 = entities[ent1_id], entities[ent2_id]
        if ent1['entity'] == 'UNEXPECTED EVENT' and ent2['entity'] in ['EMPLOYEE', 'TIME', 'DATE', 'LOCATION', 'CAUSE', 'SOLUTION'] \
            or ent1['entity'] == 'EXPECTED EVENT' and ent2['entity'] in ['EMPLOYEE', 'TIME', 'DATE', 'LOCATION']\
            or ent1['entity'] == 'ACTIVITY' and ent2['entity'] in ['EMPLOYEE', 'TIME', 'DATE', 'LOCATION']\
            or ent1['entity'] == 'CAUSE' and ent2['entity'] in ['ACTIVITY']:
            return True
        return False

In [None]:
def get_relation_text(text, first, second):
    first_mark = '[{}]'.format(first['entity'])
    first_entity = first_mark + text[first['start']: first['end']] + first_mark
    second_mark = '[{}]'.format(second['entity'])
    second_entity = second_mark + text[second['start']: second['end']] + second_mark
    subtext = text[first['end']: second['start']]
    return first_entity + subtext + second_entity

In [None]:
def generate_possible_relations(entities, entities_list, relations=None, ratio=None):
    entities_list.sort(key=lambda x: x[1])
    possible_relations = []
    for i, ent_one in enumerate(entities_list):
        for j, ent_two in enumerate(entities_list[i+1:i+11]): # change to 10 entities
            if possible_relation(entities, ent_one[0], ent_two[0]) and \
             ((relations is not None and not existing_relation(relations, ent_one[0], ent_two[0])) \
              or relations is None):
                possible_relations.append([ent_one[0], ent_two[0]])
    if ratio:
        possible_relations = random.sample(possible_relations, min(len(possible_relations), ratio * len(relations)))
    return possible_relations


In [None]:
def generate_train_relations(documents):
    texts = []
    labels = []
    for i, doc in documents.iterrows():
        annotations = doc['annotations'][0]['result']
        entities = {}
        entities_list = []
        relations = []
        text = doc['data']['text']
        for item in annotations:
            if item['type'] == 'labels':
                entities[item['id']] = {'start': item['value']['start'], 'end': item['value']['end'], 'entity': item['value']['labels'][0]}
                entities_list.append([item['id'], item['value']['start'], item['value']['end'], item['value']['labels'][0]])
            else:
                if len(item['labels']) != 0:
                    if entities[item['from_id']]['start'] < entities[item['to_id']]['start']:
                        relations.append({'from': item['from_id'], 'to': item['to_id'], 'label': item['labels'][0]})
                    else:
                        relations.append({'from': item['to_id'], 'to': item['from_id'], 'label': item['labels'][0]})

        entities_list = sorted(entities_list, key=lambda x: x[1])

        none_relations = generate_possible_relations(entities, entities_list, relations=relations, ratio=5)
        for relation in none_relations:
            first = entities[relation[0]]
            second = entities[relation[1]]
            texts.append(get_relation_text(text, first, second))
            labels.append('None')

        for relation in relations:
            first = entities[relation['from']]
            second = entities[relation['to']]
            texts.append(get_relation_text(text, first, second))
            labels.append(relation['label'])




    return pd.DataFrame(data={'text': texts, 'label':labels})

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [None]:
train, val = train_test_split(train_raw, test_size=0.2)

In [None]:
print(train.shape, val.shape)

(24, 2) (6, 2)


In [None]:
train_dataset = generate_train_relations(train)
train_dataset['label'] = train_dataset.apply(lambda x: label2id[x['label']], axis=1)
train_dataset = Dataset(pa.Table.from_pandas(train_dataset))

train_dataset = train_dataset.map(tokenize_function, batched=True)

val_dataset = generate_train_relations(val)
val_dataset['label'] = val_dataset.apply(lambda x: label2id[x['label']], axis=1)
val_dataset = Dataset(pa.Table.from_pandas(val_dataset))

val_dataset = val_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/338 [00:00<?, ? examples/s]

Map:   0%|          | 0/112 [00:00<?, ? examples/s]

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    metrics = {'f1_score_macro': f1_score(labels, predictions, average='macro'),
               'accuracy': accuracy_score(labels, predictions)}
    return metrics

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=len(id2label.keys()), id2label=id2label, label2id=label2id
)

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
BATCH_SIZE = 16
EPOCHS = 10 if component==1 else 10
output_dir = os.path.join(data_dir, 'models')

In [None]:
if os.path.exists(output_dir):
    if os.listdir(output_dir):
        shutil.rmtree(output_dir)
        os.makedirs(output_dir)
else:
    os.makedirs(output_dir)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir=output_dir,
    evaluation_strategy='epoch',
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    save_strategy='epoch',
    save_total_limit = 1,
    load_best_model_at_end=True,
    metric_for_best_model = 'f1_score_macro')

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1 Score Macro,Accuracy
1,No log,1.294542,0.262421,0.625
2,No log,0.86766,0.311831,0.678571
3,No log,0.542268,0.683761,0.839286
4,No log,0.39525,0.729803,0.883929
5,No log,0.277729,0.831511,0.955357
6,No log,0.244315,0.831511,0.955357
7,No log,0.249396,0.831511,0.955357
8,No log,0.245767,0.831511,0.955357
9,No log,0.244671,0.831511,0.955357
10,No log,0.242451,0.831511,0.955357


TrainOutput(global_step=220, training_loss=0.43612383062189275, metrics={'train_runtime': 240.5146, 'train_samples_per_second': 14.053, 'train_steps_per_second': 0.915, 'total_flos': 447787716280320.0, 'train_loss': 0.43612383062189275, 'epoch': 10.0})

In [None]:
trainer.save_model(os.path.join(output_dir, 'best_model'))

In [None]:
def get_latest_annotations(test_raw):
    def get_latest(x):
        latest = x[0]
        for annot in x[1:]:
            if annot['updated_at'] > latest['updated_at']:
                latest = annot
        return [latest]
    test_raw['annotations'] = test_raw['annotations'].apply(get_latest)

    return test_raw

In [None]:
test_raw = pd.read_json(os.path.join(data_dir, 'test_tm.json'))
test_raw = test_raw[['annotations', 'data']]
test_raw = get_latest_annotations(test_raw.copy())

In [None]:
test_dataset = generate_train_relations(test_raw)
test_dataset['label'] = test_dataset.apply(lambda x: label2id[x['label']], axis=1)
test_dataset = Dataset(pa.Table.from_pandas(test_dataset))

test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/112 [00:00<?, ? examples/s]

In [None]:
trainer.evaluate(test_dataset)

{'eval_loss': 0.3119843304157257,
 'eval_f1_score_macro': 0.8357792207792207,
 'eval_accuracy': 0.9464285714285714,
 'eval_runtime': 1.9703,
 'eval_samples_per_second': 56.843,
 'eval_steps_per_second': 3.553,
 'epoch': 10.0}

In [None]:
trainer.evaluate(train_dataset)

{'eval_loss': 0.15298235416412354,
 'eval_f1_score_macro': 0.8499112397607437,
 'eval_accuracy': 0.9733727810650887,
 'eval_runtime': 5.9776,
 'eval_samples_per_second': 56.545,
 'eval_steps_per_second': 3.68,
 'epoch': 10.0}

In [None]:
trainer.evaluate(val_dataset)

{'eval_loss': 0.2777290940284729,
 'eval_f1_score_macro': 0.8315106793367664,
 'eval_accuracy': 0.9553571428571429,
 'eval_runtime': 2.1424,
 'eval_samples_per_second': 52.278,
 'eval_steps_per_second': 3.267,
 'epoch': 10.0}