In [102]:
import constants
import pickle
import json
from transformers import AutoTokenizer

In [103]:
def get_token_role_in_span_OTE(token_start: int, token_end: int, span_start: int, span_end: int):
    # Wrong Annoation (span) or wrong token start/end -> token's start can't be before the token's end
    if token_end <= token_start or span_start > span_end:
        return "N"
    if token_end < span_start or span_end < token_start:
        return "O"
    if token_start > span_start:
        return "I"
    else:
        return "B"


def get_token_role_in_span_OTE_old(token_start: int, token_end: int, span_start: int, span_end: int):
    # Wrong Annoation (span) or wrong token start/end -> token's start can't be before the token's end
    if token_end <= token_start or span_start > span_end:
        return "N"
    if token_start < span_start or token_end > span_end:
        return "O"
    if token_start > span_start:
        return "I"
    else:
        return "B"


def preprocess_example_OTE(example, tokenizer):
    print("example:", example)
    import json
    with open("example_dict.json", "w") as file:
       json.dump(example, file)
    input_text = example["text"] + "[SEP]" + example["aspect_category"]
    one_hot_output = [[0 for _ in constants.LABEL_TO_ID_OTE.keys()]
                      for _ in range(constants.MAX_TOKENS_ACD)]

    tokenized_input_text = tokenizer(input_text,
                                     return_offsets_mapping=True,
                                     padding="max_length",
                                     max_length=constants.MAX_TOKENS_ACD,
                                     truncation=True)

    for (token_start, token_end), token_labels in zip(tokenized_input_text["offset_mapping"], one_hot_output):
        for span in example["tags"]:
            role = get_token_role_in_span_OTE(
                token_start, token_end, span["start"], span["end"])
            if role == "B":
                token_labels[constants.LABEL_TO_ID_OTE["B"]] = 1
            elif role == "I":
                token_labels[constants.LABEL_TO_ID_OTE["I"]] = 1

        if token_labels[constants.LABEL_TO_ID_OTE["B"]] == 0 and token_labels[constants.LABEL_TO_ID_OTE["I"]] == 0:
            token_labels[constants.LABEL_TO_ID_OTE["O"]] = 1

    return {
        "input_ids": tokenized_input_text["input_ids"],
        "attention_mask": tokenized_input_text["attention_mask"],
        "offset_mapping": tokenized_input_text["offset_mapping"],
        "aspect_category": example["aspect_category"],
        "labels": one_hot_output
    }

In [104]:
tokenizer = AutoTokenizer.from_pretrained(constants.MODEL_NAME_OTE)

In [105]:
with open("example_dict.json", "r") as file:
    example = json.load(file)
example

{'text': 'Preis war nicht heiß, kümmerte den Service aber nicht.',
 'aspect_category': 'FOOD',
 'tags': [{'end': 4,
   'start': 0,
   'tag_with_polarity': 'FOOD-POSITIVE',
   'tag_with_polarity_and_type': 'FOOD-POSITIVE-explicit',
   'text': 'Suppe',
   'type': 'label-explicit',
   'label': 'FOOD',
   'polarity': 'POSITIVE'}],
 'id': '9372f1ee-58b3-425e-8a6b-775627bbd424'}

In [106]:
example = {'text': 'Preis war nicht heiß, kümmerte den Service aber nicht.',
           'aspect_category': 'FOOD',
           'tags': [{'end': 5,
                     'start': 0,
                     'tag_with_polarity': 'FOOD-POSITIVE',
                     'tag_with_polarity_and_type': 'FOOD-POSITIVE-explicit',
                     'text': 'Suppe',
                     'type': 'label-explicit',
                     'label': 'FOOD',
                     'polarity': 'POSITIVE'}],
           'id': '9372f1ee-58b3-425e-8a6b-775627bbd424'}

In [107]:
example["text"][example["tags"][0]["start"]:example["tags"][0]["end"]]

'Preis'

In [109]:
preprocess_example_OTE(example, tokenizer)["input_ids"][:20], preprocess_example_OTE(example, tokenizer)["labels"][:20]

example: {'text': 'Preis war nicht heiß, kümmerte den Service aber nicht.', 'aspect_category': 'FOOD', 'tags': [{'end': 5, 'start': 0, 'tag_with_polarity': 'FOOD-POSITIVE', 'tag_with_polarity_and_type': 'FOOD-POSITIVE-explicit', 'text': 'Suppe', 'type': 'label-explicit', 'label': 'FOOD', 'polarity': 'POSITIVE'}], 'id': '9372f1ee-58b3-425e-8a6b-775627bbd424'}
example: {'text': 'Preis war nicht heiß, kümmerte den Service aber nicht.', 'aspect_category': 'FOOD', 'tags': [{'end': 5, 'start': 0, 'tag_with_polarity': 'FOOD-POSITIVE', 'tag_with_polarity_and_type': 'FOOD-POSITIVE-explicit', 'text': 'Suppe', 'type': 'label-explicit', 'label': 'FOOD', 'polarity': 'POSITIVE'}], 'id': '9372f1ee-58b3-425e-8a6b-775627bbd424'}


([102,
  2020,
  285,
  255,
  1988,
  818,
  16947,
  30881,
  190,
  5502,
  494,
  255,
  566,
  103,
  161,
  28823,
  30909,
  103,
  0,
  0],
 [[1, 0, 0],
  [0, 1, 0],
  [1, 0, 0],
  [1, 0, 0],
  [1, 0, 0],
  [1, 0, 0],
  [1, 0, 0],
  [1, 0, 0],
  [1, 0, 0],
  [1, 0, 0],
  [1, 0, 0],
  [1, 0, 0],
  [1, 0, 0],
  [1, 0, 0],
  [1, 0, 0],
  [1, 0, 0],
  [1, 0, 0],
  [1, 0, 0],
  [1, 0, 0],
  [1, 0, 0]])