In [81]:
label = "Frequency"

In [82]:
using_colab = False

In [83]:
if using_colab:
    from google.colab import drive

    drive.mount("/content/drive", force_remount=True)

In [84]:
import torch
import torch.nn as nn
import numpy as np
import importlib
import json
import random
import os
import re
from importlib import reload

In [85]:
if using_colab:
    dir_path = (
        "drive/Othercomputers/my_computer/dl-nlp_project_named-entity-recognition/"
    )
    # dir_path = "drive/MyDrive/dl-nlp_project_named-entity-recognition/"
    module_path = dir_path.replace("/", ".")
    # imports
    data_module = importlib.import_module(module_path + "data")
    load_data = data_module.load_data
    extract_sentences_and_labels = data_module.extract_sentences_and_labels
    generate_label_vocab = data_module.generate_label_vocab
    split_data = data_module.split_data

else:
    dir_path = "./"
    from data import (
        load_data,
        extract_sentences_and_labels,
        generate_label_vocab,
        split_data,
    )

In [86]:
train_file_path = dir_path + "data/train.json"
test_file_path = dir_path + "data/test.json"

In [87]:
train_data, test_data = load_data(train_file_path, test_file_path)
train_sentences, train_raw_labels = extract_sentences_and_labels(train_data)
test_sentences, test_raw_labels = extract_sentences_and_labels(test_data)

# Generate label vocabulary
label_vocab = generate_label_vocab(train_raw_labels + test_raw_labels)

In [88]:
SPECIAL_TOKEN = "<SPC>"


class Labels:
    def __init__(self, num_classes, names):
        super().__init__()
        self.names = names
        print(self.names)
        self.num_classes = num_classes

    def __getitem__(self, label_vector):
        return [self.names[idx] for idx, value in enumerate(label_vector) if value == 1]

    def decode(self, label_vector):
        return self.__getitem__(label_vector)

    def encode(self, names):
        indexes = []
        for name in names:
            index = self.names.index(name)
            indexes.append(index)
        tensor = torch.zeros(self.num_classes)
        for index in indexes:
            tensor[index] = 1
        return tensor

    def tensor2sentence(self, tensor):
        return [self.decode(vector) for vector in tensor]


ner_labels = Labels(
    num_classes=len(label_vocab) + 1, names=label_vocab + [SPECIAL_TOKEN]
)
id2label = ner_labels.decode
label2id = ner_labels.encode
ner_labels.num_classes

['NumberAffected', 'PvalueDiff', 'TimePoint', 'SubGroupDescription', 'Drug', 'ConfIntervalDiff', 'AllocationRatio', 'MinAge', 'Country', 'ResultMeasuredValue', 'Frequency', 'DiffGroupAbsValue', 'ObservedResult', 'SdDevChangeValue', 'FinalNumPatientsArm', 'ConfIntervalChangeValue', 'Precondition', 'SdDevResValue', 'PercentageAffected', 'DoseDescription', 'PublicationYear', 'SdDevBL', 'ObjectiveDescription', 'RelativeChangeValue', 'ConclusionComment', 'Journal', 'DoseValue', 'PValueChangeValue', 'AvgAge', 'AggregationMethod', 'NumberPatientsArm', 'CTDesign', 'Author', 'NumberPatientsCT', 'PMID', 'Title', '<SPC>']


37

In [89]:
def extract_sentences(json_file_path):
    with open(json_file_path, "r") as file:
        data = json.load(file)

    sentences = []

    for entry in data:
        for sentence in entry["sentences"]:
            tokens = sentence["words"]

            entities = sentence["entities"]
            labels_list = [torch.zeros(ner_labels.num_classes) for x in tokens]
            for label_entity in entities:
                start_pos = label_entity["start_pos"]
                end_pos = label_entity["end_pos"]
                label = label_entity["label"]
                label_id = label2id([label]).argmax().item()
                for label_index in range(start_pos, end_pos + 1):
                    labels_list[label_index][label_id] = 1
            sentence["tokens"] = tokens
            sentence["labels_list"] = labels_list
            sentences.append(sentence)

    return [x["tokens"] for x in sentences], [x["labels_list"] for x in sentences]

In [90]:
train_sentences, train_labels = extract_sentences(train_file_path)
test_sentences, test_labels = extract_sentences(test_file_path)
train_sentences, train_labels, val_sentences, val_labels = split_data(
    train_sentences, train_labels
)

print(len(train_sentences), len(train_labels))
print(len(val_sentences), len(val_labels))
print(len(test_sentences), len(test_labels))

1300 1300
145 145
385 385


In [96]:
data_file_name = f"{dir_path}data/labels/{label}.json"
if os.path.exists(data_file_name):
    with open(data_file_name, "r") as json_file:
        data = json.load(json_file)
else:
    data = {
        "sentences": [],
        "labels_lists": [],
    }

In [100]:
label_abbreviations = {
    "ObjectiveDescription": "OD",
    "Precondition": "PC",
    "RelativeChangeValue": "RCV",
    "DiffGroupAbsValue": "DGAV",
    "NumberPatientsCT": "NPC",
    "AllocationRatio": "AR",
    "DoseValue": "DV",
    "AggregatonMethod": "AM",
    "ResultMeasuredValue": "RMV",
    "SdDevResValue": "SDRV",
    "PvalueDiff": "PDiff",
    "ConfIntervalChangeValue": "CICV",
    "PValueChangeValue": "PVCV",
    "ConfIntervalDiff": "CID",
    "TimePoint": "TP",
    "PercentageAffected": "PA",
    "NumberAffected": "NA",
    "SubGroupDescription": "SGD",
    "MinAge": "MA",
    "Frequency": "F",
    "ObservedResult": "OR",
    "SdDevChangeValue": "SDCV",
    "FinalNumPatientsArm": "FNPA",
    "DoseDescription": "DD",
    "PublicationYear": "PY",
    "SdDevBL": "SDBL",
    "ConclusionComment": "CC",
    "Journal": "J",
    "AvgAge": "AA",
    "AggregationMethod": "AM",
    "NumberPatientsArm": "NPA",
    "CTDesign": "CTD",
    "Author": "A",
    "Title": "T",
    "Country": "C",
    "Drug": "D",
}
label_unabbreviations = {v: k for k, v in label_abbreviations.items()}

In [93]:
examples = []
examples_with_labels = []
for sentence, labels_list in zip(train_sentences, train_labels):
    new_sentence = sentence
    found = False
    for i, (token, labels) in enumerate(zip(sentence, labels_list)):
        if label in id2label(labels):
            found = True
            if not new_sentence[i].startswith("!!"):
                new_sentence[i] = f"!!{token}!!"
    if found:
        # print(" ".join(new_sentence))
        words = []
        for word, labels in zip(sentence, labels_list):
            # print(id2label(labels), word)
            abbreviated_labels = [
                label_abbreviations[label] if label in label_abbreviations else label
                for label in id2label(labels)
            ]
            words.append(f"{word} {abbreviated_labels}")
        # print(words)
        # print()
        examples.append(new_sentence)
        examples_with_labels.append(words)

print(len(examples))

55


In [106]:
label_unabbreviations["NPC"]

'NumberPatientsCT'

## Util Functions


In [34]:
def tokenize(input_str):
    pattern = r"!![^!]+!!|[-;/\.]|\w+|\S"
    # Find all matches
    tokens = re.findall(pattern, input_str)
    return tokens

In [22]:
def make_markdown_table(array, align: str = None):
    """
    Args:
        array: The array to make into a table. Mush be a rectangular array
               (constant width and height).
        align: The alignment of the cells : 'left', 'center' or 'right'.
    """
    # make sure every elements are strings
    array = [[str(elt) for elt in line] for line in array]
    # get the width of each column
    widths = [max(len(line[i]) for line in array) for i in range(len(array[0]))]
    # make every width at least 3 colmuns, because the separator needs it
    widths = [max(w, 3) for w in widths]
    # center text according to the widths
    array = [[elt.center(w) for elt, w in zip(line, widths)] for line in array]

    # separate the header and the body
    array_head = array[0]
    array = array[1:]

    header = "| " + " | ".join(array_head) + " |"

    # alignment of the cells
    align = str(align).lower()  # make sure `align` is a lowercase string
    if align == "none":
        # we are just setting the position of the : in the table.
        # here there are none
        border_left = "| "
        border_center = " | "
        border_right = " |"
    elif align == "center":
        border_left = "|:"
        border_center = ":|:"
        border_right = ":|"
    elif align == "left":
        border_left = "|:"
        border_center = " |:"
        border_right = " |"
    elif align == "right":
        border_left = "| "
        border_center = ":| "
        border_right = ":|"
    else:
        raise ValueError("align must be 'left', 'right' or 'center'.")
    separator = (
        border_left + border_center.join(["-" * w for w in widths]) + border_right
    )

    # body of the table
    body = [""] * len(array)  # empty string list that we fill after
    for idx, line in enumerate(array):
        # for each line, change the body at the correct index
        body[idx] = "| " + " | ".join(line) + " |"
    body = "\n".join(body)

    return header + "\n" + separator + "\n" + body

## Code


In [122]:
import ast

index = random.randint(0, len(examples) - 1)
print(" ".join(examples[index]))
print(examples_with_labels[index])
print()
original_table = []
original_table.append(["Word", "Labels (Abrv.)", "Labels"])

for labels_str in examples_with_labels[index]:
    word = labels_str.split(" ")[0]
    labels = labels_str.split(" ")[1]
    label_list = ast.literal_eval(labels)
    original_table.append(
        [
            word,
            labels,
            [
                label_unabbreviations[label]
                if label in label_unabbreviations
                else label
                for label in label_list
            ],
        ]
    )
print(make_markdown_table(original_table, align="left"))

METHODS : In this randomized , open - label parallel study , !!twice!! !!-!! !!daily!! biphasic insulin aspart 30 ( 30 % soluble and 70 % protaminated insulin aspart ; BIAsp 30 ) plus metformin ( met ) was compared with !!once!! !!-!! !!daily!! insulin glargine ( glarg ) plus glimepiride ( glim ) in 255 insulin - na ï ve patients ( 131 male ; mean + / - SD age , 61 . 2 + / - 9 . 1 years ) .
['METHODS []', ': []', 'In []', 'this []', 'randomized []', ', []', 'open []', '- []', 'label []', 'parallel []', 'study []', ', []', "!!twice!! ['F']", "!!-!! ['F']", "!!daily!! ['F']", 'biphasic []', 'insulin []', 'aspart []', '30 []', '( []', '30 []', '% []', 'soluble []', 'and []', '70 []', '% []', 'protaminated []', 'insulin []', 'aspart []', '; []', 'BIAsp []', '30 []', ') []', 'plus []', 'metformin []', '( []', 'met []', ') []', 'was []', 'compared []', 'with []', "!!once!! ['F']", "!!-!! ['F']", "!!daily!! ['F']", 'insulin []', 'glargine []', '( []', 'glarg []', ') []', 'plus []', 'glimepiri

In [127]:
input_text = input("Input Sentence: ")
token_list = tokenize(input_text.strip('"'))
labels_list = [[] for x in token_list]
for i, token in enumerate(token_list):
    suggesting = i < len(original_table) and 0 < i
    if suggesting:
        suggestion = f", Suggestion: {original_table[i][1]}, {original_table[i][2]}"
        print(
            f"{i:^3}: {token:^20}{suggestion}",
        )
    else:
        print(f"{i:^3}: {token:^20}")
table = []
table.append(["Id", "Word", "Labels"])
input_label = label_abbreviations[label] if label in label_abbreviations else label
while len(input_label) > 0:
    start_idx = int(input(f"Start idx for {input_label}: "))
    end_idx = int(input(f"End idx for {input_label}: "))
    for i in range(start_idx, end_idx + 1):
        labels_list[i].append(
            label_unabbreviations[input_label]
            if input_label in label_unabbreviations
            else input_label
        )
    input_label = input("Label: ")
    while isinstance(input_label, int):
        input_label = input("Label: ")

token_list = [token.replace("!!", "") for token in token_list]
print(labels_list)
for i, (token, labels) in enumerate(zip(token_list, labels_list)):
    table.append([i, token, labels])
print(make_markdown_table(table, align="left"))
submit = input("Hit enter to submit")
if len(submit) == 0:
    data["sentences"].append(token_list)
    data["labels_lists"].append(labels_list)
    print("Data submitted")

 0 :       METHODS       
 1 :          :          , Suggestion: [], []
 2 :       Patients      , Suggestion: [], []
 3 :         with        , Suggestion: [], []
 4 :         type        , Suggestion: [], []
 5 :          2          , Suggestion: [], []
 6 :       diabetes      , Suggestion: [], []
 7 :          (          , Suggestion: [], []
 8 :         HbA         , Suggestion: [], []
 9 :          (          , Suggestion: [], []
10 :          1c         , Suggestion: [], []
11 :          )          , Suggestion: [], []
12 :          6          , Suggestion: [], []
13 :          .          , Suggestion: ['F'], ['Frequency']
14 :          8          , Suggestion: ['F'], ['Frequency']
15 :          -          , Suggestion: ['F'], ['Frequency']
16 :          9          , Suggestion: [], []
17 :          .          , Suggestion: [], []
18 :          8          , Suggestion: [], []
19 :          %          , Suggestion: [], []
20 :          [          , Suggestion: [], []
21 :        

ValueError: invalid literal for int() with base 10: ''

In [53]:
data_file_name = f"{dir_path}data/labels/{label}.json"
with open(data_file_name, "w") as json_file:
    json.dump(data, json_file, indent=2)
    print(f"{data_file_name} was updated.")

./data/labels/ConfIntervalChangeValue.json was updated.
