In [None]:
from google.colab import drive

drive.mount("/content/drive", force_remount=True)

In [2]:
import torch
import torch.nn as nn
import numpy as np
import importlib
import json
import random
import os
import re
from importlib import reload

In [None]:
# If you are using Colab

dir_path = "drive/Othercomputers/my_computer/dl-nlp_project_named-entity-recognition/"
# dir_path = "drive/MyDrive/dl-nlp_project_named-entity-recognition/"
module_path = dir_path.replace("/", ".")
# imports
data_module = importlib.import_module(module_path + "data")
load_data = data_module.load_data
extract_sentences_and_labels = data_module.extract_sentences_and_labels
generate_label_vocab = data_module.generate_label_vocab
split_data = data_module.split_data

In [50]:
# If you are using local machine
from data import (
    load_data,
    extract_sentences_and_labels,
    generate_label_vocab,
    split_data,
)

dir_path = "./"

In [51]:
train_file_path = dir_path + "data/train.json"
test_file_path = dir_path + "data/test.json"

In [52]:
train_data, test_data = load_data(train_file_path, test_file_path)
train_sentences, train_raw_labels = extract_sentences_and_labels(train_data)
test_sentences, test_raw_labels = extract_sentences_and_labels(test_data)

# Generate label vocabulary
label_vocab = generate_label_vocab(train_raw_labels + test_raw_labels)

In [53]:
SPECIAL_TOKEN = "<SPC>"


class Labels:
    def __init__(self, num_classes, names):
        super().__init__()
        self.names = names
        print(self.names)
        self.num_classes = num_classes

    def __getitem__(self, label_vector):
        return [self.names[idx] for idx, value in enumerate(label_vector) if value == 1]

    def decode(self, label_vector):
        return self.__getitem__(label_vector)

    def encode(self, names):
        indexes = []
        for name in names:
            index = self.names.index(name)
            indexes.append(index)
        tensor = torch.zeros(self.num_classes)
        for index in indexes:
            tensor[index] = 1
        return tensor

    def tensor2sentence(self, tensor):
        return [self.decode(vector) for vector in tensor]


ner_labels = Labels(
    num_classes=len(label_vocab) + 1, names=label_vocab + [SPECIAL_TOKEN]
)
id2label = ner_labels.decode
label2id = ner_labels.encode
ner_labels.num_classes

['ObjectiveDescription', 'Precondition', 'ResultMeasuredValue', 'DoseDescription', 'DoseValue', 'NumberPatientsCT', 'NumberPatientsArm', 'Frequency', 'NumberAffected', 'Author', 'Drug', 'ConfIntervalChangeValue', 'ConclusionComment', 'AvgAge', 'DiffGroupAbsValue', 'MinAge', 'TimePoint', 'PvalueDiff', 'PMID', 'ConfIntervalDiff', 'PercentageAffected', 'CTDesign', 'FinalNumPatientsArm', 'Journal', 'PValueChangeValue', 'AllocationRatio', 'Title', 'RelativeChangeValue', 'Country', 'SdDevBL', 'AggregationMethod', 'PublicationYear', 'SdDevChangeValue', 'ObservedResult', 'SubGroupDescription', 'SdDevResValue', '<SPC>']


37

In [54]:
def extract_sentences(json_file_path):
    with open(json_file_path, "r") as file:
        data = json.load(file)

    sentences = []

    for entry in data:
        for sentence in entry["sentences"]:
            tokens = sentence["words"]

            entities = sentence["entities"]
            labels_list = [torch.zeros(ner_labels.num_classes) for x in tokens]
            for label_entity in entities:
                start_pos = label_entity["start_pos"]
                end_pos = label_entity["end_pos"]
                label = label_entity["label"]
                for label_index in range(start_pos, end_pos + 1):
                    # this needs to be changed as well, see below
                    labels_list[label_index] = label2id([label])
            sentence["tokens"] = tokens
            sentence["labels_list"] = labels_list
            sentences.append(sentence)

    return [x["tokens"] for x in sentences], [x["labels_list"] for x in sentences]

In [55]:
train_sentences, train_labels = extract_sentences(train_file_path)
test_sentences, test_labels = extract_sentences(test_file_path)
train_sentences, train_labels, val_sentences, val_labels = split_data(
    train_sentences, train_labels
)

print(len(train_sentences), len(train_labels))
print(len(val_sentences), len(val_labels))
print(len(test_sentences), len(test_labels))

1300 1300
145 145
385 385


In [18]:
label = "CTDesign"
examples = []
examples_with_labels = []
for sentence, labels_list in zip(train_sentences, train_labels):
    new_sentence = sentence
    found = False
    for i, (token, labels) in enumerate(zip(sentence, labels_list)):
        if label in id2label(labels):
            found = True
            if not new_sentence[i].startswith("!!"):
                new_sentence[i] = f"!!{token}!!"
    if found:
        # print(" ".join(new_sentence))
        words = []
        for word, labels in zip(sentence, labels_list):
            # print(id2label(labels), word)
            words.append(f"{word} {id2label(labels)}")
        # print(words)
        # print()
        examples.append(new_sentence)
        examples_with_labels.append(words)

In [58]:
example_count = 5

for i in range(min(example_count, len(examples))):
    index = random.randint(0, len(examples) - 1)
    print(" ".join(examples[index]))
    print(examples_with_labels[index])
    print()

METHODS : This was a multinational , 52 - week , openlabel , parallel - group , !!noninferiority!! , !!treat!! !!-!! !!to!! !!-!! !!target!! trial .
['METHODS []', ': []', 'This []', 'was []', 'a []', 'multinational []', ', []', '52 []', '- []', 'week []', ', []', 'openlabel []', ', []', 'parallel []', '- []', 'group []', ', []', "!!noninferiority!! ['CTDesign']", ', []', "!!treat!! ['CTDesign']", "!!-!! ['CTDesign']", "!!to!! ['CTDesign']", "!!-!! ['CTDesign']", "!!target!! ['CTDesign']", 'trial []', '. []']

OBJECTIVE : This multicenter , double - blind , !!treat!! !!-!! !!to!! !!-!! !!target!! , phase 3 trial evaluated the efficacy and safety of fast - acting insulin aspart ( faster aspart ) versus insulin aspart ( IAsp ) in adults with type 2 diabetes receiving basal insulin and oral antidiabetic agents .
['OBJECTIVE []', ': []', "This ['ObjectiveDescription']", "multicenter ['ObjectiveDescription']", ", ['ObjectiveDescription']", "double ['ObjectiveDescription']", "- ['ObjectiveDe

In [None]:
new_data = {}
new_data["sentences"] = []
new_data["labels_lists"] = []

In [None]:
# TODO: Set up the workflow to also retrieve the labels
input_text = input("Input: ")
sentences = input_text.strip().split("      ")
for sentence in sentences:
    tokenized_words = re.findall(r"\b\w+\b|-|!!.*?!!|[,.:]", sentence)
    words = [
        word
        if not word.startswith("!!") and not word.endswith("!!")
        else word.strip("!")
        for word in tokenized_words
    ]
    labels_list = [
        ["ObjectiveDescription"]
        if not word.startswith("!!") and not word.endswith("!!")
        else ["ObjectiveDescription", label]
        for word in tokenized_words
    ]
    # print(words)
    # print(labels_list)
    new_data["sentences"].append(words)
    new_data["labels_lists"].append(labels_list)

In [None]:
data_file_name = f"{dir_path}data/labels/{label}.json"
with open(data_file_name, "w") as json_file:
    json.dump(new_data, json_file, indent=4)
    print(f"{data_file_name} was updated.")