In [1]:
using_colab = False

In [2]:
if using_colab:
    from google.colab import drive

    drive.mount("/content/drive", force_remount=True)

In [3]:
import torch
import torch.nn as nn
import numpy as np
import importlib
import json
import random
import os
import re
from importlib import reload

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
if using_colab:
    dir_path = (
        "drive/Othercomputers/my_computer/dl-nlp_project_named-entity-recognition/"
    )
    # dir_path = "drive/MyDrive/dl-nlp_project_named-entity-recognition/"
    module_path = dir_path.replace("/", ".")
    # imports
    data_module = importlib.import_module(module_path + "data")
    load_data = data_module.load_data
    extract_sentences_and_labels = data_module.extract_sentences_and_labels
    generate_label_vocab = data_module.generate_label_vocab
    split_data = data_module.split_data

else:
    dir_path = "./"
    from data import (
        load_data,
        extract_sentences_and_labels,
        generate_label_vocab,
        split_data,
    )

In [5]:
train_file_path = dir_path + "data/train.json"
test_file_path = dir_path + "data/test.json"

In [6]:
train_data, test_data = load_data(train_file_path, test_file_path)
train_sentences, train_raw_labels = extract_sentences_and_labels(train_data)
test_sentences, test_raw_labels = extract_sentences_and_labels(test_data)

# Generate label vocabulary
label_vocab = generate_label_vocab(train_raw_labels + test_raw_labels)

In [7]:
SPECIAL_TOKEN = "<SPC>"


class Labels:
    def __init__(self, num_classes, names):
        super().__init__()
        self.names = names
        print(self.names)
        self.num_classes = num_classes

    def __getitem__(self, label_vector):
        return [self.names[idx] for idx, value in enumerate(label_vector) if value == 1]

    def decode(self, label_vector):
        return self.__getitem__(label_vector)

    def encode(self, names):
        indexes = []
        for name in names:
            index = self.names.index(name)
            indexes.append(index)
        tensor = torch.zeros(self.num_classes)
        for index in indexes:
            tensor[index] = 1
        return tensor

    def tensor2sentence(self, tensor):
        return [self.decode(vector) for vector in tensor]


ner_labels = Labels(
    num_classes=len(label_vocab) + 1, names=label_vocab + [SPECIAL_TOKEN]
)
id2label = ner_labels.decode
label2id = ner_labels.encode
ner_labels.num_classes

['Frequency', 'PMID', 'SdDevResValue', 'Title', 'Country', 'NumberPatientsArm', 'ConfIntervalDiff', 'PvalueDiff', 'AggregationMethod', 'ConfIntervalChangeValue', 'DiffGroupAbsValue', 'Journal', 'AllocationRatio', 'TimePoint', 'ConclusionComment', 'Drug', 'SdDevChangeValue', 'FinalNumPatientsArm', 'DoseValue', 'CTDesign', 'Author', 'ResultMeasuredValue', 'RelativeChangeValue', 'SubGroupDescription', 'SdDevBL', 'PublicationYear', 'NumberAffected', 'AvgAge', 'Precondition', 'NumberPatientsCT', 'PercentageAffected', 'ObservedResult', 'ObjectiveDescription', 'MinAge', 'DoseDescription', 'PValueChangeValue', '<SPC>']


37

In [8]:
def extract_sentences(json_file_path):
    with open(json_file_path, "r") as file:
        data = json.load(file)

    sentences = []

    for entry in data:
        for sentence in entry["sentences"]:
            tokens = sentence["words"]

            entities = sentence["entities"]
            labels_list = [torch.zeros(ner_labels.num_classes) for x in tokens]
            for label_entity in entities:
                start_pos = label_entity["start_pos"]
                end_pos = label_entity["end_pos"]
                label = label_entity["label"]
                label_id = label2id([label]).argmax().item()
                for label_index in range(start_pos, end_pos + 1):
                    labels_list[label_index][label_id] = 1
            sentence["tokens"] = tokens
            sentence["labels_list"] = labels_list
            sentences.append(sentence)

    return [x["tokens"] for x in sentences], [x["labels_list"] for x in sentences]

In [9]:
train_sentences, train_labels = extract_sentences(train_file_path)
test_sentences, test_labels = extract_sentences(test_file_path)
train_sentences, train_labels, val_sentences, val_labels = split_data(
    train_sentences, train_labels
)

print(len(train_sentences), len(train_labels))
print(len(val_sentences), len(val_labels))
print(len(test_sentences), len(test_labels))

1300 1300
145 145
385 385


In [10]:
additional_labels = []
for label in ner_labels.names:
    data_file_name = f"{dir_path}data/labels/{label}.json"
    if os.path.exists(data_file_name):
        with open(data_file_name, "r") as json_file:
            data = json.load(json_file)
    else:
        continue
    for labels_list in data["labels_lists"]:
        additional_labels.append([label2id(labels) for labels in labels_list])

print(len(additional_labels))

76


In [11]:
def calculate_entropy(label_counts, total_labels):
    entropy = torch.zeros(1)
    for i, label in enumerate(label_vocab):
        p = label_counts[i] / total_labels
        entropy += -p * torch.log(p)
    return entropy.item()

In [12]:
import csv

label_counts = torch.zeros(ner_labels.num_classes)
for label_list in train_labels + val_labels + test_labels:
    for label in label_list:
        label_counts += label
total_labels = label_counts.sum()

original_entropy = calculate_entropy(label_counts, total_labels)

label_counts = torch.zeros(ner_labels.num_classes)
for label_list in train_labels + val_labels + test_labels + additional_labels:
    for label in label_list:
        label_counts += label
total_labels = label_counts.sum()

new_entropy = calculate_entropy(label_counts, total_labels)

data = []
for i, label in enumerate(label_vocab):
    data.append(
        [
            label,
            label_counts[i].int().item(),
            (label_counts[i] / total_labels).item() * 100,
        ]
    )

csv_file_path = dir_path + "data/label_distribution.csv"

with open(csv_file_path, mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Label", "Count", "Percentage"])
    writer.writerows(data)

print(f"Data has been written to {csv_file_path}")

print(f"Entropy: {new_entropy:.3f}")
print(
    f"Entropy Improvement: {(original_entropy - new_entropy) / original_entropy * 100:.3f}%"
)

Data has been written to ./data/label_distribution.csv
Entropy: 2.706
Entropy Improvement: -2.754%
