In [None]:
import os
import spacy
import numpy as np
import pandas as pd
import datasets
import torch
import evaluate
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from huggingface_hub import create_repo, get_full_repo_name, notebook_login, Repository
from transformers import AutoModelForTokenClassification, AutoTokenizer, DataCollatorForTokenClassification, get_scheduler, pipeline, Trainer, TrainingArguments
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from TorchCRF import CRF

In [None]:
# Path to data
TEXT_FOLDER = './data/CADEC.v2/cadec/text/'
OG_ANN_FOLDER = './data/CADEC.v2/cadec/original/'

# Convert annotations into IOB

In [None]:
#####################################################
# ------------------- FUNCTIONS ------------------- #
#####################################################

# Function to read text from a .txt file
def read_text(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        return file.read()

# Function to read annotations from a .ann file
def read_annotations(file_path):
    annotations = []

    with open(file_path, 'r') as file:
        lines = file.readlines()

    for line in lines:
        if not line.startswith('#'):
            parts = line.split('\t')

            # Get only the important information
            information = parts[1].split()

            # Get values from info
            label = information[0]
            # Semicolon problem not solved
            start_values = [value for value in information[1].split(';')]
            end_values = [value for value in information[2].split(';')]

            for start, end in zip(start_values, end_values):
                annotations.append({"start": int(start), "end": int(end), "label": label})

    return annotations

# Update tags based on the annotations
def annotate_text(doc, annotations):
    tags = ["O"] * len(doc)
    for annotation in annotations:
        start, end, label = annotation["start"], annotation["end"], annotation["label"]
        start_token = None
        for i, token in enumerate(doc):
            if start_token is None and token.idx >= start:
                start_token = i
            if token.idx + len(token) >= end and start_token is not None:
                for j in range(start_token, i + 1):
                    if j == start_token:
                        tags[j] = f"B-{label}"
                    else:
                        tags[j] = f"I-{label}"
                break
    return tags

# Function that aligns tokens with labels
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else label2id[labels[word_id]]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as the previous token
            label = label2id[labels[word_id]]
            # If the label is B-XXX, we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

# Function that tokenizees and aligns the labels with the tokens
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

# count total amount each entity type is in the training set
def count_entities(dataset):
    entity_count = {}
    for doc in dataset:
        for tag in doc['ner_tags']:
            if tag in entity_count:
                entity_count[tag] += 1
            else:
                entity_count[tag] = 1
    return entity_count

# Compute metrics for the training evaluation
# def compute_metrics(eval_preds):
#     logits, labels = eval_preds
#     predictions = np.argmax(logits, axis=-1)

#     # Remove ignored index (special tokens) and convert to labels
#     true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
#     true_predictions = [
#         [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
#         for prediction, label in zip(predictions, labels)
#     ]
#     all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
#     return {
#         "precision": all_metrics["overall_precision"],
#         "recall": all_metrics["overall_recall"],
#         "f1": all_metrics["overall_f1"],
#         "accuracy": all_metrics["overall_accuracy"],
#     }

# Improved compute metrics for evaluation during training
def compute_metrics(eval_preds):
# def improved_compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    entity_metrics = {entity: {"precision": 0, "recall": 0, "f1": 0} for entity in label_names}

    for entity in label_names:
        #  precision, recall, and F1 per the entity type
        true_entity_labels = [1 if entity in labels else 0 for labels in true_labels]
        predicted_entity_labels = [1 if entity in labels else 0 for labels in true_predictions]

        entity_metrics[entity]["precision"] = precision_score(true_entity_labels, predicted_entity_labels)
        entity_metrics[entity]["recall"] = recall_score(true_entity_labels, predicted_entity_labels)
        entity_metrics[entity]["f1"] = f1_score(true_entity_labels, predicted_entity_labels)

    # flatten, list of lists to list
    flat_true_labels = [label for labels in true_labels for label in labels]
    flat_true_predictions = [label for labels in true_predictions for label in labels]

    # macro and micro average F1 scores
    macro_f1 = np.mean([entity_metrics[entity]["f1"] for entity in label_names])
    micro_f1 = f1_score(flat_true_labels, flat_true_predictions, average='micro')

    all_metrics = {
        "entity_metrics": entity_metrics,
        "macro_f1": macro_f1,
        "micro_f1": micro_f1,
    }

    return all_metrics

# Function that pos process the predictions and returns true labels and true preds
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

# Transform data into desired HuggingFace format

In [None]:
# === Imports and environment setup ===
# Path to data
TEXT_FOLDER = './data/CADEC.v2/cadec/text/'
OG_ANN_FOLDER = './data/CADEC.v2/cadec/original/'

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Get a list of all text files in the folder
text_files = [f for f in os.listdir(TEXT_FOLDER) if f.endswith(".txt")]

# Initialize lists to store data
data_arr = []

# Loop through each text file
for text_file in text_files:
    # print(text_file)
    # Build file paths
    txt_file_path = os.path.join(TEXT_FOLDER, text_file)
    ann_file_path = os.path.join(OG_ANN_FOLDER, text_file.replace(".txt", ".ann"))

    # Read text from the .txt file
    text = read_text(txt_file_path)

    # Read annotations from the .ann file
    annotations = read_annotations(ann_file_path)

    # Process the text with spaCy
    doc = nlp(text)

    # Perform the annotation loop
    tags_array = annotate_text(doc, annotations)

    # Create a word by word array
    words_array = [token.text for token in doc]

    # Store the data for this document
    data_arr.append({"tokens": words_array, "ner_tags": tags_array})

# Convert the list of dictionaries to a pandas dataframe
df = pd.DataFrame(data_arr)

# Print the dataframe
# df

In [None]:
#####################################################
# -- Pandas DataFrame to HuggingFace DatasetDict -- #
#####################################################

# Split data into train, val & test sets
data, test = train_test_split(df, test_size=0.2)
train, val = train_test_split(data, test_size=0.2)

# Convert to dataset dictionaries
train_dataframe = pd.DataFrame({
    'id': train.index.values.astype(str),
    'tokens': train.tokens.values,
    'ner_tags': train.ner_tags.values
})
dev_dataframe = pd.DataFrame({
    'id': val.index.values.astype(str),
    'tokens': val.tokens.values,
    'ner_tags': val.ner_tags.values
})
test_dataframe = pd.DataFrame({
    'id': test.index.values.astype(str),
    'tokens': test.tokens.values,
    'ner_tags': test.ner_tags.values
})

# From dictionaries to DatasetDict
train_dataset = datasets.Dataset.from_dict(train_dataframe)
dev_dataset = datasets.Dataset.from_dict(dev_dataframe)
test_dataset = datasets.Dataset.from_dict(test_dataframe)

# Join into one
raw_data = datasets.DatasetDict({'train': train_dataset, 'validation': dev_dataset, 'test': test_dataset})
raw_data

In [None]:
#####################################################
# ------------ Statistics from dataset ------------ #
#####################################################

train_count = count_entities(raw_data['train'])
entity_count = count_entities(raw_data['train'])
test_count = count_entities(raw_data['test'])
val_count = count_entities(raw_data['validation'])

entity_count

# calculate for val and test set and then sum
entity_count_val = count_entities(raw_data['validation'])
entity_count_test = count_entities(raw_data['test'])

for key in entity_count_val:
    entity_count[key] += (entity_count_val[key] + entity_count_test[key])
    
print ( "all datasets: \n", entity_count) 
print ( "train dataset: \n", train_count) 

In [None]:
# remove 'O' from all counts
entity_count.pop('O')
train_count.pop('O')
val_count.pop('O')

In [None]:
entity_labels = list(train_count.keys())
entity_labels.sort()

bar_width = 0.2

train_positions = np.arange(len(entity_labels))
val_positions = train_positions + bar_width
test_positions = val_positions + bar_width

fig, axs = plt.subplots(1, 3, figsize=(15, 5))

for ax, data, title, color in zip(axs, [train_count, val_count, test_count], ['Train', 'Val', 'Test'], ['steelblue', 'darkolivegreen', 'darksalmon']):
    ax.bar(train_positions, [data[label] for label in entity_labels], width=bar_width, color=color, label=title)
    ax.set_title(title, fontsize=18, color=color)
    ax.set_xlabel('Entity Labels', fontsize=16)
    ax.set_ylabel('Count', fontsize=16)
    ax.set_xticks(val_positions)
    ax.set_xticklabels(entity_labels, rotation=45, ha='right')
    ax.legend()

plt.tight_layout()
plt.show()

# Tokenize data

In [None]:
# plot total amount of labels starting with B compared to labels starting with I

count_b_entities = {key: value for key, value in entity_count.items() if key.startswith('B-')}
count_i_entities = {key: value for key, value in entity_count.items() if key.startswith('I-')}

fig, ax = plt.subplots(figsize=(10, 6))
bar_width = 0.35

print(count_b_entities)
print(count_i_entities)
b_bars = ax.bar(range(len(count_b_entities)), list(count_b_entities.values()), bar_width, label='B-Entities')
i_bars = ax.bar([x + bar_width for x in range(len(count_i_entities))], list(count_i_entities.values()), bar_width, label='I-Entities')

ax.set_xlabel('Entity types')
ax.set_ylabel('Total count')
ax.set_title('Total count of entities starting with B and I')
ax.set_xticks([x + bar_width/2 for x in range(len(count_b_entities))])
ax.set_xticklabels(count_b_entities.keys())
ax.legend()


plt.show()

# BERT

In [None]:
#####################################################
# ----------------- Get tokenizer ----------------- #
#####################################################

# Get tokenizer from checkpint
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Set labels with Id
label_names = ['O', 'B-ADR', 'I-ADR',
               'B-Disease', 'I-Disease',
               'B-Drug', 'I-Drug',
               'B-Finding', 'I-Finding',
               'B-Symptom', 'I-Symptom']

id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

tokenized_datasets = raw_data.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_data["train"].column_names,
)

In [None]:
#####################################################
# ------------------- Get Model ------------------- #
#####################################################

# Get data collector with the previously defined tokenizer
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Get metric
metric = evaluate.load("seqeval")

# Get model from checkpoint
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

# Number of labels the model has
model.config.num_labels

# CRF-BERT


In [None]:
#####################################################
# ------------- Train Base-line Model ------------- #
#####################################################

# Set training arguments
args = TrainingArguments(
    "bert-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

# Set trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

# Train
trainer.train()

# Push to hub (saving the train in HuggingFace)
trainer.push_to_hub(commit_message="Training complete")

# BIlstm-crf

In [None]:
# === Load/prepare tokenizer ===
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

label_names = ['O', 'B-ADR', 'I-ADR',
               'B-Disease', 'I-Disease',
               'B-Drug', 'I-Drug',
               'B-Finding', 'I-Finding',
               'B-Symptom', 'I-Symptom']

id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

tokenized_datasets = raw_data.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_data["train"].column_names,
)

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

# add CRF layer to the model
model.crf = CRF(num_labels=model.config.num_labels)
model.config.num_labels

args = TrainingArguments(
    "bert-finetuned-ner-CRF",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

trainer.train()

trainer.push_to_hub(commit_message="Training complete")

# Optimizing and Training with Accelerator


In [None]:
# === Load/prepare tokenizer ===
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

label_names = ['O', 'B-ADR', 'I-ADR',
               'B-Disease', 'I-Disease',
               'B-Drug', 'I-Drug',
               'B-Finding', 'I-Finding',
               'B-Symptom', 'I-Symptom']

id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

tokenized_datasets = raw_data.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_data["train"].column_names,
)

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

#add bilstm layer to the model
model.bilstm = torch.nn.LSTM(model.config.hidden_size, model.config.hidden_size, num_layers=1, bidirectional=True, batch_first=True)
model.config.num_labels

args = TrainingArguments(
    "bert-finetuned-ner-bilstm",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

trainer.train()

trainer.push_to_hub(commit_message="Training complete")

In [None]:
#####################################################
# ------------ Set-up with Accelerator ------------ #
#####################################################

# Replace this with your own checkpoint
model_checkpoint = 'bert-finetuned-ner-accelerate/'
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)

# Get dataloader for train and evaluation
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], collate_fn=data_collator, batch_size=8
)

# Get model from the new pretrained checkpoint
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

# Set optimizer AdamW in this case
optimizer = AdamW(model.parameters(), lr=2e-5)

# Set accelerator
accelerator = Accelerator()

# Prepare
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(model, optimizer, train_dataloader, eval_dataloader)

# Set accelerator train parameters
num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [None]:
#####################################################
# --------------- HuggingFace Setup --------------- #
#####################################################

# Set model and repository name
model_name = "bert-finetuned-ner-accelerate"
repo_name = get_full_repo_name(model_name)
print(f'Repo name: {repo_name}')

# Create repository
create_repo("Gorgoura/bert-finetuned-ner-accelerate", repo_type="model")

# Set ouput directory and make the repository
output_dir = "bert-finetuned-ner-accelerate"
repo = Repository(output_dir, clone_from=repo_name)

# Evaluation

In [None]:
#####################################################
# ----------- Training with Accelerator ----------- #
#####################################################

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    for batch in eval_dataloader:
        with torch.no_grad():
            outputs = model(**batch)

        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]

        # Necessary to pad predictions and labels for being gathered
        predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(predictions)
        labels_gathered = accelerator.gather(labels)

        true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=true_predictions, references=true_labels)

    results = metric.compute()
    print(
        f"epoch {epoch}:",
        {
            key: results[f"overall_{key}"]
            for key in ["precision", "recall", "f1", "accuracy"]
        },
    )

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        repo.push_to_hub(
            commit_message=f"Training in progress epoch {epoch}", blocking=False
        )

In [None]:
#####################################################
# ------------ Evaluating the training ------------ #
#####################################################

# Initializing the trainer again, to run the evaluation
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    # eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

# Evaluate the model on the test set
test_results = trainer.evaluate(eval_dataset=tokenized_datasets["test"])
test_results

In [None]:
# === Evaluation / metrics ===
# Print the results and the checkpoint used
print (f"checkpoint: {model_checkpoint}")
for key, value in test_results.items():
    print(f"{key}: {value:.3f}")

# Print all entity metrics
entity_metrics = test_results["eval_entity_metrics"]
macro_f1 = test_results["eval_macro_f1"]
micro_f1 = test_results["eval_micro_f1"]

# Display the evaluation results
print("Entity Metrics:")
for entity, metrics in entity_metrics.items():
    print(f"{entity}: Precision={metrics['precision']}, Recall={metrics['recall']}, F1={metrics['f1']}")

print(f"Macro-Averaged F1: {macro_f1}")
print(f"Micro-Averaged F1: {micro_f1}")