# Prep

In [None]:
# Install dependencies
# %pip install -q -U ipywidgets transformers tqdm
# %pip install -q -U seqeval
# %pip install -q -U accelerate
# %pip install -q -U transformers[torch]
# %pip install -q --upgrade -U torch torchvision torchaudio torchtext
# %pip install -q dill==0.3.1.1
# %pip install -q numpy==1.14.3
# %pip install -q pyarrow==0.3.8
# %pip install -q multiprocess==0.70.16
# %pip install -q -U datasets==2.6.0
# %pip install fsspec==2023.9.2
# %pip install spacy
# %pip install spacy-en-core-web-sm
# %python3 -m spacy download en_core_web_sm

In [None]:
import torch
import torchtext
from datasets import load_dataset
import nltk
from datasets import DatasetDict, Dataset
from sklearn.metrics import classification_report
from transformers import AutoTokenizer
import transformers
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, pipeline
from transformers import DataCollatorForTokenClassification
from datasets import load_metric
import numpy as np
import gc
from transformers import Trainer
from datasets import DatasetDict, Dataset
from transformers.pipelines import PIPELINE_REGISTRY
from pipeline import NER_Pipeline
from huggingface_hub import notebook_login

In [None]:
SEED = 1234
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

print("PyTorch Version: ", torch.__version__)
print("torchtext Version: ", torchtext.__version__)
print(f"Using {'GPU' if str(DEVICE) == 'cuda' else 'CPU'}.")

# Training Data

In [None]:
dataset = load_dataset("surrey-nlp/PLOD-CW", cache_dir=None, download_mode="force_redownload")

In [None]:
label_list = ['B-O', 'B-AC', 'B-LF', 'I-LF']
print(label_list)

In [None]:
train = dataset['train']
print(f"train size: {len(train)}")
val = dataset['validation']
print(f"val size: {len(val)}")
test = dataset['test']
print(f"test size: {len(test)}")

def flatten(A):
    rt = []
    for i in A:
        if isinstance(i,list): rt.extend(flatten(i))
        else: rt.append(i)
    return rt

from collections import Counter
flat = flatten(train["ner_tags"])
print(Counter(flat))

In [None]:
def decode_tags(tag_sequences, possible_tags):
    """
    Decodes a sequence of numerical tags into a list of corresponding textual labels.

    Args:
        tag_sequence: A list of integers representing numerical tags.
        possible_tags: A list of strings representing the possible textual labels.

    Returns:
        A list of strings representing the decoded textual tags.
    """

    decoded_tags = [[possible_tags[tag] for tag in row] for row in tag_sequences]
    return decoded_tags


def build_dataset(filtered_set, cw_set, num_of_samples):
    """
    Merges a specified number of rows from a larger list to a smaller list, ensuring no duplicates.

    Args:
        filtered_set: a split of the filtered dataset
        cw_set: a split of the cw dataset
        num_of_samples: The number of rows to add from the filtered set.

    Returns:
        new tokens, pos_tags and ner_tags lists
    """
    # set up the initial lists
    tokens = cw_set["tokens"]
    pos_tags = cw_set["pos_tags"]
    ner_tags = cw_set["ner_tags"]
     
    # set up the filtered lists
    # tokens
    filtered_tokens = filtered_set["tokens"]
    # pos_tags
    filtered_label_list = filtered_set.features[f"pos_tags"].feature.names
    filtered_pos_tags = decode_tags(filtered_set["pos_tags"], filtered_label_list)
    # ner_tags
    filtered_label_list = filtered_set.features[f"ner_tags"].feature.names
    filtered_ner_tags = decode_tags(filtered_set["ner_tags"], filtered_label_list)

    # convert the tokens list to sets for efficient duplicate checking
    tokens_set = set(tuple(row) for row in tokens)
    filtered_tokens_set = set(tuple(row) for row in filtered_tokens)

    # find rows to add
    rows_to_add = []
    for index, row in enumerate(filtered_tokens_set):
        if tuple(row) not in tokens_set and len(rows_to_add) < num_of_samples:
            rows_to_add.append(index)

    # Merge and return the lists
    tokens = tokens + [filtered_tokens[i] for i in rows_to_add]
    pos_tags = pos_tags + [filtered_pos_tags[i] for i in rows_to_add]
    ner_tags = ner_tags + [filtered_ner_tags[i] for i in rows_to_add]

    return tokens, pos_tags, ner_tags

In [None]:
label_all_tokens = True
def encode_tags(tag_sequences, possible_tags):
    """
    Encodes a sequence of string tags into a list of corresponding integer tags.

    Args:
        tag_sequences: A 2d list of strings representing numerical tags.
        possible_tags: A list of strings representing the possible textual labels.

    Returns:
        A list of strings representing the decoded textual tags.
    """

    encoded_tags = [[possible_tags.index(tag) for tag in row] for row in tag_sequences]
    return encoded_tags
def tokenize_and_align_labels(data, tokenizer, task):
    tokenized_inputs = tokenizer(data["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    converted_tags = encode_tags(data[f"{task}_tags"], label_list)
    for i, label in enumerate(converted_tags):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

## Pre Processing

### Lemmatization

In [None]:
nltk.download('wordnet')

In [None]:
def combine_lists_elementwise(list_A, list_B):
  """
  Combines two 2D lists of strings element-wise into a 2D list of tuples.

  Args:
      list_A: A 2D list of strings (e.g., [['A', 'A', 'A'], ['A', 'A', 'A']]).
      list_B: Another 2D list of strings with the same dimensions as list_A.

  Returns:
      A 2D list of tuples, where each tuple combines corresponding elements from list_A and list_B.

  Raises:
      ValueError: If the dimensions of list_A and list_B don't match.
  """

  # Check if dimensions match
  if len(list_A) != len(list_B) or len(list_A[0]) != len(list_B[0]):
    raise ValueError("Dimensions of lists A and B must be equal.")

  # Create the resulting list using list comprehension
  return [[(a, b) for a, b in zip(row_a, row_b)] for row_a, row_b in zip(list_A, list_B)]

In [None]:
def convert_pos_tag(nltk_tag):
    """
    Converts NLTK POS tags to the format expected by the lemmatizer.

    Args:
        nltk_tag: The POS tag in NLTK format (e.g., VBG, NNS).

    Returns:
        The corresponding POS tag for the lemmatizer (n, v, a, r, or s) or None if no match.
    """

    tag_map = {
        'NUM': '',  # Number (not handled by lemmatizer)
        'CCONJ': '',  # Coordinating conjunction (not handled)
        'PRON': '',  # Pronoun (not handled)
        'NOUN': 'n',   # Noun
        'SCONJ': '',  # Subordinating conjunction (not handled)
        'SYM': '',   # Symbol (not handled)
        'INTJ': '',  # Interjection (not handled)
        'ADJ': 'a',    # Adjective
        'ADP': '',   # Preposition (not handled)
        'PUNCT': '',  # Punctuation (not handled)
        'ADV': 'r',    # Adverb
        'AUX': 'v',    # Auxiliary verb
        'DET': '',   # Determiner (not handled)
        'VERB': 'v',   # Verb
        'X': '',      # Other (not handled)
        'PART': '',   # Particle (not handled)
        'PROPN': 'n',   # Proper noun
    }
    return tag_map.get(nltk_tag)

In [None]:
def lemmatize_list(data, pos_tags):
    """
    Lemmatizes a 2D list of tokens using NLTK.

    Args:
        data: A 2D list of strings (tokens) to be lemmatized.

    Returns:
        A 2D list containing the lemmatized tokens.
    """

    # Initialize the WordNet lemmatizer
    lemmatizer = nltk.WordNetLemmatizer()

    pos_tags = [[convert_pos_tag(tag) for tag in row] for row in pos_tags]


    data = combine_lists_elementwise(data, pos_tags)


    # Lemmatize with part-of-speech information
    lemmatized_data = [[token if pos == '' else lemmatizer.lemmatize(token, pos) for token, pos in row] for row in data]

    return lemmatized_data

### Pre-Processing Pipeline

In [None]:
def pre_process_data(tokens, pos_tags):
    # lemmatize the data
    data = lemmatize_list(tokens, pos_tags)
    # lowercase the data
    data = [[string.lower() for string in row] for row in data]
    return data

In [None]:
task = "ner"

In [None]:
train_tokens = pre_process_data(train["tokens"], train["pos_tags"])
val_tokens = pre_process_data(val["tokens"], val["pos_tags"])
test_tokens = pre_process_data(test["tokens"], test["pos_tags"])
original_train_tokens = train["tokens"]
original_val_tokens = val["tokens"]
print(f"original train tokens: {original_train_tokens[0]}\npre-processed train tokens: {train_tokens[0]}")
print(f"original val tokens: {original_val_tokens[0]}\npre-processed val tokens: {val_tokens[0]}")

dataset = DatasetDict({
    "train": Dataset.from_dict({"tokens": train_tokens, "pos_tags": train["pos_tags"], "ner_tags": train["ner_tags"]}),
    "validation": Dataset.from_dict({"tokens": val_tokens, "pos_tags": val["pos_tags"], "ner_tags": val["ner_tags"]}),
    "test": Dataset.from_dict({"tokens": test_tokens, "pos_tags": test["pos_tags"], "ner_tags": test["ner_tags"]}),
})

## Getting Extra Samples

In [None]:
filtered_dataset = load_dataset("surrey-nlp/PLOD-filtered")

In [None]:
filtered_train = filtered_dataset["train"]
print(f"train size: {len(filtered_train)}")
filtered_val = filtered_dataset["validation"]
print(f"val size: {len(filtered_val)}")
filtered_test = filtered_dataset["test"]
print(f"test size: {len(filtered_test)}")

In [None]:
medium = 10000
tokens_medium, pos_tags_medium, ner_tags_medium = build_dataset(filtered_train, train, medium)
tokens_medium = pre_process_data(tokens_medium, pos_tags_medium)
print(f"num of medium samples: {len(tokens_medium)}")

In [None]:
medium_datasets_dict = {
    "train": Dataset.from_dict({"tokens": tokens_medium, "pos_tags": pos_tags_medium, "ner_tags": ner_tags_medium})
}
medium_dataset = DatasetDict(medium_datasets_dict)

# Model

## Tokenizer and Model

In [None]:
model_checkpoint = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True) # use AutoTokenizer because it defaults to fast tokenizers where as using the BERT Tokenizer does not
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

## Training

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")

In [None]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    overall_results = metric.compute(predictions=true_predictions, references=true_labels)
    
    true_labels = [item for sublist in true_labels for item in sublist]
    true_predictions = [item for sublist in true_predictions for item in sublist]
    
    
    results = classification_report(true_labels, true_predictions, labels = label_list)
    print(results)
    return {
        "precision": overall_results["overall_precision"],
        "recall": overall_results["overall_recall"],
        "f1": overall_results["overall_f1"],
        "accuracy": overall_results["overall_accuracy"],
    }

In [None]:
def intermediate_func(data):
    return tokenize_and_align_labels(data, tokenizer, task)
tokenized_datasets = dataset.map(intermediate_func, batched=True)
medium_tokenized_dataset = medium_dataset.map(intermediate_func, batched=True)

training_args = TrainingArguments(
    f"{model_checkpoint}-finetuned-GROUP-{task}",
    evaluation_strategy ='steps',
    eval_steps = 7000,
    logging_steps = 500,
    save_total_limit = 1,
    learning_rate=1e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.001,
    save_steps=0,
    metric_for_best_model = 'f1',
    load_best_model_at_end=True,
    report_to=["tensorboard"],
)

trainer = Trainer(
    model,
    training_args,
    train_dataset=medium_tokenized_dataset["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
# clear CUDA memory
gc.collect()
torch.cuda.empty_cache()

In [None]:
trainer.train()

In [None]:
# clear CUDA memory
gc.collect()
torch.cuda.empty_cache()

## Save

In [None]:
trainer.model.save_pretrained("roberta_save")

In [None]:
notebook_login()

In [None]:
tokenizer.push_to_hub("SurtMcGert/NLP-group-CW-roberta-ner-tagging")
trainer.model.push_to_hub("SurtMcGert/NLP-group-CW-roberta-ner-tagging")

# Pipeline

In [None]:
# Register custom pipeline
PIPELINE_REGISTRY.register_pipeline(
    "NER_NLP_tagger",
    pipeline_class = NER_Pipeline,
    pt_model = AutoModelForTokenClassification
)

In [None]:
ner_tagger = pipeline("NER_NLP_tagger", model = "SurtMcGert/NLP-group-CW-roberta-ner-tagging")

In [None]:
ner_tagger.requires_update()

### Quick Test

In [None]:
output = ner_tagger("this is a test on our Natural Language Processing (NLP) tagging Artificial Intelligence (AI).")
print(output)

# Server

In [None]:
%streamlit run server.py