# Prep

In [None]:
# Install dependencies
# %pip install -q -U ipywidgets transformers tqdm
# %pip install -q -U seqeval
# %pip install -q -U accelerate
# %pip install -q -U transformers[torch]
# %pip install -q --upgrade -U torch torchvision torchaudio torchtext
# %pip install -q dill==0.3.1.1
# %pip install -q numpy==1.14.3
# %pip install -q pyarrow==0.3.8
# %pip install -q multiprocess==0.70.16
# %pip install -q -U datasets==2.6.0
# %pip install fsspec==2023.9.2
# %pip install spacy
# %pip install spacy-en-core-web-sm
# %python3 -m spacy download en_core_web_sm

In [1]:
import datasets
import torch
import torchtext
from datasets import load_dataset, Features, Value
import matplotlib.pyplot as plt
import nltk
from datasets import DatasetDict, Dataset
from sklearn.metrics import classification_report
from transformers import AutoTokenizer
import transformers
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, pipeline, Pipeline
from transformers import DataCollatorForTokenClassification
from datasets import load_metric
import numpy as np
import dill
import gc
from transformers import Trainer
import torch.nn as nn
from datasets import DatasetDict, Dataset
from sklearn.pipeline import Pipeline
from transformers.pipelines import PIPELINE_REGISTRY
from pipeline import NER_Pipeline
import spacy

In [21]:
SEED = 1234
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

print("PyTorch Version: ", torch.__version__)
print("torchtext Version: ", torchtext.__version__)
print(f"Using {'GPU' if str(DEVICE) == 'cuda' else 'CPU'}.")

PyTorch Version:  2.2.1+cu121
torchtext Version:  0.17.1+cpu
Using GPU.


# Pre Processing

In [22]:
label_list = ['B-O', 'B-AC', 'B-LF', 'I-LF']
print(label_list)

['B-O', 'B-AC', 'B-LF', 'I-LF']


## Lemmatization

In [4]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\harry\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [14]:
def combine_lists_elementwise(list_A, list_B):
  """
  Combines two 2D lists of strings element-wise into a 2D list of tuples.

  Args:
      list_A: A 2D list of strings (e.g., [['A', 'A', 'A'], ['A', 'A', 'A']]).
      list_B: Another 2D list of strings with the same dimensions as list_A.

  Returns:
      A 2D list of tuples, where each tuple combines corresponding elements from list_A and list_B.

  Raises:
      ValueError: If the dimensions of list_A and list_B don't match.
  """

  # Check if dimensions match
  if len(list_A) != len(list_B) or len(list_A[0]) != len(list_B[0]):
    raise ValueError("Dimensions of lists A and B must be equal.")

  # Create the resulting list using list comprehension
  return [[(a, b) for a, b in zip(row_a, row_b)] for row_a, row_b in zip(list_A, list_B)]

In [29]:
def convert_pos_tag(nltk_tag):
    """
    Converts NLTK POS tags to the format expected by the lemmatizer.

    Args:
        nltk_tag: The POS tag in NLTK format (e.g., VBG, NNS).

    Returns:
        The corresponding POS tag for the lemmatizer (n, v, a, r, or s) or None if no match.
    """

    tag_map = {
        'NUM': '',  # Number (not handled by lemmatizer)
        'CCONJ': '',  # Coordinating conjunction (not handled)
        'PRON': '',  # Pronoun (not handled)
        'NOUN': 'n',   # Noun
        'SCONJ': '',  # Subordinating conjunction (not handled)
        'SYM': '',   # Symbol (not handled)
        'INTJ': '',  # Interjection (not handled)
        'ADJ': 'a',    # Adjective
        'ADP': '',   # Preposition (not handled)
        'PUNCT': '',  # Punctuation (not handled)
        'ADV': 'r',    # Adverb
        'AUX': 'v',    # Auxiliary verb
        'DET': '',   # Determiner (not handled)
        'VERB': 'v',   # Verb
        'X': '',      # Other (not handled)
        'PART': '',   # Particle (not handled)
        'PROPN': 'n',   # Proper noun
    }
    return tag_map.get(nltk_tag)

In [41]:
def lemmatize_list(data, pos_tags):
    """
    Lemmatizes a 2D list of tokens using NLTK.

    Args:
        data: A 2D list of strings (tokens) to be lemmatized.

    Returns:
        A 2D list containing the lemmatized tokens.
    """

    # Initialize the WordNet lemmatizer
    lemmatizer = nltk.WordNetLemmatizer()

    pos_tags = [[convert_pos_tag(tag) for tag in row] for row in pos_tags]


    data = combine_lists_elementwise(data, pos_tags)


    # Lemmatize with part-of-speech information
    lemmatized_data = [[token if pos == '' else lemmatizer.lemmatize(token, pos) for token, pos in row] for row in data]

    return lemmatized_data

## Pre-Processing Pipeline

In [42]:
def pre_process_data(tokens, pos_tags):
    # lemmatize the data
    data = lemmatize_list(tokens, pos_tags)
    # lowercase the data
    data = [[string.lower() for string in row] for row in data]
    return data

In [4]:
task = "ner"

# Model

## Tokenizer

In [23]:
model_checkpoint = "bert-base-uncased"
#model_checkpoint = "xlnet-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True) # use AutoTokenizer because it defaults to fast tokenizers where as using the BERT Tokenizer does not
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)
tokenizer.save_pretrained("model_saves/BERT_ex4_six_save")

('model_saves/BERT_ex4_six_save\\tokenizer_config.json',
 'model_saves/BERT_ex4_six_save\\special_tokens_map.json',
 'model_saves/BERT_ex4_six_save\\vocab.txt',
 'model_saves/BERT_ex4_six_save\\added_tokens.json',
 'model_saves/BERT_ex4_six_save\\tokenizer.json')

In [24]:
MODEL_NAME = "model_saves/BERT_ex4_six_save"
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME, num_labels=len(label_list))

In [51]:
text = ["this", "is", "a", "NLP", "test"]
model_inputs = tokenizer(text, truncation=True, is_split_into_words=True, return_tensors="pt")

predictions = model(model_inputs["input_ids"], attention_mask=model_inputs["attention_mask"])
logits = predictions.logits.argmax(-1)
logits = logits[0].tolist()
logits = logits[1:-1]

all_tokens = []
for token_id in model_inputs["input_ids"][0]:
  token = tokenizer.convert_ids_to_tokens([token_id], skip_special_tokens=False)[0]
  all_tokens.append(token)

all_tokens = all_tokens[1:-1]

output = list(zip(all_tokens, logits))
print(output)

['this', 'is', 'a', 'nl', '##p', 'test']
[('this', 0), ('is', 0), ('a', 0), ('nl', 1), ('##p', 1), ('test', 0)]


# Pipeline

In [7]:
# Register custom pipeline
PIPELINE_REGISTRY.register_pipeline(
    "NER_NLP_tagger",
    pipeline_class = NER_Pipeline,
    pt_model = AutoModelForTokenClassification
)

In [8]:
ner_tagger = pipeline("NER_NLP_tagger", model = MODEL_NAME)

### Quick Test

In [9]:
output = ner_tagger("this is a test on our NLP tagging AI.")
print(output)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\harry\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


ModelOutput([('logits', tensor([[[ 4.2329, -0.7156, -1.7449, -1.7011],
         [ 5.8678, -2.2680, -1.7264, -2.0507],
         [ 6.1571, -2.5399, -1.9748, -2.1765],
         [ 6.5044, -2.3500, -2.1571, -2.3708],
         [ 6.3640, -2.5742, -1.9788, -2.1533],
         [ 6.4246, -2.7203, -1.9795, -1.9951],
         [ 5.5785, -2.1480, -0.9883, -2.6478],
         [ 0.0527,  2.4808, -0.1440, -1.8209],
         [-0.1638,  2.9137, -0.5655, -1.9580],
         [ 2.3796, -1.0876, -0.4989, -0.9725],
         [ 2.1622, -1.8587, -0.2954,  0.4922],
         [ 2.4821,  0.2444, -2.8167, -0.1760],
         [ 5.6708, -1.8396, -2.2162, -2.1163],
         [ 4.5670, -0.3235, -2.8016, -1.5900]]]))])


TypeError: argument 'ids': 'list' object cannot be interpreted as an integer

# Server

In [None]:
%python server.py