In [2]:
from datasets import load_dataset
from dotenv import load_dotenv

import torch

import os
from pathlib import Path
# Load variables
load_dotenv()
# change dir root (one above)
access_key = os.getenv("HUGGING_FACE")
root_dir = os.path.abspath("/work3/s174159/ET_LLM_RAG/")
model_dir = Path(root_dir, "models")
articles_dir = Path(root_dir, "Articles")

In [None]:
model_name = "microsoft/deberta-v3-base"

In [3]:
torch.cuda.is_available()

True

In [4]:
wnut = load_dataset("wnut_17", cache_dir=model_dir)

In [5]:
wnut["train"]

Dataset({
    features: ['id', 'tokens', 'ner_tags'],
    num_rows: 3394
})

In [6]:
wnut["train"].features

{'id': Value(dtype='string', id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-corporation', 'I-corporation', 'B-creative-work', 'I-creative-work', 'B-group', 'I-group', 'B-location', 'I-location', 'B-person', 'I-person', 'B-product', 'I-product'], id=None), length=-1, id=None)}

In [7]:
label_list = wnut["train"].features[f"ner_tags"].feature.names
label_list

['O',
 'B-corporation',
 'I-corporation',
 'B-creative-work',
 'I-creative-work',
 'B-group',
 'I-group',
 'B-location',
 'I-location',
 'B-person',
 'I-person',
 'B-product',
 'I-product']

In [8]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", cache_dir=model_dir)

In [9]:
example = wnut["train"][0]
print(example)

{'id': '0', 'tokens': ['@paulwalk', 'It', "'s", 'the', 'view', 'from', 'where', 'I', "'m", 'living', 'for', 'two', 'weeks', '.', 'Empire', 'State', 'Building', '=', 'ESB', '.', 'Pretty', 'bad', 'storm', 'here', 'last', 'evening', '.'], 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 8, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0]}


In [10]:
tokeniezd_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokeniezd_input["input_ids"])
print(tokens)

['[CLS]', '@', 'paul', '##walk', 'it', "'", 's', 'the', 'view', 'from', 'where', 'i', "'", 'm', 'living', 'for', 'two', 'weeks', '.', 'empire', 'state', 'building', '=', 'es', '##b', '.', 'pretty', 'bad', 'storm', 'here', 'last', 'evening', '.', '[SEP]']


In [11]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation = True, is_split_into_words=True)

    labels = []

    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index = i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else: 
                label_ids.append(-100)

            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [12]:
tokenized_wnut = wnut.map(
    tokenize_and_align_labels,
    batched = True,
    
)

In [13]:
from transformers import DataCollatorForTokenClassification

In [14]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [15]:
import evaluate
seqeval = evaluate.load("seqeval", cache_dir=model_dir)

In [16]:
example["ner_tags"][0]

0

In [17]:
import numpy as np

labels = [label_list[i] for i in example[f"ner_tags"]]

In [18]:
import numpy as np

labels = [label_list[i] for i in example[f"ner_tags"]]

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
}

In [19]:
id2label = {
    0: "O",
    1: "B-corporation",
    2: "I-corporation",
    3: "B-creative-work",
    4: "I-creative-work",
    5: "B-group",
    6: "I-group",
    7: "B-location",
    8: "I-location",
    9: "B-person",
    10: "I-person",
    11: "B-product",
    12: "I-product",
    }
label2id = {
    "O": 0,
    "B-corporation": 1,
    "I-corporation": 2,
    "B-creative-work": 3,
    "I-creative-work": 4,
    "B-group": 5,
    "I-group": 6,
    "B-location": 7,
    "I-location": 8,
    "B-person": 9,
    "I-person": 10,
    "B-product": 11,
    "I-product": 12,
}

In [21]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels = 13,
    id2label = id2label,
    label2id= label2id,
    cache_dir = model_dir
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
from huggingface_hub import login
login(token= access_key)

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /zhome/01/d/127159/.cache/huggingface/token
Login successful


In [32]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir = f"{model_dir}/EntityLinking",
    learning_rate = 2e-5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size=16,
    num_train_epochs = 20,
    weight_decay = 0.01,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    load_best_model_at_end = True,
    report_to = ["none"],
    push_to_hub = False, 
)

In [33]:
from transformers import Trainer

trainer = Trainer(
    model = model,   
    train_dataset = tokenized_wnut["train"],
    eval_dataset = tokenized_wnut["test"],
    tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_metrics,
    args = training_args
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 5.4.268, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [34]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.276919,0.612546,0.307692,0.409624,0.94267
2,No log,0.28442,0.571669,0.31418,0.405502,0.94344
3,0.082200,0.290867,0.537389,0.392956,0.453961,0.946432
4,0.082200,0.313066,0.514007,0.391103,0.444211,0.945022
5,0.030300,0.357241,0.549932,0.372567,0.444199,0.945449
6,0.030300,0.353986,0.563492,0.39481,0.464305,0.946475
7,0.030300,0.363251,0.550995,0.410565,0.470526,0.946219
8,0.013300,0.380479,0.559033,0.364226,0.441077,0.94562
9,0.013300,0.372805,0.52669,0.411492,0.462019,0.946176
10,0.006700,0.39386,0.561379,0.377201,0.45122,0.94609


In [27]:
from transformers import pipeline

text = "Jason Derulo is having a concert in Copenhagen Denmark. The place if the event is in Royal arena, close to Malmø. This means that swedish citizens can take the train"
classifier = pipeline("ner", model="Tirendaz/my_ner_model")
classifier(text)

[{'entity': 'B-person',
  'score': 0.53076243,
  'index': 1,
  'word': 'jason',
  'start': 0,
  'end': 5},
 {'entity': 'I-person',
  'score': 0.36224654,
  'index': 2,
  'word': 'der',
  'start': 6,
  'end': 9},
 {'entity': 'I-person',
  'score': 0.36672172,
  'index': 3,
  'word': '##ulo',
  'start': 9,
  'end': 12},
 {'entity': 'B-location',
  'score': 0.3662869,
  'index': 9,
  'word': 'copenhagen',
  'start': 36,
  'end': 46},
 {'entity': 'B-location',
  'score': 0.33535883,
  'index': 10,
  'word': 'denmark',
  'start': 47,
  'end': 54},
 {'entity': 'B-location',
  'score': 0.35388476,
  'index': 19,
  'word': 'royal',
  'start': 85,
  'end': 90},
 {'entity': 'B-location',
  'score': 0.25323465,
  'index': 20,
  'word': 'arena',
  'start': 91,
  'end': 96},
 {'entity': 'B-location',
  'score': 0.36940467,
  'index': 24,
  'word': 'mal',
  'start': 107,
  'end': 110},
 {'entity': 'B-location',
  'score': 0.38407275,
  'index': 26,
  'word': '##ø',
  'start': 111,
  'end': 112}]

In [30]:
import pandas as pd

def tag_sentence(text:str):
    # convert our text to a  tokenized sequence
    inputs = tokenizer(text, truncation=True, return_tensors="pt").to("cuda")
    # get outputs
    outputs = model(**inputs)
    # convert to probabilities with softmax
    probs = outputs[0][0].softmax(1)
    # get the tags with the highest probability
    word_tags = [(tokenizer.decode(inputs['input_ids'][0][i].item()), id2label[tagid.item()]) 
                  for i, tagid in enumerate (probs.argmax(axis=1))]

    return pd.DataFrame(word_tags, columns=['word', 'tag'])

In [31]:
tag_sentence(text)

Unnamed: 0,word,tag
0,[CLS],O
1,jason,B-person
2,der,I-person
3,##ulo,I-person
4,is,O
5,having,O
6,a,O
7,concert,O
8,in,O
9,copenhagen,B-location
