In [5]:
from datasets import load_dataset
from dotenv import load_dotenv
import os
from pathlib import Path
# Load variables
load_dotenv()
# change dir root (one above)
access_key = os.getenv("HUGGING_FACE")
root_dir = os.path.abspath("/work3/s174159/ET_LLM_RAG/")
model_dir = Path(root_dir, "models")
articles_dir = Path(root_dir, "Articles")

In [105]:
wnut = load_dataset("mnaguib/WikiNER", "en", cache_dir=model_dir)

In [101]:
wnut["train"]

Dataset({
    features: ['id', 'tokens', 'ner_tags'],
    num_rows: 3394
})

In [102]:
wnut["train"].features

{'id': Value(dtype='string', id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-corporation', 'I-corporation', 'B-creative-work', 'I-creative-work', 'B-group', 'I-group', 'B-location', 'I-location', 'B-person', 'I-person', 'B-product', 'I-product'], id=None), length=-1, id=None)}

In [85]:
label_list = wnut["train"].features[f"ner_tags"].feature.names
label_list

AttributeError: 'Value' object has no attribute 'names'

In [106]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", cache_dir=model_dir)

In [107]:
example = wnut["train"][0]
print(example)

{'id': 'en-doc3688-sent43', 'words': ['Shortly', 'afterwards', ',', 'it', 'was', 'ceded', 'to', 'the', 'Aydınoğulları', 'principality', 'that', 'stationed', 'a', 'powerful', 'navy', 'in', 'the', 'harbour', 'of', 'Ayasuluğ', '(', 'the', 'present-day', 'Selçuk', ',', 'next', 'to', 'Ephesus', ')', '.'], 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]}


In [109]:
tokeniezd_input = tokenizer(example["words"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokeniezd_input["input_ids"])
print(tokens)

['[CLS]', 'shortly', 'afterwards', ',', 'it', 'was', 'ceded', 'to', 'the', 'a', '##yd', '##ı', '##no', '##gul', '##lar', '##ı', 'principality', 'that', 'stationed', 'a', 'powerful', 'navy', 'in', 'the', 'harbour', 'of', 'a', '##yas', '##ulu', '##g', '(', 'the', 'present', '-', 'day', 'se', '##lc', '##uk', ',', 'next', 'to', 'ep', '##hes', '##us', ')', '.', '[SEP]']


In [112]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["words"], truncation = True, is_split_into_words=True)

    labels = []

    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index = i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else: 
                label_ids.append(-100)

            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [113]:
tokenized_wnut = wnut.map(
    tokenize_and_align_labels,
    batched = True,
    
)

Map:   0%|          | 0/129376 [00:00<?, ? examples/s]

Map:   0%|          | 0/14398 [00:00<?, ? examples/s]

In [114]:
from transformers import DataCollatorForTokenClassification

In [115]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [116]:
import evaluate
seqeval = evaluate.load("seqeval", cache_dir=model_dir)

In [123]:
example["ner_tags"][0]

0

In [117]:
import numpy as np

labels = [label_list[i] for i in example[f"ner_tags"]]

TypeError: 'Sequence' object is not subscriptable

In [45]:
import numpy as np

labels = [label_list[i] for i in example[f"ner_tags"]]

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
}

In [46]:
id2label = {
    0: "O",
    1: "B-corporation",
    2: "I-corporation",
    3: "B-creative-work",
    4: "I-creative-work",
    5: "B-group",
    6: "I-group",
    7: "B-location",
    8: "I-location",
    9: "B-person",
    10: "I-person",
    11: "B-product",
    12: "I-product",
    }
label2id = {
    "O": 0,
    "B-corporation": 1,
    "I-corporation": 2,
    "B-creative-work": 3,
    "I-creative-work": 4,
    "B-group": 5,
    "I-group": 6,
    "B-location": 7,
    "I-location": 8,
    "B-person": 9,
    "I-person": 10,
    "B-product": 11,
    "I-product": 12,
}

In [70]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels = 13,
    id2label = id2label,
    label2id= label2id,
    cache_dir = model_dir
)

model = AutoModelForTokenClassification.from_pretrained(
    "microsoft/deberta-v3-base", 
    num_labels = 13, 
    id2label = id2label,
    label2id = label2id,
    cache_dir = model_dir)

model = AutoModelForTokenClassification.from_pretrained(
    "FacebookAI/roberta-base", 
    num_labels = 13, 
    id2label = id2label,
    label2id = label2id,
    cache_dir = model_dir)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [66]:
from huggingface_hub import login
login(token= access_key)

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /zhome/01/d/127159/.cache/huggingface/token
Login successful


In [67]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir = f"{model_dir}/EntityLinking",
    learning_rate = 2e-5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size=16,
    num_train_epochs = 2,
    weight_decay = 0.01,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    load_best_model_at_end = True,
    report_to = ["none"],
    push_to_hub = False, 
)

In [68]:
from transformers import Trainer

trainer = Trainer(
    model = model,   
    train_dataset = tokenized_wnut["train"],
    eval_dataset = tokenized_wnut["test"],
    tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_metrics,
    args = training_args
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 5.4.268, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [69]:
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [53]:
from transformers import pipeline

text = "My name is Sarah, I live in London"
classifier = pipeline("ner", model="Tirendaz/my_ner_model")
classifier(text)

config.json:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/266M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/320 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

[{'entity': 'B-person',
  'score': 0.5315515,
  'index': 4,
  'word': 'sarah',
  'start': 11,
  'end': 16},
 {'entity': 'B-location',
  'score': 0.39324522,
  'index': 9,
  'word': 'london',
  'start': 28,
  'end': 34}]

In [59]:
import pandas as pd

def tag_sentence(text:str):
    # convert our text to a  tokenized sequence
    inputs = tokenizer(text, truncation=True, return_tensors="pt").to("cpu")
    # get outputs
    outputs = model(**inputs)
    # convert to probabilities with softmax
    probs = outputs[0][0].softmax(1)
    # get the tags with the highest probability
    word_tags = [(tokenizer.decode(inputs['input_ids'][0][i].item()), id2label[tagid.item()]) 
                  for i, tagid in enumerate (probs.argmax(axis=1))]

    return pd.DataFrame(word_tags, columns=['word', 'tag'])

In [62]:
tag_sentence(text)

Unnamed: 0,word,tag
0,[CLS],O
1,my,O
2,name,O
3,is,O
4,sarah,B-person
5,",",O
6,i,O
7,live,O
8,in,O
9,london,B-location
