## Installation and imports

In [None]:
!pip install datasets
!pip install seqeval

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [None]:
import os
import torch
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import (AutoConfig,
                          AutoTokenizer,
                          AutoModelForTokenClassification,
                          DataCollatorForTokenClassification,
                          TrainingArguments,
                          Trainer,
                          pipeline)
from seqeval.metrics import f1_score

In [None]:
from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/My Drive/machine_learning')
print(f'Working directory: {os.getcwd()}')

Mounted at /content/drive
Working directory: /content/drive/My Drive/machine_learning


In [None]:
base_model = 'xlm-roberta-base'
trained_model = 'xlm-roberta-base-for-ner'

## Preparing the dataset

In [None]:
tokenizer = AutoTokenizer.from_pretrained(base_model)

NameError: name 'AutoTokenizer' is not defined

In [None]:
# stretch the tokenizer's legs
random_sample_text = 'This is going to be a smashing named entity recognition model'
tokens = tokenizer(random_sample_text).tokens()
input_ids = tokenizer.encode(random_sample_text, return_tensors="pt")
pd.DataFrame([tokens, input_ids[0].numpy()], index=["Tokens", "Input IDs"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
Tokens,<s>,▁This,▁is,▁going,▁to,▁be,▁a,▁sma,shing,▁na,med,▁enti,ty,▁recognition,▁model,</s>
Input IDs,0,3293,83,7730,47,186,10,44446,54700,24,4806,77630,939,230466,3299,2


In [None]:
data = pd.read_csv(r'model_ner_dataset.csv', encoding='latin-1')
data['Sentence #'] = data['Sentence #'].ffill()
data = data.dropna(how='any')
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [None]:
grouped_data = data.groupby('Sentence #').agg({'Word': list, 'Tag': list}).reset_index()
dataset = Dataset.from_pandas(grouped_data)
print(dataset)
print(dataset[0]['Word'])

Dataset({
    features: ['Sentence #', 'Word', 'Tag'],
    num_rows: 47959
})
['Thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'London', 'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.']


In [None]:
tags = set(data['Tag'])
index2tag = {idx: tag for idx, tag in enumerate(tags)}
tag2index = {tag: idx for idx, tag in enumerate(tags)}
print(index2tag)

{0: 'I-per', 1: 'I-gpe', 2: 'B-per', 3: 'I-eve', 4: 'I-nat', 5: 'B-art', 6: 'I-tim', 7: 'B-nat', 8: 'O', 9: 'I-art', 10: 'B-geo', 11: 'B-gpe', 12: 'I-geo', 13: 'B-tim', 14: 'I-org', 15: 'B-eve', 16: 'B-org'}


In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["Word"],
                                 truncation=True,
                                 is_split_into_words=True)
    labels = []
    for idx, label in enumerate(examples["Tag"]):
        word_ids = tokenized_inputs.word_ids(batch_index=idx)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None or word_idx == previous_word_idx:
                label_ids.append(-100)
            else:
                label_ids.append(tag2index[label[word_idx]])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
dataset = dataset.map(tokenize_and_align_labels,
                      batched=True,
                      remove_columns=['Sentence #', 'Word', 'Tag'])
print(dataset)

Map:   0%|          | 0/47959 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 47959
})


In [None]:
dataset = dataset.train_test_split(test_size=0.1, shuffle=True)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 43163
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 4796
    })
})


## Defining a metric for evaluation

In [None]:
def align_predictions(predictions, label_ids):
    preds = np.argmax(predictions, axis=2)
    batch_size, seq_len = preds.shape
    labels_list, preds_list = [], []
    for batch_idx in range(batch_size):
        example_labels, example_preds = [], []
        for seq_idx in range(seq_len):
            # Ignore label IDs = -100
            if label_ids[batch_idx, seq_idx] != -100:
                example_labels.append(index2tag[label_ids[batch_idx][seq_idx]])
                example_preds.append(index2tag[preds[batch_idx][seq_idx]])
        labels_list.append(example_labels)
        preds_list.append(example_preds)
    return preds_list, labels_list

def compute_metrics(eval_pred):
    y_pred, y_true = align_predictions(eval_pred.predictions,
                                       eval_pred.label_ids)
    return {"f1": f1_score(y_true, y_pred)}

## Training

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [None]:
model_config = AutoConfig.from_pretrained(base_model,
                                          num_labels=len(index2tag),
                                          id2label=index2tag, label2id=tag2index)

In [None]:
model = AutoModelForTokenClassification.from_pretrained(base_model,
                                                        config=model_config).to(device)

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir=trained_model,
    log_level='error',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    num_train_epochs=3,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
    weight_decay=0.01,
    disable_tqdm=False,
    push_to_hub=False,
    report_to='none',
)



In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [None]:
trainer.train() # let's goooo

Epoch,Training Loss,Validation Loss,F1
1,0.1105,0.096917,0.827213
2,0.0882,0.086352,0.837567
3,0.0755,0.084975,0.842893


TrainOutput(global_step=5397, training_loss=0.10497294785204653, metrics={'train_runtime': 1851.4327, 'train_samples_per_second': 69.94, 'train_steps_per_second': 2.915, 'total_flos': 3703655974240212.0, 'train_loss': 0.10497294785204653, 'epoch': 3.0})

## Evaluate the trained model

In [None]:
def tag_text(text):
    # Get tokens with special characters
    tokens = tokenizer(text).tokens()
    # Encode the sequence into IDs
    input_ids = tokenizer(text, return_tensors='pt').input_ids.to(device)
    # Get predictions as distribution over 7 possible classes
    outputs = model(input_ids)[0]
    # Take argmax to get most likely class per token
    predictions = torch.argmax(outputs, dim=2)
    # Convert to DataFrame
    preds = [index2tag[p] for p in predictions[0].cpu().numpy()]
    return pd.DataFrame(zip(tokens, preds), columns=['Tokens', 'Tags'])

In [None]:
text = 'Aside from an unfortunate incident with a pig, \
David Cameron successfully guided the United Kingdom \
through a turbulent period.'
print(text)
tag_text(text)

Aside from an unfortunate incident with a pig, David Cameron successfully guided the United Kingdom through a turbulent period.


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
Tokens,<s>,▁A,side,▁from,▁an,▁un,fortuna,te,▁incident,▁with,...,d,▁the,▁United,▁Kingdom,▁through,▁a,▁turbulent,▁period,.,</s>
Tags,O,O,O,O,O,O,O,O,O,O,...,O,O,B-geo,I-geo,O,O,O,O,O,O




## Use for predictions

Run at least the first header above (installation and imports). Then:

In [None]:
trained_model = r'xlm-roberta-base-for-ner/checkpoint-5397'
texts = ['David Cameron was a pretty good politician', 'But Ed Milliband did a good job at opposition too']

In [None]:
model = AutoModelForTokenClassification.from_pretrained(trained_model)
tokenizer = AutoTokenizer.from_pretrained(trained_model)

In [None]:
pipe = pipeline('ner', model=model, tokenizer=tokenizer)

Device set to use cuda:0


In [None]:
pipe(texts)

[[{'entity': 'B-per',
   'score': np.float32(0.99710983),
   'index': 1,
   'word': '▁David',
   'start': 0,
   'end': 5},
  {'entity': 'I-per',
   'score': np.float32(0.9950244),
   'index': 2,
   'word': '▁Cameron',
   'start': 6,
   'end': 13}],
 [{'entity': 'B-per',
   'score': np.float32(0.9927408),
   'index': 2,
   'word': '▁Ed',
   'start': 4,
   'end': 6},
  {'entity': 'I-per',
   'score': np.float32(0.99251384),
   'index': 3,
   'word': '▁Milli',
   'start': 7,
   'end': 12},
  {'entity': 'I-per',
   'score': np.float32(0.9923205),
   'index': 4,
   'word': 'band',
   'start': 12,
   'end': 16}]]