In [2]:
!pip install datasets transformers


Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Configs

In [4]:
LABEL2IDX = {
    'O': 0,
    'B-DAT': 1,
    'B-PER': 2,
    'B-ORG': 3,
    'B-LOC': 4,
    'B-EVE': 5,
    'I-DAT': 6,
    'I-PER': 7,
    'I-ORG': 8,
    'I-LOC': 9,
    'I-EVE': 10
}

IDX2LABEL = {i: k for k, i in LABEL2IDX.items()}

CLS = [101]
SEP = [102]
VALUE_TOKEN = [0]
MAX_LEN = 128
TRAIN_BATCH_SIZE = 8
EVAL_BATCH_SIZE = 8
EPOCHS = 4
NUM_CLASS = 11
LEARNING_RATE = 5e-5

In [5]:
from datasets import concatenate_datasets, load_dataset
from transformers import XLMRobertaTokenizerFast
import ast

model_name = 'xlm-roberta-base'
tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name)
data_files = "/content/drive/MyDrive/Colab Notebooks/NER-datasets/shuffled-100000.csv"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

In [6]:
def parse_tokens_and_labels(example):
    # Ensure 'tokens' and 'labels' are in the expected format
    if isinstance(example['tokens'], str):
        example['tokens'] = ast.literal_eval(example['tokens'])
    if isinstance(example['labels'], str):
        example['labels'] = ast.literal_eval(example['labels'])
    return example


In [7]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"],truncation=True,max_length=512,
 is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [8]:

dataset = load_dataset('csv', data_files=data_files)
dataset = dataset['train'].select(range(60000))
# Apply the parsing function to the entire dataset
parsed_dataset = dataset.map(parse_tokens_and_labels)
# rename labels to net_tags
parsed_dataset = parsed_dataset.rename_column("labels", "ner_tags")
tokenized_dataset = parsed_dataset.map(tokenize_and_align_labels, batched=True, remove_columns=parsed_dataset.column_names,)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/60000 [00:00<?, ? examples/s]

Map:   0%|          | 0/60000 [00:00<?, ? examples/s]

In [9]:
tokenized_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 60000
})

In [10]:
# Printing the values and lengths of each column in the first item of tokenized_dataset
print(f"Input IDs: {tokenized_dataset[0]['input_ids']}, length: {len(tokenized_dataset[0]['input_ids'])}")
print(f"Attention Mask: {tokenized_dataset[0]['attention_mask']}, length: {len(tokenized_dataset[0]['attention_mask'])}")
print(f"Labels: {tokenized_dataset[0]['labels']}, length: {len(tokenized_dataset[0]['labels'])}")



Input IDs: [0, 151375, 13919, 270, 1807, 33477, 234051, 178, 22838, 2254, 1807, 175, 43493, 92871, 178, 2900, 39310, 65, 2254, 1807, 5373, 175, 1262, 8573, 1901, 9280, 84975, 4392, 90401, 23027, 3392, 13087, 6, 5, 2], length: 35
Attention Mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], length: 35
Labels: [-100, 2, 7, 1, 6, 6, -100, 0, 0, 0, -100, 0, 0, 4, 0, -100, 0, 0, 0, -100, 0, 0, 4, -100, -100, 0, 0, 3, -100, 0, 0, 0, 0, -100, -100], length: 35


In [11]:
 print(tokenizer.decode(tokenized_dataset[0]['input_ids'][0]))
 print(tokenizer.decode(tokenized_dataset[0]['input_ids'][1]))
 print(tokenizer.decode(tokenized_dataset[0]['input_ids'][-2]))
 print(tokenizer.decode(tokenized_dataset[0]['input_ids'][-1]))

<s>
ابراهیم
.
</s>


In [12]:
print(tokenizer.convert_ids_to_tokens(tokenized_dataset[0]['input_ids']))


['<s>', '▁ابراهیم', '▁احمد', '▁از', '▁سال', '▁۱۹', '۴۹', '▁به', '▁مدت', '▁دو', '▁سال', '▁در', '▁زندان', '▁بغداد', '▁به', '▁سر', '▁برد', '▁و', '▁دو', '▁سال', '▁نیز', '▁در', '▁کر', 'کو', 'ک', '▁تحت', '▁نظارت', '▁شهر', 'بانی', '▁عراق', '▁قرار', '▁داشت', '▁', '.', '</s>']


In [13]:
# Split the combined dataset into train, validation, and test sets
train_test_split = tokenized_dataset.train_test_split(test_size=0.2)
train_eval_split = train_test_split['test'].train_test_split(test_size=0.5)

train_dataset = train_test_split['train']
eval_dataset = train_eval_split['train']
test_dataset = train_eval_split['test']
# Print out info of each dataset
print(f"Training set: {train_dataset} samples")
print(f"Validation set: {eval_dataset} samples")
print(f"Test set: {test_dataset} samples")


Training set: Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 48000
}) samples
Validation set: Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 6000
}) samples
Test set: Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 6000
}) samples


In [14]:
from transformers import DataCollatorForTokenClassification
import torch
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [15]:
train_dataset.set_format("torch")
eval_dataset.set_format("torch")
test_dataset.set_format("torch")

In [16]:
train_sample = train_dataset[0]
print(type(train_sample['input_ids']))

<class 'torch.Tensor'>


In [17]:
# check the paddings with -100s
batch = data_collator([train_dataset[i] for i in range(2)])
batch["labels"]

tensor([[-100,    5,    5, -100,    0, -100,    0, -100, -100,    0,    0,    5,
            5, -100,    5, -100,    0,    0,    0,    0, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100],
        [-100,    0,    0,    2, -100, -100,    7, -100,    0,    0,    0, -100,
         -100,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0, -100,    0, -100,    0,    0,    0, -100, -100]])

In [18]:
for i in range(2):
    print(train_dataset[i]["labels"])

tensor([-100,    5,    5, -100,    0, -100,    0, -100, -100,    0,    0,    5,
           5, -100,    5, -100,    0,    0,    0,    0, -100, -100])
tensor([-100,    0,    0,    2, -100, -100,    7, -100,    0,    0,    0, -100,
        -100,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0, -100,    0, -100,    0,    0,    0, -100, -100])


In [None]:
IDX2LABEL

{0: 'O',
 1: 'B-DAT',
 2: 'B-PER',
 3: 'B-ORG',
 4: 'B-LOC',
 5: 'B-EVE',
 6: 'I-DAT',
 7: 'I-PER',
 8: 'I-ORG',
 9: 'I-LOC',
 10: 'I-EVE'}

In [None]:
LABEL2IDX

{'O': 0,
 'B-DAT': 1,
 'B-PER': 2,
 'B-ORG': 3,
 'B-LOC': 4,
 'B-EVE': 5,
 'I-DAT': 6,
 'I-PER': 7,
 'I-ORG': 8,
 'I-LOC': 9,
 'I-EVE': 10}

In [19]:
!pip install seqeval evaluate

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=46d67a34a422bae11c8a0a53c913bab221aa67018bc40eeaf8bf3dade443bb74
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully buil

In [20]:
label_list = [k for k,v in LABEL2IDX.items()]
label_list

['O',
 'B-DAT',
 'B-PER',
 'B-ORG',
 'B-LOC',
 'B-EVE',
 'I-DAT',
 'I-PER',
 'I-ORG',
 'I-LOC',
 'I-EVE']

In [21]:
import numpy as np
from seqeval.metrics import classification_report
import evaluate

accuracy = evaluate.load('accuracy')
seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    #############
    cr = classification_report(true_labels, true_predictions,digits=4)
    fname = "report.txt"
    file = open(fname,'w')
    file.write(cr)
    file.close()
    #############
    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [22]:
from transformers import XLMRobertaForTokenClassification, TrainingArguments, Trainer
model = XLMRobertaForTokenClassification.from_pretrained(model_name, num_labels=NUM_CLASS, id2label=IDX2LABEL,label2id=LABEL2IDX)

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
samples = train_dataset[:8]
batch = data_collator([train_dataset[i] for i in range(i)])
{k: v.shape for k, v in batch.items()}


{'input_ids': torch.Size([1, 22]),
 'attention_mask': torch.Size([1, 22]),
 'labels': torch.Size([1, 22])}

In [24]:
 model.config.max_position_embeddings

514

In [25]:
print(train_dataset)
print(eval_dataset)

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 48000
})
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 6000
})


In [26]:
training_args = TrainingArguments(
    output_dir = "/tmp/Persina-NER",
    learning_rate= LEARNING_RATE,
    per_device_train_batch_size= TRAIN_BATCH_SIZE,
    per_device_eval_batch_size= EVAL_BATCH_SIZE,
    num_train_epochs= EPOCHS,
    weight_decay= 0,
    warmup_ratio = 0,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    )
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    )
trainer.train()



Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1903,0.188054,0.616172,0.639141,0.627447,0.933629
2,0.1637,0.172187,0.642108,0.645787,0.643943,0.938545
3,0.1302,0.173222,0.643289,0.674051,0.658311,0.939779
4,0.1058,0.183911,0.641763,0.697502,0.668472,0.940547


TrainOutput(global_step=24000, training_loss=0.16075026718775431, metrics={'train_runtime': 4967.3245, 'train_samples_per_second': 38.653, 'train_steps_per_second': 4.832, 'total_flos': 7007397811530336.0, 'train_loss': 0.16075026718775431, 'epoch': 4.0})

In [27]:
model.config.num_labels

11

In [28]:
validation_results = trainer.evaluate()

In [29]:
validation_results

{'eval_loss': 0.18391144275665283,
 'eval_precision': 0.6417627213944335,
 'eval_recall': 0.6975021006798564,
 'eval_f1': 0.6684724916724623,
 'eval_accuracy': 0.940547244399659,
 'eval_runtime': 37.994,
 'eval_samples_per_second': 157.92,
 'eval_steps_per_second': 19.74,
 'epoch': 4.0}

In [30]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
        )
test_results = trainer.evaluate()

In [31]:
test_results

{'eval_loss': 0.18083029985427856,
 'eval_precision': 0.6398718405126379,
 'eval_recall': 0.7004130621151897,
 'eval_f1': 0.6687751153445453,
 'eval_accuracy': 0.9406996841289708,
 'eval_runtime': 34.0218,
 'eval_samples_per_second': 176.357,
 'eval_steps_per_second': 22.045}

In [32]:
with open('report.txt') as f:
    for line in f.readlines():
        print(line)

              precision    recall  f1-score   support



         DAT     0.7316    0.7965    0.7627      4668

         EVE     0.5058    0.5693    0.5356       462

         LOC     0.5722    0.6402    0.6043      3702

         ORG     0.5885    0.6167    0.6023      1688

         PER     0.6299    0.6902    0.6587      2311



   micro avg     0.6399    0.7004    0.6688     12831

   macro avg     0.6056    0.6626    0.6327     12831

weighted avg     0.6403    0.7004    0.6690     12831



In [33]:
model.save_pretrained("./finetuned-XLM-R")
tokenizer.save_pretrained("./finetuned-XLM-R")

('./finetuned-XLM-R/tokenizer_config.json',
 './finetuned-XLM-R/special_tokens_map.json',
 './finetuned-XLM-R/sentencepiece.bpe.model',
 './finetuned-XLM-R/added_tokens.json',
 './finetuned-XLM-R/tokenizer.json')