In [1]:
!pip install datasets transformers



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Configs

In [3]:
LABEL2IDX = {
    'O': 0,
    'B-DAT': 1,
    'B-PER': 2,
    'B-ORG': 3,
    'B-LOC': 4,
    'B-EVE': 5,
    'I-DAT': 6,
    'I-PER': 7,
    'I-ORG': 8,
    'I-LOC': 9,
    'I-EVE': 10
}

IDX2LABEL = {i: k for k, i in LABEL2IDX.items()}

CLS = [101]
SEP = [102]
VALUE_TOKEN = [0]
MAX_LEN = 128
TRAIN_BATCH_SIZE = 8
EVAL_BATCH_SIZE = 8
EPOCHS = 4
NUM_CLASS = 11
LEARNING_RATE = 5e-5

In [4]:
from huggingface_hub import notebook_login

notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
from datasets import concatenate_datasets, load_dataset
from transformers import AutoTokenizer
import ast
# model_name = 'sbunlp/fabert'
model_name = 'PartAI/TookaBERT-Base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
data_files = "/content/drive/MyDrive/Colab Notebooks/NER-datasets/shuffled-100000.csv"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
def parse_tokens_and_labels(example):
    # Ensure 'tokens' and 'labels' are in the expected format
    if isinstance(example['tokens'], str):
        example['tokens'] = ast.literal_eval(example['tokens'])
    if isinstance(example['labels'], str):
        example['labels'] = ast.literal_eval(example['labels'])
    return example


In [7]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"],truncation=True,max_length=512,
 is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [8]:

dataset = load_dataset('csv', data_files=data_files)
dataset = dataset['train'].select(range(60000))
# Apply the parsing function to the entire dataset
parsed_dataset = dataset.map(parse_tokens_and_labels)
# rename labels to net_tags
parsed_dataset = parsed_dataset.rename_column("labels", "ner_tags")
tokenized_dataset = parsed_dataset.map(tokenize_and_align_labels, batched=True, remove_columns=parsed_dataset.column_names,)

In [9]:
tokenized_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 60000
})

In [10]:
# Printing the values and lengths of each column in the first item of tokenized_dataset
print(f"Input IDs: {tokenized_dataset[0]['input_ids']}, length: {len(tokenized_dataset[0]['input_ids'])}")
print(f"Attention Mask: {tokenized_dataset[0]['attention_mask']}, length: {len(tokenized_dataset[0]['attention_mask'])}")
print(f"Labels: {tokenized_dataset[0]['labels']}, length: {len(tokenized_dataset[0]['labels'])}")

Input IDs: [2, 4534, 3492, 698, 802, 207, 468, 691, 1646, 799, 11960, 687, 3477, 7293, 15825, 2144, 680, 799, 11960, 890, 687, 21543, 1896, 2852, 26962, 2122, 921, 836, 169, 3], length: 30
Attention Mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], length: 30
Labels: [-100, 2, 7, 1, 6, 6, -100, 0, 0, 0, -100, 0, 0, 4, 0, 0, 0, 0, -100, 0, 0, 4, 0, 0, 3, 0, 0, 0, 0, -100], length: 30


In [11]:
 print(tokenizer.decode(tokenized_dataset[0]['input_ids'][0]))
 print(tokenizer.decode(tokenized_dataset[0]['input_ids'][1]))
 print(tokenizer.decode(tokenized_dataset[0]['input_ids'][-2]))
 print(tokenizer.decode(tokenized_dataset[0]['input_ids'][-1]))

<s>
ابراهیم
.
</s>


In [12]:
print(tokenizer.convert_ids_to_tokens(tokenized_dataset[0]['input_ids']))


['<s>', '▁ابراهیم', '▁احمد', '▁از', '▁سال', '▁۱۹', '۴۹', '▁به', '▁مدت', '▁دو', '\u200cسال', '▁در', '▁زندان', '▁بغداد', '▁به\u200cسر', '▁برد', '▁و', '▁دو', '\u200cسال', '▁نیز', '▁در', '▁کرکوک', '▁تحت', '▁نظارت', '▁شهربانی', '▁عراق', '▁قرار', '▁داشت', '▁.', '</s>']


In [13]:
# Split the combined dataset into train, validation, and test sets
train_test_split = tokenized_dataset.train_test_split(test_size=0.2)
train_eval_split = train_test_split['test'].train_test_split(test_size=0.5)

train_dataset = train_test_split['train']
eval_dataset = train_eval_split['train']
test_dataset = train_eval_split['test']
# Print out info of each dataset
print(f"Training set: {train_dataset} samples")
print(f"Validation set: {eval_dataset} samples")
print(f"Test set: {test_dataset} samples")


Training set: Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 48000
}) samples
Validation set: Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 6000
}) samples
Test set: Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 6000
}) samples


In [14]:
from transformers import DataCollatorForTokenClassification
import torch
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [15]:
train_dataset.set_format("torch")
eval_dataset.set_format("torch")
test_dataset.set_format("torch")

In [16]:
train_sample = train_dataset[0]
print(type(train_sample['input_ids']))

<class 'torch.Tensor'>


In [17]:
# check the paddings with -100s
batch = data_collator([train_dataset[i] for i in range(2)])
batch["labels"]

tensor([[-100,    0,    0,    2, -100,    7,    0, -100,    0,    1,    6,    6,
         -100,    0, -100,    0,    0,    0,    0, -100],
        [-100,    3,    8,    8,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0, -100]])

In [18]:
for i in range(2):
    print(train_dataset[i]["labels"])

tensor([-100,    0,    0,    2, -100,    7,    0, -100,    0,    1,    6,    6,
        -100,    0, -100,    0,    0,    0,    0, -100])
tensor([-100,    3,    8,    8,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0, -100])


In [None]:
IDX2LABEL

{0: 'O',
 1: 'B-DAT',
 2: 'B-PER',
 3: 'B-ORG',
 4: 'B-LOC',
 5: 'B-EVE',
 6: 'I-DAT',
 7: 'I-PER',
 8: 'I-ORG',
 9: 'I-LOC',
 10: 'I-EVE'}

In [None]:
LABEL2IDX

{'O': 0,
 'B-DAT': 1,
 'B-PER': 2,
 'B-ORG': 3,
 'B-LOC': 4,
 'B-EVE': 5,
 'I-DAT': 6,
 'I-PER': 7,
 'I-ORG': 8,
 'I-LOC': 9,
 'I-EVE': 10}

In [19]:
!pip install seqeval evaluate

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=ab8d3b3eb5aa6ce3a7f8d54dbfe6daa4e883e70ccfc772da39fe6b5d5bd91a3d
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully buil

In [20]:
label_list = [k for k,v in LABEL2IDX.items()]
label_list

['O',
 'B-DAT',
 'B-PER',
 'B-ORG',
 'B-LOC',
 'B-EVE',
 'I-DAT',
 'I-PER',
 'I-ORG',
 'I-LOC',
 'I-EVE']

In [21]:
import numpy as np
from seqeval.metrics import classification_report
import evaluate

accuracy = evaluate.load('accuracy')
seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    #############
    cr = classification_report(true_labels, true_predictions,digits=4)
    fname = "report.txt"
    file = open(fname,'w')
    file.write(cr)
    file.close()
    #############
    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [22]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=NUM_CLASS, id2label=IDX2LABEL,label2id=LABEL2IDX)

config.json:   0%|          | 0.00/730 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/492M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at PartAI/TookaBERT-Base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
samples = train_dataset[:8]
batch = data_collator([train_dataset[i] for i in range(i)])
{k: v.shape for k, v in batch.items()}


{'input_ids': torch.Size([1, 20]),
 'attention_mask': torch.Size([1, 20]),
 'labels': torch.Size([1, 20])}

In [24]:
 model.config.max_position_embeddings

512

In [25]:
print(train_dataset)
print(eval_dataset)

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 48000
})
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 6000
})


In [26]:
training_args = TrainingArguments(
    output_dir = "/tmp/Persina-NER",
    learning_rate= LEARNING_RATE,
    per_device_train_batch_size= TRAIN_BATCH_SIZE,
    per_device_eval_batch_size= EVAL_BATCH_SIZE,
    num_train_epochs= EPOCHS,
    weight_decay= 0,
    warmup_ratio = 0,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    )
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    )
trainer.train()



Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1826,0.178696,0.619997,0.659087,0.638945,0.934855


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1826,0.178696,0.619997,0.659087,0.638945,0.934855
2,0.1395,0.166542,0.644831,0.674503,0.659333,0.939975
3,0.0895,0.189895,0.648498,0.688685,0.667987,0.941341
4,0.0557,0.226416,0.655253,0.691768,0.673016,0.941263


TrainOutput(global_step=24000, training_loss=0.12692874081929525, metrics={'train_runtime': 3107.7117, 'train_samples_per_second': 61.782, 'train_steps_per_second': 7.723, 'total_flos': 5938031401455504.0, 'train_loss': 0.12692874081929525, 'epoch': 4.0})

In [27]:
model.config.num_labels

11

In [28]:
validation_results = trainer.evaluate()

In [29]:
validation_results

{'eval_loss': 0.2264159917831421,
 'eval_precision': 0.6552529751040373,
 'eval_recall': 0.6917681516879913,
 'eval_f1': 0.673015634959319,
 'eval_accuracy': 0.941263011978867,
 'eval_runtime': 29.8538,
 'eval_samples_per_second': 200.98,
 'eval_steps_per_second': 25.122,
 'epoch': 4.0}

In [30]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
        )
test_results = trainer.evaluate()

In [32]:
test_results

{'eval_loss': 0.22537760436534882,
 'eval_precision': 0.6687366008723294,
 'eval_recall': 0.6957926313360511,
 'eval_f1': 0.6819963811821471,
 'eval_accuracy': 0.9432087357159591,
 'eval_runtime': 42.1566,
 'eval_samples_per_second': 142.327,
 'eval_steps_per_second': 17.791}

In [33]:
with open('report.txt') as f:
    for line in f.readlines():
        print(line)

              precision    recall  f1-score   support



         DAT     0.7600    0.7909    0.7752      4544

         EVE     0.4791    0.5228    0.5000       417

         LOC     0.6118    0.6333    0.6224      3736

         ORG     0.6261    0.6458    0.6358      1818

         PER     0.6513    0.6814    0.6660      2486



   micro avg     0.6687    0.6958    0.6820     13001

   macro avg     0.6257    0.6548    0.6399     13001

weighted avg     0.6689    0.6958    0.6821     13001

