In [1]:
!pip install datasets transformers

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Configs

In [3]:
LABEL2IDX = {
    'O': 0,
    'B-DAT': 1,
    'B-PER': 2,
    'B-ORG': 3,
    'B-LOC': 4,
    'B-EVE': 5,
    'I-DAT': 6,
    'I-PER': 7,
    'I-ORG': 8,
    'I-LOC': 9,
    'I-EVE': 10
}

IDX2LABEL = {i: k for k, i in LABEL2IDX.items()}

CLS = [101]
SEP = [102]
VALUE_TOKEN = [0]
MAX_LEN = 128
TRAIN_BATCH_SIZE = 8
EVAL_BATCH_SIZE = 8
EPOCHS = 4
NUM_CLASS = 11
LEARNING_RATE = 5e-5

In [4]:
from datasets import concatenate_datasets, load_dataset
from transformers import AutoTokenizer
import ast

model_name = 'sbunlp/fabert'
tokenizer = AutoTokenizer.from_pretrained(model_name)
train= "/content/drive/MyDrive/Colab Notebooks/NER-datasets/24-EVE/train.csv"
eval =  "/content/drive/MyDrive/Colab Notebooks/NER-datasets/24-EVE/eval.csv"
test =  "/content/drive/MyDrive/Colab Notebooks/NER-datasets/24-EVE/test.csv"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/18.3k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/552k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [5]:
def parse_tokens_and_labels(example):
    # Ensure 'tokens' and 'labels' are in the expected format
    if isinstance(example['tokens'], str):
        example['tokens'] = ast.literal_eval(example['tokens'])
    if isinstance(example['labels'], str):
        example['labels'] = ast.literal_eval(example['labels'])
    return example


In [6]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"],truncation=True,max_length=512,
 is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [7]:
dataset_train = load_dataset('csv', data_files=train)['train']
dataset_eval = load_dataset('csv', data_files=eval)['train']
dataset_test = load_dataset('csv', data_files=test)['train']

dataset_all = concatenate_datasets([dataset_train, dataset_eval, dataset_test])

# Apply the parsing function to the entire dataset
parsed_dataset = dataset_all.map(parse_tokens_and_labels)

# Rename labels to ner_tags
parsed_dataset = parsed_dataset.rename_column("labels", "ner_tags")

# Tokenize and align labels
tokenized_dataset = parsed_dataset.map(tokenize_and_align_labels, batched=True, remove_columns=parsed_dataset.column_names)

# Split the tokenized dataset back into train, eval, and test
dataset_length = len(dataset_train), len(dataset_eval), len(dataset_test)
print(dataset_length)



Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

(24000, 3000, 3000)


In [8]:
combined = {
    'train': tokenized_dataset.select(range(dataset_length[0])),
    'eval': tokenized_dataset.select(range(dataset_length[0], dataset_length[0] + dataset_length[1])),
    'test': tokenized_dataset.select(range(dataset_length[0] + dataset_length[1], sum(dataset_length)))}


In [9]:
combined

{'train': Dataset({
     features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
     num_rows: 24000
 }),
 'eval': Dataset({
     features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
     num_rows: 3000
 }),
 'test': Dataset({
     features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
     num_rows: 3000
 })}

In [10]:
# Printing the values and lengths of each column in the first item of tokenized_dataset
# print(f"Tokens: {tokenized_dataset[0]['tokens']}, length: {len(tokenized_dataset[0]['tokens'])}")
# print(f"NER Tags: {tokenized_dataset[0]['ner_tags']}, length: {len(tokenized_dataset[0]['ner_tags'])}")
print(f"Input IDs: {combined['train'][0]['input_ids']}, length: {len(combined['train'][0]['input_ids'])}")
print(f"Token Type IDs: {combined['train'][0]['token_type_ids']}, length: {len(combined['train'][0]['token_type_ids'])}")
print(f"Attention Mask: {combined['train'][0]['attention_mask']}, length: {len(combined['train'][0]['attention_mask'])}")
print(f"Labels: {combined['train'][0]['labels']}, length: {len(combined['train'][0]['labels'])}")



Input IDs: [101, 2297, 2425, 790, 1763, 1715, 1809, 622, 3587, 2367, 2305, 6666, 2425, 599, 5912, 6111, 6344, 5912, 3103, 5893, 4331, 2841, 2580, 622, 3103, 5202, 4331, 2841, 2580, 3492, 4331, 2841, 2580, 8769, 3674, 622, 5504, 34416, 3498, 14987, 2345, 117, 102], length: 43
Token Type IDs: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], length: 43
Attention Mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], length: 43
Labels: [-100, 1, 1, 6, -100, -100, -100, 0, 0, -100, 0, 1, 6, 0, 0, 2, 7, 0, 3, 8, 8, 8, 8, 0, 3, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100], length: 43


In [11]:
 print(tokenizer.decode(combined['train'][0]['input_ids'][0]))
 print(tokenizer.decode(combined['train'][0]['input_ids'][1]))
 print(tokenizer.decode(combined['train'][0]['input_ids'][-2]))
 print(tokenizer.decode(combined['train'][0]['input_ids'][-1]))

[CLS]
در
.
[SEP]


In [12]:
print(tokenizer.convert_ids_to_tokens(combined['train'][0]['input_ids']))


['[CLS]', 'در', 'سال', '۱', '##۳', '##۸', '##۵', 'و', 'هز', '##مان', 'با', 'دومین', 'سال', 'ر', 'رئیسڸجمهور', 'محمود', 'احمدیڸنژاد', 'رئیسڸجمهور', 'دولت', 'نهم', 'جمهوری', 'اسلامی', 'ایران', 'و', 'دولت', 'دهم', 'جمهوری', 'اسلامی', 'ایران', 'نظام', 'جمهوری', 'اسلامی', 'ایران', 'معاونت', 'علمی', 'و', 'فناوری', 'رئیسڸجمهوری', 'آغاز', 'بهڸکار', 'کرد', '.', '[SEP]']


In [13]:
train_dataset = combined['train']
eval_dataset = combined['eval']
test_dataset = combined['test']
# Print out info of each dataset
print(f"Training set: {train_dataset} samples")
print(f"Validation set: {eval_dataset} samples")
print(f"Test set: {test_dataset} samples")


Training set: Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 24000
}) samples
Validation set: Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 3000
}) samples
Test set: Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 3000
}) samples


In [14]:
from transformers import DataCollatorForTokenClassification
import torch
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [15]:
train_dataset.set_format("torch")
eval_dataset.set_format("torch")
test_dataset.set_format("torch")

In [16]:
train_sample = train_dataset[0]
print(type(train_sample['input_ids']))

<class 'torch.Tensor'>


In [17]:
# check the paddings with -100s
batch = data_collator([train_dataset[i] for i in range(2)])
batch["labels"]

tensor([[-100,    1,    1,    6, -100, -100, -100,    0,    0, -100,    0,    1,
            6,    0,    0,    2,    7,    0,    3,    8,    8,    8,    8,    0,
            3,    8,    8,    8,    8,    8,    8,    8,    8,    0,    0,    0,
            0,    0,    0,    0,    0,    0, -100],
        [-100,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    3,    8,    0,    0,    0,    0,    0, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100]])

In [18]:
for i in range(2):
    print(train_dataset[i]["labels"])

tensor([-100,    1,    1,    6, -100, -100, -100,    0,    0, -100,    0,    1,
           6,    0,    0,    2,    7,    0,    3,    8,    8,    8,    8,    0,
           3,    8,    8,    8,    8,    8,    8,    8,    8,    0,    0,    0,
           0,    0,    0,    0,    0,    0, -100])
tensor([-100,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    3,    8,    0,    0,    0,    0,    0, -100])


In [None]:
IDX2LABEL

{0: 'O',
 1: 'B-DAT',
 2: 'B-PER',
 3: 'B-ORG',
 4: 'B-LOC',
 5: 'B-EVE',
 6: 'I-DAT',
 7: 'I-PER',
 8: 'I-ORG',
 9: 'I-LOC',
 10: 'I-EVE'}

In [None]:
LABEL2IDX

{'O': 0,
 'B-DAT': 1,
 'B-PER': 2,
 'B-ORG': 3,
 'B-LOC': 4,
 'B-EVE': 5,
 'I-DAT': 6,
 'I-PER': 7,
 'I-ORG': 8,
 'I-LOC': 9,
 'I-EVE': 10}

In [19]:
!pip install seqeval evaluate

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=98f23848cce7f79578076051dd944469cf01299fa24d73ab5a8eaf2397b5d7e8
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully buil

In [20]:
label_list = [k for k,v in LABEL2IDX.items()]
label_list

['O',
 'B-DAT',
 'B-PER',
 'B-ORG',
 'B-LOC',
 'B-EVE',
 'I-DAT',
 'I-PER',
 'I-ORG',
 'I-LOC',
 'I-EVE']

In [21]:
import numpy as np
from seqeval.metrics import classification_report
import evaluate

accuracy = evaluate.load('accuracy')
seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    #############
    cr = classification_report(true_labels, true_predictions,digits=4)
    fname = "report.txt"
    file = open(fname,'w')
    file.write(cr)
    file.close()
    #############
    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [22]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=NUM_CLASS, id2label=IDX2LABEL,label2id=LABEL2IDX)

config.json:   0%|          | 0.00/589 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at sbunlp/fabert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
samples = train_dataset[:8]
batch = data_collator([train_dataset[i] for i in range(i)])
{k: v.shape for k, v in batch.items()}


{'input_ids': torch.Size([1, 43]),
 'token_type_ids': torch.Size([1, 43]),
 'attention_mask': torch.Size([1, 43]),
 'labels': torch.Size([1, 43])}

In [24]:
 model.config.max_position_embeddings

512

In [25]:
print(train_dataset)
print(eval_dataset)

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 24000
})
Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 3000
})


In [26]:
training_args = TrainingArguments(
    output_dir = "/tmp/Persina-NER",
    learning_rate= LEARNING_RATE,
    per_device_train_batch_size= TRAIN_BATCH_SIZE,
    per_device_eval_batch_size= EVAL_BATCH_SIZE,
    num_train_epochs= 10,
    weight_decay= 0,
    warmup_ratio = 0,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    )
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    )
trainer.train()



Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2123,0.190349,0.619659,0.565244,0.591202,0.934415
2,0.1561,0.181921,0.625846,0.655061,0.64012,0.938607
3,0.0934,0.226279,0.607863,0.650285,0.628359,0.935177


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2123,0.190349,0.619659,0.565244,0.591202,0.934415
2,0.1561,0.181921,0.625846,0.655061,0.64012,0.938607
3,0.0934,0.226279,0.607863,0.650285,0.628359,0.935177
4,0.0584,0.270044,0.632244,0.685719,0.657897,0.937464
5,0.0393,0.30494,0.639891,0.652904,0.646332,0.938108
6,0.0258,0.349839,0.608581,0.673086,0.63921,0.9354
7,0.0141,0.407417,0.622605,0.695887,0.657209,0.937464
8,0.0101,0.440622,0.619835,0.681713,0.649303,0.937385
9,0.0031,0.479243,0.629472,0.6831,0.65519,0.938279
10,0.0029,0.500242,0.636117,0.686489,0.660344,0.938818


TrainOutput(global_step=30000, training_loss=0.06741897463401159, metrics={'train_runtime': 4056.5126, 'train_samples_per_second': 59.164, 'train_steps_per_second': 7.396, 'total_flos': 7620600956350128.0, 'train_loss': 0.06741897463401159, 'epoch': 10.0})

In [27]:
model.config.num_labels

11

In [28]:
validation_results = trainer.evaluate()

In [29]:
validation_results

{'eval_loss': 0.5002424120903015,
 'eval_precision': 0.6361170592433976,
 'eval_recall': 0.6864889847481128,
 'eval_f1': 0.6603438055720213,
 'eval_accuracy': 0.9388176226276221,
 'eval_runtime': 14.2644,
 'eval_samples_per_second': 210.313,
 'eval_steps_per_second': 26.289,
 'epoch': 10.0}

In [30]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
        )
test_results = trainer.evaluate()

In [31]:
test_results

{'eval_loss': 0.48781275749206543,
 'eval_precision': 0.6255054881571346,
 'eval_recall': 0.6821546700267759,
 'eval_f1': 0.6526030287048896,
 'eval_accuracy': 0.9401253552380582,
 'eval_runtime': 23.2967,
 'eval_samples_per_second': 128.774,
 'eval_steps_per_second': 16.097}

In [32]:
with open('report.txt') as f:
    for line in f.readlines():
        print(line)

              precision    recall  f1-score   support



         DAT     0.7443    0.7562    0.7502      2256

         EVE     0.4844    0.5688    0.5232       218

         LOC     0.5532    0.6339    0.5908      1863

         ORG     0.5626    0.6050    0.5831       876

         PER     0.6082    0.6954    0.6489      1136



   micro avg     0.6255    0.6822    0.6526      6349

   macro avg     0.5905    0.6519    0.6192      6349

weighted avg     0.6299    0.6822    0.6544      6349



In [34]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [35]:
model.push_to_hub("fabert-base_finetuned")

model.safetensors:   0%|          | 0.00/495M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/pouria82/fabert-base_finetuned/commit/c06f275c68798518673ae7e95eaf001d3bc2fbab', commit_message='Upload BertForTokenClassification', commit_description='', oid='c06f275c68798518673ae7e95eaf001d3bc2fbab', pr_url=None, pr_revision=None, pr_num=None)