In [1]:
!pip install datasets transformers

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Configs

In [3]:
LABEL2IDX = {
    'O': 0,
    'B-DAT': 1,
    'B-PER': 2,
    'B-ORG': 3,
    'B-LOC': 4,
    'B-EVE': 5,
    'I-DAT': 6,
    'I-PER': 7,
    'I-ORG': 8,
    'I-LOC': 9,
    'I-EVE': 10
}

IDX2LABEL = {i: k for k, i in LABEL2IDX.items()}

MAX_LEN = 128
TRAIN_BATCH_SIZE = 8
EVAL_BATCH_SIZE = 8
EPOCHS = 10
NUM_CLASS = 11
LEARNING_RATE = 1e-5

In [4]:
from huggingface_hub import notebook_login

notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
from datasets import concatenate_datasets, load_dataset
from transformers import AutoTokenizer
import ast
model_name = 'sbunlp/fabert'
tokenizer = AutoTokenizer.from_pretrained(model_name)
data_files = "/content/drive/MyDrive/Colab Notebooks/NER-datasets/shuffled-100000.csv"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/18.3k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/552k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [6]:
def parse_tokens_and_labels(example):
    # Ensure 'tokens' and 'labels' are in the expected format
    if isinstance(example['tokens'], str):
        example['tokens'] = ast.literal_eval(example['tokens'])
    if isinstance(example['labels'], str):
        example['labels'] = ast.literal_eval(example['labels'])
    return example


In [7]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"],truncation=True,max_length=512,
 is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [8]:
dataset = load_dataset('csv', data_files=data_files)
dataset = dataset['train'].select(range(60000))
# Apply the parsing function to the entire dataset
parsed_dataset = dataset.map(parse_tokens_and_labels)
# rename labels to net_tags
parsed_dataset = parsed_dataset.rename_column("labels", "ner_tags")
tokenized_dataset = parsed_dataset.map(tokenize_and_align_labels, batched=True, remove_columns=parsed_dataset.column_names,)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/60000 [00:00<?, ? examples/s]

Map:   0%|          | 0/60000 [00:00<?, ? examples/s]

In [9]:
tokenized_dataset

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 60000
})

In [10]:
# Printing the values and lengths of each column in the first item of tokenized_dataset
print(f"Input IDs: {tokenized_dataset[0]['input_ids']}, length: {len(tokenized_dataset[0]['input_ids'])}")
print(f"Attention Mask: {tokenized_dataset[0]['attention_mask']}, length: {len(tokenized_dataset[0]['attention_mask'])}")
print(f"Labels: {tokenized_dataset[0]['labels']}, length: {len(tokenized_dataset[0]['labels'])}")

Input IDs: [101, 5865, 4096, 2303, 2425, 790, 1427, 1456, 1427, 2299, 3366, 2377, 5002, 2310, 2297, 6192, 11116, 35553, 1316, 2758, 622, 2377, 5002, 2310, 2518, 2297, 47495, 3754, 5679, 28380, 4864, 2563, 2434, 117, 102], length: 35
Attention Mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], length: 35
Labels: [-100, 2, 7, 1, 6, 6, -100, -100, -100, 0, 0, 0, -100, -100, 0, 0, 4, 0, -100, 0, 0, 0, -100, -100, 0, 0, 4, 0, 0, 3, 0, 0, 0, 0, -100], length: 35


In [11]:
 print(tokenizer.decode(tokenized_dataset[0]['input_ids'][0]))
 print(tokenizer.decode(tokenized_dataset[0]['input_ids'][1]))
 print(tokenizer.decode(tokenized_dataset[0]['input_ids'][-2]))
 print(tokenizer.decode(tokenized_dataset[0]['input_ids'][-1]))

[CLS]
ابراهیم
.
[SEP]


In [12]:
print(tokenizer.convert_ids_to_tokens(tokenized_dataset[0]['input_ids']))

['[CLS]', 'ابراهیم', 'احمد', 'از', 'سال', '۱', '##۹', '##۴', '##۹', 'به', 'مدت', 'دو', '##ڸس', '##ال', 'در', 'زندان', 'بغداد', 'بهڸس', '##ر', 'برد', 'و', 'دو', '##ڸس', '##ال', 'نیز', 'در', 'کرکوک', 'تحت', 'نظارت', 'شهربانی', 'عراق', 'قرار', 'داشت', '.', '[SEP]']


In [13]:
# Split the combined dataset into train, validation, and test sets
train_test_split = tokenized_dataset.train_test_split(test_size=0.2)
train_eval_split = train_test_split['test'].train_test_split(test_size=0.5)

train_dataset = train_test_split['train']
eval_dataset = train_eval_split['train']
test_dataset = train_eval_split['test']
# Print out info of each dataset
print(f"Training set: {train_dataset} samples")
print(f"Validation set: {eval_dataset} samples")
print(f"Test set: {test_dataset} samples")


Training set: Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 48000
}) samples
Validation set: Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 6000
}) samples
Test set: Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 6000
}) samples


In [14]:
from transformers import DataCollatorForTokenClassification
import torch
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [15]:
train_dataset.set_format("torch")
eval_dataset.set_format("torch")
test_dataset.set_format("torch")

In [16]:
train_sample = train_dataset[0]
print(type(train_sample['input_ids']))

<class 'torch.Tensor'>


In [17]:
# check the paddings with -100s
batch = data_collator([train_dataset[i] for i in range(2)])
batch["labels"]

tensor([[-100,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0, -100,    0,    0,    0,    0,    0,    0,    0,    0,
            1,    6,    0,    0,    0,    0,    0, -100],
        [-100,    0,    1,    6,    1, -100,    6,    6, -100, -100, -100,    0,
            0,    0,    0,    0,    0,    0,    0,    1,    6,    6,    0,    0,
            0,    0,    0,    0,    0, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100]])

In [18]:
for i in range(2):
    print(train_dataset[i]["labels"])

tensor([-100,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0, -100,    0,    0,    0,    0,    0,    0,    0,    0,
           1,    6,    0,    0,    0,    0,    0, -100])
tensor([-100,    0,    1,    6,    1, -100,    6,    6, -100, -100, -100,    0,
           0,    0,    0,    0,    0,    0,    0,    1,    6,    6,    0,    0,
           0,    0,    0,    0,    0, -100])


In [19]:
IDX2LABEL

{0: 'O',
 1: 'B-DAT',
 2: 'B-PER',
 3: 'B-ORG',
 4: 'B-LOC',
 5: 'B-EVE',
 6: 'I-DAT',
 7: 'I-PER',
 8: 'I-ORG',
 9: 'I-LOC',
 10: 'I-EVE'}

In [20]:
LABEL2IDX

{'O': 0,
 'B-DAT': 1,
 'B-PER': 2,
 'B-ORG': 3,
 'B-LOC': 4,
 'B-EVE': 5,
 'I-DAT': 6,
 'I-PER': 7,
 'I-ORG': 8,
 'I-LOC': 9,
 'I-EVE': 10}

In [21]:
!pip install seqeval evaluate

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=1f1225fbed2a22dae2e250d62669707e839ef92fd2aa3c3d7dbaffb444e27d44
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully buil

In [22]:
label_list = [k for k,v in LABEL2IDX.items()]
label_list

['O',
 'B-DAT',
 'B-PER',
 'B-ORG',
 'B-LOC',
 'B-EVE',
 'I-DAT',
 'I-PER',
 'I-ORG',
 'I-LOC',
 'I-EVE']

In [23]:
import numpy as np
from seqeval.metrics import classification_report
import evaluate

accuracy = evaluate.load('accuracy')
seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    #############
    cr = classification_report(true_labels, true_predictions,digits=4)
    fname = "report.txt"
    file = open(fname,'w')
    file.write(cr)
    file.close()
    #############
    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [25]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, AutoConfig
import torch.nn as nn

model_name = 'pouria82/faBERT-peymaarman-finetuned'
config = AutoConfig.from_pretrained(model_name)
config.attention_probs_dropout_prob = 0.2  #  default (
config.hidden_dropout_prob = 0.2  # default (0.1)
config.num_labels = NUM_CLASS
config.id2label = IDX2LABEL
config.label2id = LABEL2IDX
model = AutoModelForTokenClassification.from_pretrained(model_name, config=config, ignore_mismatched_sizes=True)

# Reinitialize the classifier layer
model.classifier = nn.Linear(model.config.hidden_size, config.num_labels)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at pouria82/faBERT-peymaarman-finetuned and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([21]) in the checkpoint and torch.Size([11]) in the model instantiated
- classifier.weight: found shape torch.Size([21, 768]) in the checkpoint and torch.Size([11, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
config.num_labels

11

In [27]:
samples = train_dataset[:8]
batch = data_collator([train_dataset[i] for i in range(i)])
{k: v.shape for k, v in batch.items()}


{'input_ids': torch.Size([1, 44]),
 'token_type_ids': torch.Size([1, 44]),
 'attention_mask': torch.Size([1, 44]),
 'labels': torch.Size([1, 44])}

In [28]:
 model.config.max_position_embeddings

512

In [29]:
print(train_dataset)
print(eval_dataset)

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 48000
})
Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 6000
})


In [30]:
print(model.name_or_path)

pouria82/faBERT-peymaarman-finetuned


In [31]:
LEARNING_RATE

1e-05

In [32]:
from transformers import EarlyStoppingCallback
training_args = TrainingArguments(
    "faBERT-peymaarman-finetuned",
    learning_rate= LEARNING_RATE,
    per_device_train_batch_size= TRAIN_BATCH_SIZE,
    per_device_eval_batch_size= EVAL_BATCH_SIZE,
    num_train_epochs= 10,
    weight_decay= 0.025,#*********************
    warmup_ratio = 0.1,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    lr_scheduler_type="linear",
    greater_is_better=True,
    push_to_hub=True,
    )
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )
trainer.train()



Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2242,0.244413,0.550148,0.666261,0.602662,0.917344
2,0.1766,0.2062,0.583286,0.696988,0.635088,0.927747
3,0.1585,0.186576,0.618514,0.700791,0.657087,0.934753
4,0.1321,0.194018,0.618862,0.713188,0.662686,0.935467
5,0.1164,0.196402,0.625257,0.693642,0.657676,0.936471
6,0.102,0.219309,0.608163,0.717371,0.658268,0.934122
7,0.0914,0.219916,0.620921,0.716459,0.665278,0.936188
8,0.0825,0.228289,0.624057,0.710678,0.664557,0.936779
9,0.0753,0.231437,0.628251,0.701932,0.663051,0.936992
10,0.0678,0.244719,0.619633,0.705735,0.659887,0.935782


TrainOutput(global_step=60000, training_loss=0.14665414435068766, metrics={'train_runtime': 5478.1379, 'train_samples_per_second': 87.621, 'train_steps_per_second': 10.953, 'total_flos': 1.4663141958213888e+16, 'train_loss': 0.14665414435068766, 'epoch': 10.0})

In [33]:
model.config.num_labels

11

In [34]:
validation_results = trainer.evaluate()

In [35]:
validation_results

{'eval_loss': 0.219915509223938,
 'eval_precision': 0.6209214949574847,
 'eval_recall': 0.7164587770003042,
 'eval_f1': 0.6652777287333592,
 'eval_accuracy': 0.9361875965002573,
 'eval_runtime': 17.5947,
 'eval_samples_per_second': 341.011,
 'eval_steps_per_second': 42.626,
 'epoch': 10.0}

In [36]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
        )
test_results = trainer.evaluate()

In [37]:
test_results

{'eval_loss': 0.22072747349739075,
 'eval_precision': 0.622855989776014,
 'eval_recall': 0.7064927138170444,
 'eval_f1': 0.6620433259455208,
 'eval_accuracy': 0.93544418959265,
 'eval_runtime': 17.9513,
 'eval_samples_per_second': 334.237,
 'eval_steps_per_second': 41.78}

In [38]:
with open('report.txt') as f:
    for line in f.readlines():
        print(line)

              precision    recall  f1-score   support



         DAT     0.7272    0.7959    0.7600      4484

         EVE     0.3997    0.5264    0.4544       454

         LOC     0.5620    0.6530    0.6041      3942

         ORG     0.5753    0.6378    0.6050      1742

         PER     0.6200    0.7111    0.6624      2485



   micro avg     0.6229    0.7065    0.6620     13107

   macro avg     0.5768    0.6648    0.6172     13107

weighted avg     0.6257    0.7065    0.6634     13107

