In [None]:
!pip install datasets transformers

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Configs

In [None]:
LABEL2IDX = {
    'O': 0,
    'B-DAT': 1,
    'B-PER': 2,
    'B-ORG': 3,
    'B-LOC': 4,
    'B-EVE': 5,
    'I-DAT': 6,
    'I-PER': 7,
    'I-ORG': 8,
    'I-LOC': 9,
    'I-EVE': 10
}

IDX2LABEL = {i: k for k, i in LABEL2IDX.items()}

MAX_LEN = 128
TRAIN_BATCH_SIZE = 8
EVAL_BATCH_SIZE = 8
EPOCHS = 5
LEARNING_RATE = 2e-5

In [None]:
from datasets import concatenate_datasets, load_dataset
from transformers import AutoTokenizer
import ast

model_name = 'sbunlp/fabert'
tokenizer = AutoTokenizer.from_pretrained(model_name)
data_files = "/content/drive/MyDrive/Colab Notebooks/NER-datasets/shuffled-100000.csv"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/18.3k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/552k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [None]:
def parse_tokens_and_labels(example):
    # Ensure 'tokens' and 'labels' are in the expected format
    if isinstance(example['tokens'], str):
        example['tokens'] = ast.literal_eval(example['tokens'])
    if isinstance(example['labels'], str):
        example['labels'] = ast.literal_eval(example['labels'])
    return example

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"],
 is_split_into_words=True)
    return tokenized_inputs


In [None]:
dataset = load_dataset('csv', data_files=data_files)


Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 100000
    })
})

In [None]:
# Apply the parsing function to the entire dataset
parsed_dataset = dataset.map(parse_tokens_and_labels)
tokenized_dataset = parsed_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

In [None]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 100000
    })
})

In [None]:
tokenized_dataset['train'][0]['attention_mask']

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1]

In [None]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 100000
    })
})

In [None]:
tokenized_dataset = tokenized_dataset['train']

In [None]:
tokenized_dataset = tokenized_dataset.remove_columns('tokens')
tokenized_dataset = tokenized_dataset.remove_columns('token_type_ids')

In [None]:
tokenized_dataset

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 100000
})

In [None]:
block_size = 128

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
lm_dataset = tokenized_dataset.map(group_texts, batched=True)

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

In [None]:
lm_dataset

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 19919
})

In [None]:
lm_dataset['labels'] == lm_dataset['input_ids']

True

In [None]:
# Printing the values and lengths of each column in the first item of tokenized_dataset
print(f"labels: {lm_dataset[0]['labels']}, length: {len(lm_dataset[0]['labels'])}")
print(f"Input IDs: {lm_dataset[0]['input_ids']}, length: {len(lm_dataset[0]['input_ids'])}")
print(f"Attention Mask: {lm_dataset[0]['attention_mask']}, length: {len(lm_dataset[0]['attention_mask'])}")


labels: [101, 5865, 4096, 2303, 2425, 790, 1427, 1456, 1427, 2299, 3366, 2377, 5002, 2310, 2297, 6192, 11116, 35553, 1316, 2758, 622, 2377, 5002, 2310, 2518, 2297, 47495, 3754, 5679, 28380, 4864, 2563, 2434, 117, 102, 101, 5298, 2548, 2297, 5167, 3053, 14833, 2297, 4761, 6888, 41147, 2747, 2735, 2358, 2358, 6753, 5298, 2297, 7620, 5295, 798, 2003, 2856, 2602, 622, 7726, 12687, 2342, 2307, 24550, 2352, 3260, 4443, 34307, 2400, 2342, 117, 102, 101, 5351, 17094, 2303, 7933, 36314, 2342, 622, 2299, 3070, 6944, 2305, 18861, 6572, 2902, 10107, 16528, 25419, 3734, 5786, 18171, 3257, 11332, 2369, 117, 102, 101, 6257, 10193, 2322, 2297, 46711, 2368, 790, 1763, 1715, 1419, 6318, 3355, 2902, 23165, 2299, 8099, 2297, 4062, 4937, 6318, 3355, 2580, 2861, 622, 3640, 2299, 3529, 31808], length: 128
Input IDs: [101, 5865, 4096, 2303, 2425, 790, 1427, 1456, 1427, 2299, 3366, 2377, 5002, 2310, 2297, 6192, 11116, 35553, 1316, 2758, 622, 2377, 5002, 2310, 2518, 2297, 47495, 3754, 5679, 28380, 4864, 2563, 2

In [None]:
 print(tokenizer.decode(lm_dataset[0]['input_ids'][0]))
 print(tokenizer.decode(lm_dataset[0]['input_ids'][1]))
 print(tokenizer.decode(lm_dataset[0]['input_ids'][-2]))
 print(tokenizer.decode(lm_dataset[0]['input_ids'][-1]))

[CLS]
ابراهیم
سمت
مسول


In [None]:
print(tokenizer.convert_ids_to_tokens(lm_dataset[0]['input_ids']))

['[CLS]', 'ابراهیم', 'احمد', 'از', 'سال', '۱', '##۹', '##۴', '##۹', 'به', 'مدت', 'دو', '##ڸس', '##ال', 'در', 'زندان', 'بغداد', 'بهڸس', '##ر', 'برد', 'و', 'دو', '##ڸس', '##ال', 'نیز', 'در', 'کرکوک', 'تحت', 'نظارت', 'شهربانی', 'عراق', 'قرار', 'داشت', '.', '[SEP]', '[CLS]', 'جی', 'زی', 'در', 'چندین', 'جنگ', 'رپ', 'در', '##ڸم', '##قابل', 'رپر', '##هایی', 'چون', 'ال', 'ال', 'کول', 'جی', 'در', 'اوایل', 'دهه', '۹', '##۰', 'شرکت', 'کرده', 'و', 'برنده', '##ڸشده', 'بود', 'که', 'نقشهڸای', 'برای', 'شروع', 'کارهای', 'آیندهڸاش', 'شده', 'بود', '.', '[SEP]', '[CLS]', 'کریم', 'بیک', 'از', 'اهالی', 'ایروان', 'بود', 'و', 'به', 'دلیل', 'مخالفت', 'با', 'حکام', 'روسیه', 'توسط', 'کاس', '##اکو', '##فسکی', 'رئیس', 'روسی', 'قزاق', 'خانه', 'مسموم', 'شد', '.', '[SEP]', '[CLS]', 'مهندس', 'زع', '##یم', 'در', 'پلن', '##وم', '۱', '##۳', '##۸', '##۲', 'جبهه', 'ملی', 'توسط', 'مشترکین', 'به', 'عضویت', 'در', 'شورای', 'مرکزی', 'جبهه', 'ملی', 'ایران', 'انتخاب', 'و', 'سپس', 'به', 'سمت', 'مسول']


In [None]:
from datasets import DatasetDict
# Split the combined dataset into train, validation, and test sets
train_test_split = lm_dataset.train_test_split(test_size=0.2)
train_eval_split = train_test_split['test'].train_test_split(test_size=0.5)

train_dataset = train_test_split['train']
eval_dataset = train_eval_split['train']
test_dataset = train_eval_split['test']
combined_dataset = DatasetDict({
    'train': train_dataset,
    'eval': eval_dataset,
    'test': test_dataset
})
combined_dataset


DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 15935
    })
    eval: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 1992
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 1992
    })
})

In [None]:
from transformers import AutoModelForMaskedLM
model = AutoModelForMaskedLM.from_pretrained(model_name)

config.json:   0%|          | 0.00/589 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

In [None]:
from transformers import TrainingArguments, Trainer

In [None]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [None]:
combined_dataset['train'][0]['labels']

[102,
 101,
 4015,
 4447,
 6435,
 15474,
 2299,
 3848,
 2297,
 12503,
 2316,
 5723,
 2317,
 622,
 9493,
 2299,
 16170,
 3048,
 2466,
 3710,
 2386,
 15888,
 2317,
 117,
 102,
 101,
 18463,
 3429,
 6193,
 622,
 47881,
 35673,
 38918,
 3001,
 18890,
 2311,
 2299,
 6805,
 8693,
 1314,
 18890,
 6880,
 622,
 6426,
 4564,
 3931,
 622,
 5863,
 13603,
 6250,
 2295,
 18890,
 2311,
 2299,
 2821,
 4541,
 6880,
 563,
 10054,
 622,
 6426,
 5815,
 2602,
 2317,
 117,
 102,
 101,
 793,
 792,
 790,
 3147,
 5914,
 14176,
 2297,
 40749,
 1312,
 790,
 1427,
 1662,
 1715,
 2371,
 790,
 1427,
 1911,
 1419,
 2297,
 2425,
 790,
 1427,
 1662,
 1715,
 3553,
 2352,
 4373,
 2303,
 5914,
 4472,
 5172,
 2299,
 6019,
 111,
 792,
 1809,
 5172,
 2312,
 30837,
 112,
 622,
 3956,
 3103,
 2318,
 36656,
 22111,
 2313,
 563,
 14354,
 622,
 26918,
 2502,
 10035,
 28974,
 11406,
 6040,
 47766,
 2400,
 2342,
 117,
 102]

In [None]:
# check the paddings with -100s
batch = data_collator([combined_dataset['train'][i] for i in range(2)])
batch["labels"]

tensor([[ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          2386,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100, 35673,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  6880,  -100,  -100,  -100,
          -100,  -100,  -100,  2317,   117,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,   790,  -100,  -100,  -100,
          2371,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  3553,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,   792,  -100,  -100,  -100,  -100,   112,  -100,  -100,  -100,
          -100,  -100,  -100,  2313,   563,  -100,  -100,  -100,  -100,  -100,
          -100, 11406,  -100, 47766,  -100,  -100,  

In [None]:
import torch
combined_dataset['train'].set_format("torch")
combined_dataset['eval'].set_format("torch")
combined_dataset['test'].set_format("torch")

In [None]:
train_sample = combined_dataset['train'][0]
print(type(train_sample['attention_mask']))


<class 'torch.Tensor'>


In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
training_args = TrainingArguments(
    f"{model_name}-finetuned-MLM",
    evaluation_strategy = "epoch",
    save_strategy="epoch",
    per_device_train_batch_size= TRAIN_BATCH_SIZE,
    per_device_eval_batch_size= EVAL_BATCH_SIZE,
    load_best_model_at_end=True,
    learning_rate=2e-5,
    num_train_epochs=5,
    weight_decay=0.01,
    push_to_hub=True,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=combined_dataset["train"],
    eval_dataset=combined_dataset["eval"],
    data_collator=data_collator,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,2.7594,2.587179
2,2.577,2.537119


Epoch,Training Loss,Validation Loss
1,2.7594,2.587179
2,2.577,2.537119
3,2.4622,2.433369
4,2.4142,2.411033
5,2.3263,2.384483


There were missing keys in the checkpoint model loaded: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias'].


TrainOutput(global_step=9960, training_loss=2.5255478579356487, metrics={'train_runtime': 1670.7199, 'train_samples_per_second': 47.689, 'train_steps_per_second': 5.962, 'total_flos': 5243902731571200.0, 'train_loss': 2.5255478579356487, 'epoch': 5.0})

In [None]:
import math
eval_results = trainer.evaluate(combined_dataset['eval'])
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 11.41


In [None]:
print(eval_results)

{'eval_loss': 2.434494733810425, 'eval_runtime': 10.6979, 'eval_samples_per_second': 186.204, 'eval_steps_per_second': 23.276, 'epoch': 5.0}


In [None]:
test_results = trainer.evaluate(combined_dataset['test'])
print(f"Perplexity: {math.exp(test_results['eval_loss']):.2f}")

Perplexity: 10.86


In [None]:
print(test_results)

{'eval_loss': 2.3848636150360107, 'eval_runtime': 10.8367, 'eval_samples_per_second': 183.82, 'eval_steps_per_second': 22.977, 'epoch': 5.0}
