In [1]:
! pip install transformers torch datasets wandb



In [2]:
import re
from transformers import AutoTokenizer, AutoModelForMaskedLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
import wandb

In [3]:
def cleaning_sentence(sentence):
    words = sentence.split()
    words = [re.sub(r'[^\u0D80-\u0DFF\s]', '', word).strip() for word in words]
    words = [word for word in words if word != ""]

    return " ".join(words)

In [4]:
train_path = "/kaggle/input/dakshina-train/wiki-filt.train.text.shuf.txt"
validation_path = "/kaggle/input/dakshina-valid/wiki-filt.valid.text.shuf.txt"

training_data = []
with open(train_path, "r", encoding="utf-8") as file:
    for line in file:
        training_data.append(cleaning_sentence(line.strip()))
    print('Training data processing completed!')
    print(f'Total training data: {len(training_data)}')

validation_data = []
with open(validation_path, "r", encoding="utf-8") as file:
    for line in file:
        validation_data.append(cleaning_sentence(line.strip()))
    print('Validation data processing completed!')
    print(f'Total validation data: {len(validation_data)}')

Training data processing completed!
Total training data: 200629
Validation data processing completed!
Total validation data: 28623


In [5]:
model_name = "Ransaka/sinhala-bert-medium-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/315 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/485k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/988k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/640 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/202M [00:00<?, ?B/s]

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


generation_config.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

In [6]:
train_dataset = Dataset.from_dict({"text": training_data})
eval_dataset = Dataset.from_dict({"text": validation_data})

In [7]:
train_dataset

Dataset({
    features: ['text'],
    num_rows: 200629
})

In [8]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=256)

train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)


data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

Map:   0%|          | 0/200629 [00:00<?, ? examples/s]

Map:   0%|          | 0/28623 [00:00<?, ? examples/s]

In [None]:
token = '############'
!huggingface-cli login --token "$token"

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [10]:
training_args = TrainingArguments(
    output_dir="./sinhala_bert_finetuned_12epoch",
    overwrite_output_dir=True,
    push_to_hub=True,
    hub_model_id="Sameera827/sinhala-bert-dakshina_finetuned_epoch12",
    num_train_epochs=12,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    save_steps=10_000,
    save_total_limit=2,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

trainer.train()


eval_results = trainer.evaluate()
print("Evaluation results:", eval_results)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01111287128888926, max=1.0)…

Epoch,Training Loss,Validation Loss
1,4.3057,4.233554
2,4.123,4.0999
3,4.0089,4.022184
4,3.8924,3.955532
5,3.8047,3.949266
6,3.7303,3.875835
7,3.6662,3.856209
8,3.6379,3.840752
9,3.6053,3.794821
10,3.5305,3.793829


Evaluation results: {'eval_loss': 3.775315046310425, 'eval_runtime': 167.8515, 'eval_samples_per_second': 170.526, 'eval_steps_per_second': 10.658, 'epoch': 12.0}


In [12]:
import math
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

>>> Perplexity: 43.61
