In [1]:

from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive


In [2]:
%%capture

!pip install --upgrade bitsandbytes transformers peft accelerate datasets trl

In [27]:
from datasets import load_dataset
from transformers import MPNetTokenizer, DataCollatorForLanguageModeling
from transformers import MPNetForMaskedLM, Trainer, TrainingArguments
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [34]:


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [35]:
dataset = (load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
          .shard(num_shards=10, index=0)
          )

split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset["train"]
temp_eval_test_dataset = split_dataset["test"]


split_eval_test = temp_eval_test_dataset.train_test_split(test_size=0.5, seed=42)
eval_dataset = split_eval_test["train"]
test_dataset = split_eval_test["test"]




In [38]:
tokenizer = MPNetTokenizer.from_pretrained("microsoft/mpnet-base")

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

train_hf = train_dataset.map(tokenize_function, batched=True)
eval_hf = eval_dataset.map(tokenize_function, batched=True)
test_hf = test_dataset.map(tokenize_function, batched=True)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

Map:   0%|          | 0/368 [00:00<?, ? examples/s]

In [44]:

model = MPNetForMaskedLM.from_pretrained("microsoft/mpnet-base").to(device)

trainer = Trainer(
    model=model,

    train_dataset=train_hf,
    data_collator=data_collator,
    eval_dataset=eval_hf,


    args= TrainingArguments(
                  output_dir="./results",
                  per_device_train_batch_size=16,
                  per_device_eval_batch_size=16,
                  evaluation_strategy="steps",
                  logging_steps=25,
            )
)




In [45]:
trainer.train()


Step,Training Loss,Validation Loss
25,2.0507,1.680796
50,1.9036,1.68121
75,1.914,1.657127
100,1.9638,1.670325
125,1.8576,1.696592
150,1.9272,1.663147
175,1.8612,1.508343
200,1.7471,1.705768
225,1.7533,1.54663
250,1.6893,1.661774


TrainOutput(global_step=552, training_loss=1.777963924235192, metrics={'train_runtime': 326.155, 'train_samples_per_second': 27.015, 'train_steps_per_second': 1.692, 'total_flos': 579774435229440.0, 'train_loss': 1.777963924235192, 'epoch': 3.0})

In [61]:
from torch.utils.data import DataLoader
test_dataloader = DataLoader(test_hf, batch_size=16, shuffle=False)


for batch in trainer.get_train_dataloader():
    # print(batch)
    input_ids = batch["input_ids"].to(device)
    labels = batch["labels"].to(device)


    with torch.no_grad():
        outputs = model(input_ids=input_ids, labels=labels)
        predictions = outputs.logits

    mask_token_index = torch.where(input_ids == tokenizer.mask_token_id)
    predicted_token_ids = torch.argmax(predictions[0, mask_token_index[1], :], dim=-1)
    predicted_tokens = tokenizer.batch_decode(predicted_token_ids)
    print(f"Predicted tokens for masked positions: {predicted_tokens}")
    print()

    break


Predicted tokens for masked positions: ['-', 'di', '-', 'br', '##s', '##s', '##s', '-', '##s', '-', '##s', '##s', '##s', '>', '-', '##s', '##s', '##s', '##s', '##s', '-', '##s', 'kit', '##s', "'", '-', '##s', '##s', '##s', '##s', '##ens', '"', 'and', '##s', '-', '##s', '##s', '##s', '##s', '##s', 'br', '##s', '=', '##s', '##s', '>', '##s', '##s', '-', '##ne', '##s', '-', '-', '##s', '##s', '-', '##s', '-', 'br', 'liner', '##s', '##s', '##s', '-', 'br', '##s', '-', '##s', '>', '##s', '##s', '##s', '=', 'advertisement', '##s', '##s', 'to', '</s>', 'dr', '##s', '>', '##s', '##s', '-', '-', '##s', '##s', '##s', '=', '-', '##s', '-', '"', '=', '##s']

