In [2]:

from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive


In [3]:
%%capture

!pip install --upgrade bitsandbytes transformers peft accelerate datasets trl

In [8]:
from datasets import load_dataset
from transformers import MPNetTokenizer, DataCollatorForLanguageModeling
from transformers import MPNetForMaskedLM, Trainer, TrainingArguments
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

import torch.nn as nn
from transformers import MPNetModel, MPNetTokenizer, AdamW
from torch.utils.data import DataLoader, Dataset


In [5]:



device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
dataset = (load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
          .shard(num_shards=10, index=0)
          )

split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset["train"]
temp_eval_test_dataset = split_dataset["test"]


split_eval_test = temp_eval_test_dataset.train_test_split(test_size=0.5, seed=42)
eval_dataset = split_eval_test["train"]
test_dataset = split_eval_test["test"]




README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/733k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

In [10]:

tokenizer = MPNetTokenizer.from_pretrained("microsoft/mpnet-base")

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

train_hf = train_dataset.map(tokenize_function, batched=True)
eval_hf = eval_dataset.map(tokenize_function, batched=True)
test_hf = test_dataset.map(tokenize_function, batched=True)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/472k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/493 [00:00<?, ?B/s]

Map:   0%|          | 0/2937 [00:00<?, ? examples/s]

Map:   0%|          | 0/367 [00:00<?, ? examples/s]

Map:   0%|          | 0/368 [00:00<?, ? examples/s]

In [22]:

# Determine if GPU is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

import torch
from torch.utils.data import Dataset
from transformers import MPNetTokenizer

class HuggingFaceDatasetWrapper(Dataset):
    def __init__(self, hf_dataset, tokenizer, masking_prob=0.15, max_length=128):
        self.hf_dataset = hf_dataset
        self.tokenizer = tokenizer
        self.masking_prob = masking_prob  # Control the percentage of tokens to mask
        self.max_length = max_length

    def __len__(self):
        return len(self.hf_dataset)

    def __getitem__(self, idx):
        item = self.hf_dataset[idx]

        # Tokenize the text and pad to max_length if needed
        encoding = self.tokenizer(item['text'], truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        # Create labels (copy of input_ids) where some tokens will be masked
        labels = input_ids.clone()

        # Apply the masking process (control the percentage of tokens masked)
        probability_matrix = torch.full(labels.shape, self.masking_prob)
        masked_indices = torch.bernoulli(probability_matrix).bool()
        labels[~masked_indices] = -100  # We only compute loss on masked tokens

        # Replace masked tokens with [MASK] token (for input_ids)
        mask_token_id = self.tokenizer.mask_token_id
        input_ids[masked_indices] = mask_token_id

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }


In [23]:
import torch.nn as nn
from transformers import MPNetModel

class CustomMaskedLM(nn.Module):
    def __init__(self, pretrained_model_name="microsoft/mpnet-base"):
        super(CustomMaskedLM, self).__init__()
        # Load the pre-trained MPNet transformer model
        self.transformer = MPNetModel.from_pretrained(pretrained_model_name)

        # Add a linear head for masked language modeling (MLM)
        self.classifier = nn.Linear(self.transformer.config.hidden_size, self.transformer.config.vocab_size)

    def forward(self, input_ids, attention_mask=None, labels=None):
        # Forward pass through the transformer backbone
        transformer_outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)

        # The hidden states of the last layer (sequence of token embeddings)
        hidden_states = transformer_outputs.last_hidden_state

        # Forward pass through the classifier head (masked language modeling head)
        logits = self.classifier(hidden_states)

        # If labels are provided, compute loss for MLM task
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            # Flatten the logits and labels to match the format required for CrossEntropyLoss
            loss = loss_fct(logits.view(-1, self.transformer.config.vocab_size), labels.view(-1))

        return logits, loss


In [24]:
from transformers import AdamW

# Training loop with evaluation after each epoch
def train(model, train_dataloader, eval_dataloader, optimizer, device, epochs=3):
    for epoch in range(epochs):
        model.train()  # Set the model to training mode
        total_train_loss = 0

        for batch in train_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()  # Zero gradients
            logits, loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss.backward()  # Backpropagation
            optimizer.step()  # Optimization

            total_train_loss += loss.item()

        avg_train_loss = total_train_loss / len(train_dataloader)
        print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {avg_train_loss:.4f}")

        # Evaluate the model on the evaluation dataset after each epoch
        avg_eval_loss = evaluate(model, eval_dataloader, device)
        print(f"Evaluation Loss after epoch {epoch + 1}: {avg_eval_loss:.4f}")

# Evaluation function to compute loss on the evaluation dataset
def evaluate(model, dataloader, device):
    model.eval()  # Set the model to evaluation mode
    total_eval_loss = 0
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            logits, loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            total_eval_loss += loss.item()

    avg_eval_loss = total_eval_loss / len(dataloader)
    return avg_eval_loss




In [25]:


# Load the tokenizer
tokenizer = MPNetTokenizer.from_pretrained("microsoft/mpnet-base")

# Wrap Hugging Face datasets for training and evaluation
wrapped_train_dataset = HuggingFaceDatasetWrapper(test_hf, tokenizer=tokenizer, masking_prob=0.20)  # 20% masking
wrapped_eval_dataset = HuggingFaceDatasetWrapper(eval_hf, tokenizer=tokenizer, masking_prob=0.20)   # 20% masking

train_dataloader = DataLoader(wrapped_train_dataset, batch_size=16, shuffle=True)
eval_dataloader = DataLoader(wrapped_eval_dataset, batch_size=16, shuffle=False)


In [26]:
# Initialize model and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CustomMaskedLM().to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

# Start training the model for 3 epochs
train(model, train_dataloader, eval_dataloader, optimizer, device, epochs=3)


Some weights of MPNetModel were not initialized from the model checkpoint at microsoft/mpnet-base and are newly initialized: ['mpnet.pooler.dense.bias', 'mpnet.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Training Loss: 9.4837
Evaluation Loss after epoch 1: 8.8564
Epoch 2/3, Training Loss: 8.9823
Evaluation Loss after epoch 2: 8.6419
Epoch 3/3, Training Loss: 8.7877
Evaluation Loss after epoch 3: 8.4730


In [31]:
from torch.utils.data import DataLoader
wrapped_test_dataset = HuggingFaceDatasetWrapper(test_hf, tokenizer=tokenizer, masking_prob=0.20)

# Prepare the test DataLoader
test_dataloader = DataLoader(wrapped_test_dataset, batch_size=16, shuffle=False)
def predict_masked_tokens_on_dataset(model, tokenizer, dataloader, device):
    model.eval()  # Set model to evaluation mode
    all_predictions = []

    with torch.no_grad():  # Disable gradient calculation for inference
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            # Get model predictions (logits)
            logits, _ = model(input_ids=input_ids, attention_mask=attention_mask)

            # Find masked token positions in the batch (ignore padding)
            mask_token_indices = torch.where(input_ids == tokenizer.mask_token_id)

            # Store the predictions for the current batch
            batch_predictions = []

            for i in range(len(mask_token_indices[0])):
                batch_idx = mask_token_indices[0][i]  # Index in the batch
                token_idx = mask_token_indices[1][i]  # Index in the sequence

                # Ensure that we are not predicting padding tokens
                if input_ids[batch_idx, token_idx] != tokenizer.pad_token_id:
                    # Get predicted token ID with highest probability
                    predicted_token_id = torch.argmax(logits[batch_idx, token_idx, :], dim=-1).item()

                    # Decode predicted token ID back into word
                    predicted_token = tokenizer.decode([predicted_token_id])
                    batch_predictions.append(predicted_token)
                else:
                    batch_predictions.append("<pad>")

            # Append batch predictions to overall list
            all_predictions.append(batch_predictions)

    return all_predictions

# Perform inference on the test dataset
predictions = predict_masked_tokens_on_dataset(model, tokenizer, test_dataloader, device)

# Print predictions for each batch
for i, batch_predictions in enumerate(predictions):
    print(f"Batch {i+1} predictions: {batch_predictions}")



Batch 1 predictions: ['<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>