<a href="https://colab.research.google.com/github/OneFineStarstuff/OneFineStarstuff/blob/main/Unsupervised_Pretraining_Techniques.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import random
import torch
import torch.nn as nn
from transformers import BertForMaskedLM, BertTokenizer, AdamW
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from nltk.corpus import wordnet

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define custom dataset for Masked Language Modeling
class MLMDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=128):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer(text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        labels = input_ids.clone()
        probability_matrix = torch.full(labels.shape, 0.15)
        masked_indices = torch.bernoulli(probability_matrix).bool()
        labels[~masked_indices] = -100  # Only compute loss on masked tokens

        return input_ids, attention_mask, labels

# Define MaskedLanguageModel class
class MaskedLanguageModel(nn.Module):
    def __init__(self, model_name="bert-base-uncased"):
        super().__init__()
        self.model = BertForMaskedLM.from_pretrained(model_name)

    def forward(self, input_ids, attention_mask, masked_lm_labels=None):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=masked_lm_labels)
        return outputs.loss, outputs.logits

# Example texts for pretraining
texts = [
    "The quick brown fox jumps over the lazy dog.",
    "A journey of a thousand miles begins with a single step.",
    "To be or not to be, that is the question."
]

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Split data into training and validation sets
train_texts, val_texts = train_test_split(texts, test_size=0.2, random_state=42)

# Create datasets and dataloaders
train_dataset = MLMDataset(train_texts, tokenizer)
val_dataset = MLMDataset(val_texts, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=2, shuffle=False)

# Initialize model
mlm_model = MaskedLanguageModel().to(device)

# Initialize optimizer
optimizer = AdamW(mlm_model.parameters(), lr=5e-5)

# Training loop
for epoch in range(3):  # Train for 3 epochs
    mlm_model.train()
    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        loss, logits = mlm_model(input_ids=input_ids, attention_mask=attention_mask, masked_lm_labels=labels)
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}, Training loss: {loss.item()}")

    # Validation
    mlm_model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            loss, logits = mlm_model(input_ids=input_ids, attention_mask=attention_mask, masked_lm_labels=labels)
            val_loss += loss.item()

    val_loss /= len(val_dataloader)
    print(f"Validation loss: {val_loss}")