In [2]:
# Install necessary libraries if not already installed
!pip install torch transformers datasets

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
from datasets import load_dataset
import math



Defaulting to user installation because normal site-packages is not writeable


In [3]:
# Check if CUDA is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")



Using device: cuda


In [4]:
# Load the BookCorpus dataset from Hugging Face
dataset = load_dataset('bookcorpus', split='train')
print("Total samples:", len(dataset))


Total samples: 74004228


In [5]:
# Limit to 100k samples as specified
dataset = dataset.select(range(100000))
print("Subset size:", len(dataset))


Subset size: 100000


In [6]:
# Initialize the BERT tokenizer (WordPiece)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [7]:
# Tokenization and Preprocessing Function
def preprocess(text):
    # Tokenize and encode input text
    encoding = tokenizer(text, 
                         add_special_tokens=True, 
                         truncation=True,
                         max_length=128, 
                         padding='max_length', 
                         return_tensors='pt')
    
    # Flatten tensor to (seq_length)
    input_ids = encoding['input_ids'].squeeze()
    attention_mask = encoding['attention_mask'].squeeze()
    
    # Create segment ids (all zeros for single sentence input)
    segment_ids = torch.zeros_like(input_ids)
    
    return input_ids, attention_mask, segment_ids


In [8]:
from torch.utils.data import Dataset

# Custom Dataset Class for BookCorpus
class BookCorpusDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        text = self.dataset[idx]['text']
        input_ids, attention_mask, segment_ids = preprocess(text)
        return input_ids, attention_mask, segment_ids


In [9]:
# Create DataLoader for 
train_dataset = BookCorpusDataset(dataset)
train_loader = DataLoader(train_dataset, batch_size= 8, shuffle=True)

# Sample check
for batch in train_loader:
    input_ids, attention_mask, segment_ids = batch
    print("Input IDs:", input_ids.shape)
    print("Attention Mask:", attention_mask.shape)
    print("Segment IDs:", segment_ids.shape)
    break


Input IDs: torch.Size([8, 128])
Attention Mask: torch.Size([8, 128])
Segment IDs: torch.Size([8, 128])


Tokenization and Masking for MLM

In [10]:
import random

def mask_tokens(input_ids, tokenizer, mask_prob=0.15):
    """
    Prepare masked tokens inputs/labels for masked language modeling.
    """
    labels = input_ids.clone()
    
    # Masking (15% of tokens)
    probability_matrix = torch.full(labels.shape, mask_prob)
    masked_indices = torch.bernoulli(probability_matrix).bool()
    labels[~masked_indices] = -100  # Only compute loss on masked tokens
    
    # Replace 80% of the time with [MASK]
    indices_replaced = (torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices)
    input_ids[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)
    
    # Replace 10% of the time with random token
    indices_random = (torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced)
    random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long).to(input_ids.device)  # Move to the same device
    input_ids[indices_random] = random_words[indices_random]
    
    # 10% of the time, keep the original token (already done by default)
    return input_ids, labels



Implementation: BERTEmbedding



In [11]:
import torch
import torch.nn as nn

class BERTEmbedding(nn.Module):
    def __init__(self, vocab_size, hidden_dim, max_position_embeddings, segment_vocab_size=2, dropout_prob=0.1):
        super(BERTEmbedding, self).__init__()
        # Token Embeddings
        self.token_embeddings = nn.Embedding(vocab_size, hidden_dim)
        
        # Position Embeddings
        self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_dim)
        
        # Segment Embeddings (For sentence pairs in NLI tasks)
        self.segment_embeddings = nn.Embedding(segment_vocab_size, hidden_dim)
        
        # Layer Normalization and Dropout
        self.layer_norm = nn.LayerNorm(hidden_dim, eps=1e-12)
        self.dropout = nn.Dropout(dropout_prob)
    
    def forward(self, input_ids, segment_ids):
        seq_length = input_ids.size(1)
        
        # Position IDs [0, 1, 2, ..., seq_length-1]
        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
        
        # Get Embeddings
        token_embeddings = self.token_embeddings(input_ids)
        position_embeddings = self.position_embeddings(position_ids)
        segment_embeddings = self.segment_embeddings(segment_ids)
        
        # Sum and Normalize
        embeddings = token_embeddings + position_embeddings + segment_embeddings
        embeddings = self.layer_norm(embeddings)
        embeddings = self.dropout(embeddings)
        
        return embeddings


Implementation: MultiHeadSelfAttention



In [12]:
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, hidden_dim, num_heads, dropout_prob=0.1):
        super(MultiHeadSelfAttention, self).__init__()
        assert hidden_dim % num_heads == 0
        
        # Parameters
        self.num_heads = num_heads
        self.head_dim = hidden_dim // num_heads
        
        # Linear layers for Q, K, V
        self.query = nn.Linear(hidden_dim, hidden_dim)
        self.key = nn.Linear(hidden_dim, hidden_dim)
        self.value = nn.Linear(hidden_dim, hidden_dim)
        
        # Output projection
        self.out = nn.Linear(hidden_dim, hidden_dim)
        
        # Dropout for attention scores
        self.dropout = nn.Dropout(dropout_prob)
        
    def forward(self, hidden_states, attention_mask):
        batch_size, seq_length, hidden_dim = hidden_states.size()
        
        # Linear projections
        query = self.query(hidden_states)
        key = self.key(hidden_states)
        value = self.value(hidden_states)
        
        # Reshape to (batch_size, num_heads, seq_length, head_dim)
        query = query.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
        key = key.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
        value = value.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
        
        # Scaled Dot-Product Attention
        attention_scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.head_dim)
        attention_scores += attention_mask
        
        attention_probs = nn.Softmax(dim=-1)(attention_scores)
        attention_probs = self.dropout(attention_probs)
        
        context = torch.matmul(attention_probs, value)
        context = context.transpose(1, 2).contiguous().view(batch_size, seq_length, hidden_dim)
        
        output = self.out(context)
        
        return output


Implementation: FeedForward



In [13]:
class FeedForward(nn.Module):
    def __init__(self, hidden_dim, intermediate_dim, dropout_prob=0.1):
        super(FeedForward, self).__init__()
        self.dense1 = nn.Linear(hidden_dim, intermediate_dim)
        self.activation = nn.GELU()
        self.dense2 = nn.Linear(intermediate_dim, hidden_dim)
        self.dropout = nn.Dropout(dropout_prob)
        self.layer_norm = nn.LayerNorm(hidden_dim, eps=1e-12)
    
    def forward(self, hidden_states):
        intermediate = self.activation(self.dense1(hidden_states))
        output = self.dense2(intermediate)
        output = self.dropout(output)
        output = self.layer_norm(output + hidden_states)  # Residual Connection
        return output


Implementation: BERTLayer



In [14]:
class BERTLayer(nn.Module):
    def __init__(self, hidden_dim, num_heads, intermediate_dim, dropout_prob=0.1):
        super(BERTLayer, self).__init__()
        self.attention = MultiHeadSelfAttention(hidden_dim, num_heads, dropout_prob)
        self.feed_forward = FeedForward(hidden_dim, intermediate_dim, dropout_prob)
        self.layer_norm = nn.LayerNorm(hidden_dim, eps=1e-12)
        self.dropout = nn.Dropout(dropout_prob)
    
    def forward(self, hidden_states, attention_mask):
        # Multi-Head Self Attention
        attention_output = self.attention(hidden_states, attention_mask)
        attention_output = self.dropout(attention_output)
        attention_output = self.layer_norm(attention_output + hidden_states)
        
        # Feed Forward Network
        layer_output = self.feed_forward(attention_output)
        
        return layer_output


In [15]:
class BERTModel(nn.Module):
    def __init__(self, vocab_size, hidden_dim=768, num_layers=12, num_heads=12, intermediate_dim=3072, max_position_embeddings=512, segment_vocab_size=2, dropout_prob=0.1):
        super(BERTModel, self).__init__()
        
        # Embedding Layer
        self.embedding = BERTEmbedding(vocab_size, hidden_dim, max_position_embeddings, segment_vocab_size, dropout_prob)
        
        # Stacking BERT Layers
        self.layers = nn.ModuleList([
            BERTLayer(hidden_dim, num_heads, intermediate_dim, dropout_prob) for _ in range(num_layers)
        ])
    
    def forward(self, input_ids, segment_ids, attention_mask):
        # Input Embedding
        hidden_states = self.embedding(input_ids, segment_ids)
        
        # Apply attention mask to ignore padding tokens
        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
        
        # Pass through each BERT Layer
        for layer in self.layers:
            hidden_states = layer(hidden_states, extended_attention_mask)
        
        return hidden_states


In [16]:
class MLMHead(nn.Module):
    def __init__(self, hidden_dim, vocab_size):
        super(MLMHead, self).__init__()
        self.dense = nn.Linear(hidden_dim, hidden_dim)
        self.activation = nn.GELU()
        self.layer_norm = nn.LayerNorm(hidden_dim, eps=1e-12)
        
        # Output layer
        self.decoder = nn.Linear(hidden_dim, vocab_size)
    
    def forward(self, hidden_states):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.activation(hidden_states)
        hidden_states = self.layer_norm(hidden_states)
        logits = self.decoder(hidden_states)
        
        return logits


In [17]:
class BERTForMaskedLM(nn.Module):
    def __init__(self, vocab_size, hidden_dim=768, num_layers=12, num_heads=12, intermediate_dim=3072, max_position_embeddings=512, segment_vocab_size=2, dropout_prob=0.1):
        super(BERTForMaskedLM, self).__init__()
        
        # Base BERT Model
        self.bert = BERTModel(vocab_size, hidden_dim, num_layers, num_heads, intermediate_dim, max_position_embeddings, segment_vocab_size, dropout_prob)
        
        # MLM Head
        self.mlm_head = MLMHead(hidden_dim, vocab_size)
    
    def forward(self, input_ids, segment_ids, attention_mask):
        # Forward pass through BERT
        hidden_states = self.bert(input_ids, segment_ids, attention_mask)
        
        # Get logits for masked language modeling
        logits = self.mlm_head(hidden_states)
        
        return logits


In [18]:
import torch.optim as optim
from torch.nn import CrossEntropyLoss

# Initialize model and move to GPU
vocab_size = tokenizer.vocab_size
model = BERTForMaskedLM(vocab_size).to(device)

# Optimizer (AdamW)
optimizer = optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)

# Loss Function (CrossEntropy for MLM)
criterion = CrossEntropyLoss()

# Training Loop
epochs = 3  # Feel free to adjust

model.train()  # Set model to training mode
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    total_loss = 0
    
    for batch in train_loader:
        input_ids, attention_mask, segment_ids = [x.to(device) for x in batch]
        
        # Masking for MLM
        input_ids, labels = mask_tokens(input_ids, tokenizer)
        input_ids, labels = input_ids.to(device), labels.to(device)
        
        # Forward pass
        logits = model(input_ids, segment_ids, attention_mask)
        
        # Calculate loss
        loss = criterion(logits.view(-1, vocab_size), labels.view(-1))
        total_loss += loss.item()
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    avg_loss = total_loss / len(train_loader)
    print(f"Average Loss: {avg_loss:.4f}")




Epoch 1/3
Average Loss: 0.6750
Epoch 2/3
Average Loss: 0.5441
Epoch 3/3
Average Loss: 0.5051


In [19]:
# Save model weights
torch.save(model.state_dict(), 'bert_mlm_weights.pth')
print("Model weights saved.")

Model weights saved.


Task 2. Sentence Embedding with Sentence BERT

In [49]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertModel
from datasets import load_dataset
from torch.utils.data import DataLoader, Dataset
from torch.cuda.amp import autocast, GradScaler


In [50]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [51]:
!pip install transformers
from transformers import BertTokenizer, BertModel


Defaulting to user installation because normal site-packages is not writeable


In [60]:
# 1. Load or Initialize the Base BERT Model from Task 1
# =============================================================================
# custom BERT from Task 1, load it here.

import torch
from transformers import BertTokenizer, BertModel

# Device configuration: use CUDA if available.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the pretrained BERT model.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
base_bert = BertModel.from_pretrained('bert-base-uncased')
base_bert.to(device)


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [43]:
# 2. Prepare the SNLI Dataset
# =============================================================================
class SNLIDataset(Dataset):
    def __init__(self, split='train', max_samples=10000, max_length=64):
        full_dataset = load_dataset("snli", split=split)
        # Filter out examples with missing labels (-1 indicates an invalid label)
        filtered_dataset = full_dataset.filter(lambda x: x['label'] != -1)
        # Select a subset using the select() method
        self.dataset = filtered_dataset.select(range(min(max_samples, len(filtered_dataset))))
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        # Get sentence pair and label: premise and hypothesis
        sent1 = item['premise']
        sent2 = item['hypothesis']
        label = item['label']
        return sent1, sent2, label

def collate_fn(batch):
    # Unpack batch items: each is a tuple (sent1, sent2, label)
    sents1, sents2, labels = zip(*batch)
    encoding1 = tokenizer(list(sents1), padding=True, truncation=True, max_length=64, return_tensors="pt")
    encoding2 = tokenizer(list(sents2), padding=True, truncation=True, max_length=64, return_tensors="pt")
    labels = torch.tensor(labels, dtype=torch.long)
    return encoding1, encoding2, labels


In [53]:
# Create DataLoader for training data.
train_dataset = SNLIDataset(split="train", max_samples=5000)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)


In [54]:
# 3. Define the Sentence-BERT Model with a Siamese Network Structure
# =============================================================================
class SentenceBERT(nn.Module):
    def __init__(self, base_bert, hidden_size=768, num_classes=3):
        super(SentenceBERT, self).__init__()
        self.bert = base_bert  # Shared BERT encoder for both sentences.
        # Classifier: takes concatenated features [u, v, |u - v|].
        self.classifier = nn.Linear(hidden_size * 3, num_classes)
    
    def forward(self, encoding1, encoding2):
        # Encode first sentence.
        outputs1 = self.bert(
            input_ids=encoding1['input_ids'].to(device),
            attention_mask=encoding1['attention_mask'].to(device)
        )
        # Encode second sentence.
        outputs2 = self.bert(
            input_ids=encoding2['input_ids'].to(device),
            attention_mask=encoding2['attention_mask'].to(device)
        )
        # Extract the [CLS] token embedding.
        u = outputs1.last_hidden_state[:, 0]  # Shape: [batch_size, hidden_size]
        v = outputs2.last_hidden_state[:, 0]  # Shape: [batch_size, hidden_size]
        
        # Compute the element-wise absolute difference between embeddings.
        diff = torch.abs(u - v)
        # Concatenate the embeddings and their difference: [u, v, |u-v|].
        combined = torch.cat([u, v, diff], dim=1)
        # Compute the logits using the classifier.
        logits = self.classifier(combined)
        return logits, u, v  # Also returning individual embeddings for optional cosine-similarity analysis.


In [55]:
# Instantiate the SentenceBERT model and move it to the device.
model = SentenceBERT(base_bert=base_bert).to(device)


In [56]:
# 4. Set Up the Loss Function and Optimizer
# =============================================================================
criterion = nn.CrossEntropyLoss()  # Implements the Softmax and cross-entropy loss.
optimizer = optim.Adam(model.parameters(), lr=2e-5)


In [57]:
# 5. Training Loop
# =============================================================================
num_epochs = 3  # Adjust the number of epochs as needed.
model.train()

for epoch in range(num_epochs):
    running_loss = 0.0
    for batch_idx, (encoding1, encoding2, labels) in enumerate(train_loader):
        optimizer.zero_grad()
        logits, u, v = model(encoding1, encoding2)
        loss = criterion(logits, labels.to(device))
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

        if (batch_idx + 1) % 50 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Step [{batch_idx+1}/{len(train_loader)}], Loss: {loss.item():.4f}")
    
    avg_loss = running_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}] Average Loss: {avg_loss:.4f}")


Epoch [1/3], Step [50/313], Loss: 1.0671
Epoch [1/3], Step [100/313], Loss: 1.1226
Epoch [1/3], Step [150/313], Loss: 0.9482
Epoch [1/3], Step [200/313], Loss: 0.9279
Epoch [1/3], Step [250/313], Loss: 0.8351
Epoch [1/3], Step [300/313], Loss: 0.8868
Epoch [1/3] Average Loss: 0.9892
Epoch [2/3], Step [50/313], Loss: 0.8520
Epoch [2/3], Step [100/313], Loss: 0.7347
Epoch [2/3], Step [150/313], Loss: 0.7055
Epoch [2/3], Step [200/313], Loss: 0.9689
Epoch [2/3], Step [250/313], Loss: 0.7676
Epoch [2/3], Step [300/313], Loss: 0.6561
Epoch [2/3] Average Loss: 0.7650
Epoch [3/3], Step [50/313], Loss: 0.4416
Epoch [3/3], Step [100/313], Loss: 0.8020
Epoch [3/3], Step [150/313], Loss: 0.7125
Epoch [3/3], Step [200/313], Loss: 0.5011
Epoch [3/3], Step [250/313], Loss: 0.8470
Epoch [3/3], Step [300/313], Loss: 0.6946
Epoch [3/3] Average Loss: 0.5489


In [58]:
torch.save(model.state_dict(), "sentence_bert_model.pt")
print("Model saved as sentence_bert_model.pt")

Model saved as sentence_bert_model.pt


Task 3. Evaluation and Analysis

In [59]:
import torch
from torch.utils.data import DataLoader

# Assuming SNLIDataset and collate_fn are already defined as before.
# Load the test split (you can also use 'validation' if preferred)
test_dataset = SNLIDataset(split="validation", max_samples=1000)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)

model.eval()  # Set model to evaluation mode

correct = 0
total = 0
with torch.no_grad():
    for encoding1, encoding2, labels in test_loader:
        logits, _, _ = model(encoding1, encoding2)
        predictions = torch.argmax(logits, dim=1)
        correct += (predictions == labels.to(device)).sum().item()
        total += labels.size(0)

accuracy = correct / total
print(f"Test Accuracy: {accuracy * 100:.2f}%")


Filter:   0%|          | 0/10000 [00:00<?, ? examples/s]

Test Accuracy: 65.20%
