## Install the necessary libraries


In [None]:
%%capture
! pip install tqdm boto3 requests regex sentencepiece sacremoses
! pip install transformers

from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/NLP3

## BERT Features

In this part, you will use BERT features to classify DBPedia articles.
The data is already pre-processed, and the data loader is implemented below.

In [None]:
# Basics: dataset, data loaders, Classifier
import collections
import json
import torch
import torch.nn as nn
import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
import numpy as np
import pandas as pd


SPLITS = ['train', 'dev', 'test']

class DBPediaDataset(Dataset):
  '''DBPedia dataset.
    Args:
      path[str]: path to the original data.
  '''
  def __init__(self, path):
    with open(path) as fin:
      self._data = [json.loads(l) for l in fin]
    self._n_classes = len(set([datum['label'] for datum in self._data]))

  def __getitem__(self, index):
    return self._data[index]

  def __len__(self):
    return len(self._data)

  @property
  def n_classes(self):
    return self._n_classes

  @staticmethod
  def collate_fn(tokenizer, device, batch):
    '''The collate function that compresses a training batch.
      Args:
        batch[list[dict[str, Any]]]: data in the batch.
      Returns:
        labels[torch.LongTensor]: the labels in the batch.
        sentences[dict[str, torch.Tensor]]: sentences converted by tokenizers.
    '''
    labels = torch.tensor([datum['label'] for datum in batch]).long().to(device)
    sentences = tokenizer(
        [datum['sentence'] for datum in batch],
        return_tensors='pt',  # pt = pytorch style tensor
        padding=True)
    for key in sentences:
      sentences[key] = sentences[key].to(device)
    return labels, sentences

def construct_datasets(prefix, batch_size, tokenizer, device):
  '''Constructs datasets and data loaders.
    Args:
      prefix[str]: prefix of the dataset (e.g., dbpedia_).
      batch_size[int]: maximum number of examples in a batch.
      tokenizer: model tokenizer that converts sentences to integer tensors.
      device[torch.device]: the device (cpu/gpu) that the tensor should be on.
    Returns:
      datasets[dict[str, Dataset]]: a dict of constructed datasets.
      dataloaders[dict[str, DataLoader]]: a dict of constructed data loaders.
  '''
  datasets = collections.defaultdict()
  dataloaders = collections.defaultdict()
  for split in SPLITS:
    datasets[split] = DBPediaDataset(f'{prefix}{split}.json')
    dataloaders[split] = DataLoader(
        datasets[split],
        batch_size=batch_size,
        shuffle=(split == 'train'),
        collate_fn=lambda x:DBPediaDataset.collate_fn(tokenizer, device, x))
  return datasets, dataloaders

# 1.1 Classification with BERT

In [None]:
# 1.1: [CODE] put your implementation of classifer here
import torch
import torch.nn as nn
import torch.nn.functional as F

class Classifier(nn.Module):
    def __init__(self, input_dim=768, hidden_dim=32, output_dim=14):
        super(Classifier, self).__init__()

        # Define a simple 2-layer perceptron with ReLU activation
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # Forward pass through the network
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

In [None]:
# hyperparameters
batch_size = 32
classifier_hidden_size = 32
num_epochs = 1  # As per the assignment, train for 1 epoch

# Load BERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
bert_model = AutoModel.from_pretrained('bert-base-cased')

# Move BERT model to GPU if available
if torch.cuda.is_available():
    bert_model = bert_model.cuda()

# Construct datasets and dataloaders
datasets, dataloaders = construct_datasets(
    prefix='dbpedia_',
    batch_size=batch_size,
    tokenizer=tokenizer,
    device=bert_model.device)

# Initialize the classifier
classifier = Classifier(
    bert_model.config.hidden_size,  # Input size (768 for BERT base)
    classifier_hidden_size,         # Hidden layer size
    datasets['train'].n_classes     # Output size (number of classes)
).to(bert_model.device)

# Define optimizer and loss function
optimizer = torch.optim.Adam(classifier.parameters(), lr=5e-4)
loss_func = nn.CrossEntropyLoss()

# Training loop for 1 epoch
for epoch in range(num_epochs):
    classifier.train()  # Set classifier to training mode
    pbar = tqdm.tqdm(dataloaders['train'], desc=f"Epoch {epoch+1}")

    for labels, sentences in pbar:
        # Move labels to the same device as the model (GPU if available)
        labels = labels.to(bert_model.device)

        # Extract [CLS] token embeddings from BERT (frozen BERT)
        with torch.no_grad():  # We don't want to compute gradients for BERT
            unpooled_features = bert_model(**sentences)['last_hidden_state']  # [B, L, D]
            cls_features = unpooled_features[:, 0, :]  # Extract [CLS] token (first token) [B, D]

        # Forward pass through the classifier
        outputs = classifier(cls_features)  # [B, num_classes]

        # Calculate loss
        loss = loss_func(outputs, labels)

        # Backpropagation and optimization step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        pbar.set_postfix({'loss': loss.item()})

def evaluate(model, dataloader):
    model.eval()
    total_correct = 0
    total_samples = 0

    with torch.no_grad():
        for labels, sentences in tqdm.tqdm(dataloader, desc="Evaluating"):
            labels = labels.to(bert_model.device)

            # Extract [CLS] token embeddings from BERT (frozen BERT)
            unpooled_features = bert_model(**sentences)['last_hidden_state']
            cls_features = unpooled_features[:, 0, :]  # Extract [CLS] token

            # Forward pass through the classifier
            outputs = classifier(cls_features)

            # Get predicted class by taking argmax over output logits
            _, predicted_labels = torch.max(outputs, dim=1)

            total_correct += (predicted_labels == labels).sum().item()
            total_samples += labels.size(0)

    accuracy = total_correct / total_samples * 100.0
    return accuracy

# Evaluate on development set and test set after training
dev_accuracy = evaluate(classifier, dataloaders['dev'])
test_accuracy = evaluate(classifier, dataloaders['test'])

print(f"\nDevelopment Set Accuracy: {dev_accuracy:.2f}%")
print(f"Test Set Accuracy: {test_accuracy:.2f}%")

# Function to set random seeds for reproducibility
def set_seed(seed):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)

accuracies_dev = []
accuracies_test = []

best_model_state = None
best_dev_accuracy = 0.0

# Run experiments 5 times with different random seeds
for seed in range(5):
    print(f"Running experiment with seed {seed}")

    set_seed(seed)

    classifier = Classifier(
        bert_model.config.hidden_size,
        classifier_hidden_size,
        datasets['train'].n_classes).to(bert_model.device)

    optimizer = torch.optim.Adam(classifier.parameters(), lr=5e-4)

    classifier.train()
    pbar = tqdm.tqdm(dataloaders['train'], desc=f"Training Epoch")

    for labels, sentences in pbar:
        labels = labels.to(bert_model.device)

        with torch.no_grad():
            unpooled_features = bert_model(**sentences)['last_hidden_state']
            cls_features = unpooled_features[:, 0, :]  # Extract [CLS] token

        outputs = classifier(cls_features)

        # Compute loss
        loss = loss_func(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    dev_accuracy = evaluate(classifier, dataloaders['dev'])
    test_accuracy = evaluate(classifier, dataloaders['test'])

    accuracies_dev.append(dev_accuracy)
    accuracies_test.append(test_accuracy)

    print(f"Seed {seed} - Development Set Accuracy: {dev_accuracy:.2f}%")
    print(f"Seed {seed} - Test Set Accuracy: {test_accuracy:.2f}%")

    # Save the best model based on development set accuracy
    if dev_accuracy > best_dev_accuracy:
        best_dev_accuracy = dev_accuracy
        best_model_state = classifier.state_dict()

# Calculate mean and standard deviation of development accuracies
mean_dev_acc = np.mean(accuracies_dev)
std_dev_acc = np.std(accuracies_dev)

# Load the best model and evaluate it on the test set again (for reporting purposes)
classifier.load_state_dict(best_model_state)
best_test_accuracy = evaluate(classifier, dataloaders['test'])

# Print final results
print(f"\nMean Development Accuracy: {mean_dev_acc:.2f}% ± {std_dev_acc:.2f}%")
print(f"Best Test Accuracy (from best dev model): {best_test_accuracy:.2f}%")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 313/313 [00:36<00:00,  8.52it/s, loss=0.586]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.84it/s]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.33it/s]



Development Set Accuracy: 96.40%
Test Set Accuracy: 96.70%
Running experiment with seed 0


Training Epoch: 100%|██████████| 313/313 [00:35<00:00,  8.84it/s]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.59it/s]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.84it/s]


Seed 0 - Development Set Accuracy: 96.10%
Seed 0 - Test Set Accuracy: 96.60%
Running experiment with seed 1


Training Epoch: 100%|██████████| 313/313 [00:36<00:00,  8.58it/s]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.26it/s]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.63it/s]


Seed 1 - Development Set Accuracy: 96.80%
Seed 1 - Test Set Accuracy: 96.80%
Running experiment with seed 2


Training Epoch: 100%|██████████| 313/313 [00:37<00:00,  8.33it/s]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.29it/s]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.58it/s]


Seed 2 - Development Set Accuracy: 95.20%
Seed 2 - Test Set Accuracy: 95.90%
Running experiment with seed 3


Training Epoch: 100%|██████████| 313/313 [00:37<00:00,  8.35it/s]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.31it/s]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.56it/s]


Seed 3 - Development Set Accuracy: 95.10%
Seed 3 - Test Set Accuracy: 95.20%
Running experiment with seed 4


Training Epoch: 100%|██████████| 313/313 [00:37<00:00,  8.35it/s]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.23it/s]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.40it/s]


Seed 4 - Development Set Accuracy: 97.60%
Seed 4 - Test Set Accuracy: 97.70%


Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.54it/s]


Mean Development Accuracy: 96.16% ± 0.95%
Best Test Accuracy (from best dev model): 97.70%





# 1.2 Classification with Mean-Pooling and Max-Pooling

In [None]:
def mean_pooling(token_embeddings, attention_mask):
    mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * mask_expanded, 1)
    sum_mask = torch.clamp(attention_mask.sum(1).unsqueeze(-1), min=1e-9)
    return sum_embeddings / sum_mask

def max_pooling(token_embeddings, attention_mask):
    mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    token_embeddings[mask_expanded == 0] = -1e9
    return torch.max(token_embeddings, dim=1)[0]

# Evaluation function for both classifiers
def evaluate_pooling(model, dataloader, pooling_type='mean'):
    model.eval()
    total_correct = 0
    total_samples = 0

    with torch.no_grad():
        for labels, sentences in tqdm.tqdm(dataloader, desc="Evaluating"):
            labels = labels.to(bert_model.device)

            outputs = bert_model(**sentences)
            token_embeddings = outputs.last_hidden_state
            attention_mask = sentences['attention_mask']

            # Apply appropriate pooling
            if pooling_type == 'mean':
                pooled = mean_pooling(token_embeddings, attention_mask)
            else:  # max pooling
                pooled = max_pooling(token_embeddings, attention_mask)

            outputs = model(pooled)
            _, predicted_labels = torch.max(outputs, dim=1)

            total_correct += (predicted_labels == labels).sum().item()
            total_samples += labels.size(0)

    accuracy = total_correct / total_samples * 100.0
    return accuracy

# Training loop for mean-pooling classifier
def train_and_evaluate_pooling(pooling_type='mean'):
    accuracies_dev = []
    accuracies_test = []
    best_model_state = None
    best_dev_accuracy = 0.0

    for seed in range(5):
        print(f"\nRunning {pooling_type}-pooling experiment with seed {seed}")
        set_seed(seed)

        model = Classifier(
            bert_model.config.hidden_size,
            classifier_hidden_size,
            datasets['train'].n_classes).to(bert_model.device)

        optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)

        # Training loop
        model.train()
        pbar = tqdm.tqdm(dataloaders['train'], desc=f"Training Epoch")

        for labels, sentences in pbar:
            labels = labels.to(bert_model.device)

            with torch.no_grad():
                outputs = bert_model(**sentences)
                token_embeddings = outputs.last_hidden_state
                attention_mask = sentences['attention_mask']

                if pooling_type == 'mean':
                    pooled = mean_pooling(token_embeddings, attention_mask)
                else:
                    pooled = max_pooling(token_embeddings, attention_mask)

            outputs = model(pooled)
            loss = loss_func(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            pbar.set_postfix({'loss': loss.item()})

        # Evaluate
        dev_accuracy = evaluate_pooling(model, dataloaders['dev'], pooling_type)
        test_accuracy = evaluate_pooling(model, dataloaders['test'], pooling_type)

        accuracies_dev.append(dev_accuracy)
        accuracies_test.append(test_accuracy)

        print(f"Seed {seed} - Development Set Accuracy: {dev_accuracy:.2f}%")
        print(f"Seed {seed} - Test Set Accuracy: {test_accuracy:.2f}%")

        if dev_accuracy > best_dev_accuracy:
            best_dev_accuracy = dev_accuracy
            best_model_state = model.state_dict()

    # Calculate statistics
    mean_dev_acc = np.mean(accuracies_dev)
    std_dev_acc = np.std(accuracies_dev)

    # Load best model and get its test accuracy
    model.load_state_dict(best_model_state)
    best_test_accuracy = evaluate_pooling(model, dataloaders['test'], pooling_type)

    print(f"\n{pooling_type.capitalize()}-Pooling Results:")
    print(f"Mean Development Accuracy: {mean_dev_acc:.2f}% ± {std_dev_acc:.2f}%")
    print(f"Best Test Accuracy (from best dev model): {best_test_accuracy:.2f}%")

# Run experiments for both pooling methods
print("Running Mean-Pooling Experiments...")
train_and_evaluate_pooling('mean')

print("\nRunning Max-Pooling Experiments...")
train_and_evaluate_pooling('max')

Running Mean-Pooling Experiments...

Running mean-pooling experiment with seed 0


Training Epoch: 100%|██████████| 313/313 [00:38<00:00,  8.18it/s, loss=0.342]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.37it/s]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.59it/s]


Seed 0 - Development Set Accuracy: 96.70%
Seed 0 - Test Set Accuracy: 95.90%

Running mean-pooling experiment with seed 1


Training Epoch: 100%|██████████| 313/313 [00:38<00:00,  8.21it/s, loss=0.345]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.34it/s]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.48it/s]


Seed 1 - Development Set Accuracy: 97.10%
Seed 1 - Test Set Accuracy: 96.40%

Running mean-pooling experiment with seed 2


Training Epoch: 100%|██████████| 313/313 [00:37<00:00,  8.24it/s, loss=0.681]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.21it/s]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.47it/s]


Seed 2 - Development Set Accuracy: 96.80%
Seed 2 - Test Set Accuracy: 96.70%

Running mean-pooling experiment with seed 3


Training Epoch: 100%|██████████| 313/313 [00:38<00:00,  8.13it/s, loss=0.234]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.34it/s]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.59it/s]


Seed 3 - Development Set Accuracy: 96.90%
Seed 3 - Test Set Accuracy: 96.40%

Running mean-pooling experiment with seed 4


Training Epoch: 100%|██████████| 313/313 [00:38<00:00,  8.19it/s, loss=0.479]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.32it/s]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.57it/s]


Seed 4 - Development Set Accuracy: 97.30%
Seed 4 - Test Set Accuracy: 97.10%


Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.40it/s]



Mean-Pooling Results:
Mean Development Accuracy: 96.96% ± 0.22%
Best Test Accuracy (from best dev model): 97.10%

Running Max-Pooling Experiments...

Running max-pooling experiment with seed 0


Training Epoch: 100%|██████████| 313/313 [00:38<00:00,  8.22it/s, loss=1.32]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.23it/s]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.54it/s]


Seed 0 - Development Set Accuracy: 68.10%
Seed 0 - Test Set Accuracy: 68.20%

Running max-pooling experiment with seed 1


Training Epoch: 100%|██████████| 313/313 [00:38<00:00,  8.18it/s, loss=1.83]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.38it/s]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.56it/s]


Seed 1 - Development Set Accuracy: 59.80%
Seed 1 - Test Set Accuracy: 63.10%

Running max-pooling experiment with seed 2


Training Epoch: 100%|██████████| 313/313 [00:38<00:00,  8.23it/s, loss=1.67]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.30it/s]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.44it/s]


Seed 2 - Development Set Accuracy: 71.70%
Seed 2 - Test Set Accuracy: 70.40%

Running max-pooling experiment with seed 3


Training Epoch: 100%|██████████| 313/313 [00:38<00:00,  8.14it/s, loss=1.38]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.12it/s]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.52it/s]


Seed 3 - Development Set Accuracy: 59.90%
Seed 3 - Test Set Accuracy: 59.50%

Running max-pooling experiment with seed 4


Training Epoch: 100%|██████████| 313/313 [00:38<00:00,  8.21it/s, loss=1.42]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.31it/s]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.57it/s]


Seed 4 - Development Set Accuracy: 66.40%
Seed 4 - Test Set Accuracy: 66.40%


Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.54it/s]


Max-Pooling Results:
Mean Development Accuracy: 65.18% ± 4.68%
Best Test Accuracy (from best dev model): 70.40%





# 1.4 Fine-tuning BERT with [CLS] Features

In [None]:
# hyperparameters
batch_size = 32
classifier_hidden_size = 32

# Initialize classifier
classifier = Classifier(
    bert_model.config.hidden_size,
    classifier_hidden_size,
    datasets['train'].n_classes).to(bert_model.device)

params = []
for name, param in bert_model.named_parameters():
    # Select layers 10 and 11 from BERT
    if name.startswith('encoder.layer.10') or name.startswith('encoder.layer.11'):
        param.requires_grad = True
        params.append(param)
    else:
        param.requires_grad = False

# Combine BERT and classifier parameters for optimization
optimizer = torch.optim.Adam(params + list(classifier.parameters()), lr=5e-4)
loss_func = nn.CrossEntropyLoss()

# Training loop
for epoch in range(num_epochs):
    classifier.train()
    pbar = tqdm.tqdm(dataloaders['train'], desc=f"Epoch {epoch+1}")

    for labels, sentences in pbar:
        labels = labels.to(bert_model.device)

        # Forward pass through BERT (now with trainable last two layers)
        outputs = bert_model(**sentences)
        cls_features = outputs.last_hidden_state[:, 0, :]  # [CLS] token

        # Forward pass through classifier
        outputs = classifier(cls_features)

        # Calculate loss
        loss = loss_func(outputs, labels)

        # Backpropagation and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        pbar.set_postfix({'loss': loss.item()})

def evaluate_finetuning(model, classifier, dataloader):
    """Evaluation function for fine-tuning setup"""
    model.eval()
    classifier.eval()
    total_correct = 0
    total_samples = 0

    with torch.no_grad():
        for labels, sentences in tqdm.tqdm(dataloader, desc="Evaluating"):
            labels = labels.to(model.device)

            # Forward pass through BERT
            outputs = model(**sentences)
            cls_features = outputs.last_hidden_state[:, 0, :]

            # Forward pass through classifier
            outputs = classifier(cls_features)

            # Get predictions
            _, predicted_labels = torch.max(outputs, dim=1)

            total_correct += (predicted_labels == labels).sum().item()
            total_samples += labels.size(0)

    accuracy = total_correct / total_samples * 100.0
    return accuracy

dev_accuracy = evaluate_finetuning(bert_model, classifier, dataloaders['dev'])
test_accuracy = evaluate_finetuning(bert_model, classifier, dataloaders['test'])

print(f"\nDevelopment Set Accuracy: {dev_accuracy:.2f}%")
print(f"Test Set Accuracy: {test_accuracy:.2f}%")

def set_seed(seed):
    """Set random seeds for reproducibility"""
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)

accuracies_dev = []
accuracies_test = []
best_dev_accuracy = 0.0

# Run experiments with 5 different seeds
for seed in range(5):
    print(f"\nRunning experiment with seed {seed}")
    set_seed(seed)

    classifier = Classifier(
        bert_model.config.hidden_size,
        classifier_hidden_size,
        datasets['train'].n_classes).to(bert_model.device)

    # Collect parameters to be fine-tuned
    params = []
    for name, param in bert_model.named_parameters():
        if name.startswith('encoder.layer.10') or name.startswith('encoder.layer.11'):
            param.requires_grad = True
            params.append(param)
        else:
            param.requires_grad = False

    # Combine BERT and classifier parameters for optimization
    optimizer = torch.optim.Adam(params + list(classifier.parameters()), lr=5e-4)
    loss_func = nn.CrossEntropyLoss()

    # Training loop
    classifier.train()
    bert_model.train()

    pbar = tqdm.tqdm(dataloaders['train'], desc=f"Training (seed {seed})")

    for labels, sentences in pbar:
        labels = labels.to(bert_model.device)

        # Forward pass through BERT
        outputs = bert_model(**sentences)
        cls_features = outputs.last_hidden_state[:, 0, :]

        # Forward pass through classifier
        outputs = classifier(cls_features)

        # Calculate loss
        loss = loss_func(outputs, labels)

        # Backpropagation and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        pbar.set_postfix({'loss': loss.item()})

    # Evaluation
    bert_model.eval()
    dev_accuracy = evaluate_finetuning(bert_model, classifier, dataloaders['dev'])
    test_accuracy = evaluate_finetuning(bert_model, classifier, dataloaders['test'])

    accuracies_dev.append(dev_accuracy)
    accuracies_test.append(test_accuracy)

    print(f"Seed {seed} - Development Set Accuracy: {dev_accuracy:.2f}%")
    print(f"Seed {seed} - Test Set Accuracy: {test_accuracy:.2f}%")

    # Save best model
    if dev_accuracy > best_dev_accuracy:
        best_dev_accuracy = dev_accuracy
        best_test_accuracy = test_accuracy

# Calculate statistics
mean_dev_acc = np.mean(accuracies_dev)
std_dev_acc = np.std(accuracies_dev)

# Print final results
print("\nFine-tuning Results:")
print(f"Mean Development Accuracy: {mean_dev_acc:.2f}% ± {std_dev_acc:.2f}%")
print(f"Best Test Accuracy (from best dev model): {best_test_accuracy:.2f}%")

Epoch 1: 100%|██████████| 313/313 [00:49<00:00,  6.32it/s, loss=0.0183]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.31it/s]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.46it/s]



Development Set Accuracy: 98.00%
Test Set Accuracy: 97.60%

Running experiment with seed 0


Training (seed 0): 100%|██████████| 313/313 [00:50<00:00,  6.16it/s, loss=0.0222]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.22it/s]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.47it/s]


Seed 0 - Development Set Accuracy: 99.20%
Seed 0 - Test Set Accuracy: 99.30%

Running experiment with seed 1


Training (seed 1): 100%|██████████| 313/313 [00:50<00:00,  6.18it/s, loss=0.0122]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.20it/s]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.58it/s]


Seed 1 - Development Set Accuracy: 99.10%
Seed 1 - Test Set Accuracy: 99.30%

Running experiment with seed 2


Training (seed 2): 100%|██████████| 313/313 [00:50<00:00,  6.19it/s, loss=0.0128]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.33it/s]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.59it/s]


Seed 2 - Development Set Accuracy: 99.10%
Seed 2 - Test Set Accuracy: 98.80%

Running experiment with seed 3


Training (seed 3): 100%|██████████| 313/313 [00:50<00:00,  6.16it/s, loss=0.0046]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.38it/s]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.60it/s]


Seed 3 - Development Set Accuracy: 99.50%
Seed 3 - Test Set Accuracy: 99.70%

Running experiment with seed 4


Training (seed 4): 100%|██████████| 313/313 [00:50<00:00,  6.17it/s, loss=0.00906]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.34it/s]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.48it/s]

Seed 4 - Development Set Accuracy: 98.70%
Seed 4 - Test Set Accuracy: 99.20%

Fine-tuning Results:
Mean Development Accuracy: 99.12% ± 0.26%
Best Test Accuracy (from best dev model): 99.70%





# 1.5 GPT-2

In [None]:
from transformers import GPT2Model, GPT2Tokenizer
import torch
import torch.nn as nn
import numpy as np
import tqdm

# Initialize GPT-2
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt2_model = GPT2Model.from_pretrained('gpt2')

# Add padding token to GPT2 tokenizer
tokenizer.pad_token = tokenizer.eos_token

# Move GPT-2 to GPU if available
if torch.cuda.is_available():
    gpt2_model = gpt2_model.cuda()

# Freeze GPT-2 parameters
for param in gpt2_model.parameters():
    param.requires_grad = False

def evaluate_gpt2(model, classifier, dataloader):
    model.eval()
    classifier.eval()
    total_correct = 0
    total_samples = 0

    with torch.no_grad():
        for labels, sentences in tqdm.tqdm(dataloader, desc="Evaluating"):
            labels = labels.to(model.device)

            # Get GPT-2 features (last hidden state of the last token)
            outputs = model(**sentences)
            # Take the last token's representation for each sentence
            features = outputs.last_hidden_state[:, -1, :]

            # Forward pass through classifier
            outputs = classifier(features)
            _, predicted_labels = torch.max(outputs, dim=1)

            total_correct += (predicted_labels == labels).sum().item()
            total_samples += labels.size(0)

    accuracy = total_correct / total_samples * 100.0
    return accuracy

def set_seed(seed):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)

# Lists to store accuracies
accuracies_dev = []
accuracies_test = []
best_dev_accuracy = 0.0
best_model_state = None

# Run experiments with 5 different seeds
for seed in range(5):
    print(f"\nRunning experiment with seed {seed}")
    set_seed(seed)

    # Initialize classifier
    classifier = Classifier(
        gpt2_model.config.hidden_size,  # GPT-2 hidden size
        classifier_hidden_size,
        datasets['train'].n_classes
    ).to(gpt2_model.device)

    # Optimizer
    optimizer = torch.optim.Adam(classifier.parameters(), lr=5e-4)
    loss_func = nn.CrossEntropyLoss()

    # Training loop
    classifier.train()
    pbar = tqdm.tqdm(dataloaders['train'], desc=f"Training (seed {seed})")

    for labels, sentences in pbar:
        labels = labels.to(gpt2_model.device)

        # Get GPT-2 features
        with torch.no_grad():
            outputs = gpt2_model(**sentences)
            # Take the last token's representation
            features = outputs.last_hidden_state[:, -1, :]

        # Forward pass through classifier
        outputs = classifier(features)

        # Calculate loss
        loss = loss_func(outputs, labels)

        # Backpropagation and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        pbar.set_postfix({'loss': loss.item()})

    # Evaluation
    dev_accuracy = evaluate_gpt2(gpt2_model, classifier, dataloaders['dev'])
    test_accuracy = evaluate_gpt2(gpt2_model, classifier, dataloaders['test'])

    accuracies_dev.append(dev_accuracy)
    accuracies_test.append(test_accuracy)

    print(f"Seed {seed} - Development Set Accuracy: {dev_accuracy:.2f}%")

    # Save best model
    if dev_accuracy > best_dev_accuracy:
        best_dev_accuracy = dev_accuracy
        best_model_state = classifier.state_dict()

# Calculate statistics
mean_dev_acc = np.mean(accuracies_dev)
std_dev_acc = np.std(accuracies_dev)

# Load best model and get its test accuracy
classifier.load_state_dict(best_model_state)
best_test_accuracy = evaluate_gpt2(gpt2_model, classifier, dataloaders['test'])

# Print final results
print("\nGPT-2 Feature Extraction Results:")
print(f"Mean Development Accuracy: {mean_dev_acc:.2f}% ± {std_dev_acc:.2f}%")
print(f"Best Test Accuracy (from best dev model): {best_test_accuracy:.2f}%")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]


Running experiment with seed 0


Training (seed 0): 100%|██████████| 313/313 [00:41<00:00,  7.59it/s, loss=2.43]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  8.66it/s]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  8.82it/s]


Seed 0 - Development Set Accuracy: 23.10%

Running experiment with seed 1


Training (seed 1): 100%|██████████| 313/313 [00:40<00:00,  7.66it/s, loss=2.57]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  8.64it/s]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.04it/s]


Seed 1 - Development Set Accuracy: 23.90%

Running experiment with seed 2


Training (seed 2): 100%|██████████| 313/313 [00:40<00:00,  7.65it/s, loss=2.5]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  8.69it/s]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  8.82it/s]


Seed 2 - Development Set Accuracy: 18.10%

Running experiment with seed 3


Training (seed 3): 100%|██████████| 313/313 [00:41<00:00,  7.58it/s, loss=2.33]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  8.64it/s]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  9.02it/s]


Seed 3 - Development Set Accuracy: 20.90%

Running experiment with seed 4


Training (seed 4): 100%|██████████| 313/313 [00:41<00:00,  7.61it/s, loss=2.35]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  8.69it/s]
Evaluating: 100%|██████████| 32/32 [00:03<00:00,  8.92it/s]


Seed 4 - Development Set Accuracy: 29.80%


Evaluating: 100%|██████████| 32/32 [00:03<00:00,  8.95it/s]


GPT-2 Feature Extraction Results:
Mean Development Accuracy: 23.16% ± 3.88%
Best Test Accuracy (from best dev model): 28.50%



