In [11]:
from transformers import BertTokenizer, BertForSequenceClassification

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [12]:
import os
import numpy as np
import torch
from sklearn.model_selection import train_test_split

def read_imdb_split(split_dir):
    split_texts = []
    split_labels = []
    for label_dir in ['pos', 'neg']:
        for filename in os.listdir(os.path.join(split_dir, label_dir)):
            if filename.endswith('.txt'):
                with open(os.path.join(split_dir, label_dir, filename), 'r', encoding='utf-8') as f:
                    split_texts.append(f.read())
                split_labels.append(1 if label_dir == 'pos' else 0)
    return split_texts, np.array(split_labels)

train_texts, train_labels = read_imdb_split('aclImdb/train')
val_texts, val_labels = read_imdb_split('aclImdb/test')

train_texts, train_labels = train_texts[:100], train_labels[:100] # use only first 100 examples

train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)


In [13]:
class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = np.array(labels, dtype=np.int64)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.int64)
        return item

    def __len__(self):
        return len(self.labels)

In [14]:
train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=8, shuffle=True)

In [17]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

for epoch in range(10):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    model.eval()
    with torch.no_grad():
        num_correct = 0
        num_total = 0
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            preds = outputs.logits.argmax(dim=-1)
            num_correct += (preds == labels).sum().item()
            num_total += labels.size(0)
        acc = num_correct / num_total
        print(f"Epoch {epoch}: val acc = {acc:.3f}")


Epoch 0: val acc = 1.000
Epoch 1: val acc = 1.000
Epoch 2: val acc = 1.000
Epoch 3: val acc = 1.000
Epoch 4: val acc = 1.000
Epoch 5: val acc = 1.000
Epoch 6: val acc = 1.000
Epoch 7: val acc = 1.000
Epoch 8: val acc = 1.000
Epoch 9: val acc = 1.000


In [18]:
def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (text, label) in enumerate(dataloader):
        optimizer.zero_grad()
        # Call the model without the `offsets` argument   
        predicted_label = model(text)

        loss = criterion(predicted_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()
            

def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (text, label) in enumerate(dataloader):
            # Call the model without the `offsets` argument
            predicted_label = model(text)
            loss = criterion(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count


In [19]:
import matplotlib.pyplot as plt
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset
import time

# Hyperparameters
EPOCHS = 10 # epoch
LR = [0.01, 1, 2]  # learning rate
BATCH_SIZE = 43 # batch size for training

criterion = torch.nn.CrossEntropyLoss()

accuracy_lr = []



# Collect the accuracy values at each epoch

for lr in LR:
  optimizer = torch.optim.SGD(model.parameters(), lr=lr)
  scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
  total_accu = None

  train_accuracies = []
  val_accuracies = []

  for epoch in range(1, EPOCHS + 1):
      # Train the model and collect the training accuracy
      epoch_start_time = time.time()
      train(train_loader)
      train_acc = evaluate(train_loader)
      train_accuracies.append(train_acc)
      
      # Evaluate the model and collect the validation accuracy
      val_acc = evaluate(val_loader)
      val_accuracies.append(val_acc)
      
      # Adjust the learning rate if the validation accuracy did not improve
      if total_accu is not None and total_accu > val_acc:
        scheduler.step()
      else:
        total_accu = val_acc
      
      print('-' * 59)
      print('| end of epoch {:3d} | time: {:5.2f}s | '
            'train accuracy {:8.3f} | val accuracy {:8.3f}'.format(epoch,
                                                                  time.time() - epoch_start_time,
                                                                  train_acc, val_acc))
      print('-' * 59)
 
  accuracy_lr.append(val_acc)

print(accuracy_lr)

# Plot the training and validation accuracies
plt.plot(train_accuracies, label='Training accuracy')
plt.plot(val_accuracies, label='Validation accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

ValueError: too many values to unpack (expected 2)