## Fine-Tuning BERT for Grammatical Classification

We will use a pretrained BERT model from HuggingFace and the Corpus of Linguistic Acceptability (CoLA) to train a model that classifies a sentence's grammatical correctness. As a reminder, CoLA is a collection of "10657 sentences from 23 linguistics publications, expertly annotated for acceptability (grammaticality) by their original authors", with the canonical problem being to build a binary classifier.

We begin by defining our constants and loading our training data.

In [1]:
import io
import os
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm, trange

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import (
    TensorDataset, DataLoader, RandomSampler, SequentialSampler
)
# from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertConfig
from transformers import (
    AdamW, BertForSequenceClassification, get_linear_schedule_with_warmup
)


ModuleNotFoundError: No module named 'torch'

In [None]:
# Define constants.
BATCH_SIZE = 32
EPOCHS = 1
DEVICE = torch.device(
    'cuda' if torch.cuda.is_available()
    else 'mps' if torch.backends.mps.is_available() 
    else 'cpu'
)
DATA_DIR = '/Users/justinsima/dir/datasets/CoLA/raw'

# Max number of tokens for each sequence. 512 used by paper.
MAX_LEN = 128

# Number of batches to use before proceeding to next step (training, validation, testing). 
# This is for debugging purposes; set to 'None' for full run.
MAX_BATCHES = 5


In [None]:
# Load training data.
data_train = pd.read_csv(
    os.path.join(data_dir, 'in_domain_train.tsv'),
    names=['source', 'label', 'notes', 'sentence'],
    delimiter='\t',
    header=None
)

data_train.head()

### Preprocessing

We next add our classification (CLS) and seperator (SEP) tokens to our sentences, and tokenize each sample. We'll then convert each token to it's corresponding id in BERT's vocabulary, zero-pad each sequence, and make our masks. Finally, we'll separate our dataset into training and validation sets, and wrap each in a torch DataLoader.


In [None]:
# Prepare sentences and labels for BERT.
sentences = ['[CLS] ' + sentence + ' [SEP]' for sentence in data_train['sentence'].values]
labels = data_train['label'].values

# Tokenize sentences.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
tokenized_sentences = [tokenizer.tokenize(sentence) for sentence in sentences]


In [None]:
# Convert tokens to index in BERT vocabulary.
unpadded_sentence_ids = [tokenizer.convert_tokens_to_ids(s) for s in tokenized_sentences]

# Pad sequences and create attention masks.
sentence_ids = np.array([np.pad(np.array(x).flatten(),
                    pad_width =(0, MAX_LEN-len(x)),
                    mode='constant',
                    constant_values=0.
                ) for x in unpadded_sentence_ids])
attention_masks = np.array([
    np.concatenate(
        [np.ones(shape=(len(x))), np.zeros(shape=(MAX_LEN - len(x)))]
    ) for x in unpadded_sentence_ids
])


In [None]:
# Separate training and validation sets.
train_features, val_features, train_labels, val_labels, train_masks, val_masks = train_test_split(
    sentence_ids,
    labels,
    attention_masks,
    test_size=0.1
)


In [None]:
# Convert to pytorch arrays.
train_features = torch.from_numpy(train_features).long()
val_features = torch.from_numpy(val_features).long()
train_labels = torch.from_numpy(train_labels).long()
val_labels = torch.from_numpy(val_labels).long()
train_masks = torch.from_numpy(train_masks).long()
val_masks = torch.from_numpy(val_masks).long()


In [None]:
# Create data loaders.
dataset_train = TensorDataset(train_features, train_masks, train_labels)
train_sampler = RandomSampler(dataset_train)
train_loader = DataLoader(dataset=dataset_train, sampler=train_sampler, batch_size=BATCH_SIZE)

dataset_val = TensorDataset(val_features, val_masks, val_labels)
val_sampler = RandomSampler(dataset_val)
val_loader = DataLoader(dataset=dataset_val, sampler=val_sampler, batch_size=BATCH_SIZE)


### Training Our Model
Now we're ready to fine-tune a BERT model. What follows is a fairly standard training loop using an HuggingFace's AdamW optimizer. Loss on both training and validation sets are shown below.

In [None]:
# Configure BERT model.
base_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model = nn.DataParallel(base_model)
model.to(DEVICE)


In [None]:
# Prepare parameters for training.
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.weight']

optimizer_grouped_parameters = [
    {
        'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate': 0.1
    },
    {
        'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate': 0.0
    }
]

In [None]:
# Optimizer and scaler.
optimizer = AdamW(
    optimizer_grouped_parameters,
    lr = 1e-5,
    eps = 1e-8
)

total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps = 0,
    num_training_steps = total_steps
)


In [None]:
# Calculate accuracy.
def accuracy_score(preds, labels):
    class_preds = np.argmax(preds, axis=1).flatten()
    class_labels = labels.flatten()
    return np.sum(class_preds == class_labels) / len(class_labels)


In [None]:
# Training loop.
loss_train_list = []
loss_val_list = []

for epoch in trange(EPOCHS, leave=True, desc='Epoch:'):
    model.train()
    
    # Initialize epoch tracking variables.
    time_start = datetime.datetime.now()
    loss_train, accuracy_train = 0.0, 0.0
    nb_tr_examples, nb_tr_steps = 0, 0
    val_loss, val_accuracy = 0, 0
    n_val_steps, b_val_examples = 0, 0
    
    for step, batch in tqdm(enumerate(train_loader), leave=True, desc='Batches:'):
        # Store tensors and move to device.
        batch_sequences, batch_masks, batch_labels = batch[0].to(DEVICE), batch[1].to(DEVICE), batch[2].to(DEVICE)
        
        optimizer.zero_grad()
        
        # Feed model and calculate loss / accuracy.
        outputs = model(batch_sequences, token_type_ids=None, attention_mask=batch_masks, labels=batch_labels)
        loss = outputs['loss']
        loss_train_list.append(loss.item())
        logits = outputs['logits'].detach().cpu().numpy()
        np_labels = batch_labels.to('cpu').numpy()
        batch_train_accuracy = accuracy_score(logits, np_labels)
        accuracy_train += batch_train_accuracy
        
        # Backwards step.
        loss.backward()
        optimizer.step()
        scheduler.step()
            
        # Update train tracking statistics.
        loss_train += loss.item()
        nb_tr_examples += batch_sequences.size(0)
        nb_tr_steps += 1
        
        # TODO: Remove. Only for debugging.
        if MAX_BATCHES:
            if nb_tr_steps == MAX_BATCHES:
                break

    time_elapsed = datetime.datetime.now() - time_start
                        
    # Evaluate each epoch.
    model.eval()
                        
    for batch in val_loader:
        batch_sequences, batch_masks, batch_labels = batch[0].to(DEVICE), batch[1].to(DEVICE), batch[2].to(DEVICE)
                        
        with torch.no_grad():
            output = model(batch_sequences, token_type_ids=None, attention_mask=batch_masks, labels=batch_labels)
            logits = output['logits'].detach().cpu().numpy()
            np_labels = batch_labels.to('cpu').numpy()
            
            batch_val_accuracy = accuracy_score(logits, np_labels)
            batch_val_loss = output['loss']
            loss_val_list.append(batch_val_loss.item())
            val_loss += batch_val_loss.item()
            val_accuracy += batch_val_accuracy
            n_val_steps += 1
            
            # TODO: Remove. Only for debugging.
            if MAX_BATCHES:
                if n_val_steps == MAX_BATCHES:
                    break
     
    print(f'Epoch: {epoch}, \
        Average Time per Batch: {time_elapsed / len(train_loader)}, \
        Training Loss: {loss_train / len(train_loader)} \
        Training Accuracy: {accuracy_train / len(train_loader)} \
        Validation Loss: {val_loss / len(val_loader)} \
        Validation Accuracy: {val_accuracy / len(val_loader)}')


In [None]:
# Visualize training loss.
plt.figure(figsize=(15,8))
plt.title("Training loss")
plt.xlabel("Batch")
plt.ylabel("Loss")
plt.plot(loss_train_list)
plt.show()


In [None]:
# Visualize validation loss.
plt.figure(figsize=(15,8))
plt.title("Validation loss")
plt.xlabel("Batch")
plt.ylabel("Loss")
plt.plot(loss_val_list)
plt.show()


### Evaluation
We can now prepare our testing data and make predictions. We'll measure our performance on the test set using Matthew's Correlation Coefficient, which is the standard metric for this problem.


In [None]:
# Prepare test data for model.
data_test = pd.read_csv(
    os.path.join(DATA_DIR, 'out_of_domain_dev.tsv'),
    names=['source', 'label', 'notes', 'sentence'],
    delimiter='\t',
    header=None
)

sentences_test = ["[CLS] " + sentence + " [SEP]" for sentence in data_test['sentence'].values]
labels_test = data_test['label'].values
tokenized_sentences_test = [tokenizer.tokenize(sentence) for sentence in sentences_test]

unpadded_sentence_ids_test = [tokenizer.convert_tokens_to_ids(s) for s in tokenized_sentences_test]
sentence_ids_test = np.array([np.pad(np.array(x).flatten(),
                    pad_width =(0, MAX_LEN-len(x)),
                    mode='constant',
                    constant_values=0.
                ) for x in unpadded_sentence_ids_test])
attention_masks_test = np.array([
    np.concatenate(
        [np.ones(shape=(len(x))), np.zeros(shape=(MAX_LEN - len(x)))]
    ) for x in unpadded_sentence_ids_test
])

test_sequences = torch.tensor(sentence_ids_test)
test_masks = torch.tensor(attention_masks_test)
test_labels = torch.tensor(labels_test)

dataset_test = TensorDataset(test_sequences, test_masks, test_labels)
test_sampler = SequentialSampler(dataset_test)
test_loader = DataLoader(dataset_test, sampler=test_sampler, batch_size=BATCH_SIZE)


In [None]:
# Make predictions on the test set.
model.eval()

preds = []
true_state = []

for test_step, batch in tqdm(enumerate(test_loader)):
    batch_sequences = batch[0].long().to(DEVICE)
    batch_masks = batch[1].long().to(DEVICE)
    batch_labels = batch[2].long().to(DEVICE)
    
    # TODO: Remove. Only for debugging. 
    if MAX_BATCHES:
        if test_step == MAX_BATCHES:
            break
    
    with torch.no_grad():
        output = model(batch_sequences, token_type_ids=None, attention_mask=batch_masks)
    
    logits = output['logits'].detach().cpu().numpy()
    np_labels = batch_labels.to('cpu').numpy()
    preds.append(logits)
    true_state.append(np_labels)


In [None]:
# Evaluate performance on the test set using aggregate Matthew's evaluation.
flattened_predictions = [item for sublist in preds for item in sublist]
flat_predictions = np.argmax(flattened_predictions, axis=1).flatten()
flat_true_labels = [item for sublist in true_state for item in sublist]

print(f"Test Matthew's Correlation Coefficient: {matthews_corrcoef(flat_true_labels, flat_predictions)}")