# Extracting features for downstream tasks

We will use a pretrained BERT model (https://huggingface.co/bert-base-uncased) to extract features about reviews from rotten tomatoes. Subsequently we'll train a simple 2-layer neural network to classify whether a review is positive or negative.

In [None]:
import numpy as np
from tqdm import tqdm
from transformers import BertTokenizer, BertModel
from datasets import load_dataset_builder, load_dataset
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch import optim
from torch.utils.data import DataLoader, TensorDataset

device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [None]:
# create BertTokenizer and BertModel and move the model to GPU if possible
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained("bert-base-uncased")
bert_model = bert_model.eval()
bert_model = bert_model.to(device)

In [None]:
# load the dataset
dataset_name = 'rotten_tomatoes'
ds_builder = load_dataset_builder(dataset_name)
print(ds_builder.info.description)
print(ds_builder.info.features)

ds_train = load_dataset(dataset_name, split="train")
ds_validation = load_dataset(dataset_name, split="validation")
ds_test = load_dataset(dataset_name, split="test")

ds_train

In [None]:
# some quick data analysis
nr_chars = [len(dct['text']) for dct in ds_train]
nr_words = [len(dct['text'].split(' ')) for dct in ds_train]

print("Number of character quantiles", np.quantile(nr_chars, np.linspace(0, 1, 11)))
print("Number of words quantiles", np.quantile(nr_words, np.linspace(0, 1, 11)))

In [None]:
def text_to_features(ds):
    """Encodes text to features by applying a (e.g.) BERT base model.
    """
    # create features
    features_list = []
    for i in tqdm(range(ds.num_rows)):
        tokens = tokenizer(ds[i]['text'], return_tensors='pt').to(device)
        with torch.no_grad():
            output = bert_model(**tokens)
        features_list.append(output.pooler_output)
    features_list = torch.cat(features_list)
    # extract the corresponding labels and move them to GPU (since the features are on GPU as well)
    labels_list = [dct['label'] for dct in ds]
    labels_list = torch.tensor(labels_list) \
        .reshape((-1, 1)) \
        .float() \
        .to(device)
    return features_list, labels_list

In [None]:
# transform the reviews to features
x_train, y_train = text_to_features(ds_train)
x_validation, y_validation = text_to_features(ds_validation)
x_test, y_test = text_to_features(ds_test)

In [None]:
# Define a simple neural network with two dense layers and a sigmoid in the end
class SimpleNN(nn.Module):
    
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.sigmoid(x)
        return x

# Parameters for the model and training
input_size = x_train.shape[1]  # Size of the input features
hidden_size = 128  # Number of units in the hidden layer
output_size = 1  # binary classification so only 1 output node
batch_size = 32  # Training batch size

# Create the model
model = SimpleNN(input_size, hidden_size, output_size)
model = model.to(device)

# Define a loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=25, gamma=0.5)

In [None]:
# Create dataLoaders
def create_dataloader(x, y, bs):
    dataset = TensorDataset(x, y)
    return DataLoader(dataset, batch_size=bs, shuffle=True)

dl_train = create_dataloader(x_train, y_train, batch_size)
dl_validation = create_dataloader(x_validation, y_validation, batch_size)
dl_test = create_dataloader(x_test, y_test, batch_size)

print("Number of training steps per epoch:", len(dl_train))

In [None]:
def eval_model(dl):
    """Helper function to easily evaluate the performance of the model on a data split.
    """
    model.eval()
    total_loss = 0
    total_correct = 0
    with torch.no_grad():
        for inputs, targets in dl:
            preds = model(inputs)
            loss = criterion(preds, targets)
            total_loss += loss.item()
            total_correct += ((preds > 0.5) == targets).sum()
    avg_loss = total_loss / len(dl)
    accuracy = total_correct / len(dl.dataset.tensors[1])
    return avg_loss, accuracy.to("cpu").item()

In [None]:
# check performance of untrained model
eval_model(dl_train)

In [None]:
# train the simple model
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    avg_loss = []
    for inputs, targets in dl_train:
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        avg_loss.append(loss.item())
    scheduler.step()
    
    avg_loss = np.mean(avg_loss)
    if epoch % 5 == 0:
        val_loss, val_accuracy = eval_model(dl_validation)
        print(f'Epoch [{epoch+1}/{num_epochs}], lr {scheduler.get_last_lr()[0]}, Train Loss: {avg_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}')

In [None]:
# check performance on test set
eval_model(dl_test)