In [None]:
from transformers import pipeline

In [None]:
model = pipeline('fill-mask', model='bert-base-uncased')

In [None]:
model("Hello world! What a [MASK] day it is!")

# Extracting features for downstream tasks

In [None]:
import numpy as np
from transformers import BertTokenizer, BertModel, BertConfig
from datasets import load_dataset_builder, load_dataset
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

device = "cuda:0" if torch.cuda.is_available() else "cpu"
# device = "cpu"

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")
model = model.eval()
model = model.to(device)

In [None]:
ds_builder = load_dataset_builder("rotten_tomatoes")
print(ds_builder.info.description)
print(ds_builder.info.features)

In [None]:
ds_train = load_dataset("rotten_tomatoes", split="train")
ds_validation = load_dataset("rotten_tomatoes", split="validation")
ds_test = load_dataset("rotten_tomatoes", split="test")

ds_train

In [None]:
nr_chars = [len(dct['text']) for dct in ds_train]
nr_words = [len(dct['text'].split(' ')) for dct in ds_train]

print("Number of character quantiles", np.quantile(nr_chars, np.linspace(0, 1, 11)))
print("Number of words quantiles", np.quantile(nr_words, np.linspace(0, 1, 11)))

In [None]:
x_train = []
for i in tqdm(range(ds_train.num_rows)):
    tokens = tokenizer(ds_train[i]['text'], return_tensors='pt').to(device)
    with torch.no_grad():
        output = model(**tokens)
    x_train.append(output.pooler_output)
x_train = torch.cat(x_train)

In [None]:
y_train = [dct['label'] for dct in ds_train]
y_train = torch.tensor(y_train) \
    .reshape((-1, 1)) \
    .float() \
    .to(device)

In [None]:
import torch.nn as nn
from torch import optim
from torch.utils.data import DataLoader, TensorDataset

In [None]:
# Define a simple neural network with two dense layers
class SimpleNN(nn.Module):
    
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.sigmoid(x)
        return x

# Parameters for the model and training
input_size = x_train.shape[1]  # Size of the input features
hidden_size = 128  # Number of units in the hidden layer
output_size = 1  # Size of the output (e.g., number of classes in classification)
batch_size = 32  # Training batch size

# Create the model
model = SimpleNN(input_size, hidden_size, output_size)
model = model.to(device)

In [None]:
# Create a DataLoader
dataset = TensorDataset(x_train, y_train)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [None]:
preds, labels = [], []
for inputs, targets in dataloader:
    # Forward pass
    with torch.no_grad():
        outputs = model(inputs)
    preds.append(outputs)
    labels.append(targets)
preds = torch.cat(preds)
labels = torch.cat(labels)

# baseline loss
print("Loss:", criterion(preds, labels))
print("Accuracy", ((preds > 0.5) == labels).float().mean())

In [None]:
# Define a loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=25, gamma=0.5)

In [None]:
# Training loop (simplified)
num_epochs = 100
for epoch in range(num_epochs):
    avg_loss = []
    for inputs, targets in dataloader:
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, targets)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        avg_loss.append(loss.item())
    scheduler.step()
    
    avg_loss = np.mean(avg_loss)
    if epoch % 5 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], lr {scheduler.get_last_lr()[0]}, Loss: {avg_loss:.4f}')

In [None]:
model = model.eval()

In [None]:
preds, labels = [], []
for inputs, targets in dataloader:
    # Forward pass
    with torch.no_grad():
        outputs = model(inputs)
    preds.append(outputs)
    labels.append(targets)
preds = torch.cat(preds)
labels = torch.cat(labels)

# baseline loss
print("Loss:", criterion(preds, labels))
print("Accuracy", ((preds > 0.5) == labels).float().mean())