In [1]:
%%capture
! pip install datasets transformers


In [2]:
from tqdm.notebook import tqdm
from IPython import display

import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score

import torch
import torch.nn as nn

from datasets import load_dataset
from transformers import T5TokenizerFast, T5ForConditionalGeneration, DataCollatorForSeq2Seq

In [3]:
#####################################
###### DO NOT CHANGE THIS CELL ######
#####################################

BASE_MODEL_NAME = 't5-small'

BATCH_SIZE = 32
LEARNING_RATE = 1e-5
EPOCHS = 10

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [4]:
DEVICE

device(type='cuda', index=0)

In [5]:
dataset = load_dataset('imdb')
dataset.pop('unsupervised')
print(dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
})


In [6]:
def id2label(ids):
    label_names = ['negative', 'positive']
    return [label_names[id] for id in ids]

def label2id(labels):
    label_names_dict = {
        'negative': 0,
        'positive': 1
    }
    return [
        label_names_dict.get(label, 2)
        for label in labels
    ]

In [7]:
tokenizer = T5TokenizerFast.from_pretrained(BASE_MODEL_NAME)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [8]:
def preprocess_input(text):
    text = text.lower()
    text = text.replace('<br />', ' ')
    return text

def map_function(row):
    processed_input = [
        preprocess_input(text)
        for text in row['text']
    ]
    input_info = tokenizer(processed_input, truncation=True, max_length=256)
    output_info = tokenizer(id2label(row['label']))
    return {
        **input_info,
        'labels': output_info.input_ids
    }


dataset = dataset.map(map_function, batched=True)
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [12]:
model = T5ForConditionalGeneration.from_pretrained(BASE_MODEL_NAME)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [14]:
col_fn = DataCollatorForSeq2Seq(
    tokenizer, return_tensors='pt', padding='longest',
)

train_loader = torch.utils.data.DataLoader(
    dataset['train'],
    batch_size=BATCH_SIZE,
    collate_fn=col_fn,
    shuffle=True
)

test_loader = torch.utils.data.DataLoader(
    dataset['test'],
    batch_size=BATCH_SIZE,
    collate_fn=col_fn,
)

In [15]:
def train_loop(model, loader, optimizer):
    model.train()

    batch_losses = []

    for row in tqdm(loader, desc='Training:'):
        optimizer.zero_grad()

        out = model(**row.to(model.device))
        loss = out.loss

        batch_loss_value = loss.item()
        loss.backward()
        optimizer.step()

        batch_losses.append(batch_loss_value)

    loss_value = np.mean(batch_losses)
    return {'train_loss': loss_value}

def _predict(model, row):
    return model.generate(
        input_ids=row.input_ids,
        attention_mask=row.attention_mask,
        max_length=5
    )

def tokenizer_ids_to_label(all_input_ids):
    return tokenizer.batch_decode(all_input_ids, skip_special_tokens=True)

def valid_loop(model, loader, compute_metrics):
    model.eval()

    all_true = []
    all_pred = []

    with torch.no_grad():
        for row in tqdm(loader, desc='Validating:'):
            row.to(model.device)
            pred = _predict(model, row)

            all_true += row.labels.detach().cpu().tolist()
            all_pred += pred.detach().cpu().tolist()

    all_true = label2id(tokenizer_ids_to_label(all_true))
    all_pred = label2id(tokenizer_ids_to_label(all_pred))

    return {'valid_acc': compute_metrics(y_true=all_true, y_pred=all_pred)}

In [16]:
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
compute_metrics = accuracy_score

In [17]:
# Make sure model is in eval mode
model.eval()
model.to(DEVICE)

# Run validation before training
zero_shot_results = valid_loop(
    model=model,
    loader=test_loader,
    compute_metrics=accuracy_score
)

print("Zero-shot accuracy:", zero_shot_results['valid_acc'])


Validating::   0%|          | 0/782 [00:00<?, ?it/s]

  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


Zero-shot accuracy: 0.0


In [35]:
samples = [dataset['test'][i] for i in smp_idx]
for s in samples:
    s['input_ids'] = s['input_ids'].unsqueeze(0).to(DEVICE)
    s['attention_mask'] = s['attention_mask'].unsqueeze(0).to(DEVICE)
for i, s in enumerate(samples):
    output_ids = model.generate(
        input_ids=s['input_ids'],
        attention_mask=s['attention_mask'],
        max_length=5
    )
    prediction = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    true_label = tokenizer.decode(s['labels'], skip_special_tokens=True)
    review_text = tokenizer.decode(s['input_ids'][0], skip_special_tokens=True)

    print(f"Sample {i+1}:")
    print("Review :", review_text)
    print("True   :", true_label)
    print("Pred   :", prediction)
    print("---")


Sample 1:
Review : worth the entertainment value of a rental, especially if you like action movies. this one features the usual car chases, fights with the great van damme kick style, shooting battles with the 40 shell load shotgun, and even terrorist style bombs. all of this is entertaining and competently handled but there is nothing that really blows you away if you've seen your share before. the plot is made interesting by the inclusion of a rabbit, which is clever but hardly profound. many of the characters are heavily stereotyped -- the angry veterans, the terrified illegal aliens, the crooked cops, the indifferent feds, the bitchy tough lady station head, the crooked politician, the fat federale who looks like he was typecast as the mexican in a hollywood movie from the 1940s. all passably acted but again nothing special. i thought the main villains were pretty well done and fairly well acted. by the end of the movie you certainly knew who the good guys were and weren't. there w

In [23]:
positive_id = tokenizer("positive", add_special_tokens=False).input_ids[0]
negative_id = tokenizer("negative", add_special_tokens=False).input_ids[0]


In [28]:
def zero_shot_predict(model, dataloader):
    model.eval()
    model.to(DEVICE)
    all_preds = []
    all_labels = []

    positive_id = tokenizer("positive", add_special_tokens=False).input_ids[0]
    negative_id = tokenizer("negative", add_special_tokens=False).input_ids[0]

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Zero-shot predicting"):
            batch = {k: v.to(DEVICE) for k,v in batch.items()}

            batch_size = batch['input_ids'].shape[0]
            decoder_start = torch.full((batch_size,1),
                                       model.config.decoder_start_token_id,
                                       dtype=torch.long).to(DEVICE)

            outputs = model(
                input_ids=batch['input_ids'],
                attention_mask=batch['attention_mask'],
                decoder_input_ids=decoder_start
            )

            logits_first_token = outputs.logits[:, 0, :]
            probs = torch.nn.functional.softmax(logits_first_token, dim=-1)

            batch_preds = ["positive" if p[positive_id] > p[negative_id] else "negative" for p in probs]
            all_preds.extend(batch_preds)

            all_labels.extend(tokenizer.batch_decode(batch['labels'], skip_special_tokens=True))

    all_preds_ids = label2id(all_preds)
    all_true_ids = label2id(all_labels)

    from sklearn.metrics import accuracy_score
    acc = accuracy_score(all_true_ids, all_preds_ids)
    return acc


In [29]:
zero_shot_acc = zero_shot_predict(model, test_loader)
print("Zero-shot accuracy on test set:", zero_shot_acc)


Zero-shot predicting:   0%|          | 0/782 [00:00<?, ?it/s]

Zero-shot accuracy on test set: 0.75552


In [34]:
samples = [dataset['test'][i] for i in smp_idx]
for s in samples:
    s['input_ids'] = s['input_ids'].unsqueeze(0).to(DEVICE)
    s['attention_mask'] = s['attention_mask'].unsqueeze(0).to(DEVICE)

import torch.nn.functional as F

for i, s in enumerate(samples):
    batch_size = s['input_ids'].shape[0]
    decoder_start = torch.full((batch_size, 1),
                               model.config.decoder_start_token_id,
                               dtype=torch.long).to(DEVICE)

    with torch.no_grad():
        outputs = model(
            input_ids=s['input_ids'],
            attention_mask=s['attention_mask'],
            decoder_input_ids=decoder_start
        )

    # softmax over first token
    logits_first_token = outputs.logits[:, 0, :]
    probs = F.softmax(logits_first_token, dim=-1)

    # pick positive or negative
    prediction = "positive" if probs[0, positive_id] > probs[0, negative_id] else "negative"
    true_label = tokenizer.decode(s['labels'], skip_special_tokens=True)
    review_text = tokenizer.decode(s['input_ids'][0], skip_special_tokens=True)

    print(f"Sample {i+1}:")
    print("Review :", review_text)
    print("True   :", true_label)
    print("Pred   :", prediction)
    print("---")


Sample 1:
Review : worth the entertainment value of a rental, especially if you like action movies. this one features the usual car chases, fights with the great van damme kick style, shooting battles with the 40 shell load shotgun, and even terrorist style bombs. all of this is entertaining and competently handled but there is nothing that really blows you away if you've seen your share before. the plot is made interesting by the inclusion of a rabbit, which is clever but hardly profound. many of the characters are heavily stereotyped -- the angry veterans, the terrified illegal aliens, the crooked cops, the indifferent feds, the bitchy tough lady station head, the crooked politician, the fat federale who looks like he was typecast as the mexican in a hollywood movie from the 1940s. all passably acted but again nothing special. i thought the main villains were pretty well done and fairly well acted. by the end of the movie you certainly knew who the good guys were and weren't. there w

In [33]:
smp_idx = [1, 20000, 15000, 10000, 4000]