In [1]:
# from datasets import load_dataset

# #dataset = load_dataset("yelp_review_full")
# dataset = load_dataset("amazon_reviews_multi", "en", "train")
# dataset["train"][100]

#---
# def tokenize_function(examples):
#     return tokenizer(examples["text"], padding="max_length", truncation=True)

#tokenized_datasets = dataset.map(tokenize_function, batched=True)

#---
#tokenized_datasets.set_format("torch")

#small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(10))

#tokenized_dataset = tokenized_dataset.remove_columns(["review_body"])

# small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
# small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

#---
#eval_dataloader = DataLoader(small_eval_dataset, batch_size=8)


In [2]:
import datasets
from datasets import load_dataset

def fetch_dataset(
    dataset_name: str="amazon_reviews_multi",
    configuration: str="en",
    split: str="train"
) -> datasets.arrow_dataset.Dataset:
    '''
    Fetch dataset from HuggingFace datasets server.
    '''
    dataset = load_dataset(dataset_name, configuration, split=split)
    return dataset



  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import typing
from transformers import AutoTokenizer

def tokenize_dataset(
    tokenizer: AutoTokenizer, 
    dataset: datasets.arrow_dataset.Dataset,
    sample=True
) -> datasets.arrow_dataset.Dataset:
    '''
    Tokenize the HuggingFace dataset object and format for use in laterPyTorch logic.
    '''
    tokenized_dataset = dataset.map(
        lambda x: tokenizer(x["review_body"], padding="max_length", truncation=True),
        batched=True
    )
    # Torch needs the target column to be named "labels"
    tokenized_dataset = tokenized_dataset.rename_column("stars", "labels")
    
    # We can format the dataset for Torch using this method.
    tokenized_dataset.set_format(
        type="torch", 
        columns=["input_ids", "token_type_ids", "attention_mask", "labels"]
    )
    # Let's downsample to speed things up for testing
    if sample==True:
        tokenized_dataset_small = tokenized_dataset.shuffle(seed=42).select(range(10))
        return tokenized_dataset_small
    else:
        return tokenised_dataset

In [4]:
from torch.utils.data import DataLoader

def create_dataloader(
    tokenized_dataset: datasets.arrow_dataset.Dataset,
    batch_size: int = 16,
    shuffle: bool = True
):
    dataloader = DataLoader(tokenized_dataset, shuffle=shuffle, batch_size=batch_size)
    return dataloader

from transformers import AutoModelForSequenceClassification
from torch.optim import AdamW
from transformers import get_scheduler

def configure_scheduler_optimizer(
    model: typing.Any,
    dataloader: typing.Any,
    learning_rate: float,
    num_training_steps: int
) -> tuple[typing.Any, typing.Any]:
    '''
    Return a learning scheduler for use in training using the AdamW optimizer
    '''
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    lr_scheduler = get_scheduler(
        name="linear", 
        optimizer=optimizer, 
        num_warmup_steps=0, 
        num_training_steps=num_training_steps
    )
    return lr_scheduler, optimizer

import torch
from tqdm.auto import tqdm

def transfer_learn(
    model: typing.Any, 
    dataloader: typing.Any,
    learning_rate: float = 5e-5,
    num_epochs: int = 5,
    progress_bar: bool = True
)-> typing.Any:

    device = torch.device("cuda") if torch.cuda.is_available() else\
        torch.device("cpu")
    model.to(device)
    
    num_training_steps = num_epochs * len(dataloader)

    lr_scheduler, optimizer = configure_scheduler_optimizer(
        model = model, 
        dataloader = dataloader,
        learning_rate = learning_rate,
        num_training_steps = num_training_steps
    )
    
    if progress_bar:
        progress_bar = tqdm(range(num_training_steps))
    else:
        pass

    model.train()
    for epoch in range(num_epochs):
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            if progress_bar:
                progress_bar.update(1)
            else:
                pass
    return model


In [5]:
dataset = fetch_dataset()
import pprint
pprint.pprint(dataset[100])

Found cached dataset amazon_reviews_multi (/Users/apmcm/.cache/huggingface/datasets/amazon_reviews_multi/en/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609)


{'language': 'en',
 'product_category': 'sports',
 'product_id': 'product_en_0610451',
 'review_body': 'Two nights in the water tide to our dock in the lake..... I’d '
                'say something liked this.',
 'review_id': 'en_0143676',
 'review_title': 'Let the picture tell you how go this is',
 'reviewer_id': 'reviewer_en_0377453',
 'stars': 1}


In [6]:

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
tokenized_dataset = tokenize_dataset(tokenizer=tokenizer, dataset=dataset, sample=True)
dataloader = create_dataloader(tokenized_dataset=tokenized_dataset)
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=6) # 5 stars
transfer_learned_model = transfer_learn(
    model = model,
    dataloader=dataloader
)

Loading cached processed dataset at /Users/apmcm/.cache/huggingface/datasets/amazon_reviews_multi/en/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609/cache-e8cc4a26d4442ff6.arrow
Loading cached shuffled indices for dataset at /Users/apmcm/.cache/huggingface/datasets/amazon_reviews_multi/en/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609/cache-7bed94b504cfb269.arrow
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a

In [7]:
import evaluate

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
metric = evaluate.load("accuracy")
model.eval()

eval_dataset = fetch_dataset(split="test")
tokenized_eval_dataset = tokenize_dataset(tokenizer=tokenizer, dataset=eval_dataset, sample=True)
eval_dataloader = create_dataloader(
    tokenized_dataset=tokenized_eval_dataset
    )

for batch in eval_dataloader:#eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

Found cached dataset amazon_reviews_multi (/Users/apmcm/.cache/huggingface/datasets/amazon_reviews_multi/en/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609)
                                                                              

{'accuracy': 0.3}

In [8]:
def test_func(**kwargs):
    return kwargs['test']