In [1]:
from datasets import load_dataset

raw_data = load_dataset("stanfordnlp/imdb")
raw_data

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [2]:
import re 

re_br = re.compile(r'<br\s*?/?>')
raw_data = raw_data.map(
    lambda x: {"text": [re_br.sub("\n", reviews) for reviews in x['text']]},
    batched=True
)
print(raw_data['test'][1]['text'])

Worth the entertainment value of a rental, especially if you like action movies. This one features the usual car chases, fights with the great Van Damme kick style, shooting battles with the 40 shell load shotgun, and even terrorist style bombs. All of this is entertaining and competently handled but there is nothing that really blows you away if you've seen your share before.

The plot is made interesting by the inclusion of a rabbit, which is clever but hardly profound. Many of the characters are heavily stereotyped -- the angry veterans, the terrified illegal aliens, the crooked cops, the indifferent feds, the bitchy tough lady station head, the crooked politician, the fat federale who looks like he was typecast as the Mexican in a Hollywood movie from the 1940s. All passably acted but again nothing special.

I thought the main villains were pretty well done and fairly well acted. By the end of the movie you certainly knew who the good guys were and weren't. There was an emotional l

In [3]:
import torch 

from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
# from transformers import TrainingArguments


output_dir = "C:/Users/Steven/IMDB/bert-finetuned-imdb-sentiment-accelerate-epoch_3"
config = f"{output_dir}/config"
model_checkpoint = "bert-base-uncased"
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, 
    # config=config, 
    num_labels=2, 
    id2label=id2label, 
    label2id=label2id, 
    # use_flash_attention_2=False                                        
    
)
# training_args = TrainingArguments(
#     per_device_train_batch_size=8,
#     fp16=True,
#     learning_rate=3e-5,
#     output_dir="bert-finetuned-imdb-sentiment-accelerate",
# )

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
def tokenize_and_split(examples):
    result = tokenizer(
        examples["text"],
        add_special_tokens=True,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_overflowing_tokens=True,

    )

    # # Extract mapping between new and old indices
    sample_map = result.pop("overflow_to_sample_mapping")
    for _, values in examples.items():
        result["labels"] = [values[i] for i in sample_map]
    return result


In [5]:
from torch.utils.data import DataLoader

# train_data = raw_data['train'].shuffle(seed=42)
test_data = raw_data['test'].shuffle(seed=42)
# tokenized_train = train_data.map(
#     tokenize_and_split, 
#     batched=True,
#     remove_columns=train_data.column_names
# )
# tokenized_sample.set_format('torch')
tokenized_test = test_data.map(
    tokenize_and_split, 
    batched=True,
    remove_columns=test_data.column_names
)
# tokenized_sample_test.set_format('torch')
# train_dataloader = DataLoader(
#     tokenized_train,
#     shuffle=True,
#     batch_size=8,
#     collate_fn=data_collator
# )
test_dataloader = DataLoader(
    tokenized_test,
    batch_size=8,
    collate_fn=data_collator,
    pin_memory=True,
    pin_memory_device='cuda:0'
)

In [6]:
import torch
import evaluate

from tqdm.auto import tqdm

progress_bar = tqdm(range(len(test_dataloader)))
metric = evaluate.combine(["accuracy", "f1", "precision", "recall"])
Overall_eval = {'accuracy':[], 'f1':[], 'precision':[], 'recall':[]}

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)

model.eval()
for batch in test_dataloader:
    batch = batch.to(device)
    with torch.no_grad():
        outputs = model(**batch)

    predictions = outputs.logits.argmax(dim=-1)

    # Necessary to pad predictions and labels for being gathered
    # predictions_pad = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
    # batch_labels = accelerator.pad_across_processes(batch["labels"], dim=1, pad_index=-100)
    
    # predictions, references = accelerator.gather_for_metrics((predictions, batch["labels"]))

    # true_predictions, true_labels = postprocess(prediction, references)
    metric.add_batch(predictions=predictions, references=batch["labels"])
    progress_bar.update(1)

eval_metric = metric.compute()
# accelerator.print(
#     f"epoch {epoch}:", eval_metric
# )
print(eval_metric)
for key, value in eval_metric.items():
    Overall_eval[key].append(round(value, 3))
print(Overall_eval)

  0%|          | 0/3565 [00:00<?, ?it/s]

device: cuda:0


  attn_output = torch.nn.functional.scaled_dot_product_attention(


KeyboardInterrupt: 