In [None]:
!pip uninstall transformers
!pip cache purge

Found existing installation: transformers 4.39.3
Uninstalling transformers-4.39.3:
  Would remove:
    /usr/local/bin/transformers-cli
    /usr/local/lib/python3.10/dist-packages/transformers-4.39.3.dist-info/*
    /usr/local/lib/python3.10/dist-packages/transformers/*
Proceed (Y/n)? 

In [1]:
!pip install datasets transformers accelerate evaluate



In [2]:
import datasets
import torch

import evaluate

from torch.utils.data import DataLoader
from torch.utils.data.dataloader import default_collate
from transformers import AutoTokenizer, BertForSequenceClassification, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer

In [3]:
torch.manual_seed(123)

<torch._C.Generator at 0x7927f45d0150>

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [7]:
tokenizer = AutoTokenizer.from_pretrained("muhtasham/bert-tiny-mlm-finetuned-imdb-finetuned-emotion")
model = AutoModelForSequenceClassification.from_pretrained("muhtasham/bert-tiny-mlm-finetuned-imdb-finetuned-emotion", num_labels=4, ignore_mismatched_sizes=True)

tokenizer_config.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/978 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/17.6M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at muhtasham/bert-tiny-mlm-finetuned-imdb-finetuned-emotion and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([6, 128]) in the checkpoint and torch.Size([4, 128]) in the model instantiated
- classifier.bias: found shape torch.Size([6]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
model = model.to(device)

In [9]:
dataset = datasets.load_dataset('imdb')

In [10]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=256, return_tensors='pt').to(device)

In [11]:
train = dataset['train'].map(tokenize_function, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [12]:
val = dataset['test'].map(tokenize_function, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [13]:
training_args = TrainingArguments(
    output_dir="test_trainer",
    num_train_epochs=1,
    per_device_train_batch_size=64
    )

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [15]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=391, training_loss=0.799497326316736, metrics={'train_runtime': 24.6375, 'train_samples_per_second': 1014.715, 'train_steps_per_second': 15.87, 'total_flos': 15890995200000.0, 'train_loss': 0.799497326316736, 'epoch': 1.0})

In [16]:
class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels, device='cuda'):
        self.encodings = encodings
        self.labels = labels
        self.device = device

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).to(self.device)
        return item

    def __len__(self):
        return len(self.labels)

In [17]:
labels = val['label']
clear_val = val.map(remove_columns=['text', 'label'])
clear_val.set_format('pt')

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [18]:
clear_val

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 25000
})

In [19]:
val_loader = DataLoader(clear_val, batch_size=64)

In [20]:
predict = []
for batch in val_loader:
  output = model(input_ids=batch['input_ids'].to(device),
                 token_type_ids=batch['token_type_ids'].to(device),
                 attention_mask=batch['attention_mask'].to(device))
  predict.extend(torch.argmax(output.logits, dim=1))

In [21]:
acc = evaluate.load('accuracy')

In [22]:
print(acc.compute(predictions=predict, references=labels))

{'accuracy': 0.78876}
