In [1]:
import torch

device = torch.device('mps')

In [2]:
from datasets import load_dataset

trec = load_dataset('trec', split='train[:1000]')

Using custom data configuration default
Reusing dataset trec (/Users/xinli/.cache/huggingface/datasets/trec/default/1.1.0/751da1ab101b8d297a3d6e9c79ee9b0173ff94c4497b75677b59b61d5467a9b9)


In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

tokens = tokenizer(
    trec['text'], max_length=512,
    truncation=True, padding='max_length'
)

In [4]:
import numpy as np

labels = np.zeros(
    (len(trec), max(trec['label-coarse'])+1)
)

labels[np.arange(len(trec)), trec['label-coarse']] = 1
labels[:5]

array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.]])

In [5]:
labels = torch.Tensor(labels)

In [6]:
class TrecDataset(torch.utils.data.Dataset):
    def __init__(self, tokens, labels):
        self.tokens = tokens
        self.labels = labels

    def __getitem__(self, idx):
        input_ids = self.tokens[idx].ids
        attention_mask = self.tokens[idx].attention_mask
        labels = self.labels[idx]
        
        return {
            'input_ids': torch.tensor(input_ids),
            'attention_mask': torch.tensor(attention_mask),
            'labels': torch.tensor(labels)
        }

    def __len__(self):
        return len(self.labels)

dataset = TrecDataset(tokens, labels)

In [7]:
loader = torch.utils.data.DataLoader(
    dataset, batch_size=64
)

In [8]:
from transformers import BertForSequenceClassification, BertConfig

config = BertConfig.from_pretrained('bert-base-uncased')
config.num_labels = max(trec['label-coarse'])+1
model = BertForSequenceClassification(config).to(device)

In [9]:
'''
Fine-tuning the entire BERT model on first-gen M1 Mac is not going to work,
but can still fine-tune the classification head.
Freezing all BERT layer parameters.
Leaving fine-tuning to just to final few classification layers.
'''

'\nFine-tuning the entire BERT model on first-gen M1 Mac is not going to work,\nbut can still fine-tune the classification head.\nFreezing all BERT layer parameters.\nLeaving fine-tuning to just to final few classification layers.\n'

In [10]:
for param in model.bert.parameters():
    param.requires_grad = False

In [11]:
model.train()

optim = torch.optim.Adam(model.parameters(), lr=5e-5)

In [None]:
from time import time
from tqdm.auto import tqdm

loop_time = []

# setup loop (using tqdm for the progress bar)
loop = tqdm(loader, leave=True)
for batch in loop:
    batch_mps = {
        'input_ids': batch['input_ids'].to(device),
        'attention_mask': batch['attention_mask'].to(device),
        'labels': batch['labels'].to(device)
    }
    
    t0 = time()
    optim.zero_grad()
    
    outputs = model(**batch_mps)
    
    loss = outputs[0]
    loss.backward()
    
    optim.step()
    
    loop_time.append(time()-t0)
    loop.set_postfix(loss=loss.item())

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/16 [00:00<?, ?it/s]

  'labels': torch.tensor(labels)


In [None]:
loop_time