In [1]:
from transformers import AutoModelForMaskedLM

model_ckpt = "distilbert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_ckpt)

In [2]:
model.num_parameters()

66985530

In [3]:
from transformers import AutoTokenizer

text = "This is a great [MASK]."
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [4]:
import torch

inputs = tokenizer(text, return_tensors="pt")
logits = model(**inputs).logits

In [5]:
inputs['input_ids']

tensor([[ 101, 2023, 2003, 1037, 2307,  103, 1012,  102]])

In [6]:
inputs['input_ids'].shape, logits.shape

(torch.Size([1, 8]), torch.Size([1, 8, 30522]))

In [7]:
probs = torch.nn.functional.softmax(logits, dim=1)
probs

tensor([[[3.1452e-01, 2.7459e-01, 3.7539e-01,  ..., 3.3268e-01,
          5.3011e-01, 6.1277e-01],
         [5.6898e-04, 5.0407e-04, 5.8367e-04,  ..., 8.1463e-04,
          1.5591e-03, 2.1732e-03],
         [5.3732e-04, 3.8677e-04, 5.4660e-04,  ..., 2.0755e-03,
          1.1978e-02, 3.7115e-03],
         ...,
         [6.7616e-01, 7.1718e-01, 6.1386e-01,  ..., 6.4859e-01,
          4.3356e-01, 2.3645e-01],
         [1.0458e-03, 9.6402e-04, 1.1481e-03,  ..., 4.6911e-03,
          5.7508e-03, 2.5998e-02],
         [6.1588e-03, 5.6910e-03, 7.5507e-03,  ..., 8.1323e-03,
          1.3459e-02, 1.1197e-01]]], grad_fn=<SoftmaxBackward0>)

In [8]:
mask_token_index = torch.where(inputs['input_ids'] == tokenizer.mask_token_id)[1]
mask_token_logits = logits[0, mask_token_index, :]

In [9]:
top_5_tokens = torch.topk(mask_token_logits, k=5, dim=1).indices[0].tolist()
top_5_tokens

[3066, 3112, 6172, 2801, 8658]

In [10]:
for token in top_5_tokens:
    print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

'>>> This is a great deal.'
'>>> This is a great success.'
'>>> This is a great adventure.'
'>>> This is a great idea.'
'>>> This is a great feat.'


# IMDB dataset

In [11]:
from datasets import load_dataset

imdb_dataset = load_dataset("imdb")
imdb_dataset

Found cached dataset imdb (/Users/swayam/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [12]:
tokenizer.model_max_length

512

In [13]:
chunk_size = 128

def tokenize_and_chunk(batch):
    encodings = tokenizer(batch['text'])
    encodings['word_ids'] = [encodings.word_ids(idx) for idx in range(len(encodings['input_ids']))]

    concatenated = {
        k : sum(encodings[k], []) for k in encodings.keys()
        # tokenized input already contains [SEP] token after each individual example
    }


    result = {
        k : [t[i : i+chunk_size] for i in range(0, len(concatenated['input_ids']), chunk_size)]
        for k,t in concatenated.items()
    }

    result['labels'] = result['input_ids'].copy() # will be use to evaluate mask predictions

    return result


lm_dataset = imdb_dataset.map(tokenize_and_chunk, batched=True, remove_columns=['text', 'label'])

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (720 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [14]:
lm_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 61314
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 59929
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 123007
    })
})

In [93]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm_probability=0.15)

In [18]:
samples = [lm_dataset["train"][i] for i in range(2)]

for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)['input_ids']:
    print(tokenizer.decode(chunk))

[CLS] i rented i am curious - yellow from [MASK] video store because of all the controversy that surrounded [MASK] when it was first released in 1967. i also heard that at first it was seized by u. s. customs if it ever tried to enter this country, therefore being a fan [MASK] films considered " ₈ " [MASK] [MASK] had to see [MASK] for [MASK]. < br / > < br / > the [MASK] is centered around a young swedish drama student named lena who [MASK] [MASK] learn everything she can about life. in particular she wants to focus [MASK] attention [MASK] to making some sort of documentary on what the average sw [MASK] [MASK] about certain political issues such
as the [MASK] war and race issues in the united states. in net [MASK] politicians and ordinary denizens of stockholm about [MASK] opinions on [MASK], [MASK] has sex with her drama teacher, classmates, and married [MASK]. < br / > < br / > what kills [MASK] [MASK] i am curious - yellow is that 40 years [MASK], this [MASK] considered pornographic

we need `whole word masking` to mask entire word not just one token, HF already had a `DataCollatorForWholeWordMask` but we are creating a new one from scratch

In [56]:
from collections import defaultdict
import numpy as np

from transformers import default_data_collator

wwm_probability = 0.2 # whole word masking probability

def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = defaultdict(list)
        current_word_index = -1
        current_word = None

        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

    # randomly mask word
    mask = np.random.binomial(1, wwm_probability, (len(mapping),)) # similar to [1, 0, 0, 0, 1, 0, 0, 0, 1, 0]
    input_ids = feature["input_ids"]
    labels = feature["labels"]
    new_labels = [-100] * len(labels) # label only those indices where [MASK] is occuring
    for word_id in np.where(mask)[0]: # return index where mask is 1
        word_id = word_id.item()
        for idx in mapping[word_id]:
                new_labels[idx] = labels[idx] # give them right label
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels

    return default_data_collator(features)

In [57]:
samples = [lm_dataset["train"][i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")
    print(f"\n'>>> {tokenizer.convert_ids_to_tokens(chunk)}'")

dict_keys(['input_ids', 'attention_mask', 'word_ids', 'labels'])
dict_keys(['input_ids', 'attention_mask', 'word_ids', 'labels'])

'>>> [CLS] i rented i am curious - yellow from my video store because of all the controversy that surrounded it when it was first released in 1967. i also heard that at first it was seized by u. s. customs if it ever tried to enter this country, therefore being a fan of films considered " controversial " i really had to see this for myself. < br / > < br / > the plot is centered around a young swedish drama student named lena who wants to learn everything she can about life. in particular she wants to focus her attentions to making some sort of documentary on what the average swede thought about certain political issues such'

'>>> ['[CLS]', 'i', 'rented', 'i', 'am', 'curious', '-', 'yellow', 'from', 'my', 'video', 'store', 'because', 'of', 'all', 'the', 'controversy', 'that', 'surrounded', 'it', 'when', 'it', 'was', 'first', 'released', 'in', '1967', '.', 

In [58]:
from transformers import DataCollatorForWholeWordMask


data_collator = DataCollatorForWholeWordMask(tokenizer, mlm_probability=0.2)

In [59]:
samples = [lm_dataset["train"][i] for i in range(2)]
batch = data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")
    print(f"\n'>>> {tokenizer.convert_ids_to_tokens(chunk)}'")


'>>> [CLS] i rented [MASK] am curious - yellow from myct store because of all the controversy [MASK] surrounded it when it [MASK] first released in 1967. i also heard [MASK] at whisky it was seized [MASK] u. s. customs if it ever tried to [MASK] this country, therefore being a fan of [MASK] considered " controversial [MASK] [MASK] [MASK] had to [MASK] this evaluate myself. [MASK] br [MASK] > < br / > [MASK] plot [MASK] centered around a [MASK] swedish drama student named lena who wants to learn everything she [MASK] [MASK] [MASK] [MASK] in [MASK] she wants to focus her attentions to making some sort of documentary on what the average swede [MASK] about certain political issues such'

'>>> ['[CLS]', 'i', 'rented', '[MASK]', 'am', 'curious', '-', 'yellow', 'from', 'my', '##ct', 'store', 'because', 'of', 'all', 'the', 'controversy', '[MASK]', 'surrounded', 'it', 'when', 'it', '[MASK]', 'first', 'released', 'in', '1967', '.', 'i', 'also', 'heard', '[MASK]', 'at', 'whisky', 'it', 'was', 's



In [97]:
train_size = 10000
test_size = int(0.1 * train_size)

downsampled_dataset = lm_dataset["train"].train_test_split(test_size, train_size, shuffle=True, seed=42)

Loading cached split indices for dataset at /Users/swayam/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-e25b6f8f7c19cd58.arrow and /Users/swayam/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-19539aea64e9d239.arrow


In [98]:
downsampled_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1000
    })
})

In [62]:
from transformers import TrainingArguments

batch_size = 64
# Show the training loss with every epoch
logging_steps = len(downsampled_dataset["train"]) // batch_size
model_name = model_ckpt.split("/")[-1]

training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-imdb",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=False,
    logging_steps=logging_steps,
)

In [66]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=downsampled_dataset["train"],
    eval_dataset=downsampled_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [69]:
# perplexity
import math
eval_results = trainer.evaluate()
math.exp(eval_results['eval_loss'])



  0%|          | 0/16 [00:00<?, ?it/s]

40.60905919901439

In [71]:
# trainer.train()

# Acclererate

In [90]:
# explanation of below next code cell
batch = downsampled_dataset['train'][:3]
for t in zip(*batch.values()): # ([input_ids], [attention_mask], [word_ids], [labels])
    s = dict(zip(batch, t)) # {'input_ids':[input_ids], 'attention_mask': [attention_mask], ....}
    print(s)
    break

{'input_ids': [1996, 5016, 21843, 1998, 4165, 2010, 14315, 1011, 2007, 2010, 2300, 13077, 8840, 2378, 23095, 8134, 4634, 2125, 2010, 6700, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 2009, 1005, 1055, 1037, 9467, 2008, 1996, 2058, 1011, 9474, 8292, 29577, 2015, 27604, 2091, 1996, 4639, 3267, 1997, 1996, 24566, 3152, 2044, 2023, 4443, 1012, 2348, 1996, 17889, 12274, 10820, 24566, 3152, 2052, 2145, 4013, 17668, 1999, 1996, 2086, 2000, 2272, 1010, 2027, 2052, 6524, 3921, 1996, 3348, 5574, 1997, 2023, 3185, 1012, 102, 101, 2066, 24566, 1996, 23957, 2158, 1006, 4673, 1007, 1010, 2069, 2062, 2061, 1012, 2045, 1005, 1055, 2062, 1997, 2673, 1010, 2062, 4176, 1010, 2062, 9426, 3060, 6946, 1010, 1998, 5019, 1999, 2029, 1996, 2245, 2442, 2022, 1010, 2065, 2023, 2001, 2204, 2007, 2093], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

However, we saw that `DataCollatorForLanguageModeling` also applies random masking with each evaluation, so we’ll see some fluctuations in our perplexity scores with each training run. One way to eliminate this source of randomness is to apply the masking once on the whole test set, and then use the default data collator in 🤗 Transformers to collect the batches during evaluation. To see how this works, let’s implement a simple function that applies the masking on a batch, similar to our first encounter with `DataCollatorForLanguageModeling`:

In [91]:
def insert_random_mask(batch):
    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
    masked_inputs = data_collator(features)
    # Create a new "masked" column for each column in the dataset
    return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}

In [100]:
downsampled_dataset = downsampled_dataset.remove_columns(["word_ids"])
eval_dataset = downsampled_dataset["test"].map(
    insert_random_mask,
    batched=True,
    remove_columns=downsampled_dataset["test"].column_names,
)
eval_dataset = eval_dataset.rename_columns(
    {
        "masked_input_ids": "input_ids",
        "masked_attention_mask": "attention_mask",
        "masked_labels": "labels",
    }
)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

We can then set up the dataloaders as usual, but we’ll use the `default_data_collator` from 🤗 Transformers for the evaluation set:

In [101]:
eval_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1000
})

In [111]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

batch_size = 16
train_dataloader = DataLoader(
    downsampled_dataset["train"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)
eval_dataloader = DataLoader(
    eval_dataset, batch_size=batch_size, collate_fn=default_data_collator # this is imp
)

In [112]:
model = AutoModelForMaskedLM.from_pretrained(model_ckpt)

In [113]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [114]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [115]:
from transformers import get_scheduler

num_train_epochs = 1
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [116]:
model_name = "distilbert-base-uncased-finetuned-imdb-accelerate"

In [None]:
from tqdm.auto import tqdm
import torch
import math

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)

        loss = outputs.loss
        losses.append(accelerator.gather(loss.repeat(batch_size)))

    losses = torch.cat(losses)
    losses = losses[: len(eval_dataset)]
    try:
        perplexity = math.exp(torch.mean(losses))
    except OverflowError:
        perplexity = float("inf")

    print(f">>> Epoch {epoch}: Perplexity: {perplexity}")

In [None]:
# save and run
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(model_name, save_function=accelerator.save)
if accelerator.is_main_process:
    tokenizer.save_pretrained(model_name)