In [2]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [3]:
from transformers import AutoModelForMaskedLM

model_checkpoint = "distilbert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

In [4]:
distilbert_num_parameters = model.num_parameters() / 1_000_000
print(f"'>>> DistilBERT number of parameters: {round(distilbert_num_parameters)}M'")
print(f"'>>> BERT number of parameters: 110M'")

'>>> DistilBERT number of parameters: 67M'
'>>> BERT number of parameters: 110M'


In [5]:
text = "This is a great [MASK]."

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [7]:
import torch

inputs = tokenizer(text, return_tensors="pt")
token_logits = model(**inputs).logits

In [8]:
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

In [9]:
print(mask_token_index)

tensor([5])


In [10]:
print(token_logits)

tensor([[[ -5.5882,  -5.5868,  -5.5958,  ...,  -4.9448,  -4.8174,  -2.9905],
         [-11.9031, -11.8872, -12.0623,  ..., -10.9570, -10.6464,  -8.6324],
         [-11.9604, -12.1520, -12.1279,  ..., -10.0218,  -8.6074,  -8.0971],
         ...,
         [ -4.8228,  -4.6268,  -5.1041,  ...,  -4.2771,  -5.0184,  -3.9428],
         [-11.2945, -11.2388, -11.3857,  ...,  -9.2063,  -9.3411,  -6.1505],
         [ -9.5213,  -9.4632,  -9.5022,  ...,  -8.6561,  -8.4908,  -4.6903]]],
       grad_fn=<ViewBackward0>)


In [11]:
print(mask_token_logits)

tensor([[-4.8228, -4.6268, -5.1041,  ..., -4.2771, -5.0184, -3.9428]],
       grad_fn=<IndexBackward0>)


In [12]:
for token in top_5_tokens:
    print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

'>>> This is a great deal.'
'>>> This is a great success.'
'>>> This is a great adventure.'
'>>> This is a great idea.'
'>>> This is a great feat.'


In [13]:
from datasets import load_dataset

imdb_dataset = load_dataset("imdb")
imdb_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [14]:
sample = imdb_dataset["train"].shuffle(seed=42).select(range(3))

for row in sample:
    print(f"\n'>>> Review: {row['text']}'")
    print(f"'>>> Label: {row['label']}'")


'>>> Review: There is no relation at all between Fortier and Profiler but the fact that both are police series about violent crimes. Profiler looks crispy, Fortier looks classic. Profiler plots are quite simple. Fortier's plot are far more complicated... Fortier looks more like Prime Suspect, if we have to spot similarities... The main character is weak and weirdo, but have "clairvoyance". People like to compare, to judge, to evaluate. How about just enjoying? Funny thing too, people writing Fortier looks American but, on the other hand, arguing they prefer American series (!!!). Maybe it's the language, or the spirit, but I think this series is more English than American. By the way, the actors are really good and funny. The acting is not superficial at all...'
'>>> Label: 1'

'>>> Review: This movie is a great. The plot is very true to the book which is a classic written by Mark Twain. The movie starts of with a scene where Hank sings a song with a bunch of kids called "when you stu

In [15]:
unsupervised_sample = imdb_dataset["unsupervised"].shuffle(seed=42).select(range(3))

for row in unsupervised_sample:
    print(f"\n'>>> Review: {row['text']}'")
    print(f"'>>> Label: {row['label']}'")


'>>> Review: If you've seen the classic Roger Corman version starring Vincent Price it's hard to put it out of your head, but you probably should do because this one is totally different. Subtlety has been abandoned in favour of gross-out horror - nudity, gore and all-round unpleasantness. OK it's ridiculous, trashy, sensationalised and historically dubious (did any members of the Inquisition really wear horn-rimmed glasses?), but despite all this it is strangely compelling. I literally couldn't tear myself away from the screen until the end of the movie. If there's a bigger compliment you can pay to a film I don't know what it is.'
'>>> Label: -1'

'>>> Review: For me, this was the most moving film of the decade. Samira Makhmalbaf shows pure bravery and vision in the making. She has an intelligence and gift for speaking to the people, regardless of their nationality or beliefs. I am inspired and touched by her humanity and can only hope that she has touched many people the same way. 

In [16]:
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

In [17]:
tokenized_datasets = imdb_dataset.map(
    tokenize_function, batched=True, remove_columns=["text", "label"]
)
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 50000
    })
})

In [18]:
tokenizer.model_max_length

512

In [19]:
chunk_size = 512

In [20]:
tokenized_samples = tokenized_datasets["train"][:3]

for idx, sample in enumerate(tokenized_samples["input_ids"]):
    print(f"'>>> Review {idx} length: {len(sample)}'")

'>>> Review 0 length: 363'
'>>> Review 1 length: 304'
'>>> Review 2 length: 133'


In [21]:
concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated reviews length: {total_length}'")

'>>> Concatenated reviews length: 800'


In [22]:
chunks = {
    k: [t[i: i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_examples.items()
}

for chunk in chunks["input_ids"]:
    print(f"'>>> Chunk length: {len(chunk)}'")

'>>> Chunk length: 512'
'>>> Chunk length: 288'


In [23]:
def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // chunk_size) * chunk_size
    result = {
        k: [t[i: i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [24]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 15313
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 14966
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 30721
    })
})

In [25]:
tokenizer.decode(lm_datasets["train"][1]["input_ids"])

'the brown bunny, in which we \' re treated to the site of vincent gallo \' s throbbing johnson, but not a trace of pink visible on chloe sevigny. before crying ( or implying ) " double - standard " in matters of nudity, the mentally obtuse should take into account one unavoidably obvious anatomical difference between men and women : there are no genitals on display when actresses appears nude, and the same cannot be said for a man. in fact, you generally won \' t see female genitals in an american film in anything short of porn or explicit erotica. this alleged double - standard is less a double standard than an admittedly depressing ability to come to terms culturally with the insides of women \' s bodies. [SEP] [CLS] if only to avoid making this type of film in the future. this film is interesting as an experiment but tells no cogent story. < br / > < br / > one might feel virtuous for sitting thru it because it touches on so many important issues but it does so without any discerna

In [26]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [27]:
samples = [lm_datasets["train"][i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] i rented i am [MASK] - yellow from overgrown video [MASK] because of all the [MASK] that [MASK] it when it was vegetable released in 1967. i also [MASK] that at first it was seized by u. s. customs if it ever tried to enter this country, therefore being [MASK] fan of films considered " controversial [MASK] [MASK] really had [MASK] see this for myself. < devils / > < br / > the plot is centered around a young [MASK] drama student named lena [MASK] wants to learn everything she can about life [MASK] in particular she wants to focus her [MASK]s [MASK] making some sort of documentary [MASK] [MASK] the average swede thought about certain [MASK] issues such as the vietnam war and race issues [MASK] the united states. in [MASK] asking politicians and ordinary [MASK]izens of stockholm about their opinions on politics, she has sex with [MASK] drama teacher, classmates, and married men. spells br / > < br / > [MASK] kills me about i am [MASK] - yellow is that 40 years [MASK], this [M

In [28]:
import collections
import numpy as np

from transformers import default_data_collator

wwm_probability = 0.2

In [29]:
def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels
    return default_data_collator(features)

In [30]:
samples = [lm_datasets["train"][i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

In [31]:
for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] i [MASK] [MASK] [MASK] curious [MASK] yellow from my video store because of all the controversy that surrounded [MASK] when it was first released in 1967. i also heard that [MASK] [MASK] it [MASK] [MASK] by u [MASK] s. customs [MASK] it ever tried to enter this country, therefore being [MASK] fan of [MASK] [MASK] " controversial " [MASK] really had to [MASK] [MASK] [MASK] myself. < [MASK] / > < br / > the plot is centered around a young swedish drama student named [MASK] who [MASK] [MASK] learn everything she [MASK] about [MASK]. [MASK] particular she wants to focus her [MASK] [MASK] to making some sort of documentary on [MASK] the average [MASK] [MASK] thought about certain political issues such as the vietnam war [MASK] race issues in the united states. in between [MASK] [MASK] and [MASK] denizens of stockholm about their opinions on politics [MASK] she [MASK] sex with her drama teacher [MASK] classmates, and married men. < [MASK] / > < br / > [MASK] kills me about i [MAS

In [32]:
# train_size = 10_000
# test_size = int(0.1 * train_size)
#
# downsampled_dataset = lm_datasets["train"].train_test_split(
#     train_size=train_size, test_size=test_size, seed=42
# )
# downsampled_dataset

In [55]:
from huggingface_hub import notebook_login, accept_access_request

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [34]:
from transformers import TrainingArguments

batch_size = 64
# 在每个 epoch 输出训练的 loss
logging_steps = len(lm_datasets["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

training_args = TrainingArguments(
    output_dir=f"/data/wzq/cache/huggingface/hub/{model_name}-finetuned-imdb",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=True,
    fp16=True,
    logging_steps=logging_steps,
)



In [35]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [36]:
import math

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

>>> Perplexity: 19.26


In [38]:
trainer.train()

Epoch,Training Loss,Validation Loss,Model Preparation Time
1,2.4931,2.300219,0.001
2,2.3977,2.257361,0.001
3,2.3616,2.239156,0.001


TrainOutput(global_step=720, training_loss=2.417264187335968, metrics={'train_runtime': 236.9259, 'train_samples_per_second': 193.896, 'train_steps_per_second': 3.039, 'total_flos': 6089726949894144.0, 'train_loss': 2.417264187335968, 'epoch': 3.0})

In [39]:
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

>>> Perplexity: 9.39


In [40]:
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/KinomotoMio/distilbert-base-uncased-finetuned-imdb/commit/804b57b894a378d99ca77fabe37b8d16eaceffb2', commit_message='End of training', commit_description='', oid='804b57b894a378d99ca77fabe37b8d16eaceffb2', pr_url=None, repo_url=RepoUrl('https://hf-mirror.com/KinomotoMio/distilbert-base-uncased-finetuned-imdb', endpoint='https://hf-mirror.com', repo_type='model', repo_id='KinomotoMio/distilbert-base-uncased-finetuned-imdb'), pr_revision=None, pr_num=None)

In [41]:
def insert_random_mask(batch):
    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
    masked_inputs = data_collator(features)

    return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}

In [42]:
lm_datasets = lm_datasets.remove_columns(["word_ids"])
eval_dataset = lm_datasets["test"].map(
    insert_random_mask,
    batched=True,
    remove_columns=lm_datasets["test"].column_names,
)
eval_dataset = eval_dataset.rename_columns(
    {
        "masked_input_ids": "input_ids",
        "masked_attention_mask": "attention_mask",
        "masked_labels": "labels",
    }
)

Map:   0%|          | 0/14966 [00:00<?, ? examples/s]

In [43]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

batch_size = 64
train_dataloader = DataLoader(
    lm_datasets["train"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)
eval_dataloader = DataLoader(
    eval_dataset, batch_size=batch_size, collate_fn=default_data_collator
)

In [44]:
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

In [45]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [47]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [48]:
from transformers import get_scheduler

num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [59]:
from huggingface_hub import get_full_repo_name

model_name = "distilbert-base-uncased-finetuned-imdb-accelerate"
repo_name = get_full_repo_name(model_name)
repo_name

'KinomotoMio/distilbert-base-uncased-finetuned-imdb-accelerate'

In [62]:
from huggingface_hub import create_repo

create_repo(
    repo_id="KinomotoMio/distilbert-base-uncased-finetuned-imdb-accelerate",
    repo_type="model",
    private=False,
    exist_ok=True,
)

RepoUrl('https://hf-mirror.com/KinomotoMio/distilbert-base-uncased-finetuned-imdb-accelerate', endpoint='https://hf-mirror.com', repo_type='model', repo_id='KinomotoMio/distilbert-base-uncased-finetuned-imdb-accelerate')

In [63]:
from huggingface_hub import Repository

output_dir = "/data/wzq/cache/huggingface/hub/" + model_name
repo = Repository(output_dir, clone_from=repo_name)

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://hf-mirror.com/KinomotoMio/distilbert-base-uncased-finetuned-imdb-accelerate into local empty directory.


In [65]:
from tqdm.auto import tqdm
import torch
import math

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # 训练
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # 评估
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)

        loss = outputs.loss
        losses.append(accelerator.gather(loss.repeat(batch_size)))

    losses = torch.cat(losses)
    losses = losses[: len(eval_dataset)]
    try:
        perplexity = math.exp(torch.mean(losses))
    except OverflowError:
        perplexity = float("inf")

    print(f">>> Epoch {epoch}: Perplexity: {perplexity}")

    # 保存并上传
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        repo.push_to_hub(
            commit_message=f"Training in progress epoch {epoch}", blocking=False
        )

  0%|          | 0/720 [00:00<?, ?it/s]

>>> Epoch 0: Perplexity: 9.045098986018697
>>> Epoch 1: Perplexity: 8.883463061453169
>>> Epoch 2: Perplexity: 8.883463061453169


In [66]:
from transformers import pipeline

mask_filler = pipeline(
    "fill-mask", model="KinomotoMio/distilbert-base-uncased-finetuned-imdb-accelerate"
)

config.json:   0%|          | 0.00/338 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Device set to use cuda:0


In [67]:
preds = mask_filler(text)

for pred in preds:
    print(f">>> {pred['sequence']}")

>>> this is a great film.
>>> this is a great movie.
>>> this is a great idea.
>>> this is a great adventure.
>>> this is a great show.
