In [1]:
from transformers import TFAutoModelForMaskedLM

model_checkpoint = "distilbert-base-cased"
model = TFAutoModelForMaskedLM.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/338M [00:00<?, ?B/s]

Some layers from the model checkpoint at distilbert-base-cased were not used when initializing TFDistilBertForMaskedLM: ['activation_13']
- This IS expected if you are initializing TFDistilBertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertForMaskedLM were initialized from the model checkpoint at distilbert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForMaskedLM for predictions without further training.


In [2]:
model(model.dummy_inputs)  # Build the model
model.summary()

Model: "tf_distil_bert_for_masked_lm"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMai  multiple                 65190912  
 nLayer)                                                         
                                                                 
 vocab_transform (Dense)     multiple                  590592    
                                                                 
 vocab_layer_norm (LayerNorm  multiple                 1536      
 alization)                                                      
                                                                 
 vocab_projector (TFDistilBe  multiple                 22692676  
 rtLMHead)                                                       
                                                                 
Total params: 65,812,036
Trainable params: 65,812,036
Non-trainable params: 0
__________________________

In [3]:
text = "This is a great [MASK]."

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

In [5]:
import numpy as np
import tensorflow as tf

inputs = tokenizer(text, return_tensors="np")
token_logits = model(**inputs).logits
# Find the location of [MASK] and extract its logits
mask_token_index = np.argwhere(inputs["input_ids"] == tokenizer.mask_token_id)[0, 1]
mask_token_logits = token_logits[0, mask_token_index, :]
# Pick the [MASK] candidates with the highest logits
# We negate the array before argsort to get the largest, not the smallest, logits
top_5_tokens = np.argsort(-mask_token_logits)[:5].tolist()

for token in top_5_tokens:
    print(f">>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}")

>>> This is a great achievement.
>>> This is a great success.
>>> This is a great deal.
>>> This is a great feat.
>>> This is a great celebration.


In [6]:
import pandas as pd 
train_data = pd.read_csv('../data/processed/BERT_MLM_streamer_data.csv')

In [7]:
from datasets import Dataset 
bert_dataset = Dataset.from_pandas(train_data)

In [8]:
bert_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 7292
})

In [9]:
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = bert_dataset.map(
    tokenize_function, batched=True, remove_columns=["text", "label"]
)
tokenized_datasets

  0%|          | 0/8 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1343 > 512). Running this sequence through the model will result in indexing errors


Dataset({
    features: ['input_ids', 'attention_mask', 'word_ids'],
    num_rows: 7292
})

In [10]:
tokenizer.model_max_length

512

In [11]:
chunk_size = 128

In [12]:
# train_dataset = tokenized_datasets.shuffle(seed=42).select(range(6000))
# eval_dataset = tokenized_datasets.shuffle(seed=42).select(range(1292))

In [13]:
# Slicing produces a list of lists for each feature
# tokenized_samples = tokenized_datasets["train"][:3]

for idx, sample in enumerate(tokenized_datasets[:3]["input_ids"]):
    print(f"'>>> Review {idx} length: {len(sample)}'")

'>>> Review 0 length: 6'
'>>> Review 1 length: 5'
'>>> Review 2 length: 11'


In [14]:
tokenized_datasets[:1]

{'input_ids': [[101, 1249, 7130, 1708, 9717, 102]],
 'attention_mask': [[1, 1, 1, 1, 1, 1]],
 'word_ids': [[None, 0, 0, 0, 0, None]]}

In [15]:
tokenized_samples = tokenized_datasets[:]

for k in tokenized_samples.keys():
    print(k)


concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated reviews length: {total_length}'")

input_ids
attention_mask
word_ids
'>>> Concatenated reviews length: 95715'


In [16]:
chunks = {
    k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_examples.items()
}

for chunk in chunks["input_ids"]:
    print(f"'>>> Chunk length: {len(chunk)}'")

'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk lengt

In [17]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

  0%|          | 0/8 [00:00<?, ?ba/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
    num_rows: 744
})

In [18]:
lm_datasets

Dataset({
    features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
    num_rows: 744
})

In [19]:
tokenizer.decode(lm_datasets[1]["input_ids"])

'GENSHIT 0 CONTENT KEKW [SEP] [CLS] DWINK [SEP] [CLS] genshin is a money machine for the devs to funnel into their actual passion products [SEP] [CLS] TRUE [SEP] [CLS] genshin has constant events and updates [SEP] [CLS] im a 40 year old balding lalafell [SEP] [CLS]????? [SEP] [CLS] GENSHIT KEKW [SEP] [CLS] mancaveDance [SEP] [CLS] TRUE [SEP] [CLS] Its a polished game TOO [SEP] [CLS] actual coke [SEP] [CLS] genshin also fun af [SEP] [CLS] They have a good roadmap'

### Fine-tuning DistilBERT with the Trainer API

In [20]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [21]:
samples = [lm_datasets[i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] AsmonSip [SEP] [CLS] coke [MASK] [SEP] [CLS] Silverhand HahaShrugRight [SEP] [CLS] I recommended your channel [MASK] my [MASK] br [MASK]! [MASK] your content [MASK] [SEP] [CLS] we just have to wait [MASK] [MASK] [SEP] [CLS] [MASK]monMALD Pepper [MASK] [MASK] [SEP] [CLS] asmon [MASK]GGERS [SEP] [CLS] P [MASK]S [MASK] [MASK] [SEP] [CLS] ASMON LET ME GIVE BIR [MASK] TO UR KID [MASK] [SEP] [CLS] parks and recreation [MASK] 3 [SEP] [CLS] PJ [MASK] [MASK]r [SEP] [CLS] [MASK] - [MASK] has [MASK]JSugar [SEP] [CLS] no i will not allow bench to take a drink [SEP] [CLS]'

'>>> geologistNSHIT 0 CONTENT K [MASK]KW [SEP] [CLS] D [MASK] [MASK]K [SEP] [CLS] genshin [MASK] a money [MASK] for the devs [MASK] funnel into their actual passion products [SEP] [CLS] TRUE [SEP] [CLS] genshin has constant events and updates [SEP] [CLS] im a 40 year old [MASK] [MASK] lalafell [SEP] [CLS]????? [SEP] [CLS] GENSHIT KEKW [SEP] [CLS] mancaveDance [SEP] [CLS] TRUE [SEP] [CLS] Its a polished [MASK] TOO [SE

In [22]:
import collections
import numpy as np

from transformers.data.data_collator import tf_default_data_collator

wwm_probability = 0.2


def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id

    return tf_default_data_collator(features)

In [23]:
samples = [lm_datasets[i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] AsmonSip [SEP] [CLS] coke juice [SEP] [CLS] Silverhand HahaShrugRight [SEP] [CLS] [MASK] recommended your [MASK] to my friends bro! Love your content. [SEP] [CLS] we just have to wait 10 years [SEP] [CLS] asmonMALD PepperTime [SEP] [CLS] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [SEP] [CLS] [MASK] [MASK] [MASK] [MASK] [MASK] [SEP] [CLS] [MASK] [MASK] [MASK] LET ME [MASK] [MASK] BIRTH TO UR KIDS [SEP] [CLS] parks and recreation < [MASK] [SEP] [CLS] PJSugar [SEP] [CLS] Coca - [MASK] has [MASK] [MASK] [MASK] [MASK] [MASK] [SEP] [CLS] no i will not allow u to take a drink [SEP] [CLS]'

'>>> GENSHIT 0 CONTENT KEKW [SEP] [CLS] DWINK [SEP] [CLS] genshin is a money machine for the devs to funnel into their [MASK] passion products [SEP] [CLS] TRUE [SEP] [CLS] genshin has constant events [MASK] [MASK] [SEP] [CLS] im a 40 year old balding lalafell [SEP] [CLS]? [MASK]??? [SEP] [CLS] GENSHIT KEKW [SEP] [CLS] mancaveDance [SEP] [CLS] TRUE [SEP] [CLS] Its a [MASK] game TOO [SEP] [CLS] act

### Train Test split

In [24]:
train_size = 600
test_size = 144 #int(0.1 * train_size)

downsampled_dataset = lm_datasets.train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
downsampled_dataset


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 600
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 144
    })
})

In [25]:
from huggingface_hub import notebook_login

notebook_login()

Login successful
Your token has been saved to /Users/Vaibhav_Beohar/.huggingface/token


In [26]:
lm_datasets

Dataset({
    features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
    num_rows: 744
})

In [27]:
tf_train_dataset = downsampled_dataset["train"].to_tf_dataset(
    columns=["input_ids", "attention_mask", "labels"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=32,
)

tf_eval_dataset = downsampled_dataset["test"].to_tf_dataset(
    columns=["input_ids", "attention_mask", "labels"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=32,
)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Caus

### Set up our training hyperparameters and compile our model. We use the create_optimizer() function from the 🤗 Transformers library, which gives us an AdamW optimizer with linear learning rate decay.

### In addition, we set up a PushToHubCallback that will save the model to the Hub after each epoch.

In [28]:
from transformers import create_optimizer
from transformers.keras_callbacks import PushToHubCallback
import tensorflow as tf

model_name = 'veb/twitch-distilbert-base-cased'

num_train_steps = len(tf_train_dataset)
optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=1_000,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")

callback = PushToHubCallback(
    output_dir=f"{model_name}-finetuned", tokenizer=tokenizer
)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour, please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


The dtype policy mixed_float16 may run slowly because this machine does not have a GPU. Only Nvidia GPUs with compute capability of at least 7.0 run quickly with mixed_float16.


Cloning https://huggingface.co/veb/twitch-distilbert-base-cased-finetuned into local empty directory.


In [29]:
import math

eval_loss = model.evaluate(tf_eval_dataset)
print(f"Perplexity: {math.exp(eval_loss):.2f}")

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the f

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Perplexity: 255.73


In [30]:
model.fit(tf_train_dataset, validation_data=tf_eval_dataset, callbacks=[callback])

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'


Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file tf_model.h5:   0%|          | 32.0k/338M [00:00<?, ?B/s]

To https://huggingface.co/veb/twitch-distilbert-base-cased-finetuned
   1c56a46..ad4591e  main -> main



<keras.callbacks.History at 0x7fd7a9e91b90>

In [31]:
eval_loss = model.evaluate(tf_eval_dataset)
print(f"Perplexity: {math.exp(eval_loss):.2f}")

Perplexity: 223.69


### Using our fine-tuned model

In [35]:
from transformers import pipeline

mask_filler = pipeline(
    "fill-mask", model="veb/twitch-distilbert-base-cased-finetuned"
)

All model checkpoint layers were used when initializing TFDistilBertForMaskedLM.

All the layers of TFDistilBertForMaskedLM were initialized from the model checkpoint at veb/twitch-distilbert-base-cased-finetuned.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForMaskedLM for predictions without further training.


In [44]:
# text = "This is a great [MASK]."
text = "Bausen [MASK]."

preds = mask_filler(text)

for pred in preds:
    print(f">>> {pred['sequence']}")

>>> Bausen died.
>>> Bausenbach.
>>> Bausen v.
>>> Bausenmann.
>>> Bausen c.
