In [1]:
from transformers import TFAutoModelForMaskedLM

model_checkpoint = "bert-base-uncased"
model = TFAutoModelForMaskedLM.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/511M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForMaskedLM.

All the layers of TFBertForMaskedLM were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


In [2]:
model(model.dummy_inputs)  # Build the model
model.summary()

Model: "tf_bert_for_masked_lm"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108891648 
                                                                 
 mlm___cls (TFBertMLMHead)   multiple                  24459834  
                                                                 
Total params: 109,514,298
Trainable params: 109,514,298
Non-trainable params: 0
_________________________________________________________________


In [3]:
text = "This is a great [MASK]."

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [5]:
import numpy as np
import tensorflow as tf

inputs = tokenizer(text, return_tensors="np")
token_logits = model(**inputs).logits
# Find the location of [MASK] and extract its logits
mask_token_index = np.argwhere(inputs["input_ids"] == tokenizer.mask_token_id)[0, 1]
mask_token_logits = token_logits[0, mask_token_index, :]
# Pick the [MASK] candidates with the highest logits
# We negate the array before argsort to get the largest, not the smallest, logits
top_5_tokens = np.argsort(-mask_token_logits)[:5].tolist()

for token in top_5_tokens:
    print(f">>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}")

>>> This is a great idea.
>>> This is a great day.
>>> This is a great place.
>>> This is a great time.
>>> This is a great thing.


In [6]:
import pandas as pd 
train_data = pd.read_csv('../data/processed/BERT_MLM_streamer_data.csv')

In [7]:
from datasets import Dataset 
bert_dataset = Dataset.from_pandas(train_data)

In [8]:
bert_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 7292
})

In [9]:
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = bert_dataset.map(
    tokenize_function, batched=True, remove_columns=["text", "label"]
)
tokenized_datasets

  0%|          | 0/8 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1343 > 512). Running this sequence through the model will result in indexing errors


Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
    num_rows: 7292
})

In [10]:
tokenizer.model_max_length

512

In [11]:
chunk_size = 128

In [12]:
# train_dataset = tokenized_datasets.shuffle(seed=42).select(range(6000))
# eval_dataset = tokenized_datasets.shuffle(seed=42).select(range(1292))

In [13]:
# Slicing produces a list of lists for each feature
# tokenized_samples = tokenized_datasets["train"][:3]

for idx, sample in enumerate(tokenized_datasets[:3]["input_ids"]):
    print(f"'>>> Review {idx} length: {len(sample)}'")

'>>> Review 0 length: 5'
'>>> Review 1 length: 4'
'>>> Review 2 length: 9'


In [14]:
tokenized_datasets[:1]

{'input_ids': [[101, 2004, 16563, 11514, 102]],
 'token_type_ids': [[0, 0, 0, 0, 0]],
 'attention_mask': [[1, 1, 1, 1, 1]],
 'word_ids': [[None, 0, 0, 0, None]]}

In [15]:
tokenized_samples = tokenized_datasets[:]

for k in tokenized_samples.keys():
    print(k)


concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated reviews length: {total_length}'")

input_ids
token_type_ids
attention_mask
word_ids
'>>> Concatenated reviews length: 82089'


In [16]:
chunks = {
    k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_examples.items()
}

for chunk in chunks["input_ids"]:
    print(f"'>>> Chunk length: {len(chunk)}'")

'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk lengt

In [17]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

  0%|          | 0/8 [00:00<?, ?ba/s]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
    num_rows: 636
})

In [18]:
lm_datasets

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
    num_rows: 636
})

In [19]:
tokenizer.decode(lm_datasets[1]["input_ids"])

'a money machine for the devs to funnel into their actual passion products [SEP] [CLS] true [SEP] [CLS] genshin has constant events and updates [SEP] [CLS] im a 40 year old balding lalafell [SEP] [CLS]????? [SEP] [CLS] genshit kekw [SEP] [CLS] mancavedance [SEP] [CLS] true [SEP] [CLS] its a polished game too [SEP] [CLS] actual coke [SEP] [CLS] genshin also fun af [SEP] [CLS] they have a good roadmap [SEP] [CLS] and all the content is free [SEP] [CLS] genshin content kekw [SEP] [CLS] true [SEP] [CLS] genshin is p2w? [SEP] [CLS] debt? the ceo is a fuckin millionare'

### Fine-tuning DistilBERT with the Trainer API

In [20]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [21]:
samples = [lm_datasets[i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] asmonsip [SEP] [CLS] coke juice [SEP] [CLS] silverhand hahashrugright [SEP] [CLS] [MASK] recommended your [MASK] to my friends bro! love your content. [SEP] [CLS] we just [MASK] [MASK] wait 10 years [SEP] [CLS] asmonmald peppertime [SEP] [CLS] as [MASK]moggers [SEP] [CLS] p stuffeduga [MASK] [SEP] [CLS] asmon let me give birth [MASK] ur kids [SEP] [CLS] [MASK] [MASK] recreation < 3 [SEP] [CLS] pjsugar [SEP] [CLS] coca - [MASK] has pjsugar [SEP] [CLS] no i [MASK] not allow u to take a drink [SEP] [CLS] genshit 0 content kekw [SEP] [CLS] dwink [SEP] [CLS] gen [MASK] is'

'>>> a money machine for the [MASK]s to [MASK] into their actual passion products [SEP] [CLS] [MASK] [SEP] [CLS] genshin has constant events and updates [SEP] [CLS] im a 40 [MASK] old balding lalafell [SEP] [CLS]????? [SEP] [CLS] genshit kekw [SEP] [CLS] man [MASK]dance [SEP] [CLS] true [SEP] [CLS] its a polished game too [SEP] [CLS] actual [MASK] [SEP] [CLS] genshin also fun af [SEP] [CLS] they have a good r

In [22]:
import collections
import numpy as np

from transformers.data.data_collator import tf_default_data_collator

wwm_probability = 0.2


def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id

    return tf_default_data_collator(features)

In [23]:
samples = [lm_datasets[i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] asmonsip [SEP] [CLS] coke juice [SEP] [CLS] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [SEP] [CLS] i recommended your channel [MASK] my friends bro! love [MASK] content [MASK] [SEP] [CLS] we just have [MASK] [MASK] 10 years [SEP] [CLS] asmonmald peppertime [SEP] [CLS] [MASK] [MASK] [MASK] [MASK] [MASK] [SEP] [CLS] [MASK] [MASK] [MASK] [MASK] [SEP] [CLS] [MASK] [MASK] [MASK] [MASK] give birth to ur [MASK] [SEP] [CLS] parks and recreation [MASK] 3 [SEP] [CLS] pjsugar [SEP] [CLS] coca - cola has [MASK] [MASK] [MASK] [MASK] [SEP] [CLS] no i [MASK] [MASK] allow u to [MASK] a drink [SEP] [CLS] genshit 0 [MASK] kekw [SEP] [CLS] dwink [SEP] [CLS] genshin is'

'>>> a [MASK] machine [MASK] the [MASK] [MASK] to funnel into their actual passion products [SEP] [CLS] true [SEP] [CLS] genshin has constant events [MASK] updates [SEP] [CLS] im a [MASK] year old [MASK] [MASK] lalafell [SEP] [CLS]?? [MASK]?? [SEP] [CLS] genshit kekw [SEP] [CLS] mancavedance [SEP] [CLS] true [SEP] [CLS] 

### Train Test split

In [24]:
train_size = 500
test_size = 136 #int(0.1 * train_size)

downsampled_dataset = lm_datasets.train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
downsampled_dataset


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 500
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 136
    })
})

In [26]:
from huggingface_hub import notebook_login

notebook_login()

Login successful
Your token has been saved to /Users/Vaibhav_Beohar/.huggingface/token


In [27]:
lm_datasets

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
    num_rows: 636
})

In [28]:
tf_train_dataset = downsampled_dataset["train"].to_tf_dataset(
    columns=["input_ids", "attention_mask", "labels"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=32,
)

tf_eval_dataset = downsampled_dataset["test"].to_tf_dataset(
    columns=["input_ids", "attention_mask", "labels"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=32,
)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Caus

### Set up our training hyperparameters and compile our model. We use the create_optimizer() function from the 🤗 Transformers library, which gives us an AdamW optimizer with linear learning rate decay.

### In addition, we set up a PushToHubCallback that will save the model to the Hub after each epoch.

In [29]:
from transformers import create_optimizer
from transformers.keras_callbacks import PushToHubCallback
import tensorflow as tf

model_name = 'veb/twitch-bert-base-uncased'

num_train_steps = len(tf_train_dataset)
optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=1_000,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")

callback = PushToHubCallback(
    output_dir=f"{model_name}-finetuned", tokenizer=tokenizer
)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour, please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


The dtype policy mixed_float16 may run slowly because this machine does not have a GPU. Only Nvidia GPUs with compute capability of at least 7.0 run quickly with mixed_float16.


Cloning https://huggingface.co/veb/twitch-bert-base-uncased-finetuned into local empty directory.


In [30]:
import math

eval_loss = model.evaluate(tf_eval_dataset)
print(f"Perplexity: {math.exp(eval_loss):.2f}")

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the f

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) 

In [31]:
model.fit(tf_train_dataset, validation_data=tf_eval_dataset, callbacks=[callback])

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'


Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file tf_model.h5:   0%|          | 32.0k/509M [00:00<?, ?B/s]

To https://huggingface.co/veb/twitch-bert-base-uncased-finetuned
   8ed10d7..d48fc05  main -> main



<keras.callbacks.History at 0x7f98aa59f610>

In [32]:
eval_loss = model.evaluate(tf_eval_dataset)
print(f"Perplexity: {math.exp(eval_loss):.2f}")

Perplexity: 130.53


### Using our fine-tuned model

In [33]:
from transformers import pipeline

mask_filler = pipeline(
    "fill-mask", model="veb/twitch-bert-base-uncased-finetuned"
)

All model checkpoint layers were used when initializing TFBertForMaskedLM.

All the layers of TFBertForMaskedLM were initialized from the model checkpoint at veb/twitch-bert-base-uncased-finetuned.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


In [34]:
# text = "This is a great [MASK]."
text = "lolSinged [MASK]."

preds = mask_filler(text)

for pred in preds:
    print(f">>> {pred['sequence']}")

>>> lolsinged..
>>> lolsinged said.
>>> lolsinged it.
>>> lolsinged '.
>>> lolsinged ".
