In [16]:
from transformers import TFAutoModelForMaskedLM

model_checkpoint = "bert-base-cased"
model = TFAutoModelForMaskedLM.from_pretrained(model_checkpoint)

All model checkpoint layers were used when initializing TFBertForMaskedLM.

All the layers of TFBertForMaskedLM were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


In [17]:
model(model.dummy_inputs)  # Build the model
model.summary()

Model: "tf_bert_for_masked_lm"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  107719680 
                                                                 
 mlm___cls (TFBertMLMHead)   multiple                  23286340  
                                                                 
Total params: 108,340,804
Trainable params: 108,340,804
Non-trainable params: 0
_________________________________________________________________


In [18]:
text = "This is a great [MASK]."

In [19]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [20]:
import numpy as np
import tensorflow as tf

inputs = tokenizer(text, return_tensors="np")
token_logits = model(**inputs).logits
# Find the location of [MASK] and extract its logits
mask_token_index = np.argwhere(inputs["input_ids"] == tokenizer.mask_token_id)[0, 1]
mask_token_logits = token_logits[0, mask_token_index, :]
# Pick the [MASK] candidates with the highest logits
# We negate the array before argsort to get the largest, not the smallest, logits
top_5_tokens = np.argsort(-mask_token_logits)[:5].tolist()

for token in top_5_tokens:
    print(f">>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}")

>>> This is a great idea.
>>> This is a great deal.
>>> This is a great place.
>>> This is a great story.
>>> This is a great thing.


In [90]:
import pandas as pd 
# train_data = pd.read_csv('../data/processed/BERT_MLM_streamer_data.csv')
train_data = pd.read_csv('../data/processed/training_chats_2_20220625_vb.csv')

In [45]:
import pandas as pd 
# train_data = pd.read_csv('../data/raw/training_chats_2_20220625.csv')
train_data = pd.read_csv('../data/processed/BERT_MLM_streamer_data_2.csv')

In [91]:
# train_data = train_data[train_data['text'].notnull() & train_data['text']!=''][['text', 'label']]
train_data = train_data[['text','label']]
train_data = train_data[train_data.text.notnull()]

In [92]:
train_data

Unnamed: 0,text,label
0,jesus,
1,2 teams,
2,Clueless Missing vital information,
3,faide in his element,
4,HES GONE ROGUE,
...,...,...
161075,pcepce,
161076,Bedge,
161077,Doublelift uses the brand new U.GG Desktop App...,
161078,good night!,


In [93]:
from datasets import Dataset 
bert_dataset = Dataset.from_pandas(train_data, preserve_index=False)

In [94]:
bert_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 161030
})

In [95]:
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = bert_dataset.map(
    tokenize_function, batched=True, remove_columns=["text", "label"]
)
tokenized_datasets

  0%|          | 0/162 [00:00<?, ?ba/s]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
    num_rows: 161030
})

In [96]:
tokenizer.model_max_length

512

In [97]:
chunk_size = 128

In [98]:
# train_dataset = tokenized_datasets.shuffle(seed=42).select(range(6000))
# eval_dataset = tokenized_datasets.shuffle(seed=42).select(range(1292))

In [99]:
# Slicing produces a list of lists for each feature
# tokenized_samples = tokenized_datasets["train"][:3]

for idx, sample in enumerate(tokenized_datasets[:3]["input_ids"]):
    print(f"'>>> Review {idx} length: {len(sample)}'")

'>>> Review 0 length: 5'
'>>> Review 1 length: 4'
'>>> Review 2 length: 8'


In [100]:
tokenized_datasets[:1]

{'input_ids': [[101, 179, 1279, 1361, 102]],
 'token_type_ids': [[0, 0, 0, 0, 0]],
 'attention_mask': [[1, 1, 1, 1, 1]],
 'word_ids': [[None, 0, 0, 0, None]]}

In [101]:
tokenized_samples = tokenized_datasets[:]

for k in tokenized_samples.keys():
    print(k)


concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated reviews length: {total_length}'")

input_ids
token_type_ids
attention_mask
word_ids


KeyboardInterrupt: 

In [56]:
chunks = {
    k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_examples.items()
}

for chunk in chunks["input_ids"]:
    print(f"'>>> Chunk length: {len(chunk)}'")

'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk lengt

'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk lengt

In [57]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

  0%|          | 0/35 [00:00<?, ?ba/s]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
    num_rows: 2175
})

In [58]:
lm_datasets

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
    num_rows: 2175
})

In [59]:
tokenizer.decode(lm_datasets[1]["input_ids"])

'[SEP] [CLS] Welp [SEP] [CLS] OMEGALUL [SEP] [CLS] KEKW ROZAAA [SEP] [CLS] - 10K [SEP] [CLS] Roza [SEP] [CLS] roza [SEP] [CLS] ROZA [SEP] [CLS] expensive round [SEP] [CLS] @ tarik why didnt you aim at the viper ult [SEP] [CLS] roxaaaaa [SEP] [CLS] intriguing decision making [SEP] [CLS] KEKW [SEP] [CLS] bro i spend 200k points on that lmao [SEP] [CLS] AYYY [SEP] [CLS] you didn\'t need the second satchel " [SEP] [CLS] lost on both bets [SEP] [CLS] All you had to'

### Fine-tuning DistilBERT with the Trainer API

In [60]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [61]:
samples = [lm_datasets[i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] ROZA [SEP] [CLS] NOOO [SEP] [CLS] R [MASK]ZA KEK [MASK] [SEP] [CLS] ROZ [MASK]AAAA [SEP] [CLS] RO [MASK]A ROZA RO [MASK]A ROZA [SEP] [CLS] [MASK]rik doing JETT Rocket COSPLAY [SEP] [CLS] SHOOT [SEP] [CLS] emotes cannot express KEKW [SEP] [CLS] Nooo [MASK]oooo [MASK] [SEP] [CLS] [MASK]rikRage _ TK tarikRage _ TK [SEP] [CLS] unused roza K [MASK] [MASK]W [SEP] [CLS] [MASK]Z P [MASK] [MASK]TS L [MASK]SGO NONBELEIVEr [MASK]'

'>>> [SEP] [CLS] Wel [MASK] [SEP] [CLS] OMEGA [MASK]UL [SEP] [CLS] KEKW R [MASK]ZAAA [SEP] [CLS] [MASK] 10K [SEP] [CLS] Roz [MASK] [SEP] [CLS] roza [SEP] [CLS] ROZ [MASK] [SEP] [CLS] expensive round [SEP] [CLS] @ tarik why didnt you aim at the v Oz ul [MASK] [SEP] [CLS] r [MASK]aaaaa [SEP] [CLS] intriguing decision making [SEP] [CLS] K [MASK] speakW [SEP] [CLS] br [MASK] i spend 200k points on that lmao [SEP] [CLS] AYYY [SEP] [CLS] [MASK] didn't need the second satchel [MASK] [SEP] [CLS] lost on both bets [SEP] [CLS] All you had to'


In [62]:
import collections
import numpy as np

from transformers.data.data_collator import tf_default_data_collator

wwm_probability = 0.2


def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id

    return tf_default_data_collator(features)

In [63]:
samples = [lm_datasets[i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] [MASK] [MASK] [MASK] [MASK] [SEP] [CLS] [MASK] [MASK] [MASK] [SEP] [CLS] [MASK] [MASK] [MASK] [MASK] KEKW [SEP] [CLS] ROZAAAAAA [SEP] [CLS] ROZA ROZA ROZA ROZA [SEP] [CLS] [MASK] [MASK] doing [MASK] [MASK] [MASK] Rocket COSPLAY [SEP] [CLS] [MASK] [MASK] [MASK] [SEP] [CLS] [MASK] [MASK] cannot [MASK] [MASK] [MASK] [MASK] [MASK] [SEP] [CLS] Noooooooooo [SEP] [CLS] tarikRage _ TK tarikRage _ [MASK] [MASK] [SEP] [CLS] unused roza KEKW [SEP] [CLS] EZ POINTS LETSGO NONBELEIVErS'

'>>> [SEP] [CLS] [MASK] [MASK] [MASK] [SEP] [CLS] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [SEP] [CLS] [MASK] [MASK] [MASK] [MASK] ROZAAA [SEP] [CLS] [MASK] 10K [SEP] [CLS] Roza [SEP] [CLS] roza [SEP] [CLS] ROZA [SEP] [CLS] expensive round [SEP] [CLS] @ [MASK] [MASK] why didnt you aim [MASK] the viper ult [SEP] [CLS] roxaaaaa [SEP] [CLS] intriguing decision [MASK] [SEP] [CLS] [MASK] [MASK] [MASK] [MASK] [SEP] [CLS] [MASK] [MASK] i spend 200k points on that [MASK] [MASK] [MASK] [SEP] [CLS] AYYY [SEP] [CL

### Train Test split

In [64]:
train_size = 1958
test_size = 217 #int(0.1 * train_size)

downsampled_dataset = lm_datasets.train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
downsampled_dataset


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1958
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 217
    })
})

In [65]:
from huggingface_hub import notebook_login

notebook_login()

Login successful
Your token has been saved to /Users/Vaibhav_Beohar/.huggingface/token


In [66]:
lm_datasets

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
    num_rows: 2175
})

In [67]:
tf_train_dataset = downsampled_dataset["train"].to_tf_dataset(
    columns=["input_ids", "attention_mask", "labels"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=32,
)

tf_eval_dataset = downsampled_dataset["test"].to_tf_dataset(
    columns=["input_ids", "attention_mask", "labels"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=32,
)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Caus

### Set up our training hyperparameters and compile our model. We use the create_optimizer() function from the 🤗 Transformers library, which gives us an AdamW optimizer with linear learning rate decay.

### In addition, we set up a PushToHubCallback that will save the model to the Hub after each epoch.

In [68]:
from transformers import create_optimizer
from transformers.keras_callbacks import PushToHubCallback
import tensorflow as tf

model_name = 'veb/twitch-bert-base-cased'

num_train_steps = len(tf_train_dataset)
optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=1_000,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")

callback = PushToHubCallback(
    output_dir=f"{model_name}-finetuned", tokenizer=tokenizer
)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour, please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


The dtype policy mixed_float16 may run slowly because this machine does not have a GPU. Only Nvidia GPUs with compute capability of at least 7.0 run quickly with mixed_float16.


/Users/Vaibhav_Beohar/Documents/VB_Mck_Docs/MIDS/W210/final_proj/Twitch-chat-pioneers/notebooks/veb/twitch-bert-base-cased-finetuned is already a clone of https://huggingface.co/veb/twitch-bert-base-cased-finetuned. Make sure you pull the latest changes with `repo.git_pull()`.


In [69]:
import math

eval_loss = model.evaluate(tf_eval_dataset)
print(f"Perplexity: {math.exp(eval_loss):.2f}")

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the f

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) 

In [70]:
model.fit(tf_train_dataset, validation_data=tf_eval_dataset, callbacks=[callback])

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'


Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file tf_model.h5:   0%|          | 32.0k/500M [00:00<?, ?B/s]

To https://huggingface.co/veb/twitch-bert-base-cased-finetuned
   5a34f6e..d3fef99  main -> main



<keras.callbacks.History at 0x7f86eaf4a950>

In [71]:
eval_loss = model.evaluate(tf_eval_dataset)
print(f"Perplexity: {math.exp(eval_loss):.2f}")

Perplexity: 37.50


### Using our fine-tuned model

In [72]:
from transformers import pipeline

mask_filler = pipeline(
    "fill-mask", model="veb/twitch-bert-base-cased-finetuned"
)

All model checkpoint layers were used when initializing TFBertForMaskedLM.

All the layers of TFBertForMaskedLM were initialized from the model checkpoint at veb/twitch-bert-base-cased-finetuned.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


In [5]:
# text = "This is a great [MASK]."
# text = "lolSinged [MASK]."
text = "PogU [MASK]."

preds = mask_filler(text)

for pred in preds:
    print(f">>> {pred['sequence']}")

>>> PogUp.
>>> PogUs.
>>> PogUr.
>>> PogUps.
>>> PogUm.


In [73]:
# text = "This is a great [MASK]."
# text = "lolSinged [MASK]."
text = "PogU [MASK]."

preds = mask_filler(text)

for pred in preds:
    print(f">>> {pred['sequence']}")

>>> PogUp.
>>> PogUps.
>>> PogUr.
>>> PogUs.
>>> PogUm.
