In [1]:
!pip install transformers datasets -qqq

[K     |████████████████████████████████| 3.5 MB 4.6 MB/s 
[K     |████████████████████████████████| 311 kB 35.2 MB/s 
[K     |████████████████████████████████| 6.8 MB 13.2 MB/s 
[K     |████████████████████████████████| 67 kB 3.9 MB/s 
[K     |████████████████████████████████| 895 kB 41.7 MB/s 
[K     |████████████████████████████████| 596 kB 44.5 MB/s 
[K     |████████████████████████████████| 243 kB 51.4 MB/s 
[K     |████████████████████████████████| 133 kB 53.3 MB/s 
[K     |████████████████████████████████| 1.1 MB 53.6 MB/s 
[K     |████████████████████████████████| 271 kB 49.6 MB/s 
[K     |████████████████████████████████| 94 kB 3.0 MB/s 
[K     |████████████████████████████████| 144 kB 53.5 MB/s 
[?25h

## Let us first checkout a Masked Language Model and see how it works


In [49]:
from transformers import TFAutoModelForMaskedLM
from transformers import AutoTokenizer
from datasets import load_dataset
import numpy as np
import tensorflow as tf

In [11]:
check_point = 'distilbert-base-uncased'

In [12]:
model = TFAutoModelForMaskedLM.from_pretrained(check_point)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForMaskedLM: ['activation_13']
- This IS expected if you are initializing TFDistilBertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertForMaskedLM were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForMaskedLM for predictions without further training.


In [6]:
model(model.dummy_inputs)# Build the model

TFMaskedLMOutput([('logits',
                   <tf.Tensor: shape=(3, 5, 30522), dtype=float32, numpy=
                   array([[[-5.4966073, -5.707848 , -5.542335 , ..., -5.194432 ,
                            -5.4303055,  2.3678167],
                           [-5.0797343, -5.3039637, -5.145937 , ..., -5.020322 ,
                            -5.0973887,  2.3962438],
                           [-5.067375 , -5.291143 , -5.136543 , ..., -4.919059 ,
                            -5.0076246,  2.5367956],
                           [-5.0746536, -5.280475 , -5.137513 , ..., -4.8463535,
                            -4.9868646,  2.5833554],
                           [-5.0443606, -5.2255216, -5.0974364, ..., -4.780394 ,
                            -4.9612126,  2.5994534]],
                   
                          [[-5.4514475, -5.6473627, -5.458254 , ..., -5.2632766,
                            -5.25949  ,  3.2350695],
                           [-4.885667 , -5.1023827, -4.905625 , ..., -4.

In [8]:
model.summary()

Model: "tf_distil_bert_for_masked_lm"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMai  multiple                 66362880  
 nLayer)                                                         
                                                                 
 vocab_transform (Dense)     multiple                  590592    
                                                                 
 vocab_layer_norm (LayerNorm  multiple                 1536      
 alization)                                                      
                                                                 
 vocab_projector (TFDistilBe  multiple                 23866170  
 rtLMHead)                                                       
                                                                 
Total params: 66,985,530
Trainable params: 66,985,530
Non-trainable params: 0
__________________________

In [13]:
tokenizer = AutoTokenizer.from_pretrained(check_point)

In [14]:
text = 'How was your [MASK]?'

In [20]:
inputs = tokenizer(text, return_tensors='np')

In [22]:
inputs 

{'input_ids': array([[ 101, 2129, 2001, 2115,  103, 1029,  102]]), 'attention_mask': array([[1, 1, 1, 1, 1, 1, 1]])}

In [27]:
model(**inputs).logits
#(1, 7, 30522),

<tf.Tensor: shape=(1, 7, 30522), dtype=float32, numpy=
array([[[ -5.6271214,  -5.616682 ,  -5.6235204, ...,  -4.9783583,
          -4.8618355,  -3.006512 ],
        [-10.603564 , -10.766145 , -10.556137 , ...,  -8.6690445,
          -8.477757 ,  -5.939316 ],
        [-12.287144 , -12.502188 , -12.049969 , ...,  -9.707514 ,
          -8.84099  ,  -7.4124527],
        ...,
        [ -6.2848816,  -6.582689 ,  -6.174431 , ...,  -5.119264 ,
          -4.538795 ,  -4.1390276],
        [ -9.212305 ,  -9.108264 ,  -9.215045 , ...,  -7.937167 ,
          -9.390009 ,  -2.4328694],
        [-12.314569 , -12.345098 , -12.151514 , ...,  -9.835674 ,
         -10.64857  ,  -8.735081 ]]], dtype=float32)>

In [28]:
token_logits = model(**inputs).logits

In [29]:
tokenizer.mask_token_id

103

In [30]:
np.argwhere(inputs['input_ids'] == tokenizer.mask_token_id )

array([[0, 4]])

In [32]:
np.argwhere(inputs['input_ids'] == tokenizer.mask_token_id )[0, 1]

4

In [33]:
mask_token_index = np.argwhere(inputs['input_ids'] == tokenizer.mask_token_id )[0, 1]

In [36]:
token_logits[0, mask_token_index] #mask_token_index = 4

<tf.Tensor: shape=(30522,), dtype=float32, numpy=
array([-6.2848816, -6.582689 , -6.174431 , ..., -5.119264 , -4.538795 ,
       -4.1390276], dtype=float32)>

In [38]:
mask_token_logits = token_logits[0, mask_token_index, :] #mask_token_index = 4

In [40]:
top_5_tokens = np.argsort(-mask_token_logits)[:5].tolist()

In [41]:
top_5_tokens

[2154, 5798, 3462, 2166, 4926]

## TOP 5 POSSIBLE TOKENS

In [42]:
for token in top_5_tokens:
  print(f'{text.replace("[MASK]", tokenizer.decode([token]))}')

How was your day?
How was your birthday?
How was your flight?
How was your life?
How was your accident?


## LEAST POSSIBLE TOKENS

In [47]:
np.argsort(-mask_token_logits)[-5:]

array([22869, 17198, 18117, 21217, 28078])

In [48]:
bottom_5_tokens = np.argsort(-mask_token_logits)[-5:].tolist()
for token in bottom_5_tokens:
  print(f'{text.replace("[MASK]", tokenizer.decode([token]))}')

How was your ##elles?
How was your ##bers?
How was your ##ards?
How was your ##rts?
How was your ##uously?


## lMDB data for our language modeling fine tuning

In [50]:
imdb_dataset = load_dataset('imdb')

Downloading:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1...


Downloading:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [51]:
imdb_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

## Preprocessing the data for Language Modelling

Language Modelling is a self-supervised learning task, for this we don't need any labels. All we will need is the input_ids, word_id and attention_mask

In [58]:
imdb_dataset['train'][0]['text'][:20]

'I rented I AM CURIOU'

In [60]:
tokenizer(imdb_dataset['train'][0]['text'][:20])

<bound method BatchEncoding.word_ids of {'input_ids': [101, 1045, 12524, 1045, 2572, 12731, 9488, 2226, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}>

In [64]:
tokenizer(imdb_dataset['train'][0]['text'][:20]).word_ids()

[None, 0, 1, 2, 3, 4, 4, 4, None]

In [65]:
tokenizer.is_fast

True

In [66]:
def tokenize_for_lm(example):
  result = tokenizer(example['text'])
  if tokenizer.is_fast:
    #  we will need them later on to do whole word masking.
    result['word_ids'] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
  return result

In [67]:
# Use batched=True to activate fast multithreading!
tokenized_datasets = imdb_dataset.map(
    tokenize_for_lm, batched=True, remove_columns=["text", "label"]
)
tokenized_datasets

  0%|          | 0/25 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (720 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 50000
    })
})

In [68]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 50000
    })
})