<a href="https://colab.research.google.com/github/Taaniya/exploring-gpt2-language-model/blob/main/Exploring_GPT2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install transformers
! pip install datasets

In [63]:
from transformers import GPT2TokenizerFast, TFGPT2LMHeadModel
from datasets import load_dataset
from transformers import pipeline
import numpy as np
import tensorflow as tf

In [6]:
tokenizer = GPT2TokenizerFast.from_pretrained('distilgpt2')

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/762 [00:00<?, ?B/s]

#### Model's Vocab

In [7]:
tokenizer.vocab_size

50257

In [8]:
tokenizer

PreTrainedTokenizerFast(name_or_path='distilgpt2', vocab_size=50257, model_max_len=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'})

In [9]:
#### Special Tokens

In [10]:
tokenizer.all_special_tokens

['<|endoftext|>']

In [11]:
tokenizer.bos_token

'<|endoftext|>'

In [12]:
tokenizer.eos_token

'<|endoftext|>'

In [13]:
tokenizer.pad_token

Using pad_token, but it is not set yet.


GPT2 doesn't use padding. It's default mex length of sentence supported is 1024

In [14]:
tokenizer.max_model_input_sizes

{'distilgpt2': 1024,
 'gpt2': 1024,
 'gpt2-large': 1024,
 'gpt2-medium': 1024,
 'gpt2-xl': 1024}

In [15]:
tokenizer.model_max_length

1024

In [21]:
! ls sample_data/

anscombe.json		      mnist_test.csv	     string_theory.txt
california_housing_test.csv   mnist_train_small.csv
california_housing_train.csv  README.md


In [22]:
datasets = load_dataset("text", data_files={"train":["sample_data/string_theory.txt"]})

Using custom data configuration default-64e91c498735b760


Downloading and preparing dataset text/default to /root/.cache/huggingface/datasets/text/default-64e91c498735b760/0.0.0/acc32f2f2ef863c93c2f30c52f7df6cc9053a1c2230b8d7da0d210404683ca08...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/default-64e91c498735b760/0.0.0/acc32f2f2ef863c93c2f30c52f7df6cc9053a1c2230b8d7da0d210404683ca08. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [27]:
datasets

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 7
    })
})

In [29]:
datasets['train']

Dataset({
    features: ['text'],
    num_rows: 7
})

In [28]:
datasets['train'][:2]

{'text': ['In physics, string theory is a theoretical framework in which the point-like particles of particle physics are replaced by one-dimensional objects called strings.',
  'String theory describes how these strings propagate through space and interact with each other. ']}

In [23]:
tokenized_datasets = datasets.map(lambda examples: tokenizer(examples['text']), batched=True, num_proc=4, remove_columns=['text'])



      

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

In [30]:
outputs = tokenizer(datasets['train'][:2]['text'], 
                    truncation=True,
                    max_length=128, 
                    return_overflowing_tokens=True,
                    return_length=True)

In [31]:
outputs

{'input_ids': [[818, 11887, 11, 4731, 4583, 318, 257, 16200, 9355, 287, 543, 262, 966, 12, 2339, 13166, 286, 18758, 11887, 389, 6928, 416, 530, 12, 19577, 5563, 1444, 13042, 13], [10100, 4583, 8477, 703, 777, 13042, 47933, 832, 2272, 290, 9427, 351, 1123, 584, 13, 220]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'length': [29, 16], 'overflow_to_sample_mapping': [0, 1]}

In [33]:
print(f"input IDs length: {len(outputs['input_ids'])}")
print(f"input chunk length: {outputs['length']}")
print(f"chunk mapping: {outputs['overflow_to_sample_mapping']}")

input IDs length: 2
input chunk length: [29, 16]
chunk mapping: [0, 1]


Above 2 examples are of length 29 & 16 tokens respectively, without padding

#### Text generation

In [35]:
generator = pipeline(task='text-generation', model='distilgpt2')

Downloading:   0%|          | 0.00/336M [00:00<?, ?B/s]

This parameter is used as default value for generating sequence of max_length

In [37]:
generator.model.config.max_length

50

In [38]:
generator("how are")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'how are these kinds of services available because you need to set them apart. For example: you can configure a web browser with the command "http://www.google.com" and add/save the HTML file in the URL URL of your web'}]

In [39]:
tokenizer("which is")

{'input_ids': [4758, 318], 'attention_mask': [1, 1]}

In [41]:
tokenizer("hello<|endoftext|>")

{'input_ids': [31373, 50256], 'attention_mask': [1, 1]}

In [42]:
tokenizer.convert_ids_to_tokens(220)

'Ġ'

In [43]:
# 505256 is the last token ID in its vocab (vocab size=50257)

tokenizer.convert_ids_to_tokens(50256)

'<|endoftext|>'

In [47]:
tokenizer.tokenize("ambigram")

['amb', 'ig', 'ram']

In [48]:
tokenizer.tokenize("In physics, string theory is a theoretical framework in which the point-like particles of \
particle physics are replaced by one-dimensional objects called strings.")

['In',
 'Ġphysics',
 ',',
 'Ġstring',
 'Ġtheory',
 'Ġis',
 'Ġa',
 'Ġtheoretical',
 'Ġframework',
 'Ġin',
 'Ġwhich',
 'Ġthe',
 'Ġpoint',
 '-',
 'like',
 'Ġparticles',
 'Ġof',
 'Ġparticle',
 'Ġphysics',
 'Ġare',
 'Ġreplaced',
 'Ġby',
 'Ġone',
 '-',
 'dimensional',
 'Ġobjects',
 'Ġcalled',
 'Ġstrings',
 '.']

This tokenizer treats spaces like parts of tokens

In [70]:
tokenized = tokenizer("how are you doing today", return_tensors='tf')

In [71]:
tokenized

{'input_ids': <tf.Tensor: shape=(1, 5), dtype=int32, numpy=array([[4919,  389,  345, 1804, 1909]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(1, 5), dtype=int32, numpy=array([[1, 1, 1, 1, 1]], dtype=int32)>}

In [84]:
lm_model = TFGPT2LMHeadModel.from_pretrained("distilgpt2")

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at distilgpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [73]:
# set the argument to return attentions
outputs = lm_model(tokenized, output_attentions=True)

In [74]:
type(outputs)

transformers.modeling_tf_outputs.TFCausalLMOutputWithCrossAttentions

In [None]:
outputs.attentions

In [76]:
logits = outputs.logits

In [77]:
logits

<tf.Tensor: shape=(1, 5, 50257), dtype=float32, numpy=
array([[[-31.152225, -29.294825, -31.223726, ..., -41.802814,
         -41.63937 , -31.382223],
        [-56.62306 , -57.436695, -60.613277, ..., -60.590916,
         -62.180126, -58.705185],
        [-65.41144 , -67.153625, -70.582344, ..., -73.06749 ,
         -73.613815, -69.42238 ],
        [-55.675682, -56.63195 , -60.45554 , ..., -61.792995,
         -65.244385, -58.518982],
        [-50.46911 , -51.555832, -54.41272 , ..., -62.17403 ,
         -61.926445, -53.483616]]], dtype=float32)>

These logits are unnormalized scores returned by last layer of the model. Let's convert them to probabilities. When converted, the probability for every token in the input sequence, represent the probability of every word in the vocabulary that can appear following the current word. 

Let's understand this next

In [78]:
next_word_preds = tf.math.softmax(outputs.logits, axis=-1)
print(next_word_preds)

tf.Tensor(
[[[6.30076043e-04 4.03693691e-03 5.86598006e-04 ... 1.49245398e-08
   1.75745409e-08 5.00617782e-04]
  [1.56708789e-04 6.94601040e-05 2.89843274e-06 ... 2.96397707e-06
   6.04908166e-07 1.95360735e-05]
  [3.11427109e-04 5.45423718e-05 1.76872561e-06 ... 1.47358591e-07
   8.53314646e-08 5.64192169e-06]
  [2.03813121e-04 7.83303622e-05 1.71145848e-06 ... 4.49279582e-07
   1.42428647e-08 1.18686885e-05]
  [4.62914584e-04 1.56150345e-04 8.97041082e-06 ... 3.82048038e-09
   4.89376228e-09 2.27152213e-05]]], shape=(1, 5, 50257), dtype=float32)


In [79]:
# we obtain probabilities across the vocabulary for each of 8 tokens in the input sequence
next_word_preds.shape

TensorShape([1, 5, 50257])

In [80]:
# predicting 2nd word & its probability
word_idx = 2
vocab_id = np.argmax(next_word_preds[0][word_idx-1])
tokenizer.convert_ids_to_tokens([vocab_id]) , next_word_preds[0][word_idx-1].numpy()

(['Ġyou'],
 array([1.5670879e-04, 6.9460104e-05, 2.8984327e-06, ..., 2.9639771e-06,
        6.0490817e-07, 1.9536074e-05], dtype=float32))

In [81]:
# predicting 3rd word & its probability
word_idx = 3
vocab_id = np.argmax(next_word_preds[0][word_idx-1])
tokenizer.convert_ids_to_tokens([vocab_id]) , next_word_preds[0][word_idx-1].numpy()

(['Ġgoing'],
 array([3.1142711e-04, 5.4542372e-05, 1.7687256e-06, ..., 1.4735859e-07,
        8.5331465e-08, 5.6419217e-06], dtype=float32))

In [83]:
# predicting 4th word & its probability
word_idx = 4
vocab_id = np.argmax(next_word_preds[0][word_idx-1])
tokenizer.convert_ids_to_tokens([vocab_id]) , next_word_preds[0][word_idx-1].numpy()

(['Ġthis'], array([2.03813121e-04, 7.83303622e-05, 1.71145848e-06, ...,
        4.49279582e-07, 1.42428647e-08, 1.18686885e-05], dtype=float32))

### References
* [Language Models are unsupervised multitask learners](https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf) 
* https://huggingface.co/transformers/v3.0.2/model_doc/gpt2.html#gpt2tokenizerfast
* https://huggingface.co/docs/transformers/model_doc/gpt2#transformers.GPT2LMHeadModel
* https://huggingface.co/docs/transformers/tokenizer_summary
* [Open AI GPT, 2018](https://s3-us-west-2.amazonaws.com/openai-assets/research-covers/language-unsupervised/language_understanding_paper.pdf)