In [1]:
!wget https://raw.githubusercontent.com/teropa/nlp/master/resources/corpora/gutenberg/austen-emma.txt

--2021-07-05 17:43:35--  https://raw.githubusercontent.com/teropa/nlp/master/resources/corpora/gutenberg/austen-emma.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 887071 (866K) [text/plain]
Saving to: ‘austen-emma.txt’


2021-07-05 17:43:36 (1.41 MB/s) - ‘austen-emma.txt’ saved [887071/887071]



In [2]:
from tokenizers import ByteLevelBPETokenizer
import tensorflow as tf
import numpy as np

In [3]:
from tokenizers.models import BPE
from tokenizers import Tokenizer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.normalizers import NFKC, Sequence, Lowercase
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.trainers import BpeTrainer

In [4]:
tokenizer = Tokenizer(BPE())
tokenizer.normalizer = Sequence([
    Lowercase()
])
tokenizer.pre_tokenizer = ByteLevel()
tokenizer.decoder = ByteLevelDecoder()

In [5]:
trainer = BpeTrainer(vocab_size=50000, inital_alphabet=ByteLevel.alphabet(), special_tokens=[
            "<s>",
            "<pad>",
            "</s>",
            "<unk>",
            "<mask>"
        ])
tokenizer.train(["austen-emma.txt"], trainer)

In [6]:
!mkdir tokenizer_gpt

In [7]:
tokenizer.save("tokenizer_gpt/tokenizer.json")

In [8]:
from transformers import GPT2TokenizerFast, GPT2Config, TFGPT2LMHeadModel

In [9]:
tokenizer_gpt = GPT2TokenizerFast.from_pretrained("tokenizer_gpt")

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


In [10]:
tokenizer_gpt.add_special_tokens({
  "eos_token": "</s>",
  "bos_token": "<s>",
  "unk_token": "<unk>",
  "pad_token": "<pad>",
  "mask_token": "<mask>"
})

0

In [11]:
tokenizer_gpt.eos_token_id

2

In [12]:
tokenizer_gpt.encode("<s> this is </s>")

[0, 265, 157, 56, 2]

In [13]:
config = GPT2Config(
  vocab_size=tokenizer_gpt.vocab_size,
  bos_token_id=tokenizer_gpt.bos_token_id,
  eos_token_id=tokenizer_gpt.eos_token_id
)
model = TFGPT2LMHeadModel(config)

In [14]:
config

GPT2Config {
  "activation_function": "gelu_new",
  "attn_pdrop": 0.1,
  "bos_token_id": 0,
  "embd_pdrop": 0.1,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "resid_pdrop": 0.1,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "transformers_version": "4.3.2",
  "use_cache": true,
  "vocab_size": 11750
}

In [15]:
with open("austen-emma.txt", "r", encoding='utf-8') as f:
    content = f.readlines()

In [16]:
content_p = []
for c in content:
    if len(c)>10:
        content_p.append(c.strip())

In [17]:
content_p = " ".join(content_p)+tokenizer_gpt.eos_token

In [18]:
tokenized_content = tokenizer_gpt.encode(content_p)

In [19]:
examples = []
block_size = 100
BATCH_SIZE = 12
BUFFER_SIZE = 1000
for i in range(0, len(tokenized_content)):
    examples.append(tokenized_content[i:i + block_size])

In [20]:
train_data = [] 
labels = [] 
for example in examples: 
    train_data.append(example[:-1]) 
    labels.append(example[1:])

In [21]:
# change 1000 if you want to train on full data
dataset = tf.data.Dataset.from_tensor_slices((train_data[:1000], labels[:1000]))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [22]:
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)

In [23]:
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

In [24]:
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

In [25]:
model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer], metrics=[metric])

In [26]:
# increase number of epochs for higher accuracy and lower loss
num_epoch = 1
history = model.fit(dataset, epochs=num_epoch)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method






In [27]:
def generate(start):  
    input_token_ids = tokenizer_gpt.encode(start, return_tensors='tf')  
    output = model.generate(  
        input_token_ids,  
        max_length = 10,  
        num_beams = 5,  
        temperature = 0.7,  
        no_repeat_ngram_size=2,  
        num_return_sequences=1  
    )  
    return tokenizer_gpt.decode(output[0])

In [28]:
generate(" ")

Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence


'  of her. the had, and the,'

In [29]:
generate("wetson was very good")

Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence


'wetson was very good, and the,'

In [30]:
!mkdir my_gpt-2

In [31]:
model.save_pretrained("my_gpt-2/")

In [32]:
model_reloaded = TFGPT2LMHeadModel.from_pretrained("my_gpt-2/")

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at my_gpt-2/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [33]:
from transformers import WEIGHTS_NAME, CONFIG_NAME, TF2_WEIGHTS_NAME, AutoModel, AutoTokenizer

In [34]:
tokenizer_gpt.save_pretrained("tokenizer_gpt_auto/")

('tokenizer_gpt_auto/tokenizer_config.json',
 'tokenizer_gpt_auto/special_tokens_map.json',
 'tokenizer_gpt_auto/vocab.json',
 'tokenizer_gpt_auto/merges.txt',
 'tokenizer_gpt_auto/added_tokens.json')

In [35]:
model = AutoModel.from_pretrained("my_gpt-2/", from_tf = True) 
tokenizer = AutoTokenizer.from_pretrained("tokenizer_gpt_auto")

All TF 2.0 model weights were used when initializing GPT2Model.

Some weights of GPT2Model were not initialized from the TF 2.0 model and are newly initialized: ['h.0.attn.bias', 'h.0.attn.masked_bias', 'h.1.attn.bias', 'h.1.attn.masked_bias', 'h.2.attn.bias', 'h.2.attn.masked_bias', 'h.3.attn.bias', 'h.3.attn.masked_bias', 'h.4.attn.bias', 'h.4.attn.masked_bias', 'h.5.attn.bias', 'h.5.attn.masked_bias', 'h.6.attn.bias', 'h.6.attn.masked_bias', 'h.7.attn.bias', 'h.7.attn.masked_bias', 'h.8.attn.bias', 'h.8.attn.masked_bias', 'h.9.attn.bias', 'h.9.attn.masked_bias', 'h.10.attn.bias', 'h.10.attn.masked_bias', 'h.11.attn.bias', 'h.11.attn.masked_bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
