# Download Data

In [1]:
!curl -L -o "news-headlines.tsv" "https://raw.githubusercontent.com/PacktPublishing/Advanced-Natural-Language-Processing-with-TensorFlow-2/master/chapter5-nlg-with-transformer-gpt/char-rnn/news-headlines.tsv"

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 69.8M  100 69.8M    0     0  19.1M      0  0:00:03  0:00:03 --:--:-- 19.0M


# Installing LIBs

In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.8.2-py3-none-any.whl (2.5 MB)
[K     |████████████████████████████████| 2.5 MB 7.9 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 58.0 MB/s 
[?25hCollecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 71.2 MB/s 
Installing collected packages: tokenizers, sacremoses, huggingface-hub, transformers
Successfully installed huggingface-hub-0.0.12 sacremoses-0.0.45 tokenizers-0.10.3 transformers-4.8.2


# Importing LIBs

In [3]:
import tensorflow as tf
import numpy as np
import pandas as pd
import csv
import os
import datetime
import time
import matplotlib.pyplot as plt
from transformers import TFOpenAIGPTLMHeadModel, OpenAIGPTTokenizer
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer


# Based on RNN

## Preparing The Data

In [4]:
chars = set()
with open('/content/news-headlines.tsv', 'r') as file:
  lines = csv.reader(file,delimiter = '\t')
  for line in lines:
      chars.update(line[0])
file.close()

In [5]:
print(chars)
# as we see there are a lot of useless Chars

{'ٹ', 'δ', '\x92', '⁸', '!', 'g', 'γ', '″', 'ï', ':', 'Y', '\xad', 'ç', 'ο', 'μ', 'D', 'S', 'É', 'y', 'Ã', 'ú', 'Å', 'v', 'j', '\x7f', 'Д', 'J', '\u200a', '丢', '🌮', 'Ñ', 'پ', 'т', '¿', 'ș', 'N', 'X', 'ᶅ', 'ں', 'ش', '¹', 'ھ', 'е', 'گ', '’', 'z', '︿', '0', 'q', 'ó', '}', 'а', 'д', '.', 'K', '¾', 'ￃ', 'm', 'Q', 'Ђ', 'о', '®', '”', '‘', 'ا', 'w', 'ᵒ', '💯', 'ί', '—', 'P', 'ά', 'Ü', "'", 'è', 'c', 'ä', '½', 'κ', 'ن', '¡', '기', '|', 'f', '1', 'á', 'L', '‐', 'H', 'ﬁ', 'ạ', '؛', 'M', '4', '%', 'ν', 'ʻ', 'œ', 'b', '°', '^', 'η', 'ε', 'α', 'ک', '~', '₂', 'ی', 'ë', 'ہ', 'ñ', '5', 'و', 'ج', 'ấ', '\n', 'T', '⊙', 'چ', 'ᴥ', '•', 'h', 'ρ', '닫', '】', 'Ç', '₹', 'ω', 'ئ', 'x', '巴', 'r', 'λ', 'n', 'ù', '\ufeff', 'Ÿ', '·', 'ξ', '™', 'W', 'к', '\x99', 'é', 'ύ', 'e', '`', '어', '/', 'U', 'o', 'ᶘ', '*', 'Č', 'l', '–', 'A', '×', 'O', 'd', '<', 'a', 'ώ', '―', '$', 'ž', 'ã', 'ö', 'ś', '?', 'ǒ', '\t', '이', '‹', 'ث', '【', 'k', 'ø', 'í', '©', '3', 'ط', 'V', '6', 'ü', '{', 'ے', 'н', ']', '🍂', 'Z', '2', 'ر', 'Κ', '@', 

In [6]:
# Build Vocab of Chars
chars = sorted(set("abcdefghijklmnopqrstuvwxyz0123456789 -,;.!?:’’’/\|_@#$%ˆ&*˜‘+-=()[]{}' ABCDEFGHIJKLMNOPQRSTUVWXYZ"))
chars = list(chars)
EOS = '<EOS>'
UNK = "<UNK>"
PAD = "<PAD>"  # need to move mask to '0'index for Embedding layer
chars.append(UNK)
chars.append(EOS)  #end of sentence
chars.insert(0, PAD)  # now padding should get index of 0

In [7]:
# Create a Mapping 
char2idx = {c:i for i,c in enumerate(chars)}
idx2char = np.array(chars)


In [8]:
# if Char Not in Vocab return <UNK> Token
# Test
char2idx.get('ا','<UNK>')

'<UNK>'

In [9]:
data = []  # load into this list of lists 
MAX_LEN = 75  #maximum length of a headline 

In [10]:
# Padding and Truncation
with open('/content/news-headlines.tsv', 'r') as file:
  lines = csv.reader(file,delimiter = '\t')
  for line in lines:
      line = line[0]
      vectorized = [char2idx.get(c, char2idx[UNK]) for c in line[:-1]] 
      LEN = len(vectorized)
      if LEN >= MAX_LEN:
        vectorized = vectorized[:MAX_LEN-1]
        vectorized.append(char2idx[EOS])
      else:
        diff = MAX_LEN - LEN - 1
        vectorized.extend([char2idx[PAD]]*diff)
        vectorized.append(char2idx[EOS])

      data.append(vectorized)
print("**** Data file loaded ****")





**** Data file loaded ****


In [11]:
data = np.array(data)

In [12]:
data.shape

(623272, 75)

In [13]:
# Prepare Data For Training 
data_in = data[:,:-1]
data_out = data[:,1:] # Data Shifted by one

In [14]:
# Create Tensorflow Dataset
X = tf.data.Dataset.from_tensor_slices((data_in,data_out))

## Building The Model

In [15]:
EPOCHS=7

# Length of the vocabulary in chars
vocab_size = len(chars)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

#batch size
BATCH_SIZE=256

# create tf.DataSet
x_train = X.shuffle(100000, reshuffle_each_iteration=True
                   ).batch(BATCH_SIZE, drop_remainder=True)

In [16]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
                               tf.keras.layers.Embedding(vocab_size,embedding_dim,mask_zero=True,batch_input_shape=[batch_size, None]),
                               tf.keras.layers.GRU(rnn_units,return_sequences=True,stateful=True,recurrent_initializer='glorot_uniform'),
                               tf.keras.layers.Dense(vocab_size)
                              ])

  return model

In [17]:
model = build_model(vocab_size,embedding_dim,rnn_units,BATCH_SIZE)

In [18]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (256, None, 256)          24576     
_________________________________________________________________
gru (GRU)                    (256, None, 1024)         3938304   
_________________________________________________________________
dense (Dense)                (256, None, 96)           98400     
Total params: 4,061,280
Trainable params: 4,061,280
Non-trainable params: 0
_________________________________________________________________


In [19]:
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)# Bcoz we not Use Softmax in last layer

### Learning Rate Scheduler From TF

In [None]:
STEPS_PER_EPOCH = len(x_train) # Training Size / Batch Size

In [None]:
lr_schedule = tf.keras.optimizers.schedules.InverseTimeDecay(
              0.001,
              decay_steps=STEPS_PER_EPOCH*(EPOCHS/10),
              decay_rate=2,
              staircase=False)

In [None]:
optimizer = tf.keras.optimizers.Adam(lr_schedule)

In [None]:
model.compile(optimizer = optimizer, loss = loss)

In [None]:
history = model.fit(x_train, epochs=EPOCHS)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


### Custom Learning Rate Scheduler

In [20]:
# Custom Callback for Learning Rate Decay
class LearningRateScheduler(tf.keras.callbacks.Callback):
  def __init__(self,init_lr,decay,steps, start_epoch):
    super().__init__()
    self.init_lr = init_lr  #initial learning rate
    self.decay = decay  # how sharply to decay
    self.steps = steps  # total number of steps of decay
    self.start_epoch = start_epoch  # which epoch to start decaying

  def on_epoch_begin(self, epoch, logs=None):
    if not hasattr(self.model.optimizer, 'lr'):
      raise ValueError('Optimizer must have a "lr" attribute.')
      
    # Get the current learning rate from model's optimizer.
    lr = float(tf.keras.backend.get_value(self.model.optimizer.learning_rate))

    if(epoch >= self.start_epoch):
        # Call schedule function to get the scheduled learning rate.
        scheduled_lr = self.init_lr / (1 + self.decay * (epoch / self.steps))
        # Set the value back to the optimizer before this epoch starts
        tf.keras.backend.set_value(self.model.optimizer.lr, 
                                   scheduled_lr)
        print('\nEpoch %05d: Learning rate is %6.8f.' % (epoch, 
                                                         scheduled_lr))


In [21]:
# Setup checkpoints 
#dynamically build folder names
dt = datetime.datetime.today().strftime("%Y-%b-%d-%H-%M-%S")

# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints/'+ dt

# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

lr_decay = LearningRateScheduler(0.001, 4., EPOCHS, 10)


In [22]:
model = build_model(vocab_size,embedding_dim,rnn_units,BATCH_SIZE)

In [23]:
model.compile(optimizer = 'adam', loss = loss)

In [24]:
EPOCHS=1
start = time.time()
history2 = model.fit(x_train, epochs=EPOCHS, 
                    callbacks=[checkpoint_callback, lr_decay])
print("**** End Training ****")
print("Training time: ", time.time()- start)

**** End Training ****
Training time:  301.4275789260864


## Generation 

In [25]:
# Length of the vocabulary in chars
vocab_size = len(chars)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

# Batch size
BATCH_SIZE=1

In [26]:
def generate_text(model, start_string, temperature=0.7, num_generate=75):
  # Low temperatures results in more predictable text.
  # Higher temperatures results in more surprising text.
  # Experiment to find the best setting.

  # Converting our start string to numbers (vectorizing)
  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store our results
  text_generated = []

  # Here batch size == 1
  for i in range(num_generate):
      predictions = model(input_eval)
      # remove the batch dimension
      predictions = tf.squeeze(predictions, 0)

      # using a categorical distribution to predict the word returned by the model
      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      # We pass the predicted word as the next input to the model
      # along with the previous hidden state
      input_eval = tf.expand_dims([predicted_id], 0)
        
      text_generated.append(idx2char[predicted_id])
      # lets break is <EOS> token is generated
      #if idx2char[predicted_id] == EOS:
      #  break #end of a sentence reached, lets stop

  return (start_string + ''.join(text_generated))

In [27]:
def build_gen_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.GRU(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model


gen_model = build_gen_model(vocab_size, embedding_dim, rnn_units, BATCH_SIZE)

In [28]:
# Now setup the location of the checkpoint
# and load the latest checkpoint
# Directory where the checkpoints will be saved
checkpoint_dir = '/content/training_checkpoints/2021-Jul-21-19-01-41' 

gen_model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

gen_model.build(tf.TensorShape([1, None]))


In [29]:
print(generate_text(gen_model, start_string=u"Obama"))


Obamacare gives bird water main floodin<PAD> celebration in 20<EOS><PAD> at age 4<PAD>o<EOS>is<PAD>i<PAD> th


# GPT

In [None]:
gpttokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
gpt = TFOpenAIGPTLMHeadModel.from_pretrained('openai-gpt')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=815973.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=458495.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1272610.0, style=ProgressStyle(descript…

ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.





HBox(children=(FloatProgress(value=0.0, description='Downloading', max=656.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466312920.0, style=ProgressStyle(descri…




All model checkpoint layers were used when initializing TFOpenAIGPTLMHeadModel.

All the layers of TFOpenAIGPTLMHeadModel were initialized from the model checkpoint at openai-gpt.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFOpenAIGPTLMHeadModel for predictions without further training.


In [None]:
input_ids = gpttokenizer.encode('Robotics is the ', return_tensors='tf')
print(input_ids)
greedy_output = gpt.generate(input_ids, max_length=100)

print("Output:\n" + 100 * '-')
print(gpttokenizer.decode(greedy_output[0], skip_special_tokens=True))

tf.Tensor([[5846 9259  544  481]], shape=(1, 4), dtype=int32)
Output:
----------------------------------------------------------------------------------------------------
robotics is the only way to get to the surface. " 
 " i'm not sure i understand. " 
 " the first thing we have to do is find a way to get to the surface. " 
 " but how? " 
 " we have to find a way to get to the surface. " 
 " but how? " 
 " we have to find a way to get to the surface. " 
 " but how? " 
 " we have to find a way to


# GPT2

In [None]:
gpt2tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# add the EOS token as PAD token to avoid warnings
gpt2 = TFGPT2LMHeadModel.from_pretrained("gpt2", 
                                         pad_token_id=gpt2tokenizer.eos_token_id)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355256.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=665.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=497933648.0, style=ProgressStyle(descri…




All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [None]:
# encode context the generation is conditioned on
input_ids = gpt2tokenizer.encode('Robotics is the ', return_tensors='tf')

# generate text until the output length (which includes the context length) reaches 50
greedy_output = gpt2.generate(input_ids, max_length=50)

print("Output:\n" + 50 * '-')
print(gpt2tokenizer.decode(greedy_output[0], skip_special_tokens=True))

Output:
--------------------------------------------------
Robotics is the vernacular of the future.

The future is not a future where robots are going to be able to do anything. It's a future where robots are going to be able to do anything.

The future is


In [None]:
# BEAM SEARCH
# activate beam search and early_stopping
beam_output = gpt2.generate(
    input_ids, 
    max_length=51, 
    num_beams=20, 
    early_stopping=True
)

print("Output:\n" + 50 * '-')
print(gpt2tokenizer.decode(beam_output[0], skip_special_tokens=True))

Output:
--------------------------------------------------
Robotics is the vernacular of science fiction and fantasy. It's a genre that has been around for a long time. It's a genre that has been around for a long time. It's a genre that has been around for a long time


In [None]:

# set no_repeat_ngram_size to 3
beam_output = gpt2.generate(
    input_ids, 
    max_length=50, 
    num_beams=5, 
    no_repeat_ngram_size=3, 
    early_stopping=True
)

print("Output:\n" + 50 * '-')
print(gpt2tokenizer.decode(beam_output[0], skip_special_tokens=True))

Output:
--------------------------------------------------
Robotics is the vernacular term for a new kind of robot. It's a robot that can do a lot of things, but it can't do them all. It can do things that other robots can't.

Advertisement




In [None]:
beam_outputs = gpt2.generate(
    input_ids, 
    max_length=50, 
    num_beams=7, 
    no_repeat_ngram_size=3, 
    num_return_sequences=3,  
    early_stopping=True,
    temperature=0.7
)

print("Output:\n" + 50 * '-')
for i, beam_output in enumerate(beam_outputs):
  print("\n{}: {}".format(i, 
                        gpt2tokenizer.decode(beam_output, 
                                             skip_special_tokens=True)))

Output:
--------------------------------------------------

0: Robotics is the vernacular of the future.

The future of robotics is in the hands of a group of people who have been working on it for a long time. The group is called the Robotics Society of America, or RSA

1: Robotics is the vernacular of the future.

The future of robotics is in the hands of a group of people who have been working on it for a long time. The group is called the Robotics Institute, and it's led by

2: Robotics is the vernacular of the future.

The future of robotics is in the hands of a group of people who have been working on it for a long time. The group is called the Robotics Society of America (RSSA).


In [None]:
# Top-K sampling
tf.random.set_seed(42)  # for reproducible results
beam_output = gpt2.generate(
    input_ids, 
    max_length=50, 
    do_sample=True, 
    top_k=25,
    temperature=2
)

print("Output:\n" + 50 * '-')
print(gpt2tokenizer.decode(beam_output[0], skip_special_tokens=True))

Output:
--------------------------------------------------
Robotics is the xtrapheatre equivalent. When I started to design drones with some basic knowledge in machine design—what is it, exactly--and the right software, a little bit about human movement and the ability of the person being able


In [None]:
input_ids = gpt2tokenizer.encode('In the dark of the night, there was a ', return_tensors='tf')
# Top-K sampling
tf.random.set_seed(42)  # for reproducible results
beam_output = gpt2.generate(
    input_ids, 
    max_length=200, 
    do_sample=True, 
    top_k=50
)

print("Output:\n" + 50 * '-')
print(gpt2tokenizer.decode(beam_output[0], skip_special_tokens=True))

Output:
--------------------------------------------------
In the dark of the night, there was a urn with four thousand five hundred-year fragments. Here were scattered five thousand years in their fragments—what is not, you may not say, four different eras; and the three fragments of the same date, three hundred and seventy-three years, which I am sure of, were all separated into the one hundred and twenty-two pieces to the earth's circumference. It may then be said, therefore, to us that the period of the sixteenth earth-days is the seven hundredth annular year, and we shall learn from that, the twelveteenth is the last earth-year and the eighty-fifth is the last year. It is this day which we shall learn of; it should be, then, therefore, to the fourteenth, the fourteenth being the fourth and the fifty-first of all six hundred-years, the fourth to the twenty-third, the second to the twenty-fourth, the last to


In [None]:
# Another sample with a larger model
gpt2tok_l = GPT2Tokenizer.from_pretrained("gpt2-large")

# add the EOS token as PAD token to avoid warnings
gpt2_l = TFGPT2LMHeadModel.from_pretrained("gpt2-large", 
                                         pad_token_id=gpt2tokenizer.eos_token_id)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355256.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=764.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=3096618024.0, style=ProgressStyle(descr…




All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2-large.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [None]:

input_ids = gpt2tok_l.encode('In the dark of the night, there was a ', return_tensors='tf')
# Top-K sampling
tf.random.set_seed(42)  # for reproducible results
beam_output = gpt2_l.generate(
    input_ids, 
    max_length=200, 
    do_sample=True, 
    top_k=25
)

print("Output:\n" + 50 * '-')
print(gpt2tok_l.decode(beam_output[0], skip_special_tokens=True))

Output:
--------------------------------------------------
In the dark of the night, there was a ursine creature standing at the edge of a pond. Its face was as white as snow and it looked to be sleeping. It had a red nose, a nose so large that it was like it was made of the face of a dog. The water beneath its feet had a red colour and it smelled of blood."

The poem was written by Joseph Campbell and later published as The Hero With a Thousand Faces. Campbell's poem is known as the story of the wolf (as is the case for most of his other work). It begins, "You're walking along a path between the hills. In each direction you see another person or thing of interest." The person or thing of interest here being a wolf which had been feeding its young. The only problem with this story is that in the context of a poem about wolves, it's difficult to say what interest the wolf has. The poem does, however, offer a number of clues
