# NLP Text Generation

# How to Get Started

---

## Before Run
1. Upload the trained model weights `scifi_best_model.hdf5`.
2. Make sure all the file directories are set as the right path.
3. You might want to delete `fit()` in `LSTM - Final Model` category, if you want to use the model uploaded from the step 1.
4. Run the code.

## Generating Text
1. In `Text Generation` category, you can chage `given_word` and the result will be shown as `generated_text`
2. The example for `given_word` are as follows:
  * 'Time travel exists only in the'
  * 'I am not'
  * 'The car'
  
  However, you can type your own `given_word`.


# Load Data

In [None]:
from google.colab import drive
import numpy as np
import string

import gensim
from keras.preprocessing.text import Tokenizer

from keras.preprocessing.text import text_to_word_sequence
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Activation, Dropout
from keras.callbacks import LambdaCallback

import re
import tensorflow as tf

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def get_txtfile(filename):
  file = open(filename, "r")
  txt = file.read()
  file.close()
  
  return txt

In [None]:
text = get_txtfile("/content/drive/My Drive/Colab Notebooks/PGT/CSC8637_DeepLearning/Coursework/Dataset/SciFi/internet_archive_scifi_v3.txt")

Show the loaded data which is a raw text

In [None]:
text[:5000]

'MARCH # All Stories New and Complete Publisher Editor IF is published bi-monthly by Quinn Publishing Company, Inc., Kingston, New York. Volume #, No. #. Copyright # by Quinn Publishing Company, Inc. Application for Entry\' as Second Class matter at Post Office, Buffalo, New York, pending. Subscription # for # issues in U.S. and Possessions: Canada # for # issues; elsewhere #. Aiiow four weeks for change of address. All stories appearing in this magazine are fiction. Any similarity to actual persons is coincidental. #c a fcopy. Printed ia U.S. A. A chat with the editor  i #  science fiction magazine called IF. The title was selected after much thought because of its brevity and on the theory it is indicative of the field and will be easy to remember. The tentative title that just morning and couldn\'t remember it until we\'d had a cup of coffee, it was summarily discarded. A great deal of thought and effort lias gone into the formation of this magazine. We have had the aid of several v

In [None]:
print('text length: ', len(text))

text length:  149326361


# Preprocessing

In [None]:
# sample_text = """Mr. Smith bought cheapsite.com for 1.5 million dollars, i.e. he paid a lot for it. Did he mind? Adam Jones Jr. thinks he didn't. In any case, this isn't true... Well, with a probability of .9 it isn't."""

def standardize_text(raw_text):
  # Split 'raw text' into 'sentences'
  sentences = re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', raw_text)
  # Tokenisation
  tokens = [text_to_word_sequence(sentence) for sentence in sentences]
  
  return tokens

In [None]:
tokenized_sentences = standardize_text(text)

# Word2vec

In [None]:
word_model = gensim.models.Word2Vec(tokenized_sentences,
                                    size=50, 
                                    min_count=1, 
                                    window=5,
                                    iter=3
                                    )

pretrained_weights = word_model.wv.syn0
vocab_size, embedding_size = pretrained_weights.shape
print('Result embedding shape:', pretrained_weights.shape)



Result embedding shape: (257011, 50)


  pretrained_weights = word_model.wv.syn0


In [None]:
# Check similar words
word_sample = "king"
word_model.wv.most_similar(word_sample)

[('prince', 0.80077064037323),
 ('queen', 0.7969905734062195),
 ('murray', 0.7541114687919617),
 ('founder', 0.7420518398284912),
 ('dictator', 0.7419301867485046),
 ('emperor', 0.7377235889434814),
 ('alexander', 0.716799259185791),
 ('elder', 0.716194212436676),
 ('leinster', 0.7158246636390686),
 ('youngest', 0.7124871015548706)]

# Prepare Train Data

In [None]:
maxlen = max([len(v) for v in tokenized_sentences])
print("Max Length:", maxlen)

Max Length: 404


In [None]:
def word2idx(word):
  return word_model.wv.vocab[word].index
  
def idx2word(idx):
  return word_model.wv.index2word[idx]

In [None]:
train_x = np.zeros([len(tokenized_sentences), maxlen], dtype=np.int32)
train_y = np.zeros([len(tokenized_sentences)], dtype=np.int32)

for i, sentence in enumerate(tokenized_sentences):
  for t, word in enumerate(sentence[:-1]):
    train_x[i, t] = word2idx(word)
  train_y[i] = word2idx(sentence[-1])

# LSTM

## Hyperparameter Tuning

In [None]:
# lstm_units = [64, 128, 256]
# dropout_rates = [0.2, 0.3, 0.4]
# learning_rates = [0.01, 0.001, 0.0001]

# for units in lstm_units:
#     for rate in dropout_rates:
#         for lr in learning_rates:
#             # Define checkpoint to save best model
            
#             # Train model
#             model = Sequential()
#             model.add(Embedding(
#                 input_dim=vocab_size, 
#                 output_dim=embedding_size, 
#                 weights=[pretrained_weights]
#                 )
#             )
#             model.add(LSTM(units))
#             model.add(Dropout(rate))
#             model.add(Dense(
#                 units=vocab_size, 
#                 activation="softmax"
#                 )
#             )
#             model.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=lr), loss='sparse_categorical_crossentropy')
#             print("--------------------")
#             print('lstm_units:',units)
#             print('dropout_rates:',rate)
#             print('learning_rates:',lr)

#             model.fit(train_x, train_y,
#                     batch_size=128,
#                     epochs=3
#                     )

## Final Model

In [None]:
print("vocab_size:", vocab_size)

vocab_size: 257011


In [None]:
from keras.layers import Input, LSTM, RepeatVector, TimeDistributed
from keras.models import Model

num_units = 128
dropout_rate = 0.3
lr = 0.01

model = Sequential()
model.add(Embedding(
    input_dim=vocab_size, 
    output_dim=embedding_size, 
    weights=[pretrained_weights])
)
model.add(LSTM(units=num_units, return_sequences=True))
model.add(Dropout(dropout_rate))
model.add(LSTM(units=num_units))
model.add(Dropout(dropout_rate))
model.add(Dense(
    units=vocab_size, 
    activation="softmax"
    ))

checkpoint_filepath = 'scifi_best_model.hdf5'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='loss',
    mode='min',
    save_best_only=True)

model.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=lr), loss='sparse_categorical_crossentropy')
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 50)          12850550  
                                                                 
 lstm (LSTM)                 (None, None, 128)         91648     
                                                                 
 dropout (Dropout)           (None, None, 128)         0         
                                                                 
 lstm_1 (LSTM)               (None, 128)               131584    
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense (Dense)               (None, 257011)            33154419  
                                                                 
Total params: 46,228,201
Trainable params: 46,228,201
No

In [None]:
model.fit(train_x, train_y,
          batch_size=2048,
          epochs=100,
          callbacks=[model_checkpoint_callback]
          )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f90cc1471f0>

In [None]:
model.load_weights(checkpoint_filepath)

# Text Generation

In [None]:
def sample(preds, temperature=1.0):
    if temperature <= 0:
        return np.argmax(preds)
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def generate_next(text, num_generated=10):
    word_idxs = [word2idx(word) for word in text.lower().split()]
    for i in range(num_generated):
        # prediction = model.predict(x=np.array(word_idxs))
        print(word_idxs)
        x=np.array(word_idxs)
        print(np.expand_dims(x, axis=0))
        prediction = model.predict(np.expand_dims(x, axis=0))
        idx = sample(prediction[-1], temperature=0.7)
        word_idxs.append(idx)
    return ' '.join(idx2word(idx) for idx in word_idxs)

In [None]:
given_word = "The car"
generated_text = generate_next(given_word)

[0, 452]
[[  0 452]]
[0, 452, 10]
[[  0 452  10]]
[0, 452, 10, 116]
[[  0 452  10 116]]
[0, 452, 10, 116, 2403]
[[   0  452   10  116 2403]]
[0, 452, 10, 116, 2403, 42]
[[   0  452   10  116 2403   42]]
[0, 452, 10, 116, 2403, 42, 8]
[[   0  452   10  116 2403   42    8]]
[0, 452, 10, 116, 2403, 42, 8, 10]
[[   0  452   10  116 2403   42    8   10]]
[0, 452, 10, 116, 2403, 42, 8, 10, 8]
[[   0  452   10  116 2403   42    8   10    8]]
[0, 452, 10, 116, 2403, 42, 8, 10, 8, 22]
[[   0  452   10  116 2403   42    8   10    8   22]]
[0, 452, 10, 116, 2403, 42, 8, 10, 8, 22, 8670]
[[   0  452   10  116 2403   42    8   10    8   22 8670]]


In [None]:
generated_text

'the car that made empire up it that it said puzzling unnecessary'