## Neural Language Model

In [1]:
import string
import numpy as np
import pandas as pd
from random import randint
import nltk

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.utils import plot_model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, GRU, Embedding
from tensorflow.keras.models import load_model

from pickle import dump, load

In [None]:
# import tensorflow as tf
# tf.__version__
# tf.config.list_physical_devices()


### Preprocess data

In [None]:
data = """
    The sixth European Union (EU) and African Union (AU) Summit taking place in Brussels this week could not have come at a more critical moment. 
    Of the 20 countries the International Rescue Committee (IRC) has identified as at greatest risk of a new, 
    or significantly worsened, humanitarian crisis in the year ahead, more than half are in Africa. 
    The African continent is also home to almost one-third of the world’s refugees. 
    Meanwhile, just 11 percent of Africa’s population is fully vaccinated from COVID-19, in stark contrast with 70 percent in the EU.
    Given that the pandemic has undermined years of hard-won progress by African communities, civil society and governments towards the Sustainable Development Goals, 
    both the EU and AU must urgently get this important work back on track – jointly driving progress towards a more resilient and sustainable future for the African continent.
    """

Assigned a unique integer to each word in the text convert the sequences of words to sequences of integers

In [None]:
# integer encode text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
encoded = tokenizer.texts_to_sequences([data])[0]

Get the size of the vocabulary to use later to determine the size of the embeddings.  
We add 1 to ensure the words are number from 1 to 22 rather than 0 to 21

In [None]:
# determine the vocabulary size
vocab_size = len(tokenizer.word_index) + 1 
print('Vocabulary Size: %d' % vocab_size)

Create sequences of words to fit the model with one word as input and one word as output.

In [None]:
# create word -> word sequences
sequences = list()
for i in range(1, len(encoded)):
    sequence = encoded[i-1:i+1]
    sequences.append(sequence)
    
print('Total Sequences: %d' % len(sequences))

In [None]:
# pad input sequences
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre') 
print('Max Sequence Length: %d' % max_length)

In [None]:
# split sequence into input X and output y
sequences = np.array(sequences)
X, y = sequences[:,0],sequences[:,1]

In [None]:
X.shape, y.shape

To fit a model to predict a probability distribution across all words in the vocabulary. We need to turn the output element (y) from a single integer into a one hot encoding with a 0 for every word in the vocabulary and a 1 for the actual word.

In [None]:
# one hot encode outputs
y = to_categorical(y, num_classes=vocab_size)

### Train neural language model (one word in one word out)
Model properties
- model has an embedding layer to learn the word embedding 
- the input sequence contains a single word therefore input_length = 1
- the model has a single LSTM layer with 50 units
- output layer has a softmax activation function and is comprised of one neuron for each word in the vocabulary

In [None]:
# define the model
def define_model(vocab_size):
    """Define the Deep learning """
    
    model = Sequential()
    model.add(Embedding(vocab_size, 10, input_length=1))
    model.add(LSTM(50))
    model.add(Dense(vocab_size, activation='softmax'))
    
    # compile network
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    plot_model(model, to_file='model.png', show_shapes=True)
    
    return model

In [None]:
# define model
model = define_model(vocab_size)
model.fit(X, y, epochs=5, verbose=1)

In [None]:
# evaluate
in_text = 'went'
print(in_text)

encoded = tokenizer.texts_to_sequences([in_text])[0]
encoded = np.array(encoded)
yhat = np.argmax(model.predict(encoded), axis=-1)
for word, index in tokenizer.word_index.items():
    if index == yhat:
        print(word)

## Language model multiple input 

### Read and clean document

In [5]:
import string

# turn a doc into clean tokens
def clean_doc(doc):
    """Clean text"""
    doc = doc.replace('--', ' ')
    tokens = doc.split() # split into tokens by white space
    table = str.maketrans('', '', string.punctuation) # remove punctuation
    tokens = [w.translate(table) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()] # remove non-alphabetic tokens
    tokens = [word.lower() for word in tokens]

    return tokens
 
# save tokens to file, one dialog per line
def save_doc(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

In [3]:
filename = 'data/republic_clean.txt'
file = open(filename, 'r')
doc = file.read()
file.close()

In [6]:
# clean document
tokens = clean_doc(doc)
print(tokens[:20])
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))

['the', 'project', 'gutenberg', 'ebook', 'of', 'the', 'republic', 'by', 'plato', 'this', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'at', 'no']
Total Tokens: 216791
Unique Tokens: 10454


In [11]:
# tokens

In [9]:
# organize into sequences of tokens
length = 50 + 1 
sequences = list()
for i in range(length, len(tokens)):
    # select sequence of tokens
    seq = tokens[i-length:i]
    
    # convert into a line
    line = ' '.join(seq)
    sequences.append(line)
    
print('Total Sequences: %d' % len(sequences))

Total Sequences: 216740


In [10]:
sequences

['the project gutenberg ebook of the republic by plato this ebook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever you may copy it give it away or reuse it under the terms of the project gutenberg license included with this ebook or online',
 'project gutenberg ebook of the republic by plato this ebook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever you may copy it give it away or reuse it under the terms of the project gutenberg license included with this ebook or online at',
 'gutenberg ebook of the republic by plato this ebook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever you may copy it give it away or reuse it under the terms of the project gutenberg license included with this ebook or online at wwwgutenbergorg',
 'ebook of the republic by plato this ebook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever you may copy it giv

In [None]:
# save sequences to file
out_filename = 'data/republic_sequences.txt'
save_doc(sequences, out_filename)

### Train model

In [None]:
# load doc into memory
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

In [None]:
# load
in_filename = 'data/republic_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')

In [None]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)

In [None]:
# vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

In [None]:
# separate into input and output
sequences = np.array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)

seq_length = X.shape[1]

In [None]:
X.shape, y.shape

In [None]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))

print(model.summary())

In [None]:
# compile and fit
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, batch_size=16, epochs=5)

In [None]:
# # save the model to file
# model.save('model.h5')

# # save the tokenizer
# dump(tokenizer, open('tokenizer.pkl', 'wb'))

### Generate text

In [None]:
# load doc into memory
def load_doc(filename):
	file = open(filename, 'r')
	text = file.read()
	file.close()
	return text

# generate a sequence from a language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result = list()
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer and truncate sequences to a fixed length
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        yhat = np.argmax(model.predict(encoded), axis=-1)
        
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
                
        # append to input
        in_text += ' ' + out_word
        result.append(out_word)
        
    return ' '.join(result)

In [None]:
# # load the model
# model = load_model('model.h5')
 
# # load the tokenizer
# tokenizer = load(open('tokenizer.pkl', 'rb'))

In [None]:
# load cleaned text sequences
in_filename = 'republic_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')
seq_length = len(lines[0].split()) - 1

In [None]:
# select a seed text
idx = randint(0,len(lines))
seed_text = lines[idx]
print(seed_text + '\n')

In [None]:
# generate new text
generated = generate_seq(model, tokenizer, seq_length, seed_text, 50)
print('generated text')
print(generated)