In [10]:
import os
os.environ['KERAS_BACKEND']='tensorflow'
import nltk
nltk.download('stopwords')
import numpy
import sys
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
from tensorflow.keras import utils
from tensorflow.keras.callbacks import ModelCheckpoint

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/r/rbond/jorlo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
def tokenize_words(input):
    # lowercase everything to standardize it
    input = input.lower()

    # instantiate the tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)

    # if the created token isn't in the stop words, make it part of "filtered"
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return " ".join(filtered)

In [12]:
file = open('../../texts/communist-manifesto.txt').read()


In [13]:
processed_inputs = tokenize_words(file)


In [14]:
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c, i) for i, c in enumerate(chars))

input_len = len(processed_inputs)
vocab_len = len(chars)
print ("Total number of characters:", input_len)
print ("Total vocab:", vocab_len)

Total number of characters: 48620
Total vocab: 37


In [15]:
seq_length = 100
x_data = []
y_data = []

In [16]:
# loop through inputs, start at the beginning and go until we hit
# the final character we can create a sequence out of
for i in range(0, input_len - seq_length, 1):
    # Define input and output sequences
    # Input is the current character plus desired sequence length
    in_seq = processed_inputs[i:i + seq_length]

    # Out sequence is the initial character plus total sequence length
    out_seq = processed_inputs[i + seq_length]

    # We now convert list of characters to integers based on
    # previously and add the values to our lists
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])

In [17]:
n_patterns = len(x_data)
print ("Total Patterns:", n_patterns)

Total Patterns: 48520


In [18]:
X = numpy.reshape(x_data, (n_patterns, seq_length, 1))
X = X/float(vocab_len)

In [19]:
y = utils.to_categorical(y_data)


In [20]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam')


In [21]:
filepath = "model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [22]:
model.fit(X, y, epochs=40, batch_size=256, callbacks=desired_callbacks)


Train on 48520 samples
Epoch 1/40
Epoch 00001: loss improved from inf to 2.95219, saving model to model_weights_saved.hdf5
Epoch 2/40
Epoch 00002: loss improved from 2.95219 to 2.91141, saving model to model_weights_saved.hdf5
Epoch 3/40
Epoch 00003: loss improved from 2.91141 to 2.90474, saving model to model_weights_saved.hdf5
Epoch 4/40
Epoch 00004: loss improved from 2.90474 to 2.90176, saving model to model_weights_saved.hdf5
Epoch 5/40
Epoch 00005: loss improved from 2.90176 to 2.89857, saving model to model_weights_saved.hdf5
Epoch 6/40
Epoch 00006: loss did not improve from 2.89857
Epoch 7/40
Epoch 00007: loss improved from 2.89857 to 2.89651, saving model to model_weights_saved.hdf5
Epoch 8/40
Epoch 00008: loss improved from 2.89651 to 2.88610, saving model to model_weights_saved.hdf5
Epoch 9/40
Epoch 00009: loss improved from 2.88610 to 2.73012, saving model to model_weights_saved.hdf5
Epoch 10/40
Epoch 00010: loss improved from 2.73012 to 2.62342, saving model to model_weigh

Epoch 34/40
Epoch 00034: loss improved from 1.67382 to 1.66652, saving model to model_weights_saved.hdf5
Epoch 35/40
Epoch 00035: loss improved from 1.66652 to 1.64840, saving model to model_weights_saved.hdf5
Epoch 36/40
Epoch 00036: loss improved from 1.64840 to 1.64195, saving model to model_weights_saved.hdf5
Epoch 37/40
Epoch 00037: loss improved from 1.64195 to 1.62646, saving model to model_weights_saved.hdf5
Epoch 38/40
Epoch 00038: loss improved from 1.62646 to 1.60866, saving model to model_weights_saved.hdf5
Epoch 39/40
Epoch 00039: loss improved from 1.60866 to 1.59601, saving model to model_weights_saved.hdf5
Epoch 40/40
Epoch 00040: loss improved from 1.59601 to 1.59072, saving model to model_weights_saved.hdf5


<tensorflow.python.keras.callbacks.History at 0x7ffcb05321d0>

# TF Tutorial

In [232]:
import os
import tensorflow as tf
import numpy as np


In [233]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

In [234]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits = True)

In [299]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim,
                batch_input_shape=[batch_size,None]),
        tf.keras.layers.LSTM(rnn_units, return_sequences = True, stateful = True,
                recurrent_initializer = 'glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

In [300]:
text = open('../../texts/communist-manifesto.txt', 'rb').read().decode(encoding='utf-8')


In [301]:
vocab = sorted(set(text))

In [302]:
char2idx = {unique:idx for idx, unique in enumerate(vocab)}
idx2char = np.array(vocab)

In [303]:
text_as_int = np.array([char2idx[char] for char in text])

In [304]:
seq_len = 100

examples_per_epoch = len(text) // (seq_len + 1)

In [305]:
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

In [306]:
sequences = char_dataset.batch(seq_len + 1, drop_remainder = True)


In [307]:
dataset = sequences.map(split_input_target)

In [308]:
BATCH_SIZE = 64
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder = True)

vocab_size = len(vocab)

In [309]:
embedding_dim = 256
rnn_units = 1024

In [310]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size = BATCH_SIZE)

In [311]:
model.compile(optimizer = 'adam', loss = loss)

In [312]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, 'marx_{epoch}')
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_prefix, save_weights_only = True)

In [313]:
EPOCHS = 100

In [None]:
history = model.fit(dataset, epochs = EPOCHS, callbacks = [checkpoint_callback])

Train for 17 steps
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100


In [None]:
generative_model = build_model(vocab_size, embedding_dim, rnn_units, batch_size = 1)

In [None]:
generative_model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

In [None]:
generative_model.build(tf.TensorShape([1,None]))

In [None]:
def gen_text(model, start_string):
    num_gen = 1000
    start_string = 'Workers'
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)
    
    text_generated = []
    temperature = 1.0
    model.reset_states()
    
    for i in range(num_gen):
        predictions = generative_model(input_eval)
        predictions = tf.squeeze(predictions, 0)
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples = 1)[-1,0].numpy()
        
        input_eval = tf.expand_dims([predicted_id], 0)
        text_generated.append(idx2char[predicted_id])
    while idx2char[predicted_id] != '.':
        predictions = generative_model(input_eval)
        predictions = tf.squeeze(predictions, 0)
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples = 1)[-1,0].numpy()
        
        input_eval = tf.expand_dims([predicted_id], 0)
        text_generated.append(idx2char[predicted_id])
    return (start_string + ''.join(text_generated))

In [None]:
gen_text(generative_model, 'Workers')

<tf.Tensor: shape=(1, 1), dtype=int32, numpy=array([[1]], dtype=int32)>