In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'retina'

In [2]:
import numpy as np
import pandas as pd
import pickle
from keras import Input, layers
from keras.models import Model, load_model
from keras.optimizers import Adam
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint, Callback
from keras import backend as K
from nltk.tokenize.treebank import TreebankWordDetokenizer as Detok

Using TensorFlow backend.


## Load processed data

In [3]:
data = pickle.load(open('data/works_proc.pkl', 'rb'))

toks_trn, toks_val, ixs_trn, ixs_val = data['toks_trn'], data['toks_val'], data['ixs_trn'], data['ixs_val']
stoi, itos = data['stoi'], data['itos']
UNK, PAD, BOS, EOS, max_len = data['UNK'], data['PAD'], data['BOS'], data['EOS'], data['max_len']

In [4]:
# Create input and output sequences for the language model
X_trn, X_val = ixs_trn[:, :-1], ixs_val[:, :-1]
y_trn, y_val = ixs_trn[:, 1:], ixs_val[:, 1:]

X_trn.shape, y_trn.shape, X_val.shape, y_val.shape

((16196, 199), (16196, 199), (4050, 199), (4050, 199))

In [5]:
# Reshape y in order to use sparse_categorical_crossentropy
y_trn_rs = y_trn.reshape(y_trn.shape[0], y_trn.shape[1], 1)
y_val_rs = y_val.reshape(y_val.shape[0], y_val.shape[1], 1)

y_trn_rs.shape, y_val_rs.shape

((16196, 199, 1), (4050, 199, 1))

## Build model

In [309]:
K.clear_session()

In [6]:
# Define model attributes
vocab_size = len(itos)
embed_size = 256
hidden_size = 256
n_rnn_layers = 2
dropout = .3

In [377]:
# Save meta data
meta_data = {
    'stoi': stoi,
    'itos': itos,
    'UNK': UNK,
    'PAD': PAD,
    'BOS': BOS,
    'EOS': EOS,
    'max_len': max_len,
    'embed_size': 256,
    'hidden_size': 256,
    'n_rnn_layers': 2,
    'dropout': .3
}

pickle.dump(meta_data, open('data/meta_data.pkl', 'wb'))

In [7]:
def create_model(vocab_size=vocab_size,
                 embed_size=embed_size,
                 hidden_size=hidden_size,
                 n_rnn_layers=n_rnn_layers,
                 dropout=dropout,
                 embed_layer=None,
                 rnn_layers=None,
                 dense_layer=None,
                 incl_states_in_output=False):
    # Define input tensor for X
    X = Input(shape=(None, ), name='X')
    
    # Embed
    if not embed_layer:
        embed_layer = layers.Embedding(input_dim=vocab_size, output_dim=embed_size, mask_zero=True, name='embed_layer')
    y = embed_layer(X)
    
    # Feed into RNNs
    # Placeholders for states
    h0s, c0s = [], []
    hs, cs = [], []
    for i in range(n_rnn_layers):
        # Define input tensor for initial states
        h0 = Input(shape=(hidden_size, ), name=f'h0_{i}')
        c0 = Input(shape=(hidden_size, ), name=f'c0_{i}')
        
        if not rnn_layers:
            rnn_layer = layers.LSTM(hidden_size, return_sequences=True, return_state=True, dropout=dropout, recurrent_dropout=dropout, name=f'rnn_layer_{i}')
        else:
            rnn_layer = rnn_layers[i]
        
        y, h, c = rnn_layer(y, initial_state=[h0, c0])

        # Save states
        h0s.append(h0)
        c0s.append(c0)
        hs.append(h)
        cs.append(c)
    
    # Feed the output to the final dense layer
    if not dense_layer:
        dense_layer = layers.Dense(vocab_size, activation='softmax', name='dense_layer')
    y = dense_layer(y)
    
    # Put together
    inputs = [X] + h0s + c0s
    if incl_states_in_output:
        outputs = [y] + hs + cs
    else:
        outputs = y
    
    model = Model(inputs, outputs)
    print(model.summary())
    
    return model

In [303]:
model = create_model()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
X (InputLayer)                  (None, None)         0                                            
__________________________________________________________________________________________________
embed_layer (Embedding)         (None, None, 256)    2022912     X[0][0]                          
__________________________________________________________________________________________________
h0_0 (InputLayer)               (None, 256)          0                                            
__________________________________________________________________________________________________
c0_0 (InputLayer)               (None, 256)          0                                            
____________________________________________________________________________________________

In [304]:
# Compile
optimizer = 'adam'
loss = 'sparse_categorical_crossentropy'
metrics = ['sparse_categorical_accuracy']

model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

## Train model

In [8]:
# Initialize states with zeros
def init_states(n_obs, hidden_size=hidden_size, n_rnn_layers=n_rnn_layers):
    h0s, c0s = [], []
    for i in range(n_rnn_layers):
        zeros = np.zeros(shape=(n_obs, hidden_size))
        
        h0s.append(zeros)
        c0s.append(zeros)
    
    return h0s, c0s

h0s_trn, c0s_trn = init_states(X_trn.shape[0])
h0s_val, c0s_val = init_states(X_val.shape[0])

In [306]:
# Add callbacks
callbacks = []

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, verbose=1)
callbacks.append(reduce_lr)

stopper = EarlyStopping(monitor='val_loss', patience=10, verbose=1)
callbacks.append(stopper)

checkpoint = ModelCheckpoint(filepath='model/model.h5', monitor='val_loss', save_best_only=True, verbose=1)
callbacks.append(checkpoint)

In [307]:
# Other training attributes
batch_size = 64
epochs = 30

In [None]:
# Train
hist = model.fit(
    x=[X_trn]+h0s_trn+c0s_trn,
    y=y_trn_rs,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=([X_val]+h0s_val+c0s_val, y_val_rs),
    shuffle=True,
    callbacks=callbacks)

## Evaluate model

In [None]:
# Reload the last checkpoint
model = load_model('model/model.h5')

In [21]:
# Extract trained layers
embed_layer = model_pkl.get_layer('embed_layer')
rnn_layers = [model_pkl.get_layer(f'rnn_layer_{i}') for i in range(n_rnn_layers)]
dense_layer = model_pkl.get_layer('dense_layer')

In [22]:
# Reconstruct the model with states in the output
model_w_states = create_model(embed_layer=embed_layer, rnn_layers=rnn_layers, dense_layer=dense_layer, incl_states_in_output=True)

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
X (InputLayer)                  (None, None)         0                                            
__________________________________________________________________________________________________
embed_layer (Embedding)         (None, None, 256)    2022912     X[0][0]                          
__________________________________________________________________________________________________
h0_0 (InputLayer)               (None, 256)          0                                            
__________________________________________________________________________________________________
c0_0 (InputLayer)               (None, 256)          0                                            
____________________________________________________________________________________________

In [12]:
detokenizer = Detok()

In [13]:
# Predict with a seed phrase
def sample_w_pred_prob(pred_probs, temperature=.5, avoid_unk=False):
    pred_probs = pred_probs.flatten().astype('float64')
    
    # Adjust probabilities with temperature
    # https://github.com/minimaxir/textgenrnn/blob/master/textgenrnn/utils.py#L16
    if temperature == 0:
        return pred_probs.argmax()
    
    log_pred_probs = np.log(pred_probs + K.epsilon()) / temperature
    pred_probs = np.exp(log_pred_probs)
    
    if avoid_unk:
        pred_probs[stoi[UNK]] = 0
    
    pred_probs = pred_probs / pred_probs.sum()
    sample_ix = np.random.multinomial(1, pred_probs).argmax()
    return sample_ix

def gen_seq_w_seed(seed_phrase, max_chars=280):
    # Tokenize and map to indices
    seed_toks = seed_phrase.split()
    seed_toks = [BOS] + seed_toks
    seed_ixs = [stoi[tok] if tok in stoi else stoi[UNK] for tok in seed_toks]
    seed_ixs = np.array(seed_ixs).reshape(1, -1)

    # Initiate states
    seed_h0s, seed_c0s = init_states(1)

    # Apply model
    seed_preds = model_w_states.predict([seed_ixs]+seed_h0s+seed_c0s)
    seed_ixs_preds, seed_hs, seed_cs = seed_preds[0], seed_preds[1:3], seed_preds[3:]

    # Extract the last predicted token and use it as the seed going forward
    seed_ix_pred = seed_ixs_preds[:, -1, :]
    seed_ix_pred = sample_w_pred_prob(seed_ix_pred, avoid_unk=True)
    seed_tok_pred = itos[seed_ix_pred]

    # Generate new words
    gen_toks = []
    chars = len(seed_phrase)
    while seed_tok_pred not in [PAD, EOS] and chars <= max_chars:
        gen_toks.append(seed_tok_pred)
        seed_preds = model_w_states.predict([np.array(seed_ix_pred).reshape(1, -1)]+seed_hs+seed_cs)
        seed_ix_pred, seed_hs, seed_cs = seed_preds[0], seed_preds[1:3], seed_preds[3:]
        seed_ix_pred = sample_w_pred_prob(seed_ix_pred, avoid_unk=True)
        seed_tok_pred = itos[seed_ix_pred]
        chars += len(seed_tok_pred) + 1
    
    return seed_toks[1:] + gen_toks

In [23]:
seed_phrase = 'it is a truth universally acknowledged'
gen_toks = gen_seq_w_seed(seed_phrase)
detokenizer.detokenize(gen_toks)

'it is a truth universally acknowledged,and that he was not at all ashamed of her,and he was preparing to be the earliest of the evening; and she was obliged to repeat the day before they came to the door,and mr . crawford was left to go out,and the more to be ready.'