In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer

from nltk.stem import PorterStemmer
from autocorrect import spell

import os
from six.moves import cPickle
import re

In [2]:
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector

sconfig = tf.ConfigProto()
# sconfig.gpu_options.per_process_gpu_memory_fraction = 0.45


Instructions for updating:
Use the retry module or similar alternatives.


In [3]:
MAX_LEN = 25
BATCH_SIZE = 64
NUM_EPOCHS = 10000

stemmer = PorterStemmer()
def process_str(string, bot_input=False, bot_output=False):
    string = string.strip().lower()
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`:]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\s{2,}", " ", string)
    string = string.split(" ")
    string = [re.sub(r"[0-9]+", "NUM", token) for token in string]
    string = [stemmer.stem(re.sub(r'(.)\1+', r'\1\1', token)) for token in string]
    string = [spell(token).lower() for token in string]
    # Truncate string
    while True:
        try:
            string.remove("")
        except:
            break
    if(not bot_input and not bot_output):
        string = string[0:MAX_LEN]
    elif(bot_input):
        string = string[0:MAX_LEN-1]
        string.insert(0, "</start>")
    else:
        string = string[0:MAX_LEN-1]
        string.insert(len(string), "</end>")
    old_len = len(string)
    for i in range((MAX_LEN) - len(string)):
        string.append(" </pad> ")
    string = re.sub("\s+", " ", " ".join(string)).strip()
    return string, old_len

# Load data

In [4]:
data = cPickle.load(open("all_convos.pkl", "rb"))
print(len(data))
user = [item[0] for item in data]
bot = [item[1] for item in data]

10407


## Preprocess data

In [5]:
if(os.path.isfile("user_processed.pkl")):
    user = cPickle.load(open("user_processed.pkl", "rb"))
else:
    user = [process_str(item) for item in user]
    cPickle.dump(user, open("user_processed.pkl", "wb"))

if(os.path.isfile("bot_in_processed.pkl")):
    bot_inputs = cPickle.load(open("bot_in_processed.pkl", "rb"))
else:
    bot_inputs = [process_str(item, bot_input=True) for item in bot]
    cPickle.dump(bot_inputs, open("bot_in_processed.pkl", "wb"))

if(os.path.isfile("bot_out_processed.pkl")):
    bot_outputs = cPickle.load(open("bot_out_processed.pkl", "rb"))
else:
    bot_outputs = [process_str(item, bot_output=True) for item in bot]
    cPickle.dump(bot_outputs, open("bot_out_processed.pkl", "wb"))
    
    
user_lens = np.array([message[1] for message in user]).astype(np.int32)
user = np.array([message[0] for message in user])

bot_inp_lens = np.array([message[1] for message in bot_inputs]).astype(np.int32)
bot_out_lens = np.array([message[1] for message in bot_outputs]).astype(np.int32)

bot_inputs = np.array([message[0] for message in bot_inputs])
bot_outputs = np.array([message[0] for message in bot_outputs])

## Show statistics about length

In [6]:
print("Average user message: {}, average bot message: {}".format(np.mean(user_lens), np.mean(bot_inp_lens)))
print("80th percentile of user lengths: {}, 80th percentile of bot lengths: {}".format(np.percentile(user_lens, 80), np.percentile(bot_inp_lens, 80)))

Average user message: 10.602959546459115, average bot message: 13.784087633323724
80th percentile of user lengths: 17.0, 80th percentile of bot lengths: 25.0


## Extract vocabulary

In [7]:
bow = CountVectorizer()

bow.fit(user.tolist() + bot_inputs.tolist())
vocab = list(bow.vocabulary_.keys())
vocab.insert(0, "NUM")
vocab.insert(0, "UNK")
vocab.insert(0, "</end>")
vocab.insert(0, "</start>")
vocab.insert(0, "</pad>")
cPickle.dump(vocab, open("vocab", "wb"))

## Placeholders

In [8]:
user_ph = tf.placeholder(user.dtype, name="user_placeholder")
bot_inp_ph = tf.placeholder(bot_inputs.dtype, name="bot_inp_placeholder")
bot_out_ph = tf.placeholder(bot_outputs.dtype, name="bot_out_placeholder")

user_lens_ph = tf.placeholder(user_lens.dtype, shape=[None], name="user_len_placeholder")
bot_inp_lens_ph = tf.placeholder(bot_inp_lens.dtype, shape=[None], name="bot_inp_lens_placeholder")
bot_out_lens_ph = tf.placeholder(bot_out_lens.dtype, shape=[None], name="bot_out_lens_placeholder")

## Datasets

In [9]:
tf_user = tf.data.Dataset.from_tensor_slices(user_ph)
tf_bot_inp = tf.data.Dataset.from_tensor_slices(bot_inp_ph)
tf_bot_out = tf.data.Dataset.from_tensor_slices(bot_out_ph)

tf_user_lens = tf.data.Dataset.from_tensor_slices(user_lens_ph)
tf_bot_inp_lens = tf.data.Dataset.from_tensor_slices(bot_inp_lens_ph)
tf_bot_out_lens = tf.data.Dataset.from_tensor_slices(bot_out_lens_ph)


## Data/Iterators

In [10]:
with tf.device("/cpu:0"), tf.name_scope("data"):
    words = tf.contrib.lookup.index_table_from_tensor(mapping=tf.constant(vocab), default_value=3)
    inverse = tf.contrib.lookup.index_to_string_table_from_tensor(mapping=tf.constant(vocab), default_value="UNK", name="inverse_op")

    tf_user = tf_user.map(lambda string: tf.string_split([string])).map(lambda tokens: (words.lookup(tokens)))
    tf_bot_inp = tf_bot_inp.map(lambda string: tf.string_split([string])).map(lambda tokens: (words.lookup(tokens)))
    tf_bot_out = tf_bot_out.map(lambda string: tf.string_split([string])).map(lambda tokens: (words.lookup(tokens)))
    
    data = tf.data.Dataset.zip((tf_user, tf_bot_inp, tf_bot_out, tf_user_lens, tf_bot_inp_lens, tf_bot_out_lens))
    data = data.shuffle(buffer_size=256).batch(BATCH_SIZE)
    data = data.prefetch(10)
    data_iterator = tf.data.Iterator.from_structure(data.output_types, data.output_shapes,
                                                   None, data.output_classes)
    train_init_op = data_iterator.make_initializer(data, name='dataset_init')
    user_doc, bot_inp_doc, bot_out_doc, user_len, bot_inp_len, bot_out_len = data_iterator.get_next()
    user_doc = tf.sparse_tensor_to_dense(user_doc)
    bot_inp_doc = tf.sparse_tensor_to_dense(bot_inp_doc)
    bot_out_doc = tf.sparse_tensor_to_dense(bot_out_doc)

## Embedding

In [11]:
with tf.name_scope("embedding"):
    embedding = tf.get_variable("embedding", [len(vocab), 200], initializer=tf.glorot_uniform_initializer())
    
    embedded_user = tf.nn.embedding_lookup(embedding, user_doc)
    embedded_user_dropout = tf.nn.dropout(embedded_user, 0.7)
    
    embedded_bot_inp = tf.nn.embedding_lookup(embedding, bot_inp_doc)
    embedded_bot_inp_dropout = tf.nn.dropout(embedded_bot_inp, 0.7)
    
    embedded_user_dropout = tf.reshape(embedded_user_dropout, [-1, MAX_LEN, 200])
    embedded_bot_inp_dropout = tf.reshape(embedded_bot_inp_dropout, [-1, MAX_LEN, 200])

## Encoder

In [12]:
with tf.name_scope("encoder"):
    # Build RNN cell
    encoder_GRU = tf.nn.rnn_cell.GRUCell(128)
    encoder_cell_fw = tf.nn.rnn_cell.DropoutWrapper(encoder_GRU, input_keep_prob=0.7, 
                                                 output_keep_prob=0.7, state_keep_prob=0.9)
    
    encoder_cell_bw = tf.nn.rnn_cell.DropoutWrapper(encoder_GRU, input_keep_prob=0.7, 
                                                 output_keep_prob=0.7, state_keep_prob=0.9)
    encoder_outputs, encoder_state = tf.nn.bidirectional_dynamic_rnn(
        encoder_cell_fw, encoder_cell_bw, embedded_user_dropout,
        sequence_length=user_len, dtype=tf.float32)
    encoder_state = tf.concat(encoder_state, -1)

## Projection layer (Output of decoder)

In [13]:
with tf.name_scope("projection"):
    projection_layer = tf.layers.Dense(
    len(vocab), use_bias=False)

## Decoder

In [14]:
with tf.name_scope("decoder"):
    decoder_GRU = tf.nn.rnn_cell.GRUCell(256)
    decoder_cell = tf.nn.rnn_cell.DropoutWrapper(decoder_GRU, input_keep_prob=0.7, 
                                                 output_keep_prob=0.7, state_keep_prob=0.9)
    # Helper for use during training
    # During training we feed the decoder
    # the target sequence
    # However, during testing we use the decoder's
    # last output
    helper = tf.contrib.seq2seq.TrainingHelper(
        embedded_bot_inp_dropout, bot_inp_len)
    # Decoder
    decoder = tf.contrib.seq2seq.BasicDecoder(
        decoder_cell, helper, encoder_state,
        output_layer=projection_layer)
    # Dynamic decoding
    outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder)
    logits = outputs.rnn_output
    translations = outputs.sample_id

## Loss computation normalized by batch size

In [15]:
with tf.name_scope("loss"):
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=tf.reshape(bot_out_doc,
                                                                    [-1, MAX_LEN]), logits=logits)
    mask = tf.sequence_mask(bot_out_len, dtype=tf.float32)
    train_loss = (tf.reduce_sum(loss * mask) / BATCH_SIZE)

## Adam with gradient clipping and learning rate scheduling using cosine decay + restarts

In [16]:
with tf.variable_scope('Adam'):
    global_step = tf.Variable(0, trainable=False)
    inc_gstep = tf.assign(global_step,global_step + 1)
    learning_rate = tf.train.cosine_decay_restarts(0.001, global_step, 550, t_mul=1.1)
    adam_optimizer = tf.train.AdamOptimizer(learning_rate)
    adam_gradients, v = zip(*adam_optimizer.compute_gradients(train_loss))
    adam_gradients, _ = tf.clip_by_global_norm(adam_gradients, 10.0)
    adam_optimize = adam_optimizer.apply_gradients(zip(adam_gradients, v))

## Inference nodes

In [17]:
with tf.variable_scope("inference"):
    # Helper
    # Start token is 1, which is the </start> token
    # End token is 2
    helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
        embedding,
        tf.fill([BATCH_SIZE], 1), 2)

    # Decoder
    decoder = tf.contrib.seq2seq.BasicDecoder(
        decoder_cell, helper, encoder_state,
        output_layer=projection_layer)
    # Dynamic decoding
    test_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
        decoder, maximum_iterations=10)
    test_translations = tf.identity(test_outputs.sample_id, name="word_ids")
    test_words = tf.identity(inverse.lookup(tf.cast(test_translations, tf.int64)), name="words")

## A function for testing

In [18]:
def testBot(sess):
    text = ["Hello"] + [""] * (BATCH_SIZE - 1)
    num_text = len(text)
    text = [process_str(sentence) for sentence in text]
    text_len = np.array([item[1] for item in text]).astype(np.int32)
    text = np.array([item[0] for item in text])
    
    user_test_ph = tf.placeholder(text.dtype)
    user_test_lens_ph = tf.placeholder(text_len.dtype)
    
    tf_user_test = tf.data.Dataset.from_tensor_slices(user_test_ph).map(lambda string: tf.string_split([string])).map(lambda tokens: (words.lookup(tokens)))
    tf_user_test_lens = tf.data.Dataset.from_tensor_slices(user_test_lens_ph)
    
    test_data = tf.data.Dataset.zip((tf_user_test, tf_bot_inp, tf_bot_out,
                                     tf_user_test_lens, tf_bot_inp_lens, tf_bot_out_lens))
    
    test_data = test_data.batch(num_text).prefetch(1)
    test_init_op = data_iterator.make_initializer(test_data)
    
    sess.run(test_init_op, feed_dict={
        user_test_ph: user,
        bot_inp_ph: bot_inputs[0:num_text],
        bot_out_ph: bot_outputs[0:num_text],
        user_test_lens_ph: user_lens,
        bot_inp_lens_ph: bot_inp_lens[0:num_text],
        bot_out_lens_ph: bot_out_lens[0:num_text]
    })
    translations_text = sess.run(inverse.lookup(tf.cast(test_translations, tf.int64)))
    return translations_text

In [19]:
with tf.name_scope('summaries'):
    tf.summary.scalar('Loss', train_loss)
    tf.summary.scalar('LR', learning_rate)
    merged = tf.summary.merge_all()
    config = projector.ProjectorConfig()
    embedding_vis = config.embeddings.add()
    embedding_vis.tensor_name = embedding.name
    vocab_str = '\n'.join(vocab)
    metadata = pd.Series(vocab)
    metadata.name = "label"
    metadata.to_csv("checkpoints/metadata.tsv", sep="\t", header=True, index_label="index")
    embedding_vis.metadata_path = 'metadata.tsv'

In [21]:
losses = []
print("Started training")

saver = tf.train.Saver()
save_dir = 'checkpoints/'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
save_path = os.path.join(save_dir, 'best_validation')

sess = tf.InteractiveSession(config=sconfig)

writer = tf.summary.FileWriter('./checkpoints', sess.graph)
projector.visualize_embeddings(writer, config)


sess.run([words.init, tf.global_variables_initializer(), inverse.init])
step = 0

for i in range(NUM_EPOCHS):
    if(i % 10 == 0):
        saver.save(sess=sess, save_path=save_path, write_meta_graph=True)
    sess.run(train_init_op, feed_dict={
        user_ph: user,
        bot_inp_ph: bot_inputs,
        bot_out_ph: bot_outputs,
        user_lens_ph: user_lens,
        bot_inp_lens_ph: bot_inp_lens,
        bot_out_lens_ph: bot_out_lens
    })

    while True:
        try:
            _, batch_loss, summary = sess.run([adam_optimize, train_loss, merged])
            writer.add_summary(summary, i)
            losses.append(batch_loss)
        except tf.errors.InvalidArgumentError:
            continue
        except tf.errors.OutOfRangeError:
            print("Epoch {}: Loss(Mean): {} Loss(Std): {}".format(i, np.mean(losses), np.std(losses)))
            losses = []
            break
        sess.run(inc_gstep)
        step += 1
    print(testBot(sess)[0])

Started training
Epoch 0: Loss(Mean): 69.88444519042969 Loss(Std): 15.55992603302002
[b'UNK' b'UNK' b'UNK' b'UNK' b'UNK' b'UNK' b'UNK' b'UNK' b'UNK' b'UNK']
Epoch 1: Loss(Mean): 63.92498016357422 Loss(Std): 6.699320316314697
[b'UNK' b'UNK' b'UNK' b'UNK' b'UNK' b'UNK' b'UNK' b'UNK' b'UNK' b'UNK']
Epoch 2: Loss(Mean): 62.49559020996094 Loss(Std): 7.132476329803467
[b'UNK' b'UNK' b'UNK' b'UNK' b'UNK' b'UNK' b'UNK' b'have' b'UNK' b'UNK']
Epoch 3: Loss(Mean): 61.721195220947266 Loss(Std): 6.579246520996094
[b'UNK' b'UNK' b'UNK' b'UNK' b'UNK' b'UNK' b'UNK' b'</end>' b'UNK' b'UNK']
Epoch 4: Loss(Mean): 58.166316986083984 Loss(Std): 6.813504695892334
[b'UNK' b'UNK' b'have' b'UNK' b'</end>' b'UNK' b'</end>' b'UNK' b'</end>'
 b'UNK']
Epoch 5: Loss(Mean): 54.21249008178711 Loss(Std): 6.547247409820557
[b'UNK' b'</end>' b'UNK' b'have' b'UNK' b'num' b'num' b'star' b'hotel'
 b'UNK']
Epoch 6: Loss(Mean): 52.51726531982422 Loss(Std): 6.151824474334717
[b'UNK' b'have' b'UNK' b'num' b'day' b'to' b'num' 

KeyboardInterrupt: 