In [1]:
import numpy as np
from distutils.version import LooseVersion
import warnings
import tensorflow as tf
import gensim

# Check TensorFlow Version
assert LooseVersion(tf.__version__) >= LooseVersion('1.1'), 'Please use TensorFlow version 1.1 or newer'
print('TensorFlow Version: {}'.format(tf.__version__))

# Check for a GPU
if not tf.test.gpu_device_name():
    warnings.warn('No GPU found. Please use a GPU to train your neural network.')
else:
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))

Using TensorFlow backend.


TensorFlow Version: 1.1.0
Default GPU Device: /gpu:0


In [2]:
# Load word2vec model
w2v = gensim.models.KeyedVectors.load('data/w2v-773752559-1000000-300-5-5-OpenSubtitles2016.bin')

In [3]:
def get_inputs(output_dim=300):
    """
    Create TF Placeholders for input, targets, learning_rate and input_sequence_length.
    :return: Tuple (input_, targets, learning_rate, keep_prob, input_sequence_length)
    """

    input_ = tf.placeholder(tf.int32, [None, None], name='input')
    targets = tf.placeholder(tf.float32, [None, output_dim])
    learning_rate = tf.placeholder(tf.float32)
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    input_sequence_length = tf.placeholder(tf.int32, [None], name="input_sequence_length")
    
    return (input_, targets, learning_rate, keep_prob, input_sequence_length)

In [4]:
def build_lstm(lstm_size, num_layers, batch_size, keep_prob, inputs, num_classes, input_sequence_length):
    ''' Build LSTM cell.
    
        Arguments
        ---------
        keep_prob: Scalar tensor (tf.placeholder) for the dropout keep probability
        lstm_size: Size of the hidden layers in the LSTM cells
        num_layers: Number of LSTM layers
        batch_size: Batch size

    '''
    
    # one_hot encode input
    x_one_hot = tf.one_hot(inputs, num_classes) # num_classes = len(vocab)
    
    def build_cell(rnn_size):
        cell = tf.contrib.rnn.LSTMCell(rnn_size, initializer=tf.contrib.layers.xavier_initializer())
        return cell
    
    # Construct a stacked tf.contrib.rnn.LSTMCell...
    stacked_cell = tf.contrib.rnn.MultiRNNCell([build_cell(lstm_size) for _ in range(num_layers)])
    # ...wrapped in a tf.contrib.rnn.DropoutWrapper
    cell = tf.contrib.rnn.DropoutWrapper(stacked_cell, output_keep_prob=keep_prob)
    
    # Pass cell and embedded input to tf.nn.dynamic_rnn()
    rnn_output, rnn_state = tf.nn.dynamic_rnn(cell, x_one_hot, sequence_length=input_sequence_length, dtype=tf.float32)
    
    # Initial state
    initial_state = tf.identity(stacked_cell.zero_state(batch_size, tf.float32), name="initial_state")
    
    return rnn_output, rnn_state, initial_state

We only care about the final rnn cell output. So we need to grab it with outputs[:, -1].

In [5]:
def build_output(cell, output_dim):
    return tf.contrib.layers.fully_connected(inputs=cell[:, -1], num_outputs=output_dim, activation_fn=None)

In [6]:
def get_loss(pred, Y):
    pred=tf.nn.l2_normalize(pred,0)
    Y=tf.nn.l2_normalize(Y,0)
    
    return tf.losses.cosine_distance(pred, Y, dim=1)
    #return tf.reduce_sum(tf.multiply(pred,Y))

In [7]:
# build vocabulary
vocab = sorted(set(" ".join(w2v.wv.index2word)))
vocab_to_int = {c: i for i, c in enumerate(vocab,1 )}

In [22]:
batch_size=256
lstm_size=512
num_layers=2
keep_probability=0.8
num_classes=len(vocab)
output_dim=300
num_epochs=100
learning_rate=0.001
save_dir = './save'

# Create the graph object
graph = tf.Graph()

with graph.as_default():
    (input_, targets, lr, keep_prob, input_sequence_length) = get_inputs()
    rnn_output, rnn_state, initial_state = build_lstm(lstm_size, num_layers, batch_size, keep_prob, input_, num_classes, input_sequence_length)
    output = build_output(rnn_output, output_dim)
    loss = get_loss(output, targets)
    
    # Optimizer
    optimizer = tf.train.AdamOptimizer(learning_rate)
    
    gradients = optimizer.compute_gradients(loss)
    # clip gradients
    capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
    train_op = optimizer.apply_gradients(capped_gradients)
    
        
#     # Optimizer for training, using gradient clipping to control exploding gradients
#     tvars = tf.trainable_variables()
#     grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), grad_clip)
#     train_op = tf.train.AdamOptimizer(learning_rate)
#     optimizer = train_op.apply_gradients(zip(grads, tvars))

In [9]:
# def word2seq(word):
#     return np.array([vocab_to_int.get(c,0) for c in word])

# def get_train_subset(model=w2v, seed_words=500, topn=7):
#     top_words=model.wv.index2word[120:120+seed_words]
#     top_words=np.append(np.array(top_words),np.array([np.array(model.most_similar_cosmul(w, topn=topn))[:,0] for w in top_words]))
#     top_words=top_words.flatten()
#     top_words=set(top_words)
#     return top_words

# input_list=list(get_train_subset())

In [10]:
# def set_2_int(train_set=train_set):
#     return [[vocab_to_int[w] for w in l] for l in train_set]
#     pass


def get_padded_int_batch(input_batch, vocab_to_int):
    max_len = max([len(word) for word in input_batch])
    int_batch =  [[0] * (max_len - len(l)) + [vocab_to_int[w] for w in l] for l in input_batch]
    return int_batch


def get_batch(input_list=w2v.wv.index2word, batch_size=batch_size, vocab=vocab, vocab_to_int=vocab_to_int, model=w2v):
    """
    Batch generator.
    Input: train_set - list of words
    Returns touple:
    (pad_input_batch, pad_input_lengths, output_batch)
    """
    for batch_i in range(0, len(input_list)//batch_size):
        start_i = batch_i * batch_size

        # Slice the right amount for the batch
        input_batch = input_list[start_i:start_i + batch_size]
#         print(input_batch)

        # Pad
        pad_input_batch = np.array(get_padded_int_batch(input_batch, vocab_to_int))
#         print (pad_input_batch)

        # Need the lengths for the _lengths parameters
        pad_input_lengths = []
        for line in pad_input_batch:
            pad_input_lengths.append(len(line))

            
        # output batch
        output_batch=np.array([w2v.wv.word_vec(w) for w in input_batch])
        # print (output_batch)
        # print (pad_input_lengths)
        yield (pad_input_batch, pad_input_lengths, output_batch)


# for (batch_i, (pad_input_batch, pad_input_lengths, output)) in enumerate(get_batch(w2v.wv.index2word[:1000], batch_size=50)):
#     print (batch_i)
#     pass

train_input = w2v.wv.index2word[:10000]
valid_input = w2v.wv.index2word[10000:12000]

In [23]:
with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())

    for epoch_i in range(num_epochs):
        for batch_i, (pad_input_batch, pad_input_lengths, out_vec) in enumerate(get_batch(train_input)):
            # reset state
            state = sess.run(initial_state, {
                input_: pad_input_batch,
                input_sequence_length: pad_input_lengths
                                            })
            _, l = sess.run([train_op, loss], {
                input_: pad_input_batch,
                targets: out_vec,
                lr: learning_rate, 
                keep_prob: keep_probability,
                input_sequence_length: pad_input_lengths,  
            })
        print(l)
        


    # Save Model
    saver = tf.train.Saver()
    saver.save(sess, save_dir)
    print('Model Trained and Saved')

0.687967
0.68533
0.679318
0.672336
0.668946
0.657595
0.636338
0.614889
0.598381
0.579278
0.568593
0.555491
0.545966
0.533061
0.521892
0.510227
0.497467
0.488603
0.476933
0.460777
0.447078
0.432701
0.420488
0.411255
0.399369
0.388296
0.371705
0.363467
0.349629
0.344336
0.335385
0.326176
0.313569
0.303563
0.29783
0.290841
0.279991
0.277437
0.278125
0.266666
0.261025
0.255475
0.253872
0.245984
0.244423
0.23966
0.233882
0.228223
0.221512
0.218173
0.217728
0.21241
0.207996
0.204128
0.200776
0.198734
0.192925
0.190086
0.184505
0.179385
0.17884
0.175199
0.17434
0.173626
0.171723
0.168765
0.164635
0.159555
0.156445
0.159174
0.15509
0.150107
0.150091
0.1499
0.144651
0.144699
0.142424
0.139775
0.136731
0.135755
0.134516
0.130349
0.131222
0.127474
0.124283
0.124255
0.120589
0.119806
0.1194
0.115225
0.117116
0.114234
0.113239
0.111028
0.109572
0.105979
0.104174
0.105864
0.102838
0.103081
Model Trained and Saved


In [24]:
with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())

    (pad_input_batch, pad_input_lengths, out_vec) = next(get_batch(valid_input))
    state = sess.run(initial_state, {
        input_: pad_input_batch,
        input_sequence_length: pad_input_lengths
        })
    outputs = sess.run(output,{
        input_: pad_input_batch,
        keep_prob: 1.0,
        input_sequence_length: pad_input_lengths,
    })

In [26]:
#outputs[0]

#outputs[0] - w2v.wv.word_vec(valid_input[0])
print(valid_input[0])
w2v.wv.similar_by_vector(outputs[0])

namówić


[('sangrita', 0.18988117575645447),
 ('łuke', 0.18721675872802734),
 ('qingkong', 0.18496420979499817),
 ('kosmicznykowboj', 0.18381938338279724),
 ('zioberek', 0.1813872754573822),
 ('śnieżek', 0.1782398372888565),
 ('naprałdę', 0.17739275097846985),
 ('pielucho-majtki', 0.17712533473968506),
 ('a.d.d.a.d.d.a.d.d.a.d.d.', 0.17548315227031708),
 ('fioletowego', 0.17520149052143097)]