In [3]:
# dependencies
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split
import time
import matplotlib.pyplot as plt
import cPickle as pickle
from collections import Counter


In [6]:
#dataRead

def read_setntences(filepath):
    setntences = []
    with open(filepath, 'r') as corpus:
        for c in corpus:
            sentences.append(c.split('.'))

In [11]:
#DataPreparation
def create_dataset(source_sentences,target_sentences):
    source_vocab_dict = Counter(word.strip(',." ;:)(][?!') for sentence in source_sentences for word in sentence.split())
    target_vocab_dict = Counter(word.strip(',." ;:)(][?!') for sentence in target_sentences for word in sentence.split())

    source_vocab = map(lambda x: x[0], sorted(source_vocab_dict.items(), key = lambda x: -x[1]))
    target_vocab = map(lambda x: x[0], sorted(target_vocab_dict.items(), key = lambda x: -x[1]))
    
    source_vocab = source_vocab[:20000]
    target_vocab = target_vocab[:30000]
    
    start_idx = 2
    source_word2idx = dict([(word, idx+start_idx) for idx, word in enumerate(source_vocab)])
    source_word2idx['<ukn>'] = 0
    source_word2idx['<pad>'] = 1
    source_idx2word = dict([(idx, word) for word, idx in source_word2idx.iteritems()])
    
    start_idx = 4
    target_word2idx = dict([(word, idx+start_idx) for idx, word in enumerate(target_vocab)])
    target_word2idx['<ukn>'] = 0
    target_word2idx['<go>']  = 1
    target_word2idx['<eos>'] = 2
    target_word2idx['<pad>'] = 3
    
    target_idx2word = dict([(idx, word) for word, idx in target_word2idx.iteritems()])
    x = [[source_word2idx.get(word.strip(',." ;:)(][?!'), 0) for word in sentence.split()] for sentence in source_sentences]
    y = [[target_word2idx.get(word.strip(',." ;:)(][?!'), 0) for word in sentence.split()] for sentence in target_sentences]
    
    X = []
    Y = []
    
    for i in range(len(x)):
        n1 = len(x[i])
        n2 = len(y[i])
        n = n1 if n1 < n2 else n2 
        if abs(n1 - n2) <= 0.3 * n:
            if n1 <= 15 and n2 <= 15:
                X.append(x[i])
                Y.append(y[i])
    return X, Y, source_word2idx, source_idx2word, source_vocab, target_word2idx, target_idx2word, target_vocab

def save_dataset(file_path, obj):
    with open(file_path, 'wb') as f:
        pickle.dump(obj, f, -1)

def read_dataset(file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)

In [14]:
en_sentences = ['hello my friend', 'we need to reboot the server']
es_sentences = ['hola mi amigo', 'necesitamos reiniciar el servidor']
save_dataset('./data.pkl', create_dataset(en_sentences, es_sentences))

In [21]:
# read dataset
X, Y, en_word2idx, en_idx2word, en_vocab, es_word2idx, es_idx2word, es_vocab = read_dataset('data.pkl')

In [22]:
print X

[[9, 8, 10]]


In [23]:
#CHECK THAT WORKs
print 'Sentence in English - encoded:', X[0]
print 'Sentence in Spanish - encoded:', Y[0]
print 'Decoded:\n------------------------'

for i in range(len(X[0])):
    print en_idx2word[X[0][i]],
    
print '\n'

for i in range(len(Y[0])):
    print es_idx2word[Y[0][i]],

Sentence in English - encoded: [9, 8, 10]
Sentence in Spanish - encoded: [10, 7, 4]
Decoded:
------------------------
hello my friend 

hola mi amigo


In [26]:
# build a model

input_seq_len = 15
output_seq_len = 17
en_vocab_size = len(en_vocab) + 2 # + <pad>, <ukn>
es_vocab_size = len(es_vocab) + 4 # + <pad>, <ukn>, <eos>, <go>

# placeholders
encoder_inputs = [tf.placeholder(dtype = tf.int32, shape = [None], name = 'encoder{}'.format(i)) for i in range(input_seq_len)]
decoder_inputs = [tf.placeholder(dtype = tf.int32, shape = [None], name = 'decoder{}'.format(i)) for i in range(output_seq_len)]

targets = [decoder_inputs[i+1] for i in range(output_seq_len-1)]
# add one more target
targets.append(tf.placeholder(dtype = tf.int32, shape = [None], name = 'last_target'))
target_weights = [tf.placeholder(dtype = tf.float32, shape = [None], name = 'target_w{}'.format(i)) for i in range(output_seq_len)]

# output projection
size = 512
w_t = tf.get_variable('proj_w', [es_vocab_size, size], tf.float32)
b = tf.get_variable('proj_b', [es_vocab_size], tf.float32)
w = tf.transpose(w_t)
output_projection = (w, b)

outputs, states = tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(
                                            encoder_inputs,
                                            decoder_inputs,
                                            tf.contrib.rnn.BasicLSTMCell(size),
                                            num_encoder_symbols = en_vocab_size,
                                            num_decoder_symbols = es_vocab_size,
                                            embedding_size = 100,
                                            feed_previous = False,
                                            output_projection = output_projection,
                                            dtype = tf.float32)