In [37]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving glove.6B.100d.txt to glove.6B.100d.txt
Saving tur.txt to tur (1).txt
User uploaded file "glove.6B.100d.txt" with length 347116733 bytes
User uploaded file "tur (1).txt" with length 33268135 bytes


In [40]:
import tensorflow as tf
import numpy as np

In [41]:
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.layers import Input, Dense, GRU, Embedding, CuDNNGRU
from tensorflow.keras.optimizers import RMSprop
from tensorflow.python.keras.callbacks import ModelCheckpoint
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [42]:
mark_start = 'ssss '
mark_end = ' eeee'

In [43]:
data_src = []
data_dest = []

In [44]:
for line in open('tur.txt', encoding='UTF-8'):
    en_text, tr_text = line.rstrip().split('\t')

    tr_text = mark_start + tr_text + mark_end

    data_src.append(en_text)
    data_dest.append(tr_text)


In [45]:
data_src[80]

'Goodbye!'

In [47]:
data_dest[80]

'ssss Hoşça kalın. eeee'

In [46]:
data_src[300000]

'Tom will certainly try to do that.'

In [48]:
data_dest[300000]

'ssss Tom kesinlikle onu yapmaya çalışacak. eeee'

In [49]:
class TokenizerWrap(Tokenizer):
    def __init__(self, texts, padding, reverse=False, num_words=None):
        Tokenizer.__init__(self, num_words=num_words)

        self.fit_on_texts(texts)

        self.index_to_word = dict(zip(self.word_index.values(), self.word_index.keys()))

        self.tokens = self.texts_to_sequences(texts)

        if reverse:
            self.tokens = [list(reversed(x)) for x in self.tokens]
            truncating = 'pre'
        else:
            truncating = 'post'

        self.num_tokens = [len(x) for x in self.tokens]
        self.max_tokens = np.mean(self.num_tokens) + 2 * np.std(self.num_tokens)
        self.max_tokens = int(self.max_tokens)

        self.tokens_padded = pad_sequences(self.tokens,
                                           maxlen=self.max_tokens,
                                           padding=padding,
                                           truncating=truncating)

    def token_to_word(self, token):
        word = ' ' if token == 0 else self.index_to_word[token]
        return word

    def tokens_to_string(self, tokens):
        words = [self.index_to_word[token] for token in tokens if token != 0]
        text = ' '.join(words)
        return text

    def text_to_tokens(self, text, padding, reverse=False):
        tokens = self.texts_to_sequences([text])
        tokens = np.array(tokens)

        if reverse:
            tokens = np.flip(tokens, axis=1)
            truncating = 'pre'
        else:
            truncating = 'post'

        tokens = pad_sequences(tokens,
                               maxlen=self.max_tokens,
                               padding=padding,
                               truncating=truncating)

        return tokens

In [50]:
tokenizer_src = TokenizerWrap(texts = data_src,
                              padding = 'pre',
                              reverse = True,
                              num_words = None)

In [51]:
tokenizer_dest = TokenizerWrap(texts=data_dest,
                              padding='post',
                              reverse=False,
                              num_words=None)

In [52]:
tokens_src = tokenizer_src.tokens_padded
tokens_dest = tokenizer_dest.tokens_padded
print(tokens_src.shape)
print(tokens_dest.shape)

(473035, 11)
(473035, 10)


In [53]:
tokens_dest[20000]

array([    1,    59, 52107,   529,   498,     2,     0,     0,     0,
           0], dtype=int32)

In [54]:
tokens_src[20000]

array([  0,   0,   0,   0,   0,   0,   0,  57, 474,  21,  39], dtype=int32)

In [55]:
tokenizer_src.tokens_to_string(tokens_src[20000])

'him trust we can'

In [56]:
tokenizer_dest.tokens_to_string(tokens_dest[20000])

'ssss ona itimat edebilir miyiz eeee'

In [57]:
token_start = tokenizer_dest.word_index[mark_start.strip()]
token_start

1

In [58]:
token_end = tokenizer_dest.word_index[mark_end.strip()]
token_end

2

In [59]:
encoder_input_data = tokens_src

In [60]:
decoder_input_data = tokens_dest[:, :-1]
decoder_output_data = tokens_dest[:, 1:]

In [61]:
encoder_input_data[200000]

array([   0,    0,    0,    0,    0,    0, 1028,  113,   95,    5,   39],
      dtype=int32)

In [62]:
decoder_input_data[200000]

array([   1, 2391,    4,   18, 4127,   48,    2,    0,    0], dtype=int32)

In [63]:
decoder_output_data[200000]

array([2391,    4,   18, 4127,   48,    2,    0,    0,    0], dtype=int32)

In [64]:
tokenizer_dest.tokens_to_string(decoder_input_data[200000])

'ssss eksik bir şey görebiliyor musun eeee'

In [65]:
tokenizer_dest.tokens_to_string(decoder_output_data[200000])

'eksik bir şey görebiliyor musun eeee'

In [66]:
num_encoder_words = len(tokenizer_src.word_index)
num_decoder_words = len(tokenizer_dest.word_index)

In [67]:
embedding_size = 100

In [77]:
word2vec = {}

with open('glove.6B.100d.txt', encoding = 'UTF-8') as f:

    for line in f:
        values = line.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype = 'float32')
        word2vec[word] = vec

In [78]:
embedding_matrix = np.random.uniform(-1, 1, (num_encoder_words, embedding_size))
for word, i in tokenizer_src.word_index.items():
    if i < num_encoder_words:
      embedding_vector = word2vec.get(word)
      if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [81]:
from os import name
encoder_input = Input(shape = (None ,), name = 'encoder_input')

In [83]:
encoder_embedding = Embedding(input_dim = num_encoder_words,
                              output_dim = embedding_size,
                               weights = [embedding_matrix],
                              trainable = True,
                              name = 'encoder_embedding')

In [108]:
state_size = 256

In [105]:
encoder_gru1= CuDNNGRU(state_size, name='encoder_gru1', return_sequences = True)
encoder_gru2= CuDNNGRU(state_size, name='encoder_gru2', return_sequences = True )
encoder_gru3= CuDNNGRU(state_size, name='encoder_gru3', return_sequences = True )

In [106]:
def connect_encoder():

    net = encoder_input

    net = encoder_embedding(net)

    net= encoder_gru1(net)
    net= encoder_gru2(net)
    net= encoder_gru3(net)

    encoder_output = net

    return encoder_output

In [87]:
encoder_output = connect_encoder()

In [88]:
decoder_initial_state = Input(shape = (state_size,), name= 'decoder_initial_state')

In [89]:
decoder_input = Input(shape = (None,),name = 'decoder_input')

In [100]:
decoder_embedding = Embedding(input_dim = num_decoder_words,
                               output_dim = embedding_size,
                               name = 'decoder_embedding')

In [101]:
decoder_gru1 = CuDNNGRU(state_size, name = 'decoder_gru1', return_sequences = True)
decoder_gru2 = CuDNNGRU(state_size, name = 'decoder_gru2', return_sequences = True)
decoder_gru3 = CuDNNGRU(state_size, name = 'decoder_gru3', return_sequences = True)

In [102]:
def connect_decoder(initial_state):
    net = decoder_input

    net = decoder_embedding(net)

    net = decoder_gru1(net, initial_state = initial_state)
    net = decoder_gru2(net, initial_state = initial_state)
    net = decoder_gru3(net, initial_state = initial_state)

    decoder_output = decoder_dense(net)

    return decoder_output

In [107]:
decoder_output = connect_decoder(initial_state = encoder_output)

ValueError: ignored