In [25]:
import os, sys

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, GRU, Dense, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd

In [2]:
BATCH_SIZE = 64
EPOCHS = 20
LSTM_NODES =256
NUM_SENTENCES = 5000
MAX_SENTENCE_LENGTH = 50
MAX_NUM_WORDS = 20000
EMBEDDING_SIZE = 100

In [3]:
lines = pd.read_csv('turkish.tsv', sep = '\t', header = None)
lines.rename(columns={0:'English',
                          1:'Turkish',
                          2:'Drop'}, 
                 inplace=True)
lines = lines.drop('Drop', axis =1)
lines = lines.values.tolist()

In [4]:
lines

[['Hi.', 'Merhaba.'],
 ['Run!', 'Kaç!'],
 ['Run!', 'Koş!'],
 ['Run.', 'Kaç!'],
 ['Run.', 'Koş!'],
 ['Who?', 'Kim?'],
 ['Wow!', 'Vay canına!'],
 ['Fire!', 'Ateş!'],
 ['Fire!', 'Yangın!'],
 ['Fire!', 'Ateş et!'],
 ['Help!', 'İmdat!'],
 ['Jump!', 'Atla!'],
 ['Jump.', 'Git!'],
 ['Jump.', 'Atla!'],
 ['Wait.', 'Bekle.'],
 ['Do it.', 'Onu yap.'],
 ['Go on.', 'Devam edin!'],
 ['I ran.', 'Koştum.'],
 ['I see.', 'Anlıyorum.'],
 ['I see.', 'Görüyorum.'],
 ['Oh no!', 'Oh hayır!'],
 ['Shoot!', 'Ateş!'],
 ['Shoot!', 'Ateş et!'],
 ['Shoot!', 'Vur!'],
 ['Attack!', 'Saldır!'],
 ['Attack!', 'Hücum!'],
 ['Cheers!', 'Yarasın!'],
 ['Cheers!', 'Şerefe.'],
 ['Freeze!', 'Kımıldama!'],
 ['Freeze!', 'Olduğun yerde kal!'],
 ['Go now.', 'Haydi git.'],
 ['Go now.', 'Git artık.'],
 ['Go now.', 'Gidin artık.'],
 ['Got it?', 'Anladın mı?'],
 ["I'm 19.", 'Ben 19 yaşındayım.'],
 ['No way!', 'Mümkün değil!'],
 ['No way!', 'Asla!'],
 ['Really?', 'Gerçekten mi?'],
 ['Really?', 'Valla mı?'],
 ['Really?', 'Esas mı?'],
 ['Re

In [5]:
input_sentences = []
output_sentences = []
output_sentences_inputs = []

count = 0
for line in lines:
    count += 1

    if count > NUM_SENTENCES:
        break

    input_sentence, output = line[0], line[1]

    output_sentence = output + ' <eos>'
    output_sentence_input = '<sos> ' + output

    input_sentences.append(input_sentence)
    output_sentences.append(output_sentence)
    output_sentences_inputs.append(output_sentence_input)

print("num samples input:", len(input_sentences))
print("num samples output:", len(output_sentences))
print("num samples output input:", len(output_sentences_inputs))

num samples input: 5000
num samples output: 5000
num samples output input: 5000


In [6]:
print(input_sentences[172])
print(output_sentences[172])
print(output_sentences_inputs[172])

Follow us.
Bizi izle. <eos>
<sos> Bizi izle.


In [7]:
input_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
input_tokenizer.fit_on_texts(input_sentences)
input_integer_seq = input_tokenizer.texts_to_sequences(input_sentences)
word2idx_inputs = input_tokenizer.word_index
print('Total unique words in the input: %s' % len(word2idx_inputs))

max_input_len = max(len(sen) for sen in input_integer_seq)
print("Length of longest sentence in input: %g" % max_input_len)

Total unique words in the input: 2022
Length of longest sentence in input: 6


In [8]:
output_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, filters='')
output_tokenizer.fit_on_texts(output_sentences + output_sentences_inputs)
output_integer_seq = output_tokenizer.texts_to_sequences(output_sentences)
output_input_integer_seq = output_tokenizer.texts_to_sequences(output_sentences_inputs)

word2idx_outputs = output_tokenizer.word_index
print('Total unique words in the output: %s' % len(word2idx_outputs))

num_words_output = len(word2idx_outputs) + 1
max_out_len = max(len(sen) for sen in output_integer_seq)
print("Length of longest sentence in the output: %g" % max_out_len)

Total unique words in the output: 4568
Length of longest sentence in the output: 7


In [9]:
encoder_input_sequences = pad_sequences(input_integer_seq, maxlen=max_input_len)
print("encoder_input_sequences.shape:", encoder_input_sequences.shape)
print("encoder_input_sequences[172]:", encoder_input_sequences[172])

encoder_input_sequences.shape: (5000, 6)
encoder_input_sequences[172]: [  0   0   0   0 400  48]


In [10]:
print(word2idx_inputs["follow"])
print(word2idx_inputs["us"])

400
48


In [11]:
decoder_input_sequences = pad_sequences(output_input_integer_seq, maxlen=max_out_len, padding='post')
print("decoder_input_sequences.shape:", decoder_input_sequences.shape)
print("decoder_input_sequences[172]:", decoder_input_sequences[172])

decoder_input_sequences.shape: (5000, 7)
decoder_input_sequences[172]: [  2  63 380   0   0   0   0]


In [21]:
type(decoder_input_sequences)

numpy.ndarray

In [12]:
print(word2idx_outputs["<sos>"])
print(word2idx_outputs["bizi"])
print(word2idx_outputs["izle."])

2
63
380


In [13]:
from numpy import array
from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()

glove_file = open(r'C:\\Users\\Ufuk Altan\\Downloads\\Downloads\\Data Science\\glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()

In [14]:
num_words = min(MAX_NUM_WORDS, len(word2idx_inputs) + 1)
embedding_matrix = zeros((num_words, EMBEDDING_SIZE))
for word, index in word2idx_inputs.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [15]:
print(embeddings_dictionary["ill"])

[ 0.12648    0.1366     0.22192   -0.025204  -0.7197     0.66147
  0.48509    0.057223   0.13829   -0.26375   -0.23647    0.74349
  0.46737   -0.462      0.20031   -0.26302    0.093948  -0.61756
 -0.28213    0.1353     0.28213    0.21813    0.16418    0.22547
 -0.98945    0.29624   -0.62476   -0.29535    0.21534    0.92274
  0.38388    0.55744   -0.14628   -0.15674   -0.51941    0.25629
 -0.0079678  0.12998   -0.029192   0.20868   -0.55127    0.075353
  0.44746   -0.71046    0.75562    0.010378   0.095229   0.16673
  0.22073   -0.46562   -0.10199   -0.80386    0.45162    0.45183
  0.19869   -1.6571     0.7584    -0.40298    0.82426   -0.386
  0.0039546  0.61318    0.02701   -0.3308    -0.095652  -0.082164
  0.7858     0.13394   -0.32715   -0.31371   -0.20247   -0.73001
 -0.49343    0.56445    0.61038    0.36777   -0.070182   0.44859
 -0.61774   -0.18849    0.65592    0.44797   -0.10469    0.62512
 -1.9474    -0.60622    0.073874   0.50013   -1.1278    -0.42066
 -0.37322   -0.50538    0

In [16]:
print(embedding_matrix[539])

[-0.024843    0.47766     0.32437    -0.054239   -0.47622001  1.10430002
  0.014733    0.92413002 -0.0036772  -0.29596999  0.54781997 -0.081146
 -0.021779    0.51880002  0.14273    -0.019756   -0.4429      0.45526001
 -0.34454    -0.81260002 -0.35356     0.019425   -0.26789001 -1.06159997
  0.27105001 -0.70291001 -0.14556    -0.46669    -0.38155001 -0.11217
  0.14527     0.047139    0.076051    0.64837998 -0.051128    1.01119995
 -0.40836999  0.34013     0.75625998 -0.70703     0.23114    -1.01240003
  0.16597    -0.33013001  0.43122     0.12475    -0.87905997  0.45133001
  0.74589002 -0.52077001  0.062784   -0.55373001 -0.17152999  1.02119994
 -0.020486   -3.03209996 -0.26379001 -0.41382     1.19669998  0.45789999
 -0.37781     0.85339999  0.15135001  0.28586    -0.04119    -0.057052
  1.32219994 -0.17764001 -0.21249001 -0.069602    0.17002    -0.30949
  0.51332003 -0.28586999 -0.076631    0.66819     0.11987    -0.17851999
 -0.067827    0.14388999 -0.024034    0.057917    0.062507   

In [17]:
embedding_layer = tf.keras.layers.Embedding(num_words, EMBEDDING_SIZE, weights=[embedding_matrix], input_length=max_input_len)

In [18]:
decoder_targets_one_hot = np.zeros((
        len(input_sentences),
        max_out_len,
        num_words_output
    ),
    dtype='float32'
)

In [19]:
decoder_targets_one_hot.shape

(5000, 7, 4569)

In [20]:
decoder_targets_one_hot

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0.

In [22]:
for i, d in enumerate(decoder_input_sequences):
    for t, word in enumerate(d):
        decoder_targets_one_hot[i, t, word] = 1

In [27]:
encoder_inputs_placeholder = Input((max_input_len,))
x = embedding_layer(encoder_inputs_placeholder)
encoder = LSTM(LSTM_NODES, return_sequences=True)
encoder_outputs, h, c = encoder(x)
encoder_states = [h, c]

TypeError: Cannot iterate over a tensor with unknown first dimension.

In [None]:
decoder_inputs_placeholder = Input(shape=(max_out_len,))

decoder_embedding = Embedding(num_words_output, LSTM_NODES)
decoder_inputs_x = decoder_embedding(decoder_inputs_placeholder)

decoder_lstm = LSTM(LSTM_NODES, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs_x, initial_state=encoder_states)

In [None]:
decoder_dense = Dense(num_words_output, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [None]:
model = Model([encoder_inputs_placeholder,
  decoder_inputs_placeholder], decoder_outputs)
model.compile(
    optimizer='rmsprop',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [None]:
from keras.utils import plot_model
plot_model(model, to_file='model_plot4a.png', show_shapes=True, show_layer_names=True)

In [None]:
r = model.fit(
    [encoder_input_sequences, decoder_input_sequences],
    decoder_targets_one_hot,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_split=0.1,
)