In [1]:
# = Libraries to read in
import os, sys

# === key keras parts to create model ===

from keras.models import Model
from keras.layers import Input, LSTM, GRU, Dense, Embedding
from keras.preprocessing.text import Tokenizer # use this instead of nltk
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

# put in matrix format and plot networks
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# read in data

dothraki_dict_source = 'dothraki_dict.csv'

# will have issue with text after if to not specify string type
os.chdir('/Users/MStamp/Documents/AIA Workshop/NLP Local Programme Setup/Code Structure/Attempts of Translation')

dothraki_dict = pd.read_csv(dothraki_dict_source, dtype = 'str')


Using TensorFlow backend.


In [2]:

# apply input and output sentences
inputs = list(dothraki_dict['english'])
outputs = [str(word) + ' <eos>' for word in dothraki_dict['dothraki']]
output_sent_inputs = ['<sos> ' + str(word) for word in dothraki_dict['dothraki']]


print(str(len(inputs)) + ' ' + str(len(outputs)) + ' ' + str(len(output_sent_inputs)))

ex_val = 546
print(inputs[ex_val] + '\n' + outputs[ex_val] + '\n' + output_sent_inputs[ex_val])


# ==== Tokenization & Padding 

Total_inputs = len(inputs)

input_tokenizer = Tokenizer(num_words = Total_inputs)

# have to change code type
inputs = [str(word) for word in inputs]

input_tokenizer.fit_on_texts(inputs) # apply with inputs in English - issue as float object has no attribute

input_int_seq = input_tokenizer.texts_to_sequences(inputs)
input_int_seq[ex_val]


word2idx_inputs = input_tokenizer.word_index
# - word2idx_inputs[ex_val]

word2idx_inputs[inputs[ex_val].lower().split(' ')[0]]

# == apply same to output

Total_outputs = len(outputs)

output_tokenizer = Tokenizer(num_words=Total_outputs, filters='') #
output_tokenizer.fit_on_texts(outputs + output_sent_inputs)
# should be here as combi


output_int_seq = output_tokenizer.texts_to_sequences(outputs)
output_input_int_seq = output_tokenizer.texts_to_sequences(output_sent_inputs)

word2idx_outputs = output_tokenizer.word_index

# === Padding need to get max lens in order to correctly pad sequences
max_input_len = max(len(sen) for sen in input_int_seq)
max_output_len = max(len(sen) for sen in output_int_seq) # not output - input - would expect to be longer as have <eos> at end
# smaller in this case - as just word for word changes


encoder_input_seq = pad_sequences(input_int_seq, maxlen = max_input_len)
encoder_input_seq.shape
encoder_input_seq[ex_val]
# as only two words - see three output
decoder_input_seq = pad_sequences(output_input_int_seq, maxlen = max_output_len, padding = 'post')
decoder_input_seq[ex_val] # only two seen in any output



1361 1361 1361
giddy up
hosh <eos>
<sos> hosh


array([  2, 530], dtype=int32)

In [3]:


# == get glove file
# from numpy import array
from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()

glove_source = '/Users/MStamp/Documents/AIA Workshop/NLP Local Programme Setup/Code Structure/Pre Made Translation Corpus/glove.42B.300d.txt'

# open
glove_file =  open(glove_source, encoding="utf8")

print_out_vals = True
ind = 0
# get dim output
for line in glove_file:
    if print_out_vals:
        if ind % 10000000 == 0: # starts with comma
            print(line)
            
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
    ind += 1
glove_file.close()


, 0.18378 -0.12123 -0.11987 0.015227 -0.19121 -0.066074 -2.9876 0.80795 0.067338 -0.13184 -0.5274 0.44521 0.12982 -0.21823 -0.4508 -0.22478 -0.30766 -0.11137 -0.162 -0.21294 -0.46022 -0.086593 -0.24902 0.46729 -0.6023 -0.44972 0.43946 0.014738 0.27498 -0.078421 0.36009 0.12172 0.4298 -0.055345 0.4495 -0.74444 -0.26702 0.16431 -0.19335 0.13468 0.2887 0.23924 -0.23579 -0.28972 0.20149 0.048135 -0.18322 -0.15492 -0.19255 0.40271 0.16051 0.17721 0.32557 0.011625 -0.42572 0.34205 -0.45865 -0.2486 0.034128 0.03306 -0.057065 0.18136 -0.43638 0.0005709 -0.11935 -0.2195 0.16429 -0.18119 -0.19145 -0.081672 -0.2962 0.25803 0.073848 0.54213 -0.15405 -0.49256 0.091719 0.13329 -0.05253 -0.20518 0.34576 -1.0449 0.072779 -0.0003453 -0.16926 0.051019 -0.14753 0.23848 -0.40749 -0.58278 -0.48695 0.25863 -0.20531 -0.4775 0.40645 -0.038512 -2.403 -0.12421 0.63149 0.089419 0.08557 -0.20757 -0.1617 -0.29506 -0.13948 0.14202 -0.30138 -0.15806 0.52984 0.24229 0.075169 0.13792 0.90416 -0.23647 0.027788 0.099915

In [4]:
num_words = min(Total_inputs, len(word2idx_inputs) + 1)

emb_size = 300

embedding_matrix = zeros((num_words, emb_size))


for word,index in word2idx_inputs.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None: # is there some instant of no words having an output
        embedding_matrix[index] = embedding_vector # vector does not match output

example_text = 'fridge'

print(embeddings_dictionary[example_text])

embedding_layer = Embedding(num_words, emb_size , weights=[embedding_matrix], input_length=max_input_len)



[-2.2880e-01 -6.7593e-03 -2.4340e-01  4.9482e-02  5.3523e-01 -4.3863e-02
 -1.7705e+00 -3.4968e-02  3.1769e-01 -5.8027e-01  8.8206e-02 -3.6729e-01
 -5.2296e-02  6.7268e-01 -2.0580e-01 -2.3138e-01 -2.4324e-01 -4.3787e-02
 -1.6773e-01  9.5614e-02  6.2529e-02 -4.7919e-01 -1.9585e-01  2.8838e-01
 -4.2570e-01 -2.1769e-01  3.2729e-01  5.4468e-01  9.3767e-01 -4.7373e-01
 -2.0551e-01 -8.0361e-02 -4.8719e-01  5.2029e-02  5.6219e-01  2.5820e-01
 -1.5082e-01  4.0425e-01 -6.1659e-01 -2.7654e-01 -3.8402e-01 -3.1740e-01
 -2.2097e-01 -4.1795e-01 -1.4965e-01 -4.4497e-02  4.9798e-01 -3.7579e-02
  7.2490e-02 -6.9846e-01 -6.3166e-01  2.8624e-01  6.7615e-02 -5.2663e-02
  2.1006e-01  1.8085e-01  3.5130e-01 -1.3700e-01 -2.7910e-01 -2.1344e-01
  2.4892e-01  5.4115e-02 -1.2008e-02  1.4010e-01  3.4760e-02 -6.4579e-01
  3.2657e-01  1.9191e-01  3.1804e-01  2.8933e-01 -1.7503e-01 -2.1462e-01
 -2.0943e-01 -3.5404e-01  1.5801e-01  3.3117e-01  1.1115e-01 -3.7901e-01
  5.8188e-01  1.6978e-01  1.6933e-01  9.9021e-01 -1

In [5]:
# ==== develop model - additional structure featurea ====

num_words_output = len(word2idx_outputs) + 1

decoder_targets_one_hot = zeros((
        len(inputs),
        max_output_len,
        num_words_output
    ),
    dtype='float32'
)
# to fill with training output

for i, d in enumerate(decoder_input_seq):
    for t, word in enumerate(d):
        decoder_targets_one_hot[i, t, word] = 1
        
decoder_targets_one_hot[ex_val] 

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [6]:
# === encoder input 

nodes = 256 # unsure of impact to change this

encoder_inputs_placeholder = Input(shape=(max_input_len,))
x = embedding_layer(encoder_inputs_placeholder)
encoder = LSTM(nodes, return_state=True)

encoder_outputs, h, c = encoder(x)
encoder_states = [h, c]

# ==== define decoder

decoder_inputs_placeholder = Input(shape=(max_output_len,))

decoder_embedding = Embedding(num_words_output, nodes)
decoder_inputs_x = decoder_embedding(decoder_inputs_placeholder)

decoder_lstm = LSTM(nodes, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs_x, initial_state=encoder_states)


# ==== Combine for Prediction and Error 

decoder_dense = Dense(num_words_output, activation = 'softmax')

decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs_placeholder, 
  decoder_inputs_placeholder], decoder_outputs)
model.compile(
    optimizer='rmsprop',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [7]:
print(model.summary())
from keras.utils.vis_utils import plot_model
plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 3)            0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 2)            0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 3, 300)       333000      input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 2, 256)       341760      input_2[0][0]                    
____________________________________________________________________________________________

ImportError: Failed to import `pydot`. Please install `pydot`. For example with `pip install pydot`.

In [9]:
import pydot as pyd
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

keras.utils.vis_utils.pydot = pyd

#Visualize Model

def visualize_model(model):
    return SVG(model_to_dot(model).create(prog='dot', format='svg'))
#create your model
#then call the function on your model
visualize_model(model)

ModuleNotFoundError: No module named 'pydot'

In [1]:

batch_size = 64
epochs = 1

r = model.fit(
    [encoder_input_seq, decoder_input_seq],
    decoder_targets_one_hot,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.1,
)

NameError: name 'model' is not defined