In [3]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import re, os, sys
from sklearn.preprocessing import LabelEncoder

PROJ_ROOT = os.path.abspath(os.path.join(os.pardir))
sys.path.append(os.path.join(PROJ_ROOT, 'src'))
import word_utils

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import LSTM
from keras.layers import TimeDistributed
from keras.layers import RepeatVector
from keras.models import model_from_json

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
df = word_utils.load_dictionary()

Size of word dictionary: 117414


### Create a dictionary of all inputs and outputs

In [5]:
# Create a blank list
letr_lst = []

# Options for maximum words
inp, oup = 0, 0

for index, row in df.iterrows():
    # Append each phoneme and letter to a list
    letr_lst.extend(row['phonetics'].split(' '))
    letr_lst.extend(list(row['word']))
    
    if len(row['phonetics'].split(' ')) > oup:
        oup = len(row['phonetics'].split(' '))
    if len(list(row['word'])) > inp:
        inp = len(list(row['word']))

letr_lst.extend(['<PAD>'])
letr_lst = list(set(letr_lst))
print(letr_lst)

['W', 'E', '<PAD>', 'B', 'OW0', 'OY2', 'AO1', 'IY0', 'TH', 'SH', 'F', 'X', 'OY1', 'EY2', 'A', 'HH', 'ER0', 'AA2', 'AH0', 'Y', 'UH1', 'AA0', 'Z', 'EH2', 'L', 'IH1', 'UW1', 'AW0', 'G', 'Q', 'EH1', 'AA1', 'AO2', 'U', 'UH0', 'O', 'T', 'M', 'OW2', 'IH0', 'D', 'IY1', 'CH', 'R', 'N', 'I', 'AW2', 'S', 'ER1', 'AH2', 'UH2', '-', 'OW1', 'IY2', 'AE2', 'NG', 'JH', 'EY0', 'AW1', 'AE1', 'K', 'V', 'P', 'IH2', 'UW0', 'AE0', 'UW2', 'EH0', 'AO0', 'ER2', 'H', 'DH', 'OY0', 'AH1', 'J', 'C', 'AY2', 'AY1', 'AY0', 'ZH', 'EY1']


In [8]:
# Label encode our text to integers
lbl_enc = LabelEncoder()
lbl_enc.fit_transform(letr_lst)
np.save('../model/encoder.npy', lbl_enc.classes_)
lbl_enc.classes_

array(['-', '<PAD>', 'A', 'AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0',
       'AH1', 'AH2', 'AO0', 'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0',
       'AY1', 'AY2', 'B', 'C', 'CH', 'D', 'DH', 'E', 'EH0', 'EH1', 'EH2',
       'ER0', 'ER1', 'ER2', 'EY0', 'EY1', 'EY2', 'F', 'G', 'H', 'HH', 'I',
       'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2', 'J', 'JH', 'K', 'L', 'M',
       'N', 'NG', 'O', 'OW0', 'OW1', 'OW2', 'OY0', 'OY1', 'OY2', 'P', 'Q',
       'R', 'S', 'SH', 'T', 'TH', 'U', 'UH0', 'UH1', 'UH2', 'UW0', 'UW1',
       'UW2', 'V', 'W', 'X', 'Y', 'Z', 'ZH'], 
      dtype='<U5')

In [5]:
# Test the conversion on a single word
word_utils.single_word_conversion(df, 'Christopher', lbl_enc)

Source: ['C', 'H', 'R', 'I', 'S', 'T', 'O', 'P', 'H', 'E', 'R']
Target: ['K', 'R', 'IH1', 'S', 'T', 'AH0', 'F', 'ER0']
#### TOKENISING ####
Source: [22 38 63 40 64 66 54 61 38 26 63]
Target: [49 63 42 64 66  9 36 30]


### Create test and train datasets

In [6]:
# Maximum length of input and output
# Alphabet size
n_chr = len(lbl_enc.classes_)
print('Max input:', inp, 'Max output:', oup)
print('Size of alphabet:', n_chr)

Max input: 34 Max output: 32
Size of alphabet: 81


In [7]:
X, y = word_utils.generate_sample(df, inp, oup, lbl_enc, n_chr)
print(X[0])
print(y[0])

[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 1 0 ..., 0 0 0]
 [0 1 0 ..., 0 0 0]
 [0 1 0 ..., 0 0 0]]
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 1 0 ..., 0 0 0]
 [0 1 0 ..., 0 0 0]
 [0 1 0 ..., 0 0 0]]


In [8]:
# define LSTM configuration
n_batch = 50
n_epoch = 5

# create LSTM
model = Sequential()
model.add(LSTM(300, input_shape=(inp, n_chr)))
model.add(RepeatVector(oup))
model.add(LSTM(300, return_sequences=True))
model.add(TimeDistributed(Dense(n_chr, activation='softmax')))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
# train LSTM
for i in range(n_epoch):
    X, y = word_utils.generate_sample(df, inp, oup, lbl_enc, n_chr)
    model.fit(X, y, epochs=1, batch_size=n_batch)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 300)               458400    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 32, 300)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 32, 300)           721200    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 32, 81)            24381     
Total params: 1,203,981
Trainable params: 1,203,981
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1


In [11]:
for i in range(n_epoch):
    X, y = word_utils.generate_sample(df, inp, oup, lbl_enc, n_chr)
    model.fit(X, y, epochs=1, batch_size=n_batch)

Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1


In [13]:
for i in range(10):
    X, y = word_utils.generate_sample(df, inp, oup, lbl_enc, n_chr)
    model.fit(X, y, epochs=1, batch_size=n_batch)

Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1


In [20]:
for i in range(10):
    X, y = word_utils.generate_sample(df, inp, oup, lbl_enc, n_chr)
    model.fit(X, y, epochs=1, batch_size=n_batch)

Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1


In [25]:
for i in range(5):
    X, y = word_utils.generate_sample(df, inp, oup, lbl_enc, n_chr)
    model.fit(X, y, epochs=1, batch_size=n_batch)

Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1


In [27]:
# serialize model to JSON
model_json = model.to_json()
with open("../model/model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("../model/model.h5")
print("Saved model to disk")

Saved model to disk


In [28]:
# load json and create model
json_file = open('../model/model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("../model/model.h5")
print("Loaded model from disk")

Loaded model from disk


In [32]:
# evaluate on some new patterns
input_word = 'cereal'
X = word_utils.new_word(input_word, inp, lbl_enc, n_chr)

result = loaded_model.predict(X, batch_size=1, verbose=0)

output = []
for i in result:
    for a in i:
        a = a.tolist() 
        ind = a.index(max(a))
        output.append(ind)
        
word = lbl_enc.inverse_transform(output)
print(word)
# ['K', 'R', 'IH1', 'S', 'T', 'AH0', 'F', 'ER0']

['S' 'IH1' 'R' 'AH0' 'AH0' '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>'
 '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>'
 '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>'
 '<PAD>' '<PAD>' '<PAD>']
