# Grapheme to Phoneme

In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict

In [2]:
from keras.models import Sequential
from keras.layers.core import Masking, Dense
from keras.layers.recurrent import LSTM
from keras.layers.wrappers import TimeDistributed
from keras.optimizers import RMSprop

Using TensorFlow backend.


##### reading the train

In [15]:
train = pd.read_csv('train.csv', dtype={'Word': 'str', 'Transcription': 'str', 'Id': 'str'})
print(train.head())
train.dropna(inplace=True)
X_raw, y_raw, s2id = train.Word.head(1000), train.Transcription.head(1000), train.Id.head(1000)
X_raw, y_raw, s2id = np.array(X_raw), np.array(y_raw), np.array(s2id)

  Id            Word                  Transcription
0  1       KNOXVILLE                N AA K S V IH L
1  2      MOVIEGOING           M UW V IY G OW IH NG
2  3  PHOTOSYNTHESIS  F OW T OW S IH N TH AH S IH S
3  4           DELIO                   D EY L IY OW
4  5        SWIVELED                S W IH V AH L D


In [16]:
phoneme_data = pd.read_csv('phones.txt', sep = ' ')
phoneme_data.head()
phoneme_enc = defaultdict()
phoneme_dec = defaultdict()
for ph, code in zip(phoneme_data.phoneme, phoneme_data.code):
    phoneme_enc[ph] = code
    phoneme_dec[code] = ph

In [17]:
phoneme_enc['AA']

'a'

##### data prepare

In [18]:
MAX_WORD_LEN = max(map(len, X_raw)) + 2
x_preproc = lambda x: ''.join(['@', x, '#' * (MAX_WORD_LEN - 2 - len(x))])[::-1]
X_raw = map(x_preproc, X_raw)

y_raw = map(lambda z: ''.join(map(lambda zz: phoneme_enc[zz], z.strip().split(' '))), y_raw)
MAX_PHONE_LEN = max(map(len, y_raw)) + 2
y_preproc = lambda y: ''.join(['$', y, '%' * (MAX_PHONE_LEN - 2 - len(y))])
y_raw = map(y_preproc, y_raw)

In [19]:
X_raw[:4]

['########################ELLIVXONK@',
 '#######################GNIOGEIVOM@',
 '###################SISEHTNYSOTOHP@',
 '############################OILED@']

In [20]:
y_raw[:4]

['$watCIqu%%%%%%%%%%%%%%%%%%%%%%%%%',
 '$vHIroyqx%%%%%%%%%%%%%%%%%%%%%%%%',
 '$nyEyCqwFcCqC%%%%%%%%%%%%%%%%%%%%',
 '$imury%%%%%%%%%%%%%%%%%%%%%%%%%%%']

In [21]:
X_chars = sorted(list(set(''.join(X_raw))))
y_chars = sorted(list(set(''.join(y_raw))))

xchar_indices = dict((c, i) for i, c in enumerate(X_chars))
xindices_char = dict((i, c) for i, c in enumerate(X_chars))

ychar_indices = dict((c, i) for i, c in enumerate(y_chars))
yindices_char = dict((i, c) for i, c in enumerate(y_chars))

##### Vectorization...

In [22]:
X = np.zeros((len(X_raw), MAX_WORD_LEN, len(X_chars)), dtype=np.bool)
y = np.zeros((len(y_raw), MAX_PHONE_LEN, len(y_chars)), dtype=np.bool)

for i, word in enumerate(X_raw):
    for t, char in enumerate(word):
        X[i, t, xchar_indices[char]] = 1

for i, word in enumerate(y_raw):
    for t, char in enumerate(word):
        y[i, t, ychar_indices[char]] = 1

In [23]:
X

array([[[False, False,  True, ..., False, False, False],
        [False, False,  True, ..., False, False, False],
        [False, False,  True, ..., False, False, False],
        ..., 
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False]],

       [[False, False,  True, ..., False, False, False],
        [False, False,  True, ..., False, False, False],
        [False, False,  True, ..., False, False, False],
        ..., 
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False]],

       [[False, False,  True, ..., False, False, False],
        [False, False,  True, ..., False, False, False],
        [False, False,  True, ..., False, False, False],
        ..., 
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, 

##### building the model

In [24]:
print(X.shape, y.shape)
max_len = X.shape[1]
n_feats = X.shape[2]
n_outs = y.shape[2]
n_hidden = 32
 
print("Compiling the model ...")
model = Sequential([
    Masking(mask_value=0., input_shape=(max_len, n_feats)),
    LSTM(n_hidden, return_sequences=True),
    LSTM(n_hidden, return_sequences=True),
    LSTM(n_hidden, return_sequences=True),
    TimeDistributed(Dense(n_outs, activation='softmax')),
])
optimizer = RMSprop(lr=0.001, decay=1e-6)
model.compile(optimizer=optimizer, loss='categorical_crossentropy')

((106422, 35, 53), (106422, 34, 41))
Compiling the model ...


##### training

In [None]:
print("Training the model ...")
hist = model.fit(X, y, batch_size=16, nb_epoch=10)

In [256]:
print("Training the model ...")
hist = model.fit(X, y, batch_size=5, nb_epoch=5)

Training the model ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [260]:
print("Training the model ...")
hist = model.fit(X, y, batch_size=1, nb_epoch=1)

Training the model ...
Epoch 1/1


##### getting first results

In [290]:
def sample(preds, temperature=0.5):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    #probas = np.random.multinomial(1, preds, 1)
    return yindices_char[np.argmax(preds)]

In [294]:
def ydec(s):
    res = ''
    for t, char in enumerate(s):
        if char != '$' and char != '%':
            res += phoneme_dec[char]
        else:
            res += char
    return res
    
word = X_raw[10]
print("Let's try")
print(word[::-1])
x = np.zeros((1, MAX_WORD_LEN, len(X_chars)))
for t, char in enumerate(word):
    #print char
    x[0, t, xchar_indices[char]] = 1.
#print x

preds = model.predict(x, verbose=0)[0]
print ydec(''.join(map(sample, preds)))

Let's try
@RADY#############
$KRNAH%%%%%%%%%%%%%%


In [None]:
'''train = aux.read_train('train.csv', nlines=1000)
### Approach 1. Joint encoder-decoder
print("Creating train data ...")

print(X.shape, y.shape)
max_len = X.shape[1]
n_feats = X.shape[2]
n_outs = y.shape[2] # In fact it equals X.shape[2]
n_hidden = 32
 
print("Compiling the model ...")
model = Sequential([
    Masking(mask_value=0., input_shape=(max_len, n_feats)),
    LSTM(n_hidden, return_sequences=True),
    LSTM(n_hidden, return_sequences=True),
    LSTM(n_hidden, return_sequences=True),
    TimeDistributed(Dense(n_outs, activation='softmax')),
])
optimizer = RMSprop(lr=0.001, decay=1e-6)
model.compile(optimizer=optimizer, loss='categorical_crossentropy')
 
print("Training the model ...")
hist = model.fit(X, y, batch_size=16, nb_epoch=1)
'''