In [1]:
import string
import re
from pickle import dump,load
from unicodedata import normalize
from numpy import array

In [2]:
def load_file(filename):
    
    file = open(filename,mode="rt",encoding="utf-8")
    text = file.read()
    file.close()
    
    return text

In [3]:

def get_pairs(doc):
    lines = doc.strip().split("\n")
    pairs = [line.split('\t') for line in  lines]
    print(pairs[:10])
    return(pairs)

In [4]:
# clean a list of lines
def clean_pairs(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for pair in lines:
        clean_pair = list()
        for line in pair:
            # normalize unicode characters
            line = normalize('NFD', line).encode('ascii', 'ignore')
            line = line.decode('UTF-8')
            # tokenize on white space
            line = line.split()
            # convert to lowercase
            line = [word.lower() for word in line]
            # remove punctuation from each token
            line = [word.translate(table) for word in line]
            # remove non-printable chars form each token
            line = [re_print.sub('', w) for w in line]
            # remove tokens with numbers in them
            line = [word for word in line if word.isalpha()]
            # store as string
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    print(cleaned[:10])
    return array(cleaned)

In [5]:
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)
    


In [6]:
filename = "/home/payas/dl_box/data/names_languages/eng-fra.txt"
doc = load_file(filename)
pairs = get_pairs(doc)
cleaned_pairs = clean_pairs(pairs)
save_clean_data(cleaned_pairs, '/home/payas/dl_box/data/names_languages/english-french.pkl')

raw_dataset = cleaned_pairs
n_sentences = 10000
dataset = raw_dataset[:n_sentences, :]
train = dataset[:9000]
test = dataset[9000:]

[['Go.', 'Va !'], ['Run!', 'Cours\u202f!'], ['Run!', 'Courez\u202f!'], ['Wow!', 'Ça alors\u202f!'], ['Fire!', 'Au feu !'], ['Help!', "À l'aide\u202f!"], ['Jump.', 'Saute.'], ['Stop!', 'Ça suffit\u202f!'], ['Stop!', 'Stop\u202f!'], ['Stop!', 'Arrête-toi !']]
[['go', 'va'], ['run', 'cours'], ['run', 'courez'], ['wow', 'ca alors'], ['fire', 'au feu'], ['help', 'a laide'], ['jump', 'saute'], ['stop', 'ca suffit'], ['stop', 'stop'], ['stop', 'arretetoi']]
Saved: /home/payas/dl_box/data/names_languages/english-french.pkl


In [7]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer
 
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max(len(line.split()) for line in dataset[:,0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))
# prepare French tokenizer
fr_tokenizer = create_tokenizer(dataset[:, 1])
fr_vocab_size = len(fr_tokenizer.word_index) + 1
fr_length = max(len(line.split()) for line in dataset[:,1])
print('french Vocabulary Size: %d' % fr_vocab_size)
print('French Max Length: %d' % (fr_length))

Using TensorFlow backend.


English Vocabulary Size: 2221
English Max Length: 5
french Vocabulary Size: 4476
French Max Length: 10


In [11]:
from keras.utils import to_categorical

def encode_input(tokenizer,length,lines):
    
    X = tokenizer.texts_to_sequences(lines)
    X = pad_sequences(X, maxlen=length, padding='post')
    
    return X

def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

In [24]:
# prepare training data
trainX = encode_input(fr_tokenizer, fr_length, train[:, 1])
trainY = encode_input(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_output(trainY, eng_vocab_size)
# prepare validation data
testX = encode_input(fr_tokenizer, fr_length, test[:, 1])
testY = encode_input(eng_tokenizer, eng_length, test[:, 0])
testY = encode_output(testY, eng_vocab_size)

print("Word-translation \n",train[:1],"\n Input \n ",trainX[:1]," \n Expected output \n",trainY[:1])


Word-translation 
 [['go' 'va']] 
 Input 
  [[54  0  0  0  0  0  0  0  0  0]]  
 Expected output 
 [[[ 0.  0.  0. ...,  0.  0.  0.]
  [ 1.  0.  0. ...,  0.  0.  0.]
  [ 1.  0.  0. ...,  0.  0.  0.]
  [ 1.  0.  0. ...,  0.  0.  0.]
  [ 1.  0.  0. ...,  0.  0.  0.]]]


In [28]:
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint
from keras.utils.vis_utils import plot_model

def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    
    return model

In [33]:
# define model
model = define_model(fr_vocab_size, eng_vocab_size, fr_length, eng_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')
# summarize defined model
print(model.summary())
#plot_model(model,show_shapes=True)
# fit model
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 10, 256)           1145856   
_________________________________________________________________
lstm_9 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_5 (RepeatVecto (None, 5, 256)            0         
_________________________________________________________________
lstm_10 (LSTM)               (None, 5, 256)            525312    
_________________________________________________________________
time_distributed_5 (TimeDist (None, 5, 2221)           570797    
Total params: 2,767,277
Trainable params: 2,767,277
Non-trainable params: 0
_________________________________________________________________
None
Train on 9000 samples, validate on 1000 samples
Epoch 1/30
Epoch 00000: val_loss improved from inf to 4.21251, saving model to 

<keras.callbacks.History at 0x7ff092dd2dd8>

In [41]:
from numpy import argmax
# map an integer to a word
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None
 
# generate target given source sequence
def predict_sequence(model, tokenizer, source):
    prediction = model.predict(source, verbose=0)[0]
    integers = [argmax(vector) for vector in prediction]
    target = list()
    for i in integers:
        word = word_for_id(i, tokenizer)
        if word is None:
            break
        target.append(word)
    return ' '.join(target)

In [70]:
predict_sequence(model,eng_tokenizer,trainX[3:])

'wow'

In [65]:
print(train[3:])

[['go' 'va']
 ['run' 'cours']
 ['run' 'courez']
 ..., 
 ['i cant fake it' 'je ne peux pas le simuler']
 ['i cant feel it' 'je ne le ressens pas']
 ['i cant feel it' 'je ne parviens pas a le percevoir']]
