### Neural Machine Translation

In [1]:
def load_doc(filename):
    file = open(filename, mode='rt', encoding='utf-8')
    text = file.read()
    file.close()
    return text

In [2]:
# load dataset
filename = 'cmn.txt'
doc = load_doc(filename)

In [3]:
doc



In [4]:
# cmn -> eng
# split a loaded document into sentences
def to_pairs(doc):
    lines = doc.strip().split('\n') #' abc' -> 'abc'
    pairs = [line.split('\t') for line in lines]
    return pairs

In [5]:
# split into english-chinese pairs
pairs = to_pairs(doc)
print(pairs[0])

['Hi.', '嗨。']


In [6]:
# clean sentences
from numpy import array
# lower case
def clean_pairs(lines):
    cleaned = list()
    for pair in lines:
        clean_pair = list()
        for line in pair:
            # tokenize on white space
            line = line.split()
            # convert to lowercase
            line = [word.lower() for word in line]
            # stopwords
            # punctution
            # store as string
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    return array(cleaned)

In [7]:
cleaned_pairs = clean_pairs(pairs)


In [8]:
print("cleaned_pairs.shape:", cleaned_pairs.shape)


cleaned_pairs.shape: (20133, 2)


In [9]:
for i in range(10):
    print('[%s] => [%s]' % (cleaned_pairs[i,0], cleaned_pairs[i,1]))

[hi.] => [嗨。]
[hi.] => [你好。]
[run.] => [你用跑的。]
[wait!] => [等等！]
[hello!] => [你好。]
[i try.] => [让我来。]
[i won!] => [我赢了。]
[oh no!] => [不会吧。]
[cheers!] => [乾杯!]
[he ran.] => [他跑了。]


In [10]:
# reduce dataset size
n_sentences = 1000
cleaned_pairs = cleaned_pairs[:n_sentences, :]

In [11]:
# random shuffle
from numpy.random import shuffle
shuffle(cleaned_pairs)

In [12]:
# split into train/test
n_train = 900
train, test = cleaned_pairs[:n_train], cleaned_pairs[n_train:]
print(train.shape) #900
print(test.shape) #100

(900, 2)
(100, 2)


#### 处理英文

In [13]:
cleaned_pairs[:, 0]

array(['what do you do?', "i'm on my way.", 'we laughed.',
       'you work hard.', 'take me home.', 'what about you?', 'i know him.',
       'the birds sang.', 'of course!', 'open the box.', 'i like fish.',
       'she smiled.', 'i like tea.', 'i can see tom.', 'ok. i agree.',
       'read this book.', 'he looks young.', 'follow him.', 'come home.',
       "aren't you tom?", 'get out!', 'keep reading.', 'just say no.',
       'stop yelling!', 'boil the water.', 'drive carefully.',
       "he's a comedian.", 'he will survive.', "i'm thirsty.",
       'i have hiccups.', "it's improved.", "it's my job.", 'was i wrong?',
       'let me do that.', 'tom frowned.', 'turn right.', 'i had a vision.',
       'let him in.', 'stay sharp.', "well, let's go.", "it's ok.",
       'try hard.', 'science is fun.', 'he likes tea.', 'wipe your eyes.',
       'i am short.', 'he runs.', "he's not stupid.", "i'll get in.",
       'she cried.', 'i hate mondays.', "i'm very happy.",
       'i go to school.', 

In [14]:
# X?? Y??
#(1) word-RNN, not char-RNN
# eng?  vocab? 分词结果? 向量长度(input_shape)?
# X: [hi.]-> 希望Embedding层（Conv1D）,其输入是[hi,.]->[0,1]
from nltk.tokenize import TreebankWordTokenizer
def tokenize(lines, lang):
    tokenized_sents = []
    vocab = [] #append:[[a]].append([b])=[[a],[b]]  ->[a,b]
    tokenized_sent_lens = []
    tokenizer = TreebankWordTokenizer()
    for line in lines:
        tokenized_sent = tokenizer.tokenize(line)
        tokenized_sents.append(tokenized_sent)
        vocab.extend(tokenized_sent)
        tokenized_sent_lens.append(len(tokenized_sent))
    #vocab: list -> set(去重)
    vocab = ['None'] + list(set(vocab))
    max_sent_len = max(tokenized_sent_lens)
    return tokenized_sents, vocab, max_sent_len

In [15]:
# fit
eng_tokenized_sents, eng_vocab, eng_len = tokenize(
    cleaned_pairs[:, 0], 'eng') # first colume: eng
# TFIDFVectorizer: fit + transform
# fit : vocab -> vector dims
# transform: text-> vector
# fit: train+test
# transform: 需要什么transform什么。trainX-》输入train[:,0]

# train: fit_transform
# test: transform

In [16]:
eng_vocab_size = len(eng_vocab)
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Sequence Length: %d' % eng_len)

English Vocabulary Size: 702
English Max Sequence Length: 6


In [17]:
print(eng_vocab)
# X: cmn sequence, cmn_len, 
#    [嗨,。] =[wordindex[嗨],wordindex[。],0,0,0,..]，整个向量长度是22 
#           padding
# Y: eng sequence, eng_len
#     [hi,.]= [wordindex[hi],wordindex[.],0,0,0,..]，整个向量长度是22
# vocab[0] = 'None'
# [hi, ., None, None, ...] 否则[hi, ., at, at, at, ...]

['None', 'it', 'hot', 'said', 'aside', 'coat', 'believe', 'deal', 'not', 'wife', 'happens', 'japanese', 'hurt', 'sells', 'thief', 'blushed', 'guy', 'lost', 'the', 'hiccups', 'cry', 'definitely', 'cook', 'sorry', 'close', 'really', 'happy', 'count', 'lie', 'tall', 'watch', 'thirty', 'fooled', 'shooting', 'wake', 'bread', 'hope', 'law', 'mad', 'welcome', 'humor', 'book', 'kept', 'korean', 'business', 'admire', 'skip', 'look', 'bother', 'security', 'resisting', 'dumb', '.', 'behave', 'understand', 'let', 'starving', 'sports', 'we', 'acts', "'ve", 'married', 'drowned', 'stamp', 'beside', 'strong', 'in', 'treat', 'ok.', 'had', 'greedy', 'water', 'will', 'does', 'us', 'evening', 'well', 'blame', 'china', 'loser', 'rabbi', 'followed', 'helps', 'trying', 'circle', 'listening', 'upset', 'might', 'start', 'suits', 'appeared', 'early', 'large', 'how', 'refused', 'glue', 'forgive', 'coming', 'live', 'i', 'waved', 'these', 'patio', 'hang', 'diet', 'where', 'who', 'as', 'dieting', 'walked', 'change'

In [18]:
eng_word_indices = dict((word, i) for i, word in enumerate(eng_vocab))
eng_indices_word = dict((i, word) for i, word in enumerate(eng_vocab))

In [19]:
# 把输入输出向量化，就是变成X
# X: [hi.]-> 希望Embedding层（Conv1D）,其输入是[hi,.]->[wordindex[hi],wordindex[.],0,0,0,..]
def encode_sequences(sents, lang, max_sent_len, word_indices, vocab):
    # X: 第一维度：集合sentences，维度是有多少sentence，所以是len(sentences)
    #    第二维度：一个sentence，维度是每个sentence的长度，所以是sentlen，每个维度是词的index
    tokenized_sents, _, _ = tokenize(sents, lang) #transform
    X = np.zeros((len(tokenized_sents), max_sent_len), dtype='float32')
    for i, sentence in enumerate(tokenized_sents):
        for t, word in enumerate(sentence):
            if word in vocab:
                X[i, t] = word_indices[word]
    return X

In [20]:
# y: 对应X，每个sentence都有一个translated_sentence
#    第一维度：集合translated_sentences，所以是len(sentences)
#    第二维度：translated_sentence，维度是每个translated_sentence的长度
#    第三维度：每个词，onehot向量，translated_vocab_size
from keras.utils import to_categorical
def encode_output(sentences, vocab_size):
    ylist = list()
    for sentence in sentences:
        encoded = to_categorical(sentence, num_classes=vocab_size)
        ylist.append(encoded)
    y = array(ylist)
    y = y.reshape(sentences.shape[0], sentences.shape[1], vocab_size)
    return y

Using TensorFlow backend.


In [21]:
import numpy as np
# cmn -> eng
trainY = encode_sequences(train[:, 0], "eng", eng_len, eng_word_indices, eng_vocab)
print(trainY.shape)

(900, 6)


In [22]:
print(eng_indices_word[28])
print(eng_indices_word[91])
print(eng_indices_word[0])
print(eng_indices_word[0])
print(eng_indices_word[0])
print(eng_indices_word[0])

lie
early
None
None
None
None


In [23]:
trainY = encode_output(trainY, eng_vocab_size)

In [24]:
print(trainY.shape)

(900, 6, 702)


In [25]:
print("trainY after one hot, ", trainY[0].shape)

trainY after one hot,  (6, 702)


#### 处理中文

In [26]:
# X?? Y??
#(1) word-RNN, not char-RNN
# eng?  vocab? 分词结果? 向量长度(input_shape)?
# X: [hi.]-> 希望Embedding层（Conv1D）,其输入是[hi,.]->[0,1]
from nltk.tokenize import TreebankWordTokenizer, WhitespaceTokenizer
def tokenize(lines, lang):
    tokenized_sents = []
    vocab = [] #append:[[a]].append([b])=[[a],[b]]  ->[a,b]
    tokenized_sent_lens = []
    if lang == "eng":
        tokenizer = TreebankWordTokenizer()
    #elif lang == "cmn": 
    #    tokenizer = ?
    else:
        tokenizer = WhitespaceTokenizer()
    for line in lines:
        if lang == "cmn":
            tokenized_sent = list(jieba.cut(line))
        else: 
            tokenized_sent = tokenizer.tokenize(line)
        tokenized_sents.append(tokenized_sent)
        vocab.extend(tokenized_sent)
        tokenized_sent_lens.append(len(tokenized_sent))
    #vocab: list -> set(去重)
    vocab = ['None'] + list(set(vocab))
    max_sent_len = max(tokenized_sent_lens)
    return tokenized_sents, vocab, max_sent_len

In [27]:
# encoding=utf-8
import jieba #pip install jieba

seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
print("全模式: ", list(seg_list))  # 全模式

seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
print("精确模式: ", list(seg_list))  # 精确模式

seg_list = jieba.cut("我来到北京清华大学")  # 默认是精确模式
print("默认模式: ", list(seg_list))

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/xx/q7b13_rx6dlf7gjdm_4tsjsw0000gp/T/jieba.cache
Loading model cost 1.012 seconds.
Prefix dict has been built succesfully.


全模式:  ['我', '来到', '北京', '清华', '清华大学', '华大', '大学']
精确模式:  ['我', '来到', '北京', '清华大学']
默认模式:  ['我', '来到', '北京', '清华大学']


In [28]:
cmn_tokenized_sents, cmn_vocab, cmn_len = tokenize(
    cleaned_pairs[:, 1], 'cmn') # second colume: cmn

In [29]:
cmn_vocab_size = len(cmn_vocab)
cmn_word_indices = dict((word, i) for i, word in enumerate(cmn_vocab))
cmn_indices_word = dict((i, word) for i, word in enumerate(cmn_vocab))
print('Chinese Vocabulary Size: %d' % cmn_vocab_size)
print('Chinese Max Length: %d' % cmn_len)

Chinese Vocabulary Size: 964
Chinese Max Length: 9


In [30]:
### 生成中文X，英文Y都生成
# cmn -> eng
trainX = encode_sequences(train[:, 1], "cmn", cmn_len, cmn_word_indices, cmn_vocab)
trainX = encode_output(trainX, cmn_vocab_size)

trainY = encode_sequences(train[:, 0], "eng", eng_len, eng_word_indices, eng_vocab)
trainY = encode_output(trainY, eng_vocab_size)


In [31]:
print(trainX.shape)
print(trainY.shape)

(900, 9, 964)
(900, 6, 702)


In [32]:
# prepare validation data
testX = encode_sequences(test[:, 1], "cmn", cmn_len, cmn_word_indices, cmn_vocab)
print(testX.shape)
testY = encode_sequences(test[:, 0], "eng", eng_len, eng_word_indices, eng_vocab)
print(testY.shape)
testY = encode_output(testY, eng_vocab_size)
print(testY.shape)

(100, 9)
(100, 6)
(100, 6, 702)


#### Seq2Seq Model

In [33]:
Tx = trainX.shape[1]
Ty = trainY.shape[1]

In [36]:
print(Tx, Ty)

9 6


In [37]:
from keras.layers import Bidirectional, Concatenate, Permute, Dot, Input, LSTM, Multiply
from keras.layers import RepeatVector, Dense, Activation, Lambda
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.models import load_model, Model
import keras.backend as K
import numpy as np

In [39]:
repeator = RepeatVector(Tx)
concatenator = Concatenate(axis=-1)
densor1 = Dense(10, activation = "tanh")
densor2 = Dense(1, activation = "relu")
activator = Activation('softmax', name='attention_weights') 
dotor = Dot(axes = 1)

In [40]:
def one_step_attention(a, s_prev):
    # a -- hidden state output of the Bi-LSTM, (# of input data points, Tx, 2*n_a)
    # s_prev -- previous hidden state of the (post-attention) LSTM (# of input data points, n_s)
    
    s_prev = repeator(s_prev)
    concat = concatenator([a, s_prev])
    # dense layer
    e1 = densor1(concat)
    e2 = densor2(e1)
    # alphas
    alphas = activator(e2)
    context = dotor([alphas, a])
    
    return context

In [41]:
n_a = 32
n_s = 64
post_activation_LSTM_cell = LSTM(n_s, return_state = True)
output_layer = Dense(eng_vocab_size, activation='softmax')

In [42]:
def define_model(Tx, Ty, n_a, n_s, cmn_vocab_size, eng_vocab_size):
    # Tx, length of the input sequence
    # Ty, length of the output sequence
    # n_a, hidden state size of the Bi-LSTM
    # n_s, hidden state size of the post-attention LSTM
    
    X = Input(shape=(Tx, cmn_vocab_size))
    # s0 and c0, initial hidden state for the decoder LSTM of shape (n_s,)
    s0 = Input(shape=(n_s,), name='s0')
    c0 = Input(shape=(n_s,), name='c0')
    
    s = s0
    c = c0
    
    outputs = []
    
    a = Bidirectional(LSTM(n_a, return_sequences=True))(X)
    
    for t in range(Ty):
        context = one_step_attention(a, s)
        s, _, c = post_activation_LSTM_cell(context, initial_state = [s, c])
        out = output_layer(s)
        outputs.append(out)
    
    model = Model(inputs = [X, s0, c0], outputs = outputs)
    return model

In [43]:
model = define_model(Tx, Ty, n_a, n_s, cmn_vocab_size, eng_vocab_size)

In [44]:
model.compile(optimizer='adam', loss='categorical_crossentropy')

In [45]:
s0 = np.zeros((trainX.shape[0], n_s))
c0 = np.zeros((trainX.shape[0], n_s))

In [46]:
tY = list(trainY.swapaxes(0,1))

In [47]:
trainY.shape

(900, 6, 702)

In [48]:
len(tY)

6

In [49]:
tY[0].shape

(900, 702)

In [50]:
model.fit([trainX, s0, c0], tY, epochs=5, batch_size=100)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x131dcb240>

In [51]:
print(model.summary())

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 9, 964)        0                                            
____________________________________________________________________________________________________
s0 (InputLayer)                  (None, 64)            0                                            
____________________________________________________________________________________________________
bidirectional_1 (Bidirectional)  (None, 9, 64)         255232      input_1[0][0]                    
____________________________________________________________________________________________________
repeat_vector_2 (RepeatVector)   (None, 9, 64)         0           s0[0][0]                         
                                                                   lstm_1[0][0]            

In [52]:
# input word sequence -> X
#encode_sequences
# X -> Y -> output word sequence
def predict_sequence(source_X, model, vocab, vocab_size, indices_word):
    prediction = model.predict(source_X)[0] #(1, 6, 702)->(6, 702)
    target = list()
    for y in prediction:
        word_index = y.argmax()
        if word_index in indices_word.keys():
            target.append(indices_word[word_index])
    return ' '.join(target)  

In [53]:
sources = ['嗨！','等等。']
for i, source in enumerate(sources):
    source = [source] #(9,) -> (1,9) (len(sentences), max_len_size)
    print(source)
    source_X = encode_sequences(source, "cmn", cmn_len, cmn_word_indices, cmn_vocab)
    source_X = encode_output(source_X, cmn_vocab_size)
    
    print (source_X)
    target = predict_sequence([source_X,s0,c0], model, eng_vocab, eng_vocab_size, eng_indices_word)
    print ("translated sentence:", target)

['嗨！']
[[[ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 1.  0.  0. ...,  0.  0.  0.]
  ..., 
  [ 1.  0.  0. ...,  0.  0.  0.]
  [ 1.  0.  0. ...,  0.  0.  0.]
  [ 1.  0.  0. ...,  0.  0.  0.]]]
translated sentence: None
['等等。']
[[[ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 1.  0.  0. ...,  0.  0.  0.]
  ..., 
  [ 1.  0.  0. ...,  0.  0.  0.]
  [ 1.  0.  0. ...,  0.  0.  0.]
  [ 1.  0.  0. ...,  0.  0.  0.]]]
translated sentence: None
