In [1]:
import sys
import gc
import os
sys.path.append("/Users/ueki/Desktop/work/jp_en_translation")

In [2]:
from models.Seq2Seq_1 import build_model
from utils.LangEn import LangEn
from utils.LangJa import LangJa
from utils.preprocess import loadLangs
import numpy as np
from keras.utils import np_utils
import matplotlib.pyplot as plt
from keras.preprocessing import sequence
from gensim.models import KeyedVectors
from gensim.models import Word2Vec

Using TensorFlow backend.


In [3]:
config={
    "corpus_file":"../data/jpn.txt",
    "en_col":"description_en",
    "jp_col":"description_jp",
    "SOS_token":1,
    "EOS_token":0,
    "UNK_token":2,
    "max_features":5000,
    "MAX_LENGTH":20,
    "train_size":15000,
    "val_size":100,
    "batch_size":128,
    "epochs":20,
    "maxlen_enc":20,
    "maxlen_dec":20,
    "n_hidden":400,
    "input_dim":5000,
    "output_dim":5000,
    "emb_dim":300,
    "use_enc_emb":False,
    "use_dec_emb":False,
    "validation_split":0.1,
    "trained_param_dir":"../trained_models/1_lstm_ja_en_01.hdf5",
    "translate_length":25,
    "en_W2V_FILE" : "../data/GoogleNews-vectors-negative300.bin.gz",
    "jp_W2V_FILE":"../data/ja_data/ja.bin",
    "src":"en",
    "trg":"jp",
}

In [4]:
class Trainer:
    def __init__(self,config):
        self.batch_size = config["batch_size"]
        self.epochs = config["epochs"]
        self.validation_split = config["validation_split"]
        self.trained_param_dir = config["trained_param_dir"]
        self.output_dim = config["output_dim"]
        self.hist =None
    def train(self,e_input,d_input,target):
        print("#1 train procedure start")
        model,_,_ = build_model(config)
        model.summary()
        
        if os.path.isfile(self.trained_param_dir) and False: #モデルの学習済みパラメータ
            print("2-1? load param")
            model.load_weights(self.trained_param_dir)
        else:
            print("no_emb")
        print("#6 start training")
        
        target_categorical = np_utils.to_categorical(output_target_padded,self.output_dim)
       
        self.hist=model.fit([e_input,d_input],target_categorical,epochs=self.epochs,batch_size=self.batch_size,validation_split=self.validation_split)
        print("#9 save_param")
        model.save_weights(self.trained_param_dir)
        #return model    

In [5]:
class Translator:
    def __init__(self,config):
        self.translate_length = config["translate_length"]
        self.trained_param_dir = config["trained_param_dir"]
        self.model,self.encoder,self.decoder = build_model(config,test=True)
        self.model.load_weights(self.trained_param_dir)
    ## 翻訳文生成
    def _translate(self,e_input):
        #encode input to vec
        #encoder_outputs,state_h_1,state_c_1 = self.encoder.predict(e_input)
        #states_values=[state_h_1,state_c_1]
        encoder_outputs,*states_values = self.encoder.predict(e_input)
        
        #first token
        target_seq=np.zeros((1,1))
        target_seq[0,0] = config["SOS_token"]
        
        decoded_sentence=[]
        for i in range(0,self.translate_length):
            #output_tokens,h1,c1 = self.decoder.predict([target_seq]+states_values)
            output_tokens,*states_values = self.decoder.predict([target_seq]+states_values)
            
            sampled_token_index=np.argmax(output_tokens[0,0,:])
            if sampled_token_index==config["EOS_token"]:
                decoded_sentence.append(config["EOS_token"])
                break
            else:
                target_seq[0,0] = sampled_token_index
                #states_values =[h1,c1]
                decoded_sentence.append(sampled_token_index)
        return decoded_sentence                                    
    
    
    
    def translate_demo(self,src_data_id_seq):
        ret=[]
        for src in src_data_id_seq:
            id_seq_mat = np.array([src])
            pred_id_padded = sequence.pad_sequences(id_seq_mat,maxlen=config["MAX_LENGTH"],padding="post",truncating="post")
            pred=self._translate(pred_id_padded)
            ret.append(pred)
        return ret

In [6]:
def build_en_emb(config):
    en_word2vec= KeyedVectors.load_word2vec_format(config["en_W2V_FILE"],binary=True)
    en_EMBEDDING_DIM=config["emb_dim"]
    #n_word<max_featureの時にerrになるよ
    vocabulary_size=min(EN_lang.n_words,config["max_features"])
    en_embedding_matrix = np.zeros((vocabulary_size, en_EMBEDDING_DIM))
    print("voc->",vocabulary_size)
    cnt=0
    for word, i in EN_lang.word2index.items():
        if   i==0 or i==1 or i ==2:
            continue
        try:
            en_embedding_vector = en_word2vec[word]
            en_embedding_matrix[i] = en_embedding_vector
        except KeyError:
            cnt+=1
            en_embedding_matrix[i] = np.random.normal(0, np.sqrt(0.25),en_EMBEDDING_DIM)
    print("UNK_rate",cnt/i)
    del en_word2vec
    gc.collect()
    return en_embedding_matrix


In [7]:
def build_jp_emb(config):
    jp_word2vec= model = Word2Vec.load(config["jp_W2V_FILE"])
    jp_EMBEDDING_DIM=config["emb_dim"]
    vocabulary_size=min(JP_lang.n_words,config["max_features"])
    jp_embedding_matrix = np.zeros((vocabulary_size, jp_EMBEDDING_DIM))
    print("voc->",vocabulary_size)
    cnt=0
    for word, i in JP_lang.word2index.items():
        if   i==0 or i==1 or i ==2:
            continue
        try:
            jp_embedding_vector = jp_word2vec[word]
            jp_embedding_matrix[i] = jp_embedding_vector
        except KeyError:
            cnt+=1
            jp_embedding_matrix[i] = np.random.normal(0, np.sqrt(0.25),jp_EMBEDDING_DIM)
    print("UNK/rate->",cnt/i)

    del jp_word2vec
    gc.collect()
    return jp_embedding_matrix

# train

In [8]:
data=loadLangs(config)

reading lines


In [9]:
val_data = data[config["train_size"]:config["train_size"]+config["val_size"]]
data = data[:config["train_size"]]

In [10]:
EN_lang = LangEn(config)
JP_lang = LangJa(config)

W0802 14:31:17.561068 4685845952 toolwrapper.py:77] stdbuf was not found; communication with perl may hang due to stdio buffering.


In [11]:
for s in data[config["en_col"]]:
    EN_lang.addSentence(s)

In [12]:
for s in data[config["jp_col"]]:
    JP_lang.addSentence(s)

## input の加工

In [13]:
if config["src"]=="jp":
    src_col=config["jp_col"]
    trg_col=config["en_col"]
    Langs={"src":JP_lang,"trg":EN_lang}
else:
    src_col=config["en_col"]
    trg_col=config["jp_col"]
    Langs={"trg":JP_lang,"src":EN_lang}

In [14]:
input_en = data[src_col]

In [15]:
input_source_lang=data[src_col].apply(lambda x:Langs["src"].word2id(x))
input_target_lang=data[trg_col].apply(lambda x:Langs["trg"].word2id(x,target=True))
output_target_lang=data[trg_col].apply(lambda x:Langs["trg"].word2id(x))

In [16]:
input_source_padded=sequence.pad_sequences(input_source_lang,maxlen=config["MAX_LENGTH"],padding="post",truncating="post")
input_target_padded=sequence.pad_sequences(input_target_lang,maxlen=config["MAX_LENGTH"],padding="post",truncating="post")
output_target_padded=sequence.pad_sequences(output_target_lang,maxlen=config["MAX_LENGTH"],padding="post",truncating="post")

In [17]:
trainer = Trainer(config)

In [18]:
trainer.train(input_source_padded,input_target_padded,output_target_padded)

W0802 14:31:20.242176 4685845952 deprecation_wrapper.py:119] From /Users/ueki/.pyenv/versions/3.7.3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0802 14:31:20.263305 4685845952 deprecation_wrapper.py:119] From /Users/ueki/.pyenv/versions/3.7.3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0802 14:31:20.265338 4685845952 deprecation_wrapper.py:119] From /Users/ueki/.pyenv/versions/3.7.3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0802 14:31:20.383045 4685845952 deprecation_wrapper.py:119] From /Users/ueki/.pyenv/versions/3.7.3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please us

#1 train procedure start
#3 encoder
#4 decoder


W0802 14:31:21.650373 4685845952 deprecation_wrapper.py:119] From /Users/ueki/.pyenv/versions/3.7.3/lib/python3.7/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0802 14:31:21.771257 4685845952 deprecation_wrapper.py:119] From /Users/ueki/.pyenv/versions/3.7.3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3295: The name tf.log is deprecated. Please use tf.math.log instead.



#5
#6
#7
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input (InputLayer)      (None, 20)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 20, 300)      1500000     encoder_input[0][0]              
__________________________________________________________________________________________________
decoder_inputs (InputLayer)     (None, 20)           0                                            
__________________________________________________________________________________________________
batch_normalization_1 (BatchNor (None, 20, 300)      1200        embedding_1[0][0]                
__________________________________________________________________________________________________
e

W0802 14:31:22.597099 4685845952 deprecation.py:323] From /Users/ueki/.pyenv/versions/3.7.3/lib/python3.7/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 13500 samples, validate on 1500 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
#9 save_param


In [19]:
del trainer
gc.collect()

188

# test

In [20]:
val_data_id = val_data[src_col].apply(lambda x:Langs["src"].word2id(x))

In [21]:
val_data_id[:10]

15043                        [206, 20, 560, 8, 1643, 5, 0]
15044                    [25, 52, 8, 205, 358, 4270, 5, 0]
15045        [10, 512, 351, 215, 3569, 32, 27, 2130, 5, 0]
15046                       [18, 56, 1740, 397, 765, 5, 0]
15047    [111, 44, 753, 65, 1793, 50, 455, 79, 2360, 5, 0]
15048                    [175, 1875, 27, 2983, 2363, 5, 0]
15049                    [170, 75, 44, 2, 614, 1219, 5, 0]
15050             [490, 22, 44, 2, 2, 12, 144, 1109, 5, 0]
15051                           [4, 56, 2, 111, 121, 5, 0]
15052        [63, 792, 7, 584, 256, 16, 44, 591, 37, 5, 0]
Name: description_en, dtype: object

In [22]:
translator = Translator(config)

#3 encoder
#4 decoder
#5
#6
#7


In [23]:
ret = translator.translate_demo(val_data_id)

In [24]:
for src,pred,target in zip(val_data[src_col],ret,val_data[trg_col]):
    print("src->",src)
    print()
    print("pred->"," ".join(Langs["trg"].id2word(pred)))
    print("ans->",target)
    print("------------------")

src-> theres no need to hurry.

pred-> 急が ない と 遅れる よ 。 EOS
ans-> 急ぐ 必要 は あり ませ ん 。
------------------
src-> i want to join your band.

pred-> UNK の 方 が UNK さ れ てる よ 。 EOS
ans-> あなた の バンド に 入り たい な 。
------------------
src-> you must keep an eye on the child.

pred-> その 本 は UNK に なり たい 。 EOS
ans-> その 子 から 目 を 離さ ない よう に し なけれ ば いけ ない 。
------------------
src-> he is mad about music.

pred-> 彼 は 音楽 が 好き で ある 。 EOS
ans-> 彼 は 音楽 狂 だ 。
------------------
src-> with a little more patience she would have succeeded.

pred-> 彼女 が そんな こと を し た ので 、 私 は 何 か 知っ て い まし た 。 EOS
ans-> もし 彼女 が もう少し 我慢強かっ たら 、 成功 し て い た だろ う に 。
------------------
src-> they painted the fence green.

pred-> 彼ら は UNK で UNK し て いる 。 EOS
ans-> 彼ら は フェンス を 緑色 に 塗っ た 。
------------------
src-> there was a convention last month.

pred-> UNK が UNK た 。 EOS
ans-> 先月 、 集会 が あっ た 。
------------------
src-> shes not a fulltime employee of this company.

pred-> 彼女 は UNK UNK も 全く 速く UNK 。 EOS
ans-> 彼女 は この 会社 の 正社員 で は あり ませ ん 。
--