In [1]:
import os
import time
import pandas as pd
import numpy as np

# # install these dependencies and then comment them out
# !pip install tensorflow
# !pip install keras
# !pip install -U spacy
# !python -m spacy download ja_core_news_md


import tensorflow as tf
from tensorflow.keras import backend as K
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

cuda = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(cuda[0], enable=True)

root_folder='.'
data_folder_name='datafiles'
train_filename='standford_raw'
test_filename='test'

DATA_PATH = os.path.abspath(os.path.join(root_folder, data_folder_name))
train_filenamepath = os.path.abspath(os.path.join(DATA_PATH, train_filename))
test_filenamepath = os.path.abspath(os.path.join(DATA_PATH, test_filename))
train_path = DATA_PATH
test_path = DATA_PATH
INPUT_COLUMN = 'input'
TARGET_COLUMN = 'target'
TARGET_FOR_INPUT = 'target_for_input'
NUM_SAMPLES = 20000
NUM_TEST_SAMPLES = 15
MAX_VOCAB_SIZE = 20000
EMBEDDING_DIM = 128
HIDDEN_DIM = 1024

BATCH_SIZE = 32
EPOCHS = 100

ATTENTION_FUNC='general'
print("complete")

complete


In [2]:
df=pd.read_csv(train_filenamepath, sep="\t", header=None, names=[TARGET_COLUMN,INPUT_COLUMN], usecols=[0,1],
               nrows=NUM_SAMPLES)
test_df=pd.read_csv(test_filenamepath, sep="\t", header=None, names=[TARGET_COLUMN,INPUT_COLUMN], usecols=[0,1],
               nrows=NUM_TEST_SAMPLES)

import spacy
spacy_japanese = spacy.load("ja_core_news_md")
def tokenize_japanese(text):
    try:
        listHold = [token.text for token in spacy_japanese.tokenizer(text)]
        word = " ".join(listHold)
        return word
    except:
        return ""

input_data=df[INPUT_COLUMN].apply(lambda x : tokenize_japanese(x)).tolist()
target_data=df[TARGET_COLUMN].apply(lambda x : x + ' <eos>').tolist()
target_input_data=df[TARGET_COLUMN].apply(lambda x : '<sos> '+ x).tolist()

test_input_data=test_df[INPUT_COLUMN].apply(lambda x : tokenize_japanese(x)).tolist()
test_target_data=test_df[TARGET_COLUMN].apply(lambda x : x + ' <eos>').tolist()
test_target_input_data=test_df[TARGET_COLUMN].apply(lambda x : '<sos> '+ x).tolist()

tokenizer_inputs = Tokenizer(num_words=MAX_VOCAB_SIZE, filters='')
tokenizer_inputs.fit_on_texts(input_data)
input_sequences = tokenizer_inputs.texts_to_sequences(input_data)
input_max_len = max(len(s) for s in input_sequences)
print('Max Input Length: ', input_max_len)
print(input_data[1000])
print(input_sequences[1000])



tokenizer_outputs = Tokenizer(num_words=MAX_VOCAB_SIZE, filters='')
tokenizer_outputs.fit_on_texts(target_data)
tokenizer_outputs.fit_on_texts(target_input_data)
target_sequences = tokenizer_outputs.texts_to_sequences(target_data)
target_sequences_inputs = tokenizer_outputs.texts_to_sequences(target_input_data)
target_max_len = max(len(s) for s in target_sequences)
print('Max Target Length: ', target_max_len)


word2idx_inputs = tokenizer_inputs.word_index
print('Found %s unique input tokens.' % len(word2idx_inputs))
word2idx_outputs = tokenizer_outputs.word_index
print('Found %s unique output tokens.' % len(word2idx_outputs))


num_words_output = len(word2idx_outputs) + 1
num_words_inputs = len(word2idx_inputs) + 1
idx2word_inputs = {v:k for k, v in word2idx_inputs.items()}
idx2word_outputs = {v:k for k, v in word2idx_outputs.items()}



Max Input Length:  52
昨日 の 写真 の 男
[365, 1, 258, 1, 152]
Max Target Length:  44
Found 15664 unique input tokens.
Found 13529 unique output tokens.


In [3]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(Encoder, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.lstm = tf.keras.layers.LSTM(
            hidden_dim, return_sequences=True, return_state=True)

    def call(self, input_sequence, states):
        embed = self.embedding(input_sequence)
        output, state_h, state_c = self.lstm(embed, initial_state=states)

        return output, state_h, state_c

    def init_states(self, batch_size):
        return (tf.zeros([batch_size, self.hidden_dim]),
                tf.zeros([batch_size, self.hidden_dim]))
    
class LuongAttention(tf.keras.Model):
    def __init__(self, rnn_size, attention_func):
        super(LuongAttention, self).__init__()
        self.attention_func = attention_func

        if attention_func not in ['dot', 'general', 'concat']:
            raise ValueError(
                'Attention score must be either dot, general or concat.')

        if attention_func == 'general':
            self.wa = tf.keras.layers.Dense(rnn_size)

    def call(self, decoder_output, encoder_output):
        if self.attention_func == 'dot':
            score = tf.matmul(decoder_output, encoder_output, transpose_b=True)
        elif self.attention_func == 'general':
            score = tf.matmul(decoder_output, self.wa(
                encoder_output), transpose_b=True)
        elif self.attention_func == 'concat':
            decoder_output = tf.tile(
                decoder_output, [1, encoder_output.shape[1], 1])
            score = self.va(
                self.wa(tf.concat((decoder_output, encoder_output), axis=-1)))

            score = tf.transpose(score, [0, 2, 1])

        alignment = tf.keras.activations.softmax(score, axis=-1)
        
        context = tf.matmul(alignment, encoder_output)

        return context, alignment


class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, attention_func):
        super(Decoder, self).__init__()
        self.attention = LuongAttention(hidden_dim, attention_func)
        self.hidden_dim = hidden_dim
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.lstm = tf.keras.layers.LSTM(
            hidden_dim, return_sequences=True, return_state=True)
        self.wc = tf.keras.layers.Dense(hidden_dim, activation='tanh')
        self.ws = tf.keras.layers.Dense(vocab_size)

    def call(self, input_sequence, state, encoder_output):
        embed = self.embedding(input_sequence)
        lstm_out, state_h, state_c = self.lstm(embed, initial_state=state)

        context, alignment = self.attention(lstm_out, encoder_output)

        lstm_out = tf.concat(
            [tf.squeeze(context, 1), tf.squeeze(lstm_out, 1)], 1)

        lstm_out = self.wc(lstm_out)
        logits = self.ws(lstm_out)

        return logits, state_h, state_c, alignment
    
    
encoder = Encoder(num_words_inputs, EMBEDDING_DIM, HIDDEN_DIM)
decoder = Decoder(num_words_output, EMBEDDING_DIM, HIDDEN_DIM, ATTENTION_FUNC)
# initial_state = encoder.init_states(1)
# encoder_outputs = encoder(tf.constant([[1]]), initial_state)
# decoder_outputs = decoder(tf.constant(
#     [[1]]), encoder_outputs[1:], encoder_outputs[0])

print("complete")

complete


In [4]:
optimizer = tf.keras.optimizers.Adam(clipnorm=5.0)
checkpoint_dir = './training_ckpt_seq2seq_att'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

def predict_seq2seq_att(input_text, input_max_len, tokenizer_inputs, word2idx_outputs, idx2word_outputs):
    if input_text is None:
        input_text = input_data[np.random.choice(len(input_data))]
    print(input_text)

    input_seq = tokenizer_inputs.texts_to_sequences([input_text])
    input_seq = pad_sequences(input_seq, maxlen=input_max_len, padding='post')
    en_initial_states = encoder.init_states(1)
    en_outputs = encoder(tf.constant(input_seq), en_initial_states)
    de_input = tf.constant([[word2idx_outputs['<sos>']]])
    de_state_h, de_state_c = en_outputs[1:]
    
    out_words = []
    alignments = []

    while True:
        de_output, de_state_h, de_state_c, alignment = decoder(de_input, (de_state_h, de_state_c), en_outputs[0])
        de_input = tf.expand_dims(tf.argmax(de_output, -1), 0)
        out_words.append(idx2word_outputs[de_input.numpy()[0][0]])
        alignments.append(alignment.numpy())

        if out_words[-1] == '<eos>' or len(out_words) >= 20:
            break

    print(' '.join(out_words))
    return np.array(alignments), input_text.split(' '), out_words

for i, test_sent in enumerate(test_input_data):
    alignments, source, prediction = predict_seq2seq_att(test_sent, input_max_len, tokenizer_inputs, 
                                                     word2idx_outputs, idx2word_outputs)
    print("source:", source)
    print("prediction:", prediction)
    print("actual:", test_target_data[i])
    print()

その 二人 の 先生 は 、 同じ 数 の 生徒 を 受け 持っ て い た 。
and the same ones ive seen it influenced is my favorite power of the same <eos>
source: ['その', '二人', 'の', '先生', 'は', '、', '同じ', '数', 'の', '生徒', 'を', '受け', '持っ', 'て', 'い', 'た', '。']
prediction: ['and', 'the', 'same', 'ones', 'ive', 'seen', 'it', 'influenced', 'is', 'my', 'favorite', 'power', 'of', 'the', 'same', '<eos>']
actual: The two teachers had an equal number of students. <eos>

ウェイトレス は ジュース を 私 の 前 に 置い た 。
i bought the pictures of the vending team on the cause i bought a threat of the physical <eos>
source: ['ウェイトレス', 'は', 'ジュース', 'を', '私', 'の', '前', 'に', '置い', 'た', '。']
prediction: ['i', 'bought', 'the', 'pictures', 'of', 'the', 'vending', 'team', 'on', 'the', 'cause', 'i', 'bought', 'a', 'threat', 'of', 'the', 'physical', '<eos>']
actual: The waitress set a glass of juice in front of me. <eos>

両者 の 間 に は 著しい 違い が ある 。
of them not with systems <eos>
source: ['両者', 'の', '間', 'に', 'は', '著しい', '違い', 'が', 'ある', '。']
prediction: ['of', 'the