<a href="https://colab.research.google.com/github/Taishi-N324/CPP/blob/main/rnn_seq2seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**GPU使用に指定**

In [1]:
! git clone https://github.com/Taishi-N324/light_enja2.git

Cloning into 'light_enja2'...
remote: Enumerating objects: 56, done.[K
remote: Counting objects: 100% (56/56), done.[K
remote: Compressing objects: 100% (48/48), done.[K
remote: Total 56 (delta 22), reused 33 (delta 8), pack-reused 0[K
Unpacking objects: 100% (56/56), done.


In [2]:
import random
import tensorflow as tf
import string
import re
from tensorflow import keras
import datetime, os
import csv
import numpy as np
import random
import pickle
import sys
from tensorflow.keras import layers
from google.colab import files
from __future__ import absolute_import, division, print_function, unicode_literals

try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass


In [3]:
! pwd

/content


In [4]:
file_dir = "light_enja2/corpus/"

with open(file_dir + "train.en") as f:
  lines_en = f.read().split("\n")[:-1]
with open(file_dir + "train.ja") as f:
  lines_ja = f.read().split("\n")[:-1]

In [5]:
text_pairs = []
for i in range(100000):
  en = lines_en[i]
  ja = "[start] " + lines_ja[i] + " [end]"

  text_pairs.append((en,ja))

In [6]:
random.shuffle(text_pairs)
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples:num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples:]

In [7]:
vocab_size = 20000
sequence_length = 30
batch_size = 64
embed_dim = 256
latent_dim = 1024

In [8]:
strip_chars = string.punctuation 
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")

def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(
        lowercase, f"[{re.escape(strip_chars)}]", "")

source_vectorization = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length,
)
target_vectorization = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length + 1,
    standardize=custom_standardization,
)
train_english_texts = [pair[0] for pair in train_pairs]
train_japanese_texts = [pair[1] for pair in train_pairs]
source_vectorization.adapt(train_english_texts)
target_vectorization.adapt(train_japanese_texts)

In [9]:
def format_dataset(eng, ja):
    eng = source_vectorization(eng)
    ja = target_vectorization(ja)
    return ({
        "english": eng,
        "japanese": ja[:, :-1],
    }, ja[:, 1:])

def make_dataset(pairs):
    eng_texts, ja_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    ja_texts = list(ja_texts)
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, ja_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset, num_parallel_calls=4)
    return dataset.shuffle(2048).prefetch(16).cache()

train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

In [10]:
for inputs, targets in train_ds.take(1):
    print(f"inputs['english'].shape: {inputs['english'].shape}")
    print(f"inputs['japanese'].shape: {inputs['japanese'].shape}")
    print(f"targets.shape: {targets.shape}")

inputs['english'].shape: (64, 30)
inputs['japanese'].shape: (64, 30)
targets.shape: (64, 30)


In [11]:
source = keras.Input(shape=(None,), dtype="int64", name="english")
x = layers.Embedding(vocab_size, embed_dim, mask_zero=True)(source)
encoded_source = layers.Bidirectional(
    layers.GRU(latent_dim), merge_mode="sum")(x)

In [12]:
past_target = keras.Input(shape=(None,), dtype="int64", name="japanese")
x = layers.Embedding(vocab_size, embed_dim, mask_zero=True)(past_target)
decoder_gru = layers.GRU(latent_dim, return_sequences=True)
x = decoder_gru(x, initial_state=encoded_source)
x = layers.Dropout(0.5)(x)
target_next_step = layers.Dense(vocab_size, activation="softmax")(x)
seq2seq_rnn = keras.Model([source, past_target], target_next_step)

In [None]:
seq2seq_rnn.compile(
    optimizer="rmsprop",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"])

log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
log = seq2seq_rnn.fit(train_ds, 
                      epochs=25, 
                      validation_data=val_ds, 
                      callbacks=[tensorboard_callback])

%tensorboard --logdir logs/fit

Epoch 1/25
Epoch 2/25
 192/1094 [====>.........................] - ETA: 1:34 - loss: 1.5196 - accuracy: 0.3146

In [None]:
ja_vocab = target_vectorization.get_vocabulary()
ja_index_lookup = dict(zip(range(len(ja_vocab)), ja_vocab))
max_decoded_sentence_length = 20

def decode_sequence(input_sentence):
    tokenized_input_sentence = source_vectorization([input_sentence])
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = target_vectorization([decoded_sentence])
        next_token_predictions = seq2seq_rnn.predict(
            [tokenized_input_sentence, tokenized_target_sentence])
        sampled_token_index = np.argmax(next_token_predictions[0, i, :])
        sampled_token = ja_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token
        if sampled_token == "[end]":
            break
    return decoded_sentence

test_eng_texts = [pair[0] for pair in test_pairs]
for _ in range(10):
    input_sentence = random.choice(test_eng_texts)
    print("-")
    print(input_sentence)
    print(decode_sequence(input_sentence))