In [None]:
import string
import re
import os
import sys
from numpy import array, argmax, random, take
import pandas as pd
from keras.models import Sequential
from keras.layers import LSTM, Input, TimeDistributed, Dense, Activation, RepeatVector, Embedding
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model, Model
from keras.losses import sparse_categorical_crossentropy
from keras import optimizers
from transformers import MarianTokenizer

import tensorflow_datasets as tfds
from datasets import Dataset, DatasetDict, load_dataset


import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_colwidth', 200)

Fetch dataset

In [None]:
train_ds = pd.read_csv("/kaggle/input/iwslt2017-en-zh/train.csv", nrows=10000)
val_ds = pd.read_csv("/kaggle/input/iwslt2017-en-zh/validation.csv", nrows=10000)
test_ds  = pd.read_csv("/kaggle/input/iwslt2017-en-zh/test.csv", nrows=10000)

In [None]:
# (train_ds, test_ds, val_ds), metadata = tfds.load('huggingface:iwslt2017/iwslt2017-en-zh', split=['train[:10000]', 'test', 'validation'], with_info=True)

# print("Dataset Structure:")
# print(metadata.features)

In [None]:
# print("Train: %s \nTest: %s \nValidation: %s" % (train_ds.cardinality().numpy(), test_ds.cardinality().numpy(), val_ds.cardinality().numpy()))

In [None]:
# Split en and zh
# texts_en = list()
# texts_zh = list()
# for elm in train_ds:
#     texts_en.append(elm['translation']['en'].numpy().decode("utf-8"))
#     texts_zh.append(elm['translation']['zh'].numpy().decode("utf-8"))

# print("Train Lists created")
# print("%s%s" % ("Size: ", len(texts_zh)))

In [None]:
texts_en = train_ds['en'].tolist()
texts_zh = train_ds['zh'].tolist()
print("%s%s" % ("Size: ", len(texts_zh)))

Size: 10000


In [None]:
max_len_en = 0
longest_en = ""
for text in texts_en:
    if len(text) > max_len_en:
        max_len_en = len(text)
        longest_en = text

print("%s%s" % ("Max en sen len: ", max_len_en))
print(longest_en)

Max en sen len: 507
And these couple clips take you inside of two of the most difficult conflicts that we're faced with today. [The last 48 hours of two Palestinian suicide bombers.] [Paradise Now] [Man: As long as there is injustice, someone must make a sacrifice!] [Woman: That's no sacrifice, that's revenge!] [If you kill, there's no difference between victim and occupier.] [Man: If we had airplanes, we wouldn't need martyrs, that's the difference.] [Woman: The difference is that the Israeli military is still stronger.]


In [None]:
max_len_zh = 0
longest_zh = ""
for text in texts_zh:
    if len(text) > max_len_zh:
        max_len_zh = len(text)
        longest_zh = text

print("%s%s" % ("Max zh sen len: ", max_len_zh))
print(longest_zh)

Max zh sen len: 296
其它叫法还有powder-box, derriere, pooky, poochy, poopy poopaloo, pooninana, padepachetchki, pow, peach  另外还可以别称作toadie, dee dee, nishi, dignity, coochie, snocher, cooter labi, gladis siegelman, va, wee-wee, whore-spot, nappy dugout mungo, ghoulie, powder-box, 在迈阿密叫mimi 在费城叫split knish, 在布朗克斯区叫schmende


In [None]:
# Split en and zh
# test_texts_en = list()
# test_texts_zh = list()
# for elm in test_ds:
#     test_texts_en.append(elm['translation']['en'].numpy().decode("utf-8"))
#     test_texts_zh.append(elm['translation']['zh'].numpy().decode("utf-8"))

# print("Test Lists created")
# print("%s%s" % ("Size: ", len(test_texts_en)))

In [None]:
test_texts_en = test_ds['en'].tolist()
test_texts_zh = test_ds['zh'].tolist()
print("Test Lists created")
print("%s%s" % ("Size: ", len(test_texts_en)))

Test Lists created
Size: 8549


Tokenize

In [None]:
def tokenization(lines, is_char_level):
    tokenizer = Tokenizer(char_level = is_char_level)
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [None]:
# prepare en tokenizer
en_tokenizer = tokenization(texts_en, False)
en_vocab_size = len(en_tokenizer.word_index) + 1

en_length = max_len_en
print('en Vocabulary Size: %d' % en_vocab_size)

en Vocabulary Size: 12214


In [None]:
# prepare zh tokenizer
zh_tokenizer = tokenization(texts_zh, True)
zh_vocab_size = len(zh_tokenizer.word_index) + 1

zh_length = max_len_zh
print('zh Vocabulary Size: %d' % zh_vocab_size)

zh Vocabulary Size: 3044


In [None]:
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    seq = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    seq = pad_sequences(seq, maxlen=length, padding='post')
    return seq

In [None]:
from sklearn.model_selection import train_test_split
# prepare training data
trainX = encode_sequences(en_tokenizer, en_length, texts_en)
trainY = encode_sequences(zh_tokenizer, zh_length, texts_zh)


In [None]:
len(trainX[0])

507

In [None]:
# prepare test data
testX = encode_sequences(en_tokenizer, en_length, test_texts_en)
testY = encode_sequences(zh_tokenizer, zh_length, test_texts_zh)


Build model

In [None]:
# # build NMT model
# def define_model(in_vocab,out_vocab, in_timesteps,out_timesteps,units):
#     model = Sequential()
#     model.add(Embedding(in_vocab, units, mask_zero=True))
#     model.add(LSTM(units))
#     model.add(RepeatVector(out_timesteps))
#     model.add(LSTM(units, return_sequences=True))
#     model.add(Dense(out_vocab, activation='softmax'))
#     return model

In [None]:
# model compilation
# model = define_model(en_vocab_size, zh_vocab_size, en_length, zh_length, 512)

In [None]:
# rms = optimizers.RMSprop(lr=0.001)
# model.compile(optimizer=rms, loss='sparse_categorical_crossentropy')

Build model (tutorial)

In [None]:
input_sequence = Input(shape=(en_length,))
embedding = Embedding(input_dim=en_vocab_size, output_dim=128,)(input_sequence)
encoder = LSTM(64, return_sequences=False)(embedding)
r_vec = RepeatVector(zh_length)(encoder)
decoder = LSTM(64, return_sequences=True, dropout=0.2)(r_vec)
logits = TimeDistributed(Dense(zh_vocab_size))(decoder)

In [None]:
from keras.losses import sparse_categorical_crossentropy
from keras.optimizers import Adam

model = Model(input_sequence, Activation('softmax')(logits))
model.compile(loss=sparse_categorical_crossentropy,
              optimizer=Adam(1e-3),
              metrics=['accuracy'])
model.summary()

Train the model

In [None]:
import time
filename = '/kaggle/working/' + str(time.time()) + "_model.l5.07.keras"

checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

# train model
history = model.fit(trainX, trainY.reshape(trainY.shape[0], trainY.shape[1], 1),
                    epochs=30, batch_size=512, validation_split = 0.2,callbacks=[checkpoint],
                    verbose=1)

In [None]:
# plt.plot(history.history['loss'])
# plt.plot(history.history['val_loss'])
# plt.legend(['train','validation'])
# plt.show()

Load Model

In [None]:
import shutil

# Specify the paths to the input and working directories
input_file_path = '../input/model-kerastokenizer-v1/1712642755.4982576_model.l5.07.keras'
output_file_path = '/kaggle/working/1712642755.4982576_model.l5.07.keras'

# Copy the file from the input directory to the working directory
shutil.copyfile(input_file_path, output_file_path)


In [None]:
from keras.models import load_model

trainedModel = load_model("/kaggle/working/1712642755.4982576_model.l5.07.keras")

In [None]:
# check translation
trainX[40:41]
en_tokenizer.sequences_to_texts(trainX[40:41])

['every time the tide comes in and out you find some more shells']

In [None]:
def logits_to_sentence(logits, tokenizer):

    index_to_words = {idx: word for word, idx in tokenizer.word_index.items()}
    index_to_words[0] = '<empty>'

    return ' '.join([index_to_words[prediction] for prediction in argmax(logits, 1)])

index = 800
print("The english sentence is: {}".format(test_texts_en[index]))
print("The chinese sentence is: {}".format(test_texts_zh[index]))
print('The predicted sentence is :')
print(logits_to_sentence(trainedModel.predict(testX[index:index+1])[0], zh_tokenizer))

The english sentence is: Can I be honest?
The chinese sentence is: 我可以坦诚点吗？
The predicted sentence is :
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 125ms/step
我 <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empt

In [None]:
# a = trainedModel.predict(trainX[index:index+1])[0]
# a