In [9]:
import string
import re
import os
import sys
from numpy import array, argmax, random, take
import pandas as pd
from keras.models import Sequential
from keras.layers import LSTM, Input, TimeDistributed, Dense, Activation, RepeatVector, Embedding
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model, Model
from keras.losses import sparse_categorical_crossentropy
from keras import optimizers
from transformers import MarianTokenizer

# import tensorflow_datasets as tfds
from datasets import Dataset, DatasetDict, load_dataset


import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_colwidth', 200)

Fetch dataset (Kaggle)

In [10]:
# train_ds = pd.read_csv("/kaggle/input/iwslt2017-en-zh/train.csv")
# val_ds = pd.read_csv("/kaggle/input/iwslt2017-en-zh/validation.csv")
# test_ds  = pd.read_csv("/kaggle/input/iwslt2017-en-zh/test.csv")

Fetch dataset (Local)

In [11]:
dataset = load_dataset("iwslt2017", "iwslt2017-en-zh")
train_ds, valid_ds, test_ds = (
    dataset["train"],
    dataset["validation"],
    dataset["test"],
)

# first 10k rows
train_ds = train_ds.select(range(10000))
train_ds

Dataset({
    features: ['translation'],
    num_rows: 10000
})

In [12]:
# (train_ds, test_ds, val_ds), metadata = tfds.load('huggingface:iwslt2017/iwslt2017-en-zh', split=['train[:10000]', 'test', 'validation'], with_info=True)

# print("Dataset Structure:")
# print(metadata.features)

In [13]:
# print("Train: %s \nTest: %s \nValidation: %s" % (train_ds.cardinality().numpy(), test_ds.cardinality().numpy(), val_ds.cardinality().numpy()))

In [14]:
# Split en and zh
# texts_en = list()
# texts_zh = list()
# for elm in train_ds:
#     texts_en.append(elm['translation']['en'].numpy().decode("utf-8"))
#     texts_zh.append(elm['translation']['zh'].numpy().decode("utf-8"))

# print("Train Lists created")
# print("%s%s" % ("Size: ", len(texts_zh)))

In [15]:
texts_en = train_ds['en'].tolist()
texts_zh = train_ds['zh'].tolist()
print("%s%s" % ("Size: ", len(texts_zh)))

KeyError: "Column en not in the dataset. Current columns in the dataset: ['translation']"

In [None]:
# max_len_en = 0
# longest_en = ""
# for text in texts_en:
#     if len(text) > max_len_en:
#         max_len_en = len(text)
#         longest_en = text

# print("%s%s" % ("Max en sen len: ", max_len_en))
# print(longest_en)

In [None]:
# max_len_zh = 0
# longest_zh = ""
# for text in texts_zh:
#     if len(text) > max_len_zh:
#         max_len_zh = len(text)
#         longest_zh = text

# print("%s%s" % ("Max zh sen len: ", max_len_zh))
# print(longest_zh)

In [None]:
# Split en and zh
# test_texts_en = list()
# test_texts_zh = list()
# for elm in test_ds:
#     test_texts_en.append(elm['translation']['en'].numpy().decode("utf-8"))
#     test_texts_zh.append(elm['translation']['zh'].numpy().decode("utf-8"))

# print("Test Lists created")
# print("%s%s" % ("Size: ", len(test_texts_en)))

In [None]:
test_texts_en = test_ds['en'].tolist()
test_texts_zh = test_ds['zh'].tolist()
print("Test Lists created")
print("%s%s" % ("Size: ", len(test_texts_en)))

Test Lists created
Size: 8549


Tokenize

In [None]:
import sentencepiece as spm
from typing import List, Union

class LangTokeniser(object):
    PAD_ID = 3  # Defined as sentencepiece custom token

    def __init__(self, lang: str, model_file=None):
        self.model = spm.SentencePieceProcessor(model_file=model_file or f"./{lang}.model")
        self.special_ids = (
            self.model.unk_id(),
            LangTokeniser.PAD_ID,  # self.model.pad_id(), # this is -1 and may give errors.
            self.model.bos_id(),
            self.model.eos_id(),
        )
    
    def __len__(self):
        return len(self.model)
    
    def encode_no_padding(self, sent: Union[str, List[str]], max_len=None):
        ids = self.model.encode(sent)
        if max_len is not None and len(ids) > max_len:
            ids = ids[:max_len]
        return ids

    def encode_batch(self, sents: List[str], max_len=None):
        return [self.encode(sent, max_len) for sent in sents]

    def encode(self, sent: Union[str, List[str]], max_len=None):
        if isinstance(sent, list):
            return self.encode_batch(sent, max_len)
        ids = self.model.encode(sent)
        if max_len is not None:
            if len(ids) < max_len:
                ids.extend([LangTokeniser.PAD_ID] * (max_len - len(ids)))
            elif len(ids) > max_len:
                ids = ids[:max_len]
        return ids

    def decode(self, ids: List[int]):
        return self.model.decode([id for id in ids if 0 <= id < len(self) and id != LangTokeniser.PAD_ID])

    def decode_batch(self, ids: List[List[int]]):
        return [self.decode(id) for id in ids]

    def get_special_ids(self):
        UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = self.special_ids
        return UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX


class BaseBPETokeniser(object):
    """
    The class to tokenise input English sentences, and decode output Chinese Vocab IDs.

    Examples:
    ```py
    from tokenisation.sentencepiece_custom import BaseBPETokeniser

    tokeniser = BaseBPETokeniser()
    # or initialise with the model files in a separate path:
    tokeniser = BaseBPETokeniser(en_model_file="/path/to/en.model", zh_model_file="/path/to/zh.model")

    row = dataset[0]['translation']

    # Tokenise and truncate to max length of 512 for both.
    inputs = tokeniser(row['en'], text_target=row['zh'], max_len=512)
    # {
    #     'input_ids': [...],       # The English IDs
    #     'attention_mask': [...],
    #     'labels': [...]           # The Chinese IDs
    # }

    # should generate the Chinese tokens output.
    translated = tokeniser.decode(ids)

    ```
    """

    def __init__(self, en_model_file=None, zh_model_file=None):
        self.en_model = LangTokeniser("en", model_file=en_model_file)
        self.zh_model = LangTokeniser("zh", model_file=zh_model_file)

    def __len__(self):
        """
        Both the english and chinese tokenisers have the same length.
        """
        return len(self.en_model)

    def __call__(self, sent: str, text_target=None, max_len=128, max_zh_len=None):
        out = {
            "input_ids": self.en_model.encode(sent, max_len=max_len),
            "attention_mask": [1] * max_len,
        }
        if text_target:
            out["labels"] = self.zh_model.encode(
                text_target, max_len=max_zh_len or max_len
            )
        return out

    def encode_zh(self, sent: str, max_len=128):
        return self.zh_model.encode(sent, max_len=max_len)

    def encode_en(self, sent: str, max_len=128):
        return self.en_model.encode(sent, max_len=max_len)
    
    def decode_zh(self, labels: list[int]):
        return self.zh_model.decode(labels)

    def decode_zh_batch(self, labels: List[List[int]]):
        return self.zh_model.decode_batch(labels)
    
    def decode_en(self, labels: list[int]):
        return self.en_model.decode(labels)

    def decode_en_batch(self, labels: list[int]):
        return self.en_model.decode_batch(labels)
    
    def get_special_ids(self, lang: str):
        if lang == "en":
            return self.en_model.get_special_ids()
        elif lang == "zh":
            return self.zh_model.get_special_ids()

    def encode_en_no_padding(self, sent: str, max_len=None):
        return self.en_model.encode_no_padding(sent, max_len=max_len)

    def encode_zh_no_padding(self, sent: str, max_len=None):
        return self.zh_model.encode_no_padding(sent, max_len=max_len)

In [None]:
# sys.path.append(os.path.abspath('../../tokenisation/sentencepiece_custom'))
en_model_absolute_path = os.path.abspath('/kaggle/input/sentencepiece-models/en.model')
zh_model_absolute_path = os.path.abspath('/kaggle/input/sentencepiece-models/zh.model')

In [None]:
tokenizer = BaseBPETokeniser(en_model_file=en_model_absolute_path, zh_model_file=zh_model_absolute_path)

# If without padding
# encoded_en_no_padding = tokenizer.encode_en_no_padding("This is", max_len=3)
# print("Encoded en:", encoded_en_no_padding)
# encoded_zh_no_padding = tokenizer.encode_zh_no_padding("这是", max_len=3)
# print("Encoded zh:", encoded_zh_no_padding)

# # Decoding (same) 
# decoded_en = tokenizer.decode_src(encoded_en_no_padding)
# decoded_zh = tokenizer.decode(encoded_zh_no_padding)

# decoded_en, decoded_zh

In [None]:
# def tokenization(lines, is_char_level):
#     tokenizer = Tokenizer(char_level = is_char_level)
#     tokenizer.fit_on_texts(lines)
#     return tokenizer

Retrieve vocab from EN and CN files

In [None]:
# get the en vocab size
def read_vocab_file(vocab_file_path):
    vocab = {}
    with open(vocab_file_path, 'r', encoding='utf-8') as f:
        index = 0
        for line in f:
            token, ignore = line.strip().split()  # Assuming tokens and indices are separated by space
            vocab[token] = index 
            index += 1
    return vocab
# retrieve en vocab
en_vocab_file = "/kaggle/input/sentencepiece-vocabs/en.vocab"
en_vocab = read_vocab_file(en_vocab_file)
# retrieve zh vocab
zh_vocab_file = "/kaggle/input/sentencepiece-vocabs/zh.vocab"
zh_vocab = read_vocab_file(zh_vocab_file)

In [None]:
# encode and pad sequences
# def encode_sequences(tokenizer, length, lines):
#     seq = tokenizer.texts_to_sequences(lines)
#     # pad sequences with 0 values
#     seq = pad_sequences(seq, maxlen=length, padding='post')
#     return seq

In [None]:
# encode english
max_len_en = len(max(tokenizer.encode_en_no_padding(texts_en), key=len))
# encode chinese
max_len_zh = len(max(tokenizer.encode_zh_no_padding(texts_zh), key=len))
max_len_en, max_len_zh

(211, 285)

In [None]:
# encode english
en_outputs = tokenizer.encode_en(texts_en, max_len=max_len_en)
# encode chinese
zh_outputs = tokenizer.encode_zh(texts_en, max_len=max_len_zh)
# prepare training data
# trainX = encode_sequences(en_tokenizer, en_length, texts_en)
# trainY = encode_sequences(zh_tokenizer, zh_length, texts_zh)


In [None]:
# len(trainX[0])

In [None]:
# prepare test data
# testX = encode_sequences(en_tokenizer, en_length, test_texts_en)
# testY = encode_sequences(zh_tokenizer, zh_length, test_texts_zh)


Build model

In [None]:
# build NMT model
# def define_model(in_vocab,out_vocab, in_timesteps,out_timesteps,units):
#     model = Sequential()
#     model.add(Embedding(in_vocab, units, mask_zero=True))
#     model.add(LSTM(units))
#     model.add(RepeatVector(out_timesteps))
#     model.add(LSTM(units, return_sequences=True))
#     model.add(Dense(out_vocab, activation='softmax'))
#     return model

In [None]:
# model compilation
# model = define_model(len(en_vocab), len(zh_vocab), max_len_en, max_len_zh, 512)

In [None]:
# rms = optimizers.RMSprop(learning_rate=0.001)
# model.compile(optimizer=rms, loss='sparse_categorical_crossentropy')
# model.summary()

Build model (tutorial)

In [None]:
input_sequence = Input(shape=(max_len_en,))
embedding = Embedding(input_dim=len(en_vocab), output_dim=128,)(input_sequence)
encoder = LSTM(64, return_sequences=False)(embedding)
r_vec = RepeatVector(max_len_zh)(encoder)
decoder = LSTM(64, return_sequences=True, dropout=0.2)(r_vec)
logits = TimeDistributed(Dense(len(zh_vocab)))(decoder)

In [None]:
from keras.losses import sparse_categorical_crossentropy
from keras.optimizers import Adam

model = Model(input_sequence, Activation('softmax')(logits))
model.compile(loss=sparse_categorical_crossentropy,
              optimizer=Adam(1e-3),
              metrics=['accuracy'])
model.summary()

Train the model

In [None]:
en_outputs = array(en_outputs)
zh_outputs = array(zh_outputs)

In [None]:
en_outputs.shape, zh_outputs.shape

In [None]:
import time
filename = '/kaggle/working/' + str(time.time()) + "_model.l5.07.keras"

checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

# train model
history = model.fit(en_outputs, zh_outputs.reshape(zh_outputs.shape[0], zh_outputs.shape[1], 1),
                    epochs=15, batch_size=32, validation_split = 0.2,callbacks=[checkpoint],
                    verbose=1)

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.legend(['train','validation'])
plt.show()

Load Model

In [None]:
import shutil

# Specify the paths to the input and working directories
input_file_path = '../input/200k-trained-model/1712672287.5377033_model.l5.07.keras'
output_file_path = '/kaggle/working/200k_model.l5.07.keras'

# Copy the file from the input directory to the working directory
shutil.copyfile(input_file_path, output_file_path)


In [None]:
# from keras.models import load_model

trainedModel = load_model("/kaggle/working/200k_model.l5.07.keras")

In [None]:
# en_outputs_list = en_outputs.tolist()
# zh_outputs_list = zh_outputs.tolist()
# check translation
example = en_outputs[40:41]
print(example)
print(tokenizer.decode_en_batch(example))
print(texts_en[40])

In [None]:
list(zh_vocab.items())[:5]

In [None]:
def logits_to_sentence(logits, vocab):

    index_to_words = {idx: word for word, idx in vocab.items()}
#     index_to_words[0] = '<empty>' 

    return ' '.join([index_to_words[prediction] for prediction in argmax(logits, 1)])

index = 221000
print("The english sentence is: {}".format(texts_en[index]))
print("The chinese sentence is: {}".format(texts_zh[index]))
print('The predicted sentence is :')
print(logits_to_sentence(trainedModel.predict(array(en_outputs[index:index+1]))[0], zh_vocab))

Generate JSON of predictions

In [None]:
# import jieba
# import json
# import torch
# from bert_score import score
# from rouge_chinese import Rouge
# from sacrebleu.metrics import BLEU, CHRF, TER

In [None]:
data = {}
# loop through first 10 test set
results = logits_to_sentence(trainedModel.predict(array(en_outputs[index:index+10]))[0], zh_vocab)
print(results)