In [2]:
import string
import re
import os
import sys
from numpy import array, argmax, random, take
import pandas as pd
from keras.models import Sequential
from keras.layers import LSTM, Input, TimeDistributed, Dense, Activation, RepeatVector, Embedding
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model, Model
from keras.losses import sparse_categorical_crossentropy
from keras import optimizers
from transformers import MarianTokenizer

# import tensorflow_datasets as tfds
from datasets import Dataset, DatasetDict, load_dataset


import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_colwidth', 200)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
dataset = load_dataset("iwslt2017", "iwslt2017-en-zh")
train_ds, valid_ds, test_ds = (
    dataset["train"],
    dataset["validation"],
    dataset["test"],
)

# first 10k rows
train_ds = train_ds.select(range(10000))
train_ds

Dataset({
    features: ['translation'],
    num_rows: 10000
})

In [4]:
en_zh_test = [[data['translation']['en'], data['translation']['zh']] for data in test_ds]
en_test = [pair[0] for pair in en_zh_test]
zh_test = [pair[1] for pair in en_zh_test]

Encode test set

In [5]:
import sentencepiece as spm
from typing import List, Union

class LangTokeniser(object):
    PAD_ID = 3  # Defined as sentencepiece custom token

    def __init__(self, lang: str, model_file=None):
        self.model = spm.SentencePieceProcessor(model_file=model_file or f"./{lang}.model")
        self.special_ids = (
            self.model.unk_id(),
            LangTokeniser.PAD_ID,  # self.model.pad_id(), # this is -1 and may give errors.
            self.model.bos_id(),
            self.model.eos_id(),
        )
    
    def __len__(self):
        return len(self.model)
    
    def encode_no_padding(self, sent: Union[str, List[str]], max_len=None):
        ids = self.model.encode(sent)
        if max_len is not None and len(ids) > max_len:
            ids = ids[:max_len]
        return ids

    def encode_batch(self, sents: List[str], max_len=None):
        return [self.encode(sent, max_len) for sent in sents]

    def encode(self, sent: Union[str, List[str]], max_len=None):
        if isinstance(sent, list):
            return self.encode_batch(sent, max_len)
        ids = self.model.encode(sent)
        if max_len is not None:
            if len(ids) < max_len:
                ids.extend([LangTokeniser.PAD_ID] * (max_len - len(ids)))
            elif len(ids) > max_len:
                ids = ids[:max_len]
        return ids

    def decode(self, ids: List[int]):
        return self.model.decode([id for id in ids if 0 <= id < len(self) and id != LangTokeniser.PAD_ID])

    def decode_batch(self, ids: List[List[int]]):
        return [self.decode(id) for id in ids]

    def get_special_ids(self):
        UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = self.special_ids
        return UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX


class BaseBPETokeniser(object):
    """
    The class to tokenise input English sentences, and decode output Chinese Vocab IDs.

    Examples:
    ```py
    from tokenisation.sentencepiece_custom import BaseBPETokeniser

    tokeniser = BaseBPETokeniser()
    # or initialise with the model files in a separate path:
    tokeniser = BaseBPETokeniser(en_model_file="/path/to/en.model", zh_model_file="/path/to/zh.model")

    row = dataset[0]['translation']

    # Tokenise and truncate to max length of 512 for both.
    inputs = tokeniser(row['en'], text_target=row['zh'], max_len=512)
    # {
    #     'input_ids': [...],       # The English IDs
    #     'attention_mask': [...],
    #     'labels': [...]           # The Chinese IDs
    # }

    # should generate the Chinese tokens output.
    translated = tokeniser.decode(ids)

    ```
    """

    def __init__(self, en_model_file=None, zh_model_file=None):
        self.en_model = LangTokeniser("en", model_file=en_model_file)
        self.zh_model = LangTokeniser("zh", model_file=zh_model_file)

    def __len__(self):
        """
        Both the english and chinese tokenisers have the same length.
        """
        return len(self.en_model)

    def __call__(self, sent: str, text_target=None, max_len=128, max_zh_len=None):
        out = {
            "input_ids": self.en_model.encode(sent, max_len=max_len),
            "attention_mask": [1] * max_len,
        }
        if text_target:
            out["labels"] = self.zh_model.encode(
                text_target, max_len=max_zh_len or max_len
            )
        return out

    def encode_zh(self, sent: str, max_len=128):
        return self.zh_model.encode(sent, max_len=max_len)

    def encode_en(self, sent: str, max_len=128):
        return self.en_model.encode(sent, max_len=max_len)
    
    def decode_zh(self, labels: list[int]):
        return self.zh_model.decode(labels)

    def decode_zh_batch(self, labels: List[List[int]]):
        return self.zh_model.decode_batch(labels)
    
    def decode_en(self, labels: list[int]):
        return self.en_model.decode(labels)

    def decode_en_batch(self, labels: list[int]):
        return self.en_model.decode_batch(labels)
    
    def get_special_ids(self, lang: str):
        if lang == "en":
            return self.en_model.get_special_ids()
        elif lang == "zh":
            return self.zh_model.get_special_ids()

    def encode_en_no_padding(self, sent: str, max_len=None):
        return self.en_model.encode_no_padding(sent, max_len=max_len)

    def encode_zh_no_padding(self, sent: str, max_len=None):
        return self.zh_model.encode_no_padding(sent, max_len=max_len)

In [6]:
en_model_absolute_path = os.path.abspath('../../tokenisation/sentencepiece_custom/en.model')
zh_model_absolute_path = os.path.abspath('../../tokenisation/sentencepiece_custom/zh.model')

In [7]:
tokenizer = BaseBPETokeniser(en_model_file=en_model_absolute_path, zh_model_file=zh_model_absolute_path)

In [8]:
# get the en vocab size
def read_vocab_file(vocab_file_path):
    vocab = {}
    with open(vocab_file_path, 'r', encoding='utf-8') as f:
        index = 0
        for line in f:
            token, ignore = line.strip().split()  # Assuming tokens and indices are separated by space
            vocab[token] = index 
            index += 1
    return vocab
# retrieve en vocab
en_vocab_file = "../../tokenisation/sentencepiece_custom/en.vocab"
en_vocab = read_vocab_file(en_vocab_file)
# retrieve zh vocab
zh_vocab_file = "../../tokenisation/sentencepiece_custom/zh.vocab"
zh_vocab = read_vocab_file(zh_vocab_file)

In [9]:
# encode english
max_len_en = len(max(tokenizer.encode_en_no_padding(en_test), key=len))
# encode chinese
max_len_zh = len(max(tokenizer.encode_zh_no_padding(zh_test), key=len))
max_len_en, max_len_zh

(113, 133)

In [10]:
# encode english
en_outputs = tokenizer.encode_en(en_test, max_len=211)
# encode chinese
zh_outputs = tokenizer.encode_zh(zh_test, max_len=285)

In [11]:
import keras
print(keras.__version__)

3.1.1


Load model

In [12]:
trainedModel = load_model(r"C:\Users\glenl\OneDrive - National University of Singapore\Documents\NUS\Current semester\CS4248\4248project\models\lstm\savedModels\1712672287.5377033_model.l5.07.keras")

In [None]:
# zh_vocab

In [15]:
type(en_outputs)

list

In [29]:
a = trainedModel.predict(array(en_outputs[index:index+5]))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step


In [24]:
trainedModel.predict(array(en_outputs[index:index+2]))[0].shape

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step


(285, 16384)

In [30]:
b = argmax(a, 2).shape

(5, 285)

In [None]:
from numpy import vectorize

def logits_to_sentence(logits, vocab):
    index_to_words = {idx: word for word, idx in vocab.items()}
#     index_to_words[0] = '<empty>' 
    bestIndices = argmax(logits, 2)
    chars_arr = vectorize(index_to_words.get)(bestIndices)
    sents_arr = array([' '.join(sublist) for sublist in chars_arr])
    return sents_arr.tolist()

# index = 0
# print("The english sentence is: {}".format(en_test[index]))
# print("The chinese sentence is: {}".format(zh_test[index]))
# print('The predicted sentence is :')
# print(logits_to_sentence(trainedModel.predict(array(en_outputs[index:index+1])), zh_vocab))


In [38]:
len(en_outputs[0:len(en_outputs)])

8549

Churn out predictions from test set

In [35]:
index = 0
results = logits_to_sentence(trainedModel.predict(array(en_outputs[index:len(en_outputs)])), zh_vocab)

[1m 49/268[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m12:24[0m 3s/step

ResourceExhaustedError: Graph execution error:

Detected at node functional_1_1/time_distributed_1/transpose_1 defined at (most recent call last):
  File "c:\Users\glenl\anaconda3\envs\CS2109S\lib\runpy.py", line 197, in _run_module_as_main

  File "c:\Users\glenl\anaconda3\envs\CS2109S\lib\runpy.py", line 87, in _run_code

  File "c:\Users\glenl\anaconda3\envs\CS2109S\lib\site-packages\ipykernel_launcher.py", line 17, in <module>

  File "c:\Users\glenl\anaconda3\envs\CS2109S\lib\site-packages\traitlets\config\application.py", line 976, in launch_instance

  File "c:\Users\glenl\anaconda3\envs\CS2109S\lib\site-packages\ipykernel\kernelapp.py", line 712, in start

  File "c:\Users\glenl\anaconda3\envs\CS2109S\lib\site-packages\tornado\platform\asyncio.py", line 205, in start

  File "c:\Users\glenl\anaconda3\envs\CS2109S\lib\asyncio\base_events.py", line 596, in run_forever

  File "c:\Users\glenl\anaconda3\envs\CS2109S\lib\asyncio\base_events.py", line 1890, in _run_once

  File "c:\Users\glenl\anaconda3\envs\CS2109S\lib\asyncio\events.py", line 80, in _run

  File "c:\Users\glenl\anaconda3\envs\CS2109S\lib\site-packages\ipykernel\kernelbase.py", line 510, in dispatch_queue

  File "c:\Users\glenl\anaconda3\envs\CS2109S\lib\site-packages\ipykernel\kernelbase.py", line 499, in process_one

  File "c:\Users\glenl\anaconda3\envs\CS2109S\lib\site-packages\ipykernel\kernelbase.py", line 406, in dispatch_shell

  File "c:\Users\glenl\anaconda3\envs\CS2109S\lib\site-packages\ipykernel\kernelbase.py", line 730, in execute_request

  File "c:\Users\glenl\anaconda3\envs\CS2109S\lib\site-packages\ipykernel\ipkernel.py", line 383, in do_execute

  File "c:\Users\glenl\anaconda3\envs\CS2109S\lib\site-packages\ipykernel\zmqshell.py", line 528, in run_cell

  File "c:\Users\glenl\anaconda3\envs\CS2109S\lib\site-packages\IPython\core\interactiveshell.py", line 2881, in run_cell

  File "c:\Users\glenl\anaconda3\envs\CS2109S\lib\site-packages\IPython\core\interactiveshell.py", line 2936, in _run_cell

  File "c:\Users\glenl\anaconda3\envs\CS2109S\lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner

  File "c:\Users\glenl\anaconda3\envs\CS2109S\lib\site-packages\IPython\core\interactiveshell.py", line 3135, in run_cell_async

  File "c:\Users\glenl\anaconda3\envs\CS2109S\lib\site-packages\IPython\core\interactiveshell.py", line 3338, in run_ast_nodes

  File "c:\Users\glenl\anaconda3\envs\CS2109S\lib\site-packages\IPython\core\interactiveshell.py", line 3398, in run_code

  File "C:\Users\glenl\AppData\Local\Temp\ipykernel_11232\584290947.py", line 1, in <cell line: 1>

  File "C:\Users\glenl\AppData\Roaming\Python\Python39\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "C:\Users\glenl\AppData\Roaming\Python\Python39\site-packages\keras\src\backend\tensorflow\trainer.py", line 515, in predict

  File "C:\Users\glenl\AppData\Roaming\Python\Python39\site-packages\keras\src\backend\tensorflow\trainer.py", line 213, in one_step_on_data_distributed

  File "C:\Users\glenl\AppData\Roaming\Python\Python39\site-packages\keras\src\backend\tensorflow\trainer.py", line 202, in one_step_on_data

  File "C:\Users\glenl\AppData\Roaming\Python\Python39\site-packages\keras\src\backend\tensorflow\trainer.py", line 94, in predict_step

  File "C:\Users\glenl\AppData\Roaming\Python\Python39\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "C:\Users\glenl\AppData\Roaming\Python\Python39\site-packages\keras\src\layers\layer.py", line 814, in __call__

  File "C:\Users\glenl\AppData\Roaming\Python\Python39\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "C:\Users\glenl\AppData\Roaming\Python\Python39\site-packages\keras\src\ops\operation.py", line 48, in __call__

  File "C:\Users\glenl\AppData\Roaming\Python\Python39\site-packages\keras\src\utils\traceback_utils.py", line 156, in error_handler

  File "C:\Users\glenl\AppData\Roaming\Python\Python39\site-packages\keras\src\models\functional.py", line 194, in call

  File "C:\Users\glenl\AppData\Roaming\Python\Python39\site-packages\keras\src\ops\function.py", line 151, in _run_through_graph

  File "C:\Users\glenl\AppData\Roaming\Python\Python39\site-packages\keras\src\models\functional.py", line 578, in call

  File "C:\Users\glenl\AppData\Roaming\Python\Python39\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "C:\Users\glenl\AppData\Roaming\Python\Python39\site-packages\keras\src\layers\layer.py", line 814, in __call__

  File "C:\Users\glenl\AppData\Roaming\Python\Python39\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "C:\Users\glenl\AppData\Roaming\Python\Python39\site-packages\keras\src\ops\operation.py", line 48, in __call__

  File "C:\Users\glenl\AppData\Roaming\Python\Python39\site-packages\keras\src\utils\traceback_utils.py", line 156, in error_handler

  File "C:\Users\glenl\AppData\Roaming\Python\Python39\site-packages\keras\src\layers\rnn\time_distributed.py", line 110, in call

  File "C:\Users\glenl\AppData\Roaming\Python\Python39\site-packages\keras\src\layers\rnn\time_distributed.py", line 90, in time_distributed_transpose

  File "C:\Users\glenl\AppData\Roaming\Python\Python39\site-packages\keras\src\ops\numpy.py", line 5809, in transpose

  File "C:\Users\glenl\AppData\Roaming\Python\Python39\site-packages\keras\src\backend\tensorflow\numpy.py", line 1946, in transpose

OOM when allocating tensor with shape[32,285,16384] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator mklcpu
	 [[{{node functional_1_1/time_distributed_1/transpose_1}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_one_step_on_data_distributed_5965]