In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_hub as hub

import string
from string import digits
import re
import time
import numpy as np
import pandas as pd
import collections
import unicodedata

import os
import io


from bert import BertModelLayer
from bert.loader import StockBertConfig, load_stock_weights

In [2]:
path_to_zip = 'C:\\Users\\user\\Project From Alex NLP\\chi-eng.zip'
path_to_file = os.path.dirname(path_to_zip)+"\\chi-eng\\cmn.txt"
print(path_to_file)

C:\Users\user\Project From Alex NLP\chi-eng\cmn.txt


In [3]:
def preprocess_sentence(sentence):
  sentence = sentence.lower().strip()
  # creating a space between a word and the punctuation following it
  # eg: "he is a boy." => "he is a boy ."
  sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
  sentence = re.sub(r'[" "]+', " ", sentence)
  # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
  sentence = re.sub(r"[^a-zA-Z?.!,]+", " ", sentence)
  sentence = sentence.strip()
  # adding a start and an end token to the sentence
  # so that the model know when to start and stop predicting.
#   sentence = '<start> ' + sentence + ' <end>'
  return sentence

In [45]:
def create_dataset(path, num1,num2):
  english = []
  chinese = []
  lines = io.open(path, encoding='UTF-8').read().strip().split('\n')

#   word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_examples]]
  for l in lines[num1:num2]:
      eng = preprocess_sentence(l.split('\t')[0])
      chi = l.split('\t')[1]
      english.append(eng)
      chinese.append(chi)
        
  return english, chinese

In [5]:
en_tr, chi_tr = create_dataset(path_to_file, 0,20000)
print(en_tr[-1])
print(chi_tr[-1])

one man s meat is another man s poison .
甲之蜜糖，乙之砒霜。


In [46]:
path_to_zip = tf.keras.utils.get_file(
    'cornell_movie_dialogs.zip',
    origin=
    'http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip',
    extract=True)

path_to_dataset = os.path.join(
    os.path.dirname(path_to_zip), "cornell movie-dialogs corpus")

path_to_movie_lines = os.path.join(path_to_dataset, 'movie_lines.txt')
path_to_movie_conversations = os.path.join(path_to_dataset,
                                           'movie_conversations.txt')

In [47]:
def load_conversations(num1,num2):
  # dictionary of line id to text
  id2line = {}
  with open(path_to_movie_lines, errors='ignore') as file:
    lines = file.readlines()
  for line in lines:
    parts = line.replace('\n', '').split(' +++$+++ ')
    id2line[parts[0]] = parts[4]

  inputs, outputs = [], []
  with open(path_to_movie_conversations, 'r') as file:
    lines = file.readlines()
  for line in lines[num1:num2]:
    parts = line.replace('\n', '').split(' +++$+++ ')
    # get conversation in a list of line ID
    conversation = [line[1:-1] for line in parts[3][1:-1].split(', ')]
    for i in range(len(conversation) - 1):
      inputs.append(preprocess_sentence(id2line[conversation[i]]))
      outputs.append(preprocess_sentence(id2line[conversation[i + 1]]))
#       if len(inputs) >= MAX_SAMPLES:
#         return inputs, outputs
  return inputs, outputs

In [48]:
questions, answers = load_conversations(0,20000)
len(questions),len(answers)

(53430, 53430)

In [8]:
def convert_to_unicode(text):
    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
    if isinstance(text, str):
        return text
    elif isinstance(text, bytes):
        return text.decode("utf-8", "ignore")
    else:
        raise ValueError("Unsupported string type: %s" % (type(text)))


def load_vocab(vocab_file):
    """Loads a vocabulary file into a dictionary."""
    vocab = collections.OrderedDict()
    index = 0
    with tf.io.gfile.GFile(vocab_file, "r") as reader:
        while True:
            token = convert_to_unicode(reader.readline())
            if not token:
                break
            token = token.strip()
            vocab[token] = index
            index += 1
    return vocab


def whitespace_tokenize(text):
    """Runs basic whitespace cleaning and splitting on a piece of text."""
    text = text.strip()
    if not text:
        return []
    tokens = text.split()
    return tokens

def convert_by_vocab(vocab, items):
    """Converts a sequence of [tokens|ids] using the vocab."""
    output = []
    for item in items:
        output.append(vocab[item])
    return output

class FullTokenizer(object):
    """Runs end-to-end tokenziation."""

    def __init__(self, vocab_file, do_lower_case=True):
        self.vocab = load_vocab(vocab_file)
        self.inv_vocab = {v: k for k, v in self.vocab.items()}
        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)

    def tokenize(self, text):
        split_tokens = []
        for token in self.basic_tokenizer.tokenize(text):
            for sub_token in self.wordpiece_tokenizer.tokenize(token):
                split_tokens.append(sub_token)

        return split_tokens
    
    def convert_tokens_to_ids(self, tokens):
        return convert_by_vocab(self.vocab, tokens)

    def convert_ids_to_tokens(self, ids):
        return convert_by_vocab(self.inv_vocab, ids)


class BasicTokenizer(object):
    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""

    def __init__(self, do_lower_case=True):
        """Constructs a BasicTokenizer.
    
        Args:
          do_lower_case: Whether to lower case the input.
        """
        self.do_lower_case = do_lower_case

    def tokenize(self, text):
        """Tokenizes a piece of text."""
        text = convert_to_unicode(text)
        text = self._clean_text(text)

        # This was added on November 1st, 2018 for the multilingual and Chinese
        # models. This is also applied to the English models now, but it doesn't
        # matter since the English models were not trained on any Chinese data
        # and generally don't have any Chinese data in them (there are Chinese
        # characters in the vocabulary because Wikipedia does have some Chinese
        # words in the English Wikipedia.).
        text = self._tokenize_chinese_chars(text)

        orig_tokens = whitespace_tokenize(text)
        split_tokens = []
        for token in orig_tokens:
            if self.do_lower_case:
                token = token.lower()
                token = self._run_strip_accents(token)
            split_tokens.extend(self._run_split_on_punc(token))

        output_tokens = whitespace_tokenize(" ".join(split_tokens))
        return output_tokens

    def _run_strip_accents(self, text):
        """Strips accents from a piece of text."""
        text = unicodedata.normalize("NFD", text)
        output = []
        for char in text:
            cat = unicodedata.category(char)
            if cat == "Mn":
                continue
            output.append(char)
        return "".join(output)

    def _run_split_on_punc(self, text):
        """Splits punctuation on a piece of text."""
        chars = list(text)
        i = 0
        start_new_word = True
        output = []
        while i < len(chars):
            char = chars[i]
            if _is_punctuation(char):
                output.append([char])
                start_new_word = True
            else:
                if start_new_word:
                    output.append([])
                start_new_word = False
                output[-1].append(char)
            i += 1

        return ["".join(x) for x in output]

    def _tokenize_chinese_chars(self, text):
        """Adds whitespace around any CJK character."""
        output = []
        for char in text:
            cp = ord(char)
            if self._is_chinese_char(cp):
                output.append(" ")
                output.append(char)
                output.append(" ")
            else:
                output.append(char)
        return "".join(output)

    def _is_chinese_char(self, cp):
        """Checks whether CP is the codepoint of a CJK character."""
        # This defines a "chinese character" as anything in the CJK Unicode block:
        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
        #
        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
        # despite its name. The modern Korean Hangul alphabet is a different block,
        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
        # space-separated words, so they are not treated specially and handled
        # like the all of the other languages.
        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
                (cp >= 0x3400 and cp <= 0x4DBF) or  #
                (cp >= 0x20000 and cp <= 0x2A6DF) or  #
                (cp >= 0x2A700 and cp <= 0x2B73F) or  #
                (cp >= 0x2B740 and cp <= 0x2B81F) or  #
                (cp >= 0x2B820 and cp <= 0x2CEAF) or
                (cp >= 0xF900 and cp <= 0xFAFF) or  #
                (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
            return True

        return False

    def _clean_text(self, text):
        """Performs invalid character removal and whitespace cleanup on text."""
        output = []
        for char in text:
            cp = ord(char)
            if cp == 0 or cp == 0xfffd or _is_control(char):
                continue
            if _is_whitespace(char):
                output.append(" ")
            else:
                output.append(char)
        return "".join(output)


class WordpieceTokenizer(object):
    """Runs WordPiece tokenziation."""

    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200):
        self.vocab = vocab
        self.unk_token = unk_token
        self.max_input_chars_per_word = max_input_chars_per_word

    def tokenize(self, text):
        """Tokenizes a piece of text into its word pieces.
    
        This uses a greedy longest-match-first algorithm to perform tokenization
        using the given vocabulary.
    
        For example:
          input = "unaffable"
          output = ["un", "##aff", "##able"]
    
        Args:
          text: A single token or whitespace separated tokens. This should have
            already been passed through `BasicTokenizer.
    
        Returns:
          A list of wordpiece tokens.
        """

        text = convert_to_unicode(text)

        output_tokens = []
        for token in whitespace_tokenize(text):
            chars = list(token)
            if len(chars) > self.max_input_chars_per_word:
                output_tokens.append(self.unk_token)
                continue

            is_bad = False
            start = 0
            sub_tokens = []
            while start < len(chars):
                end = len(chars)
                cur_substr = None
                while start < end:
                    substr = "".join(chars[start:end])
                    if start > 0:
                        substr = "##" + substr
                    if substr in self.vocab:
                        cur_substr = substr
                        break
                    end -= 1
                if cur_substr is None:
                    is_bad = True
                    break
                sub_tokens.append(cur_substr)
                start = end

            if is_bad:
                output_tokens.append(self.unk_token)
            else:
                output_tokens.extend(sub_tokens)
        return output_tokens

def _is_whitespace(char):
    """Checks whether `chars` is a whitespace character."""
    # \t, \n, and \r are technically contorl characters but we treat them
    # as whitespace since they are generally considered as such.
    if char == " " or char == "\t" or char == "\n" or char == "\r":
        return True
    cat = unicodedata.category(char)
    if cat == "Zs":
        return True
    return False


def _is_control(char):
    """Checks whether `chars` is a control character."""
    # These are technically control characters but we count them as whitespace
    # characters.
    if char == "\t" or char == "\n" or char == "\r":
        return False
    cat = unicodedata.category(char)
    if cat.startswith("C"):
        return True
    return False


def _is_punctuation(char):
    """Checks whether `chars` is a punctuation character."""
    cp = ord(char)
    # We treat all non-letter/number ASCII as punctuation.
    # Characters such as "^", "$", and "`" are not in the Unicode
    # Punctuation class but we treat them as punctuation anyways, for
    # consistency.
    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
        return True
    cat = unicodedata.category(char)
    if cat.startswith("P"):
        return True
    return False

In [9]:
tokenizer_chi_1 = FullTokenizer(
    vocab_file= 'chinese_L-12_H-768_A-12/vocab.txt',
    do_lower_case=True)

tokenizer_en_1 = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    en_tr, target_vocab_size=2**13)

In [49]:
tokenizer_ques = FullTokenizer(
    vocab_file= 'uncased_L-12_H-768_A-12/vocab.txt',
    do_lower_case=True)

tokenizer_ans = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    answers, target_vocab_size=2**13)

In [69]:
tokenizer_en_2 = FullTokenizer(
    vocab_file= 'uncased_L-12_H-768_A-12/vocab.txt',
    do_lower_case=True)

tokenizer_chi_2 = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    chi_tr, target_vocab_size=2**13)

In [12]:
MAX_SEQ_LENGTH = 40
BUFFER_SIZE = 20000
BATCH_SIZE = 64


# POSITIONAL ENCODING

In [17]:
def get_angles(pos, i, d_model):
  angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
  return pos * angle_rates

In [18]:
def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                            np.arange(d_model)[np.newaxis, :],
                            d_model)

    # apply sin to even indices in the array; 2i
    sines = np.sin(angle_rads[:, 0::2])

    # apply cos to odd indices in the array; 2i+1
    cosines = np.cos(angle_rads[:, 1::2])

    pos_encoding = np.concatenate([sines, cosines], axis=-1)

    pos_encoding = pos_encoding[np.newaxis, ...]

    return tf.cast(pos_encoding, dtype=tf.float32)

In [19]:
def create_padding_mask(seq):
  seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
  
  # add extra dimensions so that we can add the padding
  # to the attention logits.
  return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

In [20]:
def create_look_ahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask  # (seq_len, seq_len)

In [21]:
def scaled_dot_product_attention(q, k, v, mask):
    """Calculate the attention weights.
    q, k, v must have matching leading dimensions.
    k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
    The mask has different shapes depending on its type(padding or look ahead) 
    but it must be broadcastable for addition.
    
    Args:
      q: query shape == (..., seq_len_q, depth)
      k: key shape == (..., seq_len_k, depth)
      v: value shape == (..., seq_len_v, depth_v)
      mask: Float tensor with shape broadcastable 
            to (..., seq_len_q, seq_len_k). Defaults to None.
      
    Returns:
      output, attention_weights
    """

    matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)

    # scale matmul_qk
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

    # add the mask to the scaled tensor.
    if mask is not None:
        scaled_attention_logits += (mask * -1e9)

        # softmax is normalized on the last axis (seq_len_k) so that the scores
    # add up to 1.
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)

    output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)

    return output, attention_weights

In [22]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model

        assert d_model % self.num_heads == 0

        self.depth = d_model // self.num_heads

        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)

        self.dense = tf.keras.layers.Dense(d_model)

    def split_heads(self, x, batch_size):
        """Split the last dimension into (num_heads, depth).
        Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
        """
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]

        q = self.wq(q)  # (batch_size, seq_len, d_model)
        k = self.wk(k)  # (batch_size, seq_len, d_model)
        v = self.wv(v)  # (batch_size, seq_len, d_model)

        q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
        k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
        v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)

        # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
        # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
        scaled_attention, attention_weights = scaled_dot_product_attention(
            q, k, v, mask)

        scaled_attention = tf.transpose(scaled_attention,
                                        perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)

        concat_attention = tf.reshape(scaled_attention,
                                      (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)

        output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)

        return output, attention_weights

In [23]:
temp_mha = MultiHeadAttention(d_model=512, num_heads=8)
y = tf.random.uniform((1, 60, 768))  # (batch_size, encoder_sequence, d_model)
q = tf.random.uniform((1, 60, 512))  # (batch_size, encoder_sequence, d_model)
out, attn = temp_mha(y, k=y, q=q, mask=None)
out.shape, attn.shape

(TensorShape([1, 60, 512]), TensorShape([1, 8, 60, 60]))

In [24]:
def point_wise_feed_forward_network(d_model, dff):
    return tf.keras.Sequential([
        tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
        tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
    ])

In [25]:
def build_encoder(config_file):
    with tf.io.gfile.GFile(config_file, "r") as reader:
        stock_params = StockBertConfig.from_json_string(reader.read())
        bert_params = stock_params.to_bert_model_layer_params()

    return BertModelLayer.from_params(bert_params, name="bert")

In [26]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(DecoderLayer, self).__init__()

        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)

        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
        self.dropout3 = tf.keras.layers.Dropout(rate)


    def call(self, x, enc_output, training,
             look_ahead_mask, padding_mask):
        # enc_output.shape == (batch_size, input_seq_len, d_model)

        attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)  # (batch_size, target_seq_len, d_model)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(attn1 + x)

        attn2, attn_weights_block2 = self.mha2(
            enc_output, enc_output, out1, padding_mask)  # (batch_size, target_seq_len, d_model)
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layernorm2(attn2 + out1)  # (batch_size, target_seq_len, d_model)

        ffn_output = self.ffn(out2)  # (batch_size, target_seq_len, d_model)
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.layernorm3(ffn_output + out2)  # (batch_size, target_seq_len, d_model)

        return out3, attn_weights_block1, attn_weights_block2

In [27]:
sample_decoder_layer = DecoderLayer(512, 8, 2048)
sample_encoder_output = tf.random.uniform((64, 128, 768))

sample_decoder_layer_output, _, _ = sample_decoder_layer(
    tf.random.uniform((64, 50, 512)), sample_encoder_output,
    False, None, None)

sample_decoder_layer_output.shape  # (batch_size, target_seq_len, d_model)

TensorShape([64, 50, 512])

In [28]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size,
                 rate=0.1):
        super(Decoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
        self.pos_encoding = positional_encoding(target_vocab_size, self.d_model)

        self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate)
                           for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, x, enc_output, training,
             look_ahead_mask, padding_mask):
        seq_len = tf.shape(x)[1]
        attention_weights = {}

        x = self.embedding(x)  # (batch_size, target_seq_len, d_model)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x, block1, block2 = self.dec_layers[i](x, enc_output, training,
                                                   look_ahead_mask, padding_mask)

            attention_weights['decoder_layer{}_block1'.format(i + 1)] = block1
            attention_weights['decoder_layer{}_block2'.format(i + 1)] = block2

        # x.shape == (batch_size, target_seq_len, d_model)
        return x, attention_weights

In [29]:
sample_decoder = Decoder(num_layers=2, d_model=512, num_heads=8, 
                         dff=2048, target_vocab_size=8000)

output, attn = sample_decoder(tf.random.uniform((64, 26)), 
                              enc_output=sample_encoder_output, 
                              training=False, look_ahead_mask=None, 
                              padding_mask=None)

output.shape, attn['decoder_layer2_block2'].shape

(TensorShape([64, 26, 512]), TensorShape([64, 8, 26, 128]))

In [30]:
class Config(object):
  def __init__(self, num_layers, d_model, dff, num_heads):
    self.num_layers = num_layers
    self.d_model = d_model
    self.dff = dff
    self.num_heads= num_heads

In [31]:
from bert.loader import map_to_stock_variable_name
# /content/drive/My Drive/machine translation/transformer/bert
class Transformer(tf.keras.Model):
  def __init__(self, config,
               target_vocab_size, 
               bert_config_file,
               bert_training=False, 
               rate=0.1,
               name='transformer'):
      super(Transformer, self).__init__(name=name)

      self.encoder = build_encoder(config_file=bert_config_file)
      self.encoder.trainable = bert_training

      self.decoder = Decoder(config.num_layers, config.d_model, 
                             config.num_heads, config.dff, target_vocab_size, rate)

      self.final_layer = tf.keras.layers.Dense(target_vocab_size)


  def load_stock_weights(self, bert: BertModelLayer, ckpt_file):
      assert isinstance(bert, BertModelLayer), "Expecting a BertModelLayer instance as first argument"
      assert tf.compat.v1.train.checkpoint_exists(ckpt_file), "Checkpoint does not exist: {}".format(ckpt_file)
      ckpt_reader = tf.train.load_checkpoint(ckpt_file)

      bert_prefix = 'transformer/bert'

      weights = []
      for weight in bert.weights:
          stock_name = map_to_stock_variable_name(weight.name, bert_prefix)
          if ckpt_reader.has_tensor(stock_name):
              value = ckpt_reader.get_tensor(stock_name)
              weights.append(value)
          else:
              raise ValueError("No value for:[{}], i.e.:[{}] in:[{}]".format(weight.name, stock_name, ckpt_file))
      bert.set_weights(weights)
      print("Done loading {} BERT weights from: {} into {} (prefix:{})".format(
          len(weights), ckpt_file, bert, bert_prefix))

  def restore_encoder(self, bert_ckpt_file):
      # loading the original pre-trained weights into the BERT layer:
      self.load_stock_weights(self.encoder, bert_ckpt_file)

  def call(self, inp, tar, training, look_ahead_mask, dec_padding_mask):
      enc_output = self.encoder(inp, training=self.encoder.trainable)  # (batch_size, inp_seq_len, d_model)

      # dec_output.shape == (batch_size, tar_seq_len, d_model)
      dec_output, attention_weights = self.decoder(
          tar, enc_output, training, look_ahead_mask, dec_padding_mask)

      final_output = self.final_layer(dec_output)  # (batch_size, tar_seq_len, target_vocab_size)

      return final_output, attention_weights

In [71]:
target_vocab_size_1 = tokenizer_en_1.vocab_size + 2
target_vocab_size_2 = tokenizer_ans.vocab_size + 2
target_vocab_size_3 = tokenizer_chi_2.vocab_size + 2
dropout_rate = 0.1
config = Config(num_layers=6, d_model=512, dff=1024, num_heads=8)

In [54]:
# gs_folder_bert
# uncased_L-12_H-768_A-12
MODEL_DIR = "uncased_L-12_H-768_A-12"
bert_config_file = os.path.join(MODEL_DIR, "bert_config.json")
bert_ckpt_file = os.path.join(MODEL_DIR, 'bert_model.ckpt')

# with tpu_strategy.scope():
transformer_1 = Transformer(config=config,
                          target_vocab_size=target_vocab_size_1,
                          bert_config_file=bert_config_file)
  
inp = tf.random.uniform((BATCH_SIZE, MAX_SEQ_LENGTH))
tar_inp = tf.random.uniform((BATCH_SIZE, MAX_SEQ_LENGTH))
fn_out, _ = transformer_1(inp, tar_inp, 
                        True,
                        look_ahead_mask=None,
                        dec_padding_mask=None)
print(tar_inp.shape) # (batch_size, tar_seq_len) 
print(fn_out.shape)  # (batch_size, tar_seq_len, target_vocab_size) 

# init bert pre-trained weights
transformer_1.restore_encoder(bert_ckpt_file)

(64, 40)
(64, 40, 7475)
Done loading 196 BERT weights from: uncased_L-12_H-768_A-12\bert_model.ckpt into <bert.model.BertModelLayer object at 0x0000023CC8508548> (prefix:transformer/bert)


In [55]:
transformer_1.summary()

Model: "transformer"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (BertModelLayer)        multiple                  108890112 
_________________________________________________________________
decoder_3 (Decoder)          multiple                  24326656  
_________________________________________________________________
dense_216 (Dense)            multiple                  3834675   
Total params: 137,051,443
Trainable params: 28,161,331
Non-trainable params: 108,890,112
_________________________________________________________________


In [56]:
# gs_folder_bert
# uncased_L-12_H-768_A-12
# MODEL_DIR = "uncased_L-12_H-768_A-12"
# bert_config_file = os.path.join(MODEL_DIR, "bert_config.json")
# bert_ckpt_file = os.path.join(MODEL_DIR, 'bert_model.ckpt')

# with tpu_strategy.scope():
transformer_2 = Transformer(config=config,
                          target_vocab_size=target_vocab_size_2,
                          bert_config_file=bert_config_file)
  
inp = tf.random.uniform((BATCH_SIZE, MAX_SEQ_LENGTH))
tar_inp = tf.random.uniform((BATCH_SIZE, MAX_SEQ_LENGTH))
fn_out, _ = transformer_2(inp, tar_inp, 
                        True,
                        look_ahead_mask=None,
                        dec_padding_mask=None)
print(tar_inp.shape) # (batch_size, tar_seq_len) 
print(fn_out.shape)  # (batch_size, tar_seq_len, target_vocab_size) 

# init bert pre-trained weights
transformer_2.restore_encoder(bert_ckpt_file)

(64, 40)
(64, 40, 8027)
Done loading 196 BERT weights from: uncased_L-12_H-768_A-12\bert_model.ckpt into <bert.model.BertModelLayer object at 0x0000023A842A3CC8> (prefix:transformer/bert)


In [57]:
transformer_2.summary()

Model: "transformer"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (BertModelLayer)        multiple                  108890112 
_________________________________________________________________
decoder_4 (Decoder)          multiple                  24609280  
_________________________________________________________________
dense_277 (Dense)            multiple                  4117851   
Total params: 137,617,243
Trainable params: 28,727,131
Non-trainable params: 108,890,112
_________________________________________________________________


In [72]:
# gs_folder_bert
# uncased_L-12_H-768_A-12
# MODEL_DIR = "uncased_L-12_H-768_A-12"
# bert_config_file = os.path.join(MODEL_DIR, "bert_config.json")
# bert_ckpt_file = os.path.join(MODEL_DIR, 'bert_model.ckpt')

# with tpu_strategy.scope():
transformer_3 = Transformer(config=config,
                          target_vocab_size=target_vocab_size_3,
                          bert_config_file=bert_config_file)
  
inp = tf.random.uniform((BATCH_SIZE, MAX_SEQ_LENGTH))
tar_inp = tf.random.uniform((BATCH_SIZE, MAX_SEQ_LENGTH))
fn_out, _ = transformer_3(inp, tar_inp, 
                        True,
                        look_ahead_mask=None,
                        dec_padding_mask=None)
print(tar_inp.shape) # (batch_size, tar_seq_len) 
print(fn_out.shape)  # (batch_size, tar_seq_len, target_vocab_size) 

# init bert pre-trained weights
transformer_3.restore_encoder(bert_ckpt_file)

(64, 40)
(64, 40, 7320)
Done loading 196 BERT weights from: uncased_L-12_H-768_A-12\bert_model.ckpt into <bert.model.BertModelLayer object at 0x0000023CD12E9988> (prefix:transformer/bert)


In [73]:
transformer_3.summary()

Model: "transformer"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (BertModelLayer)        multiple                  108890112 
_________________________________________________________________
decoder_6 (Decoder)          multiple                  24247296  
_________________________________________________________________
dense_399 (Dense)            multiple                  3755160   
Total params: 136,892,568
Trainable params: 28,002,456
Non-trainable params: 108,890,112
_________________________________________________________________


In [35]:
def create_masks(inp, tar):
    # Used in the 2nd attention block in the decoder.
    # This padding mask is used to mask the encoder outputs.
    dec_padding_mask = create_padding_mask(inp)

    # Used in the 1st attention block in the decoder.
    # It is used to pad and mask future tokens in the input received by 
    # the decoder.
    look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
    dec_target_padding_mask = create_padding_mask(tar)
    combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)

    return combined_mask, dec_padding_mask

# Evaluate chi to eng

In [36]:
def encode_chi(chi):
    tokens_chi = tokenizer_chi_1.tokenize(chi)
    lang1 = tokenizer_chi_1.convert_tokens_to_ids(['[CLS]'] + tokens_chi + ['[SEP]'])
    return lang1

In [37]:
def evaluate_chi_to_eng(transformer, inp_sentence):
    # normalize input sentence
    inp_sentence = encode_chi(inp_sentence)
    encoder_input = tf.expand_dims(inp_sentence, 0)

    # as the target is english, the first word to the transformer should be the
    # english start token.
    decoder_input = [tokenizer_en_1.vocab_size]
    output = tf.expand_dims(decoder_input, 0)

    for i in range(MAX_SEQ_LENGTH):
        combined_mask, dec_padding_mask = create_masks(
            encoder_input, output)

        # predictions.shape == (batch_size, seq_len, vocab_size)
        predictions, attention_weights = transformer(encoder_input,
                                                     output,
                                                     False,
                                                     combined_mask,
                                                     dec_padding_mask)

        # select the last word from the seq_len dimension
        predictions = predictions[:, -1:, :]  # (batch_size, 1, vocab_size)

        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

        # return the result if the predicted_id is equal to the end token
        if tf.equal(predicted_id, tokenizer_en_1.vocab_size + 1):
            return tf.squeeze(output, axis=0), attention_weights

        # concatentate the predicted_id to the output which is given to the decoder
        # as its input.
        output = tf.concat([output, predicted_id], axis=-1)

    return tf.squeeze(output, axis=0), attention_weights

In [38]:
evaluate_chi_to_eng(transformer_1, "你好嗎?")

(<tf.Tensor: shape=(41,), dtype=int32, numpy=
 array([7473,   12, 4915, 1968, 6062, 6062, 6062,   12, 6097, 6097, 6097,
        6097, 6097, 6097, 6097, 6097, 6097, 6097, 6097, 6097, 6097, 6097,
        6097, 6097, 6097, 6097, 6097, 6097, 6097, 6097, 6097, 6097, 6097,
        6097, 6097, 6097, 6097, 6097, 6097, 6097, 6097])>,
 {'decoder_layer1_block1': <tf.Tensor: shape=(1, 8, 40, 40), dtype=float32, numpy=
  array([[[[1.        , 0.        , 0.        , ..., 0.        ,
            0.        , 0.        ],
           [0.28630912, 0.7136908 , 0.        , ..., 0.        ,
            0.        , 0.        ],
           [0.4789284 , 0.18223774, 0.33883384, ..., 0.        ,
            0.        , 0.        ],
           ...,
           [0.06341842, 0.03623977, 0.01090709, ..., 0.01626615,
            0.        , 0.        ],
           [0.05919779, 0.03797729, 0.01341668, ..., 0.0168429 ,
            0.02031065, 0.        ],
           [0.0574108 , 0.03650632, 0.01572925, ..., 0.01649372,

In [95]:
def translate_chi_to_eng(transformer, sentence):
    result, attention_weights = evaluate_chi_to_eng(transformer, sentence)

    predicted_sentence = tokenizer_en_1.decode([i for i in result
                                              if i < tokenizer_en_1.vocab_size])

#     print('Input: {}'.format(sentence))
#     print('Predicted Translation: {}'.format(predicted_sentence))

    
    return predicted_sentence

In [96]:
translate_chi_to_eng(transformer_1, "你好嗎?")

'carried carried carried carried carried carried carried carried carried carried carried carried carried carried carried carried carried carried carried carried carried carried carried carried carried carried carried carried carried carried carried carried carried carried carried carried carried carried carried carried '

In [41]:
transformer_chi_to_eng = Transformer(config=config,
                          target_vocab_size=target_vocab_size_1,
                          bert_config_file=bert_config_file)
  
fn_out, _ = transformer_chi_to_eng(inp, tar_inp, 
                        True,
                        look_ahead_mask=None,
                        dec_padding_mask=None)
transformer_chi_to_eng.load_weights('/weights/nmt_chi_2_en_bert_ckpt')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x23cc00f0048>

In [97]:
translate_chi_to_eng(transformer_chi_to_eng, "你好嗎?")

'were you done ?'

In [98]:
translate_chi_to_eng(transformer_chi_to_eng,  '你要去哪裡？')

'where are you going to go ?'

In [99]:
translate_chi_to_eng(transformer_chi_to_eng, '很高興認識你')

'mary knew mary s happy .'

# Evaluate chatbot

In [51]:
def encode_ques(en):
    tokens_en = tokenizer_ques.tokenize(en)
    lang1 = tokenizer_ques.convert_tokens_to_ids(['[CLS]'] + tokens_en + ['[SEP]'])
    return lang1

In [52]:
def evaluate_chatbot(transformer, inp_sentence):
    # normalize input sentence
    inp_sentence = encode_ques(inp_sentence)
    encoder_input = tf.expand_dims(inp_sentence, 0)

    # as the target is english, the first word to the transformer should be the
    # english start token.
    decoder_input = [tokenizer_ans.vocab_size]
    output = tf.expand_dims(decoder_input, 0)

    for i in range(MAX_SEQ_LENGTH):
        combined_mask, dec_padding_mask = create_masks(
            encoder_input, output)

        # predictions.shape == (batch_size, seq_len, vocab_size)
        predictions, attention_weights = transformer(encoder_input,
                                                     output,
                                                     False,
                                                     combined_mask,
                                                     dec_padding_mask)

        # select the last word from the seq_len dimension
        predictions = predictions[:, -1:, :]  # (batch_size, 1, vocab_size)

        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

        # return the result if the predicted_id is equal to the end token
        if tf.equal(predicted_id, tokenizer_ans.vocab_size + 1):
            return tf.squeeze(output, axis=0), attention_weights

        # concatentate the predicted_id to the output which is given to the decoder
        # as its input.
        output = tf.concat([output, predicted_id], axis=-1)

    return tf.squeeze(output, axis=0), attention_weights

In [58]:
evaluate_chatbot(transformer_2, "how are you?")

(<tf.Tensor: shape=(41,), dtype=int32, numpy=
 array([8025, 2413, 2413, 2413, 2413, 2413, 2413, 2413, 2413, 2413, 2413,
        2413, 2413, 2413, 2413, 2413, 2413, 2413, 2413, 2413, 2413, 2413,
        2413, 2413, 2413, 2413, 2413, 2413, 2413, 2413, 2413, 2413, 2413,
        2413, 2413, 2413, 2413, 2413, 2413, 2413, 2413])>,
 {'decoder_layer1_block1': <tf.Tensor: shape=(1, 8, 40, 40), dtype=float32, numpy=
  array([[[[1.        , 0.        , 0.        , ..., 0.        ,
            0.        , 0.        ],
           [0.92084986, 0.0791501 , 0.        , ..., 0.        ,
            0.        , 0.        ],
           [0.8569223 , 0.0721079 , 0.07096978, ..., 0.        ,
            0.        , 0.        ],
           ...,
           [0.5055334 , 0.02431858, 0.02304769, ..., 0.01422325,
            0.        , 0.        ],
           [0.4639414 , 0.02467237, 0.02462771, ..., 0.01487065,
            0.01812332, 0.        ],
           [0.4304367 , 0.02425041, 0.02474623, ..., 0.01604094,

In [100]:
def chatbot(transformer, sentence):
    result, attention_weights = evaluate_chatbot(transformer, sentence)

    predicted_sentence = tokenizer_ans.decode([i for i in result
                                              if i < tokenizer_ans.vocab_size])

#     print('Input: {}'.format(sentence))
#     print('Predicted Translation: {}'.format(predicted_sentence))
    
    return predicted_sentence

In [101]:
chatbot(transformer_2, "how are you?")

'rmrmrmrmrmrmrmrmrmrmrmrmrmrmrmrmrmrmrmrmrmrmrmrmrmrmrmrmrmrmrmrmrmrmrmrmrmrmrmrm'

In [102]:
transformer_chatbot = Transformer(config=config,
                          target_vocab_size=target_vocab_size_2,
                          bert_config_file=bert_config_file)
  
# fn_out, _ = transformer_chatbot(inp, tar_inp, 
#                         True,
#                         look_ahead_mask=None,
#                         dec_padding_mask=None)
transformer_chatbot.load_weights('/weights/chatbot_bert_ckpt')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x23cd1166a48>

In [103]:
chatbot(transformer_chatbot, "how are you doing?")

'nice very uh , thank you . thank you .'

# Evaluate eng to chi

In [74]:
def encode_en(en):
    tokens_en = tokenizer_en_2.tokenize(en)
    lang1 = tokenizer_en_2.convert_tokens_to_ids(['[CLS]'] + tokens_en + ['[SEP]'])
    return lang1

In [75]:
def evaluate_eng_to_chi(transformer, inp_sentence):
    # normalize input sentence
    inp_sentence = encode_en(inp_sentence)
    encoder_input = tf.expand_dims(inp_sentence, 0)

    # as the target is english, the first word to the transformer should be the
    # english start token.
    decoder_input = [tokenizer_chi_2.vocab_size]
    output = tf.expand_dims(decoder_input, 0)

    for i in range(MAX_SEQ_LENGTH):
        combined_mask, dec_padding_mask = create_masks(
            encoder_input, output)

        # predictions.shape == (batch_size, seq_len, vocab_size)
        predictions, attention_weights = transformer(encoder_input,
                                                     output,
                                                     False,
                                                     combined_mask,
                                                     dec_padding_mask)

        # select the last word from the seq_len dimension
        predictions = predictions[:, -1:, :]  # (batch_size, 1, vocab_size)

        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

        # return the result if the predicted_id is equal to the end token
        if tf.equal(predicted_id, tokenizer_chi_2.vocab_size + 1):
            return tf.squeeze(output, axis=0), attention_weights

        # concatentate the predicted_id to the output which is given to the decoder
        # as its input.
        output = tf.concat([output, predicted_id], axis=-1)

    return tf.squeeze(output, axis=0), attention_weights

In [77]:
evaluate_eng_to_chi(transformer_3, "how are you?")

(<tf.Tensor: shape=(41,), dtype=int32, numpy=
 array([7318, 4084, 4084, 5153, 5153, 5153, 5153, 5153, 5153, 5153, 5153,
        5153, 5153, 5153, 5153, 5153, 5153, 5153, 5153, 5153, 5153, 5153,
        5153, 5153, 5153, 5153, 5153, 5153, 5153, 5153, 5153, 5153, 5153,
        5153, 5153, 5153, 5153, 5153, 5153, 5153, 5153])>,
 {'decoder_layer1_block1': <tf.Tensor: shape=(1, 8, 40, 40), dtype=float32, numpy=
  array([[[[1.        , 0.        , 0.        , ..., 0.        ,
            0.        , 0.        ],
           [0.47573826, 0.5242618 , 0.        , ..., 0.        ,
            0.        , 0.        ],
           [0.27310273, 0.29794422, 0.42895308, ..., 0.        ,
            0.        , 0.        ],
           ...,
           [0.00719214, 0.01817749, 0.0278219 , ..., 0.02544523,
            0.        , 0.        ],
           [0.00589785, 0.01770682, 0.02639655, ..., 0.02191273,
            0.02513941, 0.        ],
           [0.00597074, 0.01850038, 0.02735454, ..., 0.01867624,

In [104]:
def translate_eng_to_chi(transformer, sentence):
    result, attention_weights = evaluate_eng_to_chi(transformer, sentence)

    predicted_sentence = tokenizer_chi_2.decode([i for i in result
                                              if i < tokenizer_chi_2.vocab_size])

#     print('Input: {}'.format(sentence))
#     print('Predicted Translation: {}'.format(predicted_sentence))
    
    return predicted_sentence

In [105]:
translate_eng_to_chi(transformer_3, "how are you?")

'讓我們放讓我們放這是湯姆這是湯姆這是湯姆這是湯姆這是湯姆這是湯姆這是湯姆這是湯姆這是湯姆這是湯姆這是湯姆這是湯姆這是湯姆這是湯姆這是湯姆這是湯姆這是湯姆這是湯姆這是湯姆這是湯姆這是湯姆這是湯姆這是湯姆這是湯姆這是湯姆這是湯姆這是湯姆這是湯姆這是湯姆這是湯姆這是湯姆這是湯姆這是湯姆這是湯姆這是湯姆這是湯姆這是湯姆這是湯姆'

In [106]:
transformer_eng_to_chi = Transformer(config=config,
                          target_vocab_size=target_vocab_size_3,
                          bert_config_file=bert_config_file)
  
# fn_out, _ = transformer_eng_to_chi(inp, tar_inp, 
#                         True,
#                         look_ahead_mask=None,
#                         dec_padding_mask=None)
transformer_eng_to_chi.load_weights('/weights/nmt_en_2_chi_bert_ckpt')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x23cd27c8708>

In [107]:
translate_eng_to_chi(transformer_eng_to_chi, "where are you going?")

'你要去哪裡哪儿，你的哪处。'

# Combine

In [108]:
sentence = '你要去哪裡？'

chi_to_eng = translate_chi_to_eng(transformer_chi_to_eng, sentence)
convo = chatbot(transformer_chatbot, chi_to_eng)
translate_eng_to_chi(transformer_eng_to_chi, convo)

'我會去去路。'

In [114]:
def nmt_and_chatbot(sentence):
    chi_to_eng = translate_chi_to_eng(transformer_chi_to_eng, sentence)
    convo = chatbot(transformer_chatbot, chi_to_eng)
    eng_to_chi = translate_eng_to_chi(transformer_eng_to_chi, convo)
    
    print('Input: {}'.format(sentence))
    print('Predicted Translation (chi to eng): {}'.format(chi_to_eng))
    print('Predicted Response (chatbot): {}'.format(convo))
    print('Predicted Translation (eng to chi): {}'.format(eng_to_chi))

In [115]:
nmt_and_chatbot('你要去哪裡？')

Input: 你要去哪裡？
Predicted Translation (chi to eng): where are you going to go ?
Predicted Response (chatbot): i ll go .
Predicted Translation (eng to chi): 我會去去路。


In [116]:
nmt_and_chatbot('很高興認識你')

Input: 很高興認識你
Predicted Translation (chi to eng): mary knew mary s happy .
Predicted Response (chatbot): he was dead .
Predicted Translation (eng to chi): 他在死死了。


In [117]:
nmt_and_chatbot('我飽了')

Input: 我飽了
Predicted Translation (chi to eng): i m wrong .
Predicted Response (chatbot): what ?
Predicted Translation (eng to chi): “什麼，甚麼？


In [118]:
nmt_and_chatbot('我愛你')

Input: 我愛你
Predicted Translation (chi to eng): i love the much .
Predicted Response (chatbot): it s a bit of men for men for men for men for men for men for men for men for men for men for men for men for men for men for men for men for men for men 
Predicted Translation (eng to chi): 男人是男人是當男人。
