In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_hub as hub

import string
from string import digits
import re
import time
import numpy as np
import pandas as pd
import collections
import unicodedata

import os
import io


from bert import BertModelLayer
from bert.loader import StockBertConfig, load_stock_weights

from snownlp import SnowNLP

import  nltk.translate.bleu_score as bleu
import pynlpir

In [2]:
tf.config.list_physical_devices('GPU')
print("GPU Available: ", tf.test.is_gpu_available())

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
GPU Available:  True


In [3]:
pwd

'C:\\Users\\user\\Project From Alex NLP'

In [4]:
path_to_zip = 'C:\\Users\\user\\Project From Alex NLP\\chi-eng.zip'
path_to_file = os.path.dirname(path_to_zip)+"\\chi-eng\\cmn.txt"
print(path_to_file)

C:\Users\user\Project From Alex NLP\chi-eng\cmn.txt


In [5]:
def preprocess_sentence(sentence):
  sentence = sentence.lower().strip()
  # creating a space between a word and the punctuation following it
  # eg: "he is a boy." => "he is a boy ."
  sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
  sentence = re.sub(r'[" "]+', " ", sentence)
  # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
  sentence = re.sub(r"[^a-zA-Z?.!,]+", " ", sentence)
  sentence = sentence.strip()
  # adding a start and an end token to the sentence
  # so that the model know when to start and stop predicting.
#   sentence = '<start> ' + sentence + ' <end>'
  return sentence

In [6]:
# lines = io.open(path_to_file, encoding='UTF-8').read().strip().split('\n')

In [7]:
# for l in lines[0:4]:
#     eng = preprocess_sentence(l.split('\t')[0])
#     chi = l.split('\t')[1]

In [8]:
# (eng,chi)

In [9]:
# preprocess_sentence(lines[100].split('\t')[0])

In [10]:
# for l in lines[0:4]:
#     for w in l.split('\t'):
#         preprocess_sentence(w)
#         print(w)

In [6]:
# 1. Remove the accents
# 2. Clean the sentences
# 3. Return word pairs in the format: [ENGLISH, SPANISH]

def create_dataset(path, num1,num2):
  english = []
  chinese = []
  lines = io.open(path, encoding='UTF-8').read().strip().split('\n')

#   word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_examples]]
  for l in lines[num1:num2]:
      eng = preprocess_sentence(l.split('\t')[0])
      chi = l.split('\t')[1]
      english.append(eng)
      chinese.append(chi)
        
  return english, chinese

In [7]:
en_tr, chi_tr = create_dataset(path_to_file, 0,20000)
print(en_tr[-1])
print(chi_tr[-1])

one man s meat is another man s poison .
甲之蜜糖，乙之砒霜。


In [8]:
len(en_tr),len(chi_tr)

(20000, 20000)

In [14]:
# !pip install snownlp

In [15]:
# s = SnowNLP(chi_tr[-1])
# s.words

In [9]:
en_val, chi_val = create_dataset(path_to_file,23000,23500)
len(en_val),len(chi_val)

(500, 500)

In [10]:
train_examples = tf.data.Dataset.from_tensor_slices((en_tr, chi_tr))
val_examples = tf.data.Dataset.from_tensor_slices((en_val, chi_val))

In [11]:
def convert_to_unicode(text):
    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
    if isinstance(text, str):
        return text
    elif isinstance(text, bytes):
        return text.decode("utf-8", "ignore")
    else:
        raise ValueError("Unsupported string type: %s" % (type(text)))


def load_vocab(vocab_file):
    """Loads a vocabulary file into a dictionary."""
    vocab = collections.OrderedDict()
    index = 0
    with tf.io.gfile.GFile(vocab_file, "r") as reader:
        while True:
            token = convert_to_unicode(reader.readline())
            if not token:
                break
            token = token.strip()
            vocab[token] = index
            index += 1
    return vocab


def whitespace_tokenize(text):
    """Runs basic whitespace cleaning and splitting on a piece of text."""
    text = text.strip()
    if not text:
        return []
    tokens = text.split()
    return tokens

def convert_by_vocab(vocab, items):
    """Converts a sequence of [tokens|ids] using the vocab."""
    output = []
    for item in items:
        output.append(vocab[item])
    return output

class FullTokenizer(object):
    """Runs end-to-end tokenziation."""

    def __init__(self, vocab_file, do_lower_case=True):
        self.vocab = load_vocab(vocab_file)
        self.inv_vocab = {v: k for k, v in self.vocab.items()}
        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)

    def tokenize(self, text):
        split_tokens = []
        for token in self.basic_tokenizer.tokenize(text):
            for sub_token in self.wordpiece_tokenizer.tokenize(token):
                split_tokens.append(sub_token)

        return split_tokens
    
    def convert_tokens_to_ids(self, tokens):
        return convert_by_vocab(self.vocab, tokens)

    def convert_ids_to_tokens(self, ids):
        return convert_by_vocab(self.inv_vocab, ids)


class BasicTokenizer(object):
    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""

    def __init__(self, do_lower_case=True):
        """Constructs a BasicTokenizer.
    
        Args:
          do_lower_case: Whether to lower case the input.
        """
        self.do_lower_case = do_lower_case

    def tokenize(self, text):
        """Tokenizes a piece of text."""
        text = convert_to_unicode(text)
        text = self._clean_text(text)

        # This was added on November 1st, 2018 for the multilingual and Chinese
        # models. This is also applied to the English models now, but it doesn't
        # matter since the English models were not trained on any Chinese data
        # and generally don't have any Chinese data in them (there are Chinese
        # characters in the vocabulary because Wikipedia does have some Chinese
        # words in the English Wikipedia.).
        text = self._tokenize_chinese_chars(text)

        orig_tokens = whitespace_tokenize(text)
        split_tokens = []
        for token in orig_tokens:
            if self.do_lower_case:
                token = token.lower()
                token = self._run_strip_accents(token)
            split_tokens.extend(self._run_split_on_punc(token))

        output_tokens = whitespace_tokenize(" ".join(split_tokens))
        return output_tokens

    def _run_strip_accents(self, text):
        """Strips accents from a piece of text."""
        text = unicodedata.normalize("NFD", text)
        output = []
        for char in text:
            cat = unicodedata.category(char)
            if cat == "Mn":
                continue
            output.append(char)
        return "".join(output)

    def _run_split_on_punc(self, text):
        """Splits punctuation on a piece of text."""
        chars = list(text)
        i = 0
        start_new_word = True
        output = []
        while i < len(chars):
            char = chars[i]
            if _is_punctuation(char):
                output.append([char])
                start_new_word = True
            else:
                if start_new_word:
                    output.append([])
                start_new_word = False
                output[-1].append(char)
            i += 1

        return ["".join(x) for x in output]

    def _tokenize_chinese_chars(self, text):
        """Adds whitespace around any CJK character."""
        output = []
        for char in text:
            cp = ord(char)
            if self._is_chinese_char(cp):
                output.append(" ")
                output.append(char)
                output.append(" ")
            else:
                output.append(char)
        return "".join(output)

    def _is_chinese_char(self, cp):
        """Checks whether CP is the codepoint of a CJK character."""
        # This defines a "chinese character" as anything in the CJK Unicode block:
        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
        #
        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
        # despite its name. The modern Korean Hangul alphabet is a different block,
        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
        # space-separated words, so they are not treated specially and handled
        # like the all of the other languages.
        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
                (cp >= 0x3400 and cp <= 0x4DBF) or  #
                (cp >= 0x20000 and cp <= 0x2A6DF) or  #
                (cp >= 0x2A700 and cp <= 0x2B73F) or  #
                (cp >= 0x2B740 and cp <= 0x2B81F) or  #
                (cp >= 0x2B820 and cp <= 0x2CEAF) or
                (cp >= 0xF900 and cp <= 0xFAFF) or  #
                (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
            return True

        return False

    def _clean_text(self, text):
        """Performs invalid character removal and whitespace cleanup on text."""
        output = []
        for char in text:
            cp = ord(char)
            if cp == 0 or cp == 0xfffd or _is_control(char):
                continue
            if _is_whitespace(char):
                output.append(" ")
            else:
                output.append(char)
        return "".join(output)


class WordpieceTokenizer(object):
    """Runs WordPiece tokenziation."""

    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200):
        self.vocab = vocab
        self.unk_token = unk_token
        self.max_input_chars_per_word = max_input_chars_per_word

    def tokenize(self, text):
        """Tokenizes a piece of text into its word pieces.
    
        This uses a greedy longest-match-first algorithm to perform tokenization
        using the given vocabulary.
    
        For example:
          input = "unaffable"
          output = ["un", "##aff", "##able"]
    
        Args:
          text: A single token or whitespace separated tokens. This should have
            already been passed through `BasicTokenizer.
    
        Returns:
          A list of wordpiece tokens.
        """

        text = convert_to_unicode(text)

        output_tokens = []
        for token in whitespace_tokenize(text):
            chars = list(token)
            if len(chars) > self.max_input_chars_per_word:
                output_tokens.append(self.unk_token)
                continue

            is_bad = False
            start = 0
            sub_tokens = []
            while start < len(chars):
                end = len(chars)
                cur_substr = None
                while start < end:
                    substr = "".join(chars[start:end])
                    if start > 0:
                        substr = "##" + substr
                    if substr in self.vocab:
                        cur_substr = substr
                        break
                    end -= 1
                if cur_substr is None:
                    is_bad = True
                    break
                sub_tokens.append(cur_substr)
                start = end

            if is_bad:
                output_tokens.append(self.unk_token)
            else:
                output_tokens.extend(sub_tokens)
        return output_tokens

def _is_whitespace(char):
    """Checks whether `chars` is a whitespace character."""
    # \t, \n, and \r are technically contorl characters but we treat them
    # as whitespace since they are generally considered as such.
    if char == " " or char == "\t" or char == "\n" or char == "\r":
        return True
    cat = unicodedata.category(char)
    if cat == "Zs":
        return True
    return False


def _is_control(char):
    """Checks whether `chars` is a control character."""
    # These are technically control characters but we count them as whitespace
    # characters.
    if char == "\t" or char == "\n" or char == "\r":
        return False
    cat = unicodedata.category(char)
    if cat.startswith("C"):
        return True
    return False


def _is_punctuation(char):
    """Checks whether `chars` is a punctuation character."""
    cp = ord(char)
    # We treat all non-letter/number ASCII as punctuation.
    # Characters such as "^", "$", and "`" are not in the Unicode
    # Punctuation class but we treat them as punctuation anyways, for
    # consistency.
    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
        return True
    cat = unicodedata.category(char)
    if cat.startswith("P"):
        return True
    return False

In [19]:
# tokenizer_en = tfds.features.text.SubwordTextEncoder.build_from_corpus(
#     en_tr, target_vocab_size=2**13)
# tokenizer_en.save_to_file('vocab_english')

# sample_string = 'Transformer is awesome.'
# tokenized_string = tokenizer_en.encode(sample_string)
# for ts in tokenized_string:
#   print ('{} ----> {}'.format(ts, tokenizer_en.decode([ts])))

In [12]:
tokenizer_chin = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    chi_tr, target_vocab_size=2**13)
tokenizer_chin.save_to_file('vocab_chinese')

In [13]:
sample_string = '很高興認識你'
tokenized_string = tokenizer_chin.encode(sample_string)
for ts in tokenized_string:
  print ('{} ----> {}'.format(ts, tokenizer_chin.decode([ts])))

4573 ----> 很高興
575 ----> 認識
9 ----> 你


In [14]:
tokenizer_eng = FullTokenizer(
    vocab_file= 'uncased_L-12_H-768_A-12/vocab.txt',
    do_lower_case=True)

test_tokens = tokenizer_eng.tokenize(en_tr[100])
test_ids = tokenizer_eng.convert_tokens_to_ids(['[CLS]'] + test_tokens + ['[SEP]'])
print(test_ids)
print(tokenizer_eng.convert_ids_to_tokens(test_ids))

[101, 9241, 2039, 1012, 102]
['[CLS]', 'hurry', 'up', '.', '[SEP]']


In [15]:
MAX_SEQ_LENGTH = 40


def encode(en, chi, seq_length=MAX_SEQ_LENGTH):
  tokens_eng = tokenizer_eng.tokenize(tf.compat.as_text(en.numpy()))
  lang1 = tokenizer_eng.convert_tokens_to_ids(['[CLS]'] + tokens_eng + ['[SEP]'])
  if len(lang1)<seq_length:
    lang1 = lang1 + list(np.zeros(seq_length - len(lang1), 'int32'))

  lang2 = [tokenizer_chin.vocab_size] + tokenizer_chin.encode(tf.compat.as_text(chi.numpy())) + [tokenizer_chin.vocab_size + 1]
  if len(lang2)<seq_length:
    lang2 = lang2 + list(np.zeros(seq_length - len(lang2), 'int32'))

  return lang1, lang2

In [16]:
def tf_encode(en, chi):
  result_en, result_chi = tf.py_function(encode, [en, chi], [tf.int32, tf.int32])
  result_chi.set_shape([None])
  result_en.set_shape([None])

  return result_en, result_chi

In [17]:
def filter_max_length(x, y, max_length=MAX_SEQ_LENGTH):
  return tf.logical_and(tf.size(x) <= max_length,
                        tf.size(y) <= max_length)

In [18]:
print(len(chi_tr))

20000


In [19]:
BUFFER_SIZE = 20000
BATCH_SIZE = 64

train_dataset = train_examples.map(tf_encode)
# train_dataset = tf.io.decode_raw(train_dataset, tf.int32)
train_dataset = train_dataset.filter(filter_max_length)

# cache the dataset to memory to get a speedup while reading from it.
train_dataset = train_dataset.cache()
train_dataset = train_dataset.shuffle(BUFFER_SIZE).padded_batch(
    BATCH_SIZE, padded_shapes=([-1], [-1]), drop_remainder=True)
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)

val_dataset = val_examples.map(
    lambda en, chi: tf.py_function(encode, [en, chi], [tf.int32, tf.int32]))
val_dataset = val_dataset.filter(filter_max_length)
val_dataset = val_dataset.padded_batch(BATCH_SIZE, padded_shapes=([-1], [-1]))

# Positional Encoding

In [20]:
def get_angles(pos, i, d_model):
  angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
  return pos * angle_rates

In [21]:
def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                            np.arange(d_model)[np.newaxis, :],
                            d_model)

    # apply sin to even indices in the array; 2i
    sines = np.sin(angle_rads[:, 0::2])

    # apply cos to odd indices in the array; 2i+1
    cosines = np.cos(angle_rads[:, 1::2])

    pos_encoding = np.concatenate([sines, cosines], axis=-1)

    pos_encoding = pos_encoding[np.newaxis, ...]

    return tf.cast(pos_encoding, dtype=tf.float32)

In [22]:
def create_padding_mask(seq):
  seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
  
  # add extra dimensions so that we can add the padding
  # to the attention logits.
  return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

In [23]:
def create_look_ahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask  # (seq_len, seq_len)

In [24]:
def scaled_dot_product_attention(q, k, v, mask):
    """Calculate the attention weights.
    q, k, v must have matching leading dimensions.
    k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
    The mask has different shapes depending on its type(padding or look ahead) 
    but it must be broadcastable for addition.
    
    Args:
      q: query shape == (..., seq_len_q, depth)
      k: key shape == (..., seq_len_k, depth)
      v: value shape == (..., seq_len_v, depth_v)
      mask: Float tensor with shape broadcastable 
            to (..., seq_len_q, seq_len_k). Defaults to None.
      
    Returns:
      output, attention_weights
    """

    matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)

    # scale matmul_qk
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

    # add the mask to the scaled tensor.
    if mask is not None:
        scaled_attention_logits += (mask * -1e9)

        # softmax is normalized on the last axis (seq_len_k) so that the scores
    # add up to 1.
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)

    output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)

    return output, attention_weights

In [25]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model

        assert d_model % self.num_heads == 0

        self.depth = d_model // self.num_heads

        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)

        self.dense = tf.keras.layers.Dense(d_model)

    def split_heads(self, x, batch_size):
        """Split the last dimension into (num_heads, depth).
        Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
        """
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]

        q = self.wq(q)  # (batch_size, seq_len, d_model)
        k = self.wk(k)  # (batch_size, seq_len, d_model)
        v = self.wv(v)  # (batch_size, seq_len, d_model)

        q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
        k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
        v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)

        # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
        # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
        scaled_attention, attention_weights = scaled_dot_product_attention(
            q, k, v, mask)

        scaled_attention = tf.transpose(scaled_attention,
                                        perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)

        concat_attention = tf.reshape(scaled_attention,
                                      (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)

        output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)

        return output, attention_weights

In [26]:
temp_mha = MultiHeadAttention(d_model=512, num_heads=8)
y = tf.random.uniform((1, 60, 768))  # (batch_size, encoder_sequence, d_model)
q = tf.random.uniform((1, 60, 512))  # (batch_size, encoder_sequence, d_model)
out, attn = temp_mha(y, k=y, q=q, mask=None)
out.shape, attn.shape

(TensorShape([1, 60, 512]), TensorShape([1, 8, 60, 60]))

In [27]:
def point_wise_feed_forward_network(d_model, dff):
    return tf.keras.Sequential([
        tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
        tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
    ])

In [28]:
def build_encoder(config_file):
    with tf.io.gfile.GFile(config_file, "r") as reader:
        stock_params = StockBertConfig.from_json_string(reader.read())
        bert_params = stock_params.to_bert_model_layer_params()

    return BertModelLayer.from_params(bert_params, name="bert")

In [29]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(DecoderLayer, self).__init__()

        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)

        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
        self.dropout3 = tf.keras.layers.Dropout(rate)


    def call(self, x, enc_output, training,
             look_ahead_mask, padding_mask):
        # enc_output.shape == (batch_size, input_seq_len, d_model)

        attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)  # (batch_size, target_seq_len, d_model)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(attn1 + x)

        attn2, attn_weights_block2 = self.mha2(
            enc_output, enc_output, out1, padding_mask)  # (batch_size, target_seq_len, d_model)
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layernorm2(attn2 + out1)  # (batch_size, target_seq_len, d_model)

        ffn_output = self.ffn(out2)  # (batch_size, target_seq_len, d_model)
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.layernorm3(ffn_output + out2)  # (batch_size, target_seq_len, d_model)

        return out3, attn_weights_block1, attn_weights_block2

In [30]:
sample_decoder_layer = DecoderLayer(512, 8, 2048)
sample_encoder_output = tf.random.uniform((64, 128, 768))

sample_decoder_layer_output, _, _ = sample_decoder_layer(
    tf.random.uniform((64, 50, 512)), sample_encoder_output,
    False, None, None)

sample_decoder_layer_output.shape  # (batch_size, target_seq_len, d_model)

TensorShape([64, 50, 512])

In [31]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size,
                 rate=0.1):
        super(Decoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
        self.pos_encoding = positional_encoding(target_vocab_size, self.d_model)

        self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate)
                           for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, x, enc_output, training,
             look_ahead_mask, padding_mask):
        seq_len = tf.shape(x)[1]
        attention_weights = {}

        x = self.embedding(x)  # (batch_size, target_seq_len, d_model)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x, block1, block2 = self.dec_layers[i](x, enc_output, training,
                                                   look_ahead_mask, padding_mask)

            attention_weights['decoder_layer{}_block1'.format(i + 1)] = block1
            attention_weights['decoder_layer{}_block2'.format(i + 1)] = block2

        # x.shape == (batch_size, target_seq_len, d_model)
        return x, attention_weights

In [32]:
sample_decoder = Decoder(num_layers=2, d_model=512, num_heads=8, 
                         dff=2048, target_vocab_size=8000)

output, attn = sample_decoder(tf.random.uniform((64, 26)), 
                              enc_output=sample_encoder_output, 
                              training=False, look_ahead_mask=None, 
                              padding_mask=None)

output.shape, attn['decoder_layer2_block2'].shape

(TensorShape([64, 26, 512]), TensorShape([64, 8, 26, 128]))

In [33]:
class Config(object):
  def __init__(self, num_layers, d_model, dff, num_heads):
    self.num_layers = num_layers
    self.d_model = d_model
    self.dff = dff
    self.num_heads= num_heads

In [34]:
from bert.loader import map_to_stock_variable_name
# /content/drive/My Drive/machine translation/transformer/bert
class Transformer(tf.keras.Model):
  def __init__(self, config,
               target_vocab_size, 
               bert_config_file,
               bert_training=False, 
               rate=0.1,
               name='transformer'):
      super(Transformer, self).__init__(name=name)

      self.encoder = build_encoder(config_file=bert_config_file)
      self.encoder.trainable = bert_training

      self.decoder = Decoder(config.num_layers, config.d_model, 
                             config.num_heads, config.dff, target_vocab_size, rate)

      self.final_layer = tf.keras.layers.Dense(target_vocab_size)


  def load_stock_weights(self, bert: BertModelLayer, ckpt_file):
      assert isinstance(bert, BertModelLayer), "Expecting a BertModelLayer instance as first argument"
      assert tf.compat.v1.train.checkpoint_exists(ckpt_file), "Checkpoint does not exist: {}".format(ckpt_file)
      ckpt_reader = tf.train.load_checkpoint(ckpt_file)

      bert_prefix = 'transformer/bert'

      weights = []
      for weight in bert.weights:
          stock_name = map_to_stock_variable_name(weight.name, bert_prefix)
          if ckpt_reader.has_tensor(stock_name):
              value = ckpt_reader.get_tensor(stock_name)
              weights.append(value)
          else:
              raise ValueError("No value for:[{}], i.e.:[{}] in:[{}]".format(weight.name, stock_name, ckpt_file))
      bert.set_weights(weights)
      print("Done loading {} BERT weights from: {} into {} (prefix:{})".format(
          len(weights), ckpt_file, bert, bert_prefix))

  def restore_encoder(self, bert_ckpt_file):
      # loading the original pre-trained weights into the BERT layer:
      self.load_stock_weights(self.encoder, bert_ckpt_file)

  def call(self, inp, tar, training, look_ahead_mask, dec_padding_mask):
      enc_output = self.encoder(inp, training=self.encoder.trainable)  # (batch_size, inp_seq_len, d_model)

      # dec_output.shape == (batch_size, tar_seq_len, d_model)
      dec_output, attention_weights = self.decoder(
          tar, enc_output, training, look_ahead_mask, dec_padding_mask)

      final_output = self.final_layer(dec_output)  # (batch_size, tar_seq_len, target_vocab_size)

      return final_output, attention_weights

In [35]:
target_vocab_size = tokenizer_chin.vocab_size + 2
dropout_rate = 0.1
config = Config(num_layers=6, d_model=512, dff=1024, num_heads=8)

In [36]:
# gs_folder_bert
# uncased_L-12_H-768_A-12
MODEL_DIR = "uncased_L-12_H-768_A-12"
bert_config_file = os.path.join(MODEL_DIR, "bert_config.json")
bert_ckpt_file = os.path.join(MODEL_DIR, 'bert_model.ckpt')

# with tpu_strategy.scope():
transformer = Transformer(config=config,
                          target_vocab_size=target_vocab_size,
                          bert_config_file=bert_config_file)
  
inp = tf.random.uniform((BATCH_SIZE, MAX_SEQ_LENGTH))
tar_inp = tf.random.uniform((BATCH_SIZE, MAX_SEQ_LENGTH))
fn_out, _ = transformer(inp, tar_inp, 
                        True,
                        look_ahead_mask=None,
                        dec_padding_mask=None)
print(tar_inp.shape) # (batch_size, tar_seq_len) 
print(fn_out.shape)  # (batch_size, tar_seq_len, target_vocab_size) 

# init bert pre-trained weights
transformer.restore_encoder(bert_ckpt_file)

(64, 40)
(64, 40, 7320)
Instructions for updating:
Use standard file APIs to check for files with this prefix.


Instructions for updating:
Use standard file APIs to check for files with this prefix.


Done loading 196 BERT weights from: uncased_L-12_H-768_A-12\bert_model.ckpt into <bert.model.BertModelLayer object at 0x000001F224065CC8> (prefix:transformer/bert)


In [37]:
transformer.summary()

Model: "transformer"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (BertModelLayer)        multiple                  108890112 
_________________________________________________________________
decoder_1 (Decoder)          multiple                  24247296  
_________________________________________________________________
dense_94 (Dense)             multiple                  3755160   
Total params: 136,892,568
Trainable params: 28,002,456
Non-trainable params: 108,890,112
_________________________________________________________________


In [38]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()

        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)

        self.warmup_steps = warmup_steps
    
    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [39]:
learning_rate = CustomSchedule(config.d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)

In [None]:
temp_learning_rate_schedule = CustomSchedule(config.d_model)
import matplotlib.pyplot as plt

plt.plot(temp_learning_rate_schedule(tf.range(40000, dtype=tf.float32)))
plt.ylabel("Learning Rate")
plt.xlabel("Train Step")

In [None]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

In [43]:
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [44]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
    name='train_accuracy')

In [45]:
checkpoint_path = "/checkpoints/train_nmt_eng_to_chi_bert"

ckpt = tf.train.Checkpoint(transformer=transformer,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

# if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print('Latest checkpoint restored!!')

In [40]:
def create_masks(inp, tar):
    # Used in the 2nd attention block in the decoder.
    # This padding mask is used to mask the encoder outputs.
    dec_padding_mask = create_padding_mask(inp)

    # Used in the 1st attention block in the decoder.
    # It is used to pad and mask future tokens in the input received by 
    # the decoder.
    look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
    dec_target_padding_mask = create_padding_mask(tar)
    combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)

    return combined_mask, dec_padding_mask

In [47]:
@tf.function
def train_step(inp, tar):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]

    combined_mask, dec_padding_mask = create_masks(inp, tar_inp)

    
    with tf.GradientTape() as tape:
        predictions, _ = transformer(inp, tar_inp, 
                                     True,
                                     combined_mask,
                                     dec_padding_mask)
        loss = loss_function(tar_real, predictions)

    gradients = tape.gradient(loss, transformer.trainable_variables)
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

    train_loss(loss)
    train_accuracy(tar_real, predictions)

In [48]:
EPOCHS = 20

for epoch in range(EPOCHS):
    start = time.time()

    train_loss.reset_states()
    train_accuracy.reset_states()

    # inp -> chinese, tar -> english
    for (batch, (inp, tar)) in enumerate(train_dataset):
        train_step(inp, tar)

        if batch % 500 == 0:
            print('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(
                epoch + 1, batch, train_loss.result(), train_accuracy.result()))

    if (epoch + 1) % 1 == 0:
        ckpt_save_path = ckpt_manager.save()
        print('Saving checkpoint for epoch {} at {}'.format(epoch + 1,
                                                            ckpt_save_path))

    print('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1,
                                                        train_loss.result(),
                                                        train_accuracy.result()))

    print('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 1.5124 Accuracy 0.0000
Saving checkpoint for epoch 1 at /checkpoints/train_nmt_eng_to_chi_bert\ckpt-1
Epoch 1 Loss 1.1780 Accuracy 0.0358
Time taken for 1 epoch: 88.95984721183777 secs

Epoch 2 Batch 0 Loss 0.9579 Accuracy 0.0517
Saving checkpoint for epoch 2 at /checkpoints/train_nmt_eng_to_chi_bert\ckpt-2
Epoch 2 Loss 0.9586 Accuracy 0.0523
Time taken for 1 epoch: 63.46627998352051 secs

Epoch 3 Batch 0 Loss 0.9504 Accuracy 0.0537
Saving checkpoint for epoch 3 at /checkpoints/train_nmt_eng_to_chi_bert\ckpt-3
Epoch 3 Loss 0.8610 Accuracy 0.0551
Time taken for 1 epoch: 63.559361696243286 secs

Epoch 4 Batch 0 Loss 0.8376 Accuracy 0.0573
Saving checkpoint for epoch 4 at /checkpoints/train_nmt_eng_to_chi_bert\ckpt-4
Epoch 4 Loss 0.7559 Accuracy 0.0584
Time taken for 1 epoch: 63.441171407699585 secs

Epoch 5 Batch 0 Loss 0.6925 Accuracy 0.0593
Saving checkpoint for epoch 5 at /checkpoints/train_nmt_eng_to_chi_bert\ckpt-5
Epoch 5 Loss 0.6566 Accuracy 0.0628
Time taken 

# Evaluate

In [41]:
def encode_en(en):
    tokens_en = tokenizer_eng.tokenize(en)
    lang1 = tokenizer_eng.convert_tokens_to_ids(['[CLS]'] + tokens_en + ['[SEP]'])
    return lang1

In [42]:
def evaluate(transformer, inp_sentence):
    # normalize input sentence
    inp_sentence = encode_en(inp_sentence)
    encoder_input = tf.expand_dims(inp_sentence, 0)

    # as the target is english, the first word to the transformer should be the
    # english start token.
    decoder_input = [tokenizer_chin.vocab_size]
    output = tf.expand_dims(decoder_input, 0)

    for i in range(MAX_SEQ_LENGTH):
        combined_mask, dec_padding_mask = create_masks(
            encoder_input, output)

        # predictions.shape == (batch_size, seq_len, vocab_size)
        predictions, attention_weights = transformer(encoder_input,
                                                     output,
                                                     False,
                                                     combined_mask,
                                                     dec_padding_mask)

        # select the last word from the seq_len dimension
        predictions = predictions[:, -1:, :]  # (batch_size, 1, vocab_size)

        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

        # return the result if the predicted_id is equal to the end token
        if tf.equal(predicted_id, tokenizer_chin.vocab_size + 1):
            return tf.squeeze(output, axis=0), attention_weights

        # concatentate the predicted_id to the output which is given to the decoder
        # as its input.
        output = tf.concat([output, predicted_id], axis=-1)

    return tf.squeeze(output, axis=0), attention_weights

In [43]:
def plot_attention_weights(attention, sentence, result, layer):
  fig = plt.figure(figsize=(16, 8))
  
  test_tokens = tokenizer_eng.tokenize(sentence)
  sentence = tokenizer_eng.convert_tokens_to_ids(test_tokens)

#   sentence = tokenizer_ques.tokenize(sentence) #.encode(sentence)
  
  attention = tf.squeeze(attention[layer], axis=0)
  
  for head in range(attention.shape[0]):
    ax = fig.add_subplot(2, 4, head+1)
    
    # plot the attention weights
    ax.matshow(attention[head][:-1, :], cmap='viridis')

    fontdict = {'fontsize': 10}
    
    ax.set_xticks(range(len(sentence)+2))
    ax.set_yticks(range(len(result)))
    
    ax.set_ylim(len(result)-1.5, -0.5)
        
    ax.set_xticklabels(
        ['<start>']+[tokenizer_eng.convert_ids_to_tokens([i]) for i in sentence]+['<end>'], 
        fontdict=fontdict, rotation=90)
    
    ax.set_yticklabels([tokenizer_eng.decode([i]) for i in result 
                        if i < tokenizer_chin.vocab_size], 
                       fontdict=fontdict)
    
    ax.set_xlabel('Head {}'.format(head+1))
  
  plt.tight_layout()
  plt.show()

In [44]:
def translate(transformer, sentence, plot=''):
    result, attention_weights = evaluate(transformer, sentence)

    predicted_sentence = tokenizer_chin.decode([i for i in result
                                              if i < tokenizer_chin.vocab_size])

    print('Input: {}'.format(sentence))
    print('Predicted Translation: {}'.format(predicted_sentence))
    
    if plot:
        plot_attention_weights(attention_weights, sentence, result, plot)
    
    return predicted_sentence

In [43]:
translate(transformer, 'How are you?')

Input: How are you?
Predicted Translation: 机场机场机场机场机场机场机场机场机场机场机场机场走的會發生我弟弟會發生我弟弟會發生我弟弟會發生我弟弟會發生我弟弟會發生會發生會發生會發生會發生會發生會發生會發生會發生會發生會發生會發生會發生會發生會發生會發生會發生


'机场机场机场机场机场机场机场机场机场机场机场机场走的會發生我弟弟會發生我弟弟會發生我弟弟會發生我弟弟會發生我弟弟會發生會發生會發生會發生會發生會發生會發生會發生會發生會發生會發生會發生會發生會發生會發生會發生會發生'

In [44]:
translate(transformer, 'Where are you going?')

Input: Where are you going?
Predicted Translation: 机场机场句話句話句話句話失敗句話失敗句話失敗句話失敗句話失敗句話失敗失敗失敗失敗失敗失敗失敗句話失敗句話失敗句話失敗失敗失敗失敗失敗失敗失敗失敗失敗失敗句話失敗


'机场机场句話句話句話句話失敗句話失敗句話失敗句話失敗句話失敗句話失敗失敗失敗失敗失敗失敗失敗句話失敗句話失敗句話失敗失敗失敗失敗失敗失敗失敗失敗失敗失敗句話失敗'

In [56]:
translate(transformer, 'Nice to meet you')

Input: Nice to meet you
Predicted Translation: 對不起認識你認識你。


'對不起認識你認識你。'

# Save Weights

In [57]:
transformer.save_weights('/weights/nmt_en_2_chi_bert_ckpt')

In [45]:
new_transformer = Transformer(config=config,
                          target_vocab_size=target_vocab_size,
                          bert_config_file=bert_config_file)
  
fn_out, _ = new_transformer(inp, tar_inp, 
                        True,
                        look_ahead_mask=None,
                        dec_padding_mask=None)
new_transformer.load_weights('/weights/nmt_en_2_chi_bert_ckpt')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x1f224070bc8>

In [46]:
translate(new_transformer, "I am full")

Input: I am full
Predicted Translation: “饱了。


'“饱了。'

In [47]:
translate(new_transformer, 'How are you?')

Input: How are you?
Predicted Translation: 你怎么们么？


'你怎么们么？'

In [48]:
translate(new_transformer, 'Where are you going?')

Input: Where are you going?
Predicted Translation: 你要去哪裡哪儿，你的哪处。


'你要去哪裡哪儿，你的哪处。'

In [49]:
translate(new_transformer, 'Nice to meet you')

Input: Nice to meet you
Predicted Translation: 對不起認識你認識你。


'對不起認識你認識你。'

In [50]:
translate(new_transformer, 'I love you')

Input: I love you
Predicted Translation: 我爱你是你是愛你。


'我爱你是你是愛你。'

In [47]:
translate(new_transformer, 'did you finish your homework?')

Input: did you finish your homework?
Predicted Translation: 你打著作业了嗎？


'你打著作业了嗎？'

In [48]:
translate(new_transformer, 'what did you eat for breakfast?')

Input: what did you eat for breakfast?
Predicted Translation: “一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个


'“一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个'

In [49]:
translate(new_transformer, 'i am hungry right now')

Input: i am hungry right now
Predicted Translation: 我現在是饿了。


'我現在是饿了。'

In [90]:
# del en_val[36]
# # del chi_val[36]
# del en_val[122]
# del chi_val[122]

In [91]:
len(en_val)

498

In [92]:
pynlpir.open()
bleuplus = 0
i = 0

for en_, chi_ in zip(en_val,chi_val):
    print(i)
    predicted = translate(new_transformer, en_)
    two_ = pynlpir.segment(predicted, pos_tagging=False)
    one_ = pynlpir.segment(chi_, pos_tagging=False)
    print('Real Translation: {}'.format(chi_))
    bleu_ = bleu.sentence_bleu([one_], two_)
    print("BLEU Score: ", bleu_)
    print("----" *20)

    bleuplus += bleu_
    i += 1

0
Input: what would you do if you had ten thousand dollars ?
Predicted Translation: 如果我个二里做什麼的?
Real Translation: 如果你有一万美元，你想做什么呢？
BLEU Score:  9.97486269044271e-232
--------------------------------------------------------------------------------
1
Input: when i hear that song , i remember my younger days .
Predicted Translation: 我在这首歌，深歌。
Real Translation: 每次听到这首歌，都会让我回忆起自己年轻的时候。
BLEU Score:  0.12151662434083678
--------------------------------------------------------------------------------
2
Input: when was the last time you spent time on facebook ?
Predicted Translation: 在用用用用用用用用�上在Facebook在Facebook在Facebook在Facebook在Facebook在Facebook在Facebook在Facebook在Facebook在Facebook在Facebook在Facebook在Facebook在Facebook
Real Translation: 你上一次用Facebook是什麼時候？
BLEU Score:  8.726094729337945e-232
--------------------------------------------------------------------------------
3
Input: you are no better at remembering things than i am .
Predicted Translation: 你羾������������������������������������
Re

Input: everybody in the room was stunned by what happened .
Predicted Translation: 沒有人聚一遍一遍倒了。
Real Translation: 屋裡的每個人都被發生的事驚住了。
BLEU Score:  3.8476696299631795e-155
--------------------------------------------------------------------------------
32
Input: everyone except tom knew he didn t need to do that .
Predicted Translation: 除了除了每个人都，除了除了。
Real Translation: 除了汤姆以外任何人都知道他没必要这么做。
BLEU Score:  9.918892480173173e-232
--------------------------------------------------------------------------------
33
Input: everyone ought to be the master of his own destiny .
Predicted Translation: 每個人都应该命的主人。
Real Translation: 每個人都應該做自己命運的主人。
BLEU Score:  0.28617394805234075
--------------------------------------------------------------------------------
34
Input: for some reason the microphone didn t work earlier .
Predicted Translation: 在的個�台沒有沒有开台機機機機機機機機機機機機機機機機機機機機機機機機機機機機機機
Real Translation: 剛才我的麥克風沒起作用，不知道為什麼。
BLEU Score:  8.614911585158347e-232
-----------------------------------------------

Input: i ve done bad things that i should be punished for .
Predicted Translation: 我我有个事沒做种事情的事。
Real Translation: 我做了該受罰的壞事。
BLEU Score:  6.5806869883189804e-155
--------------------------------------------------------------------------------
65
Input: if there was no sun , all the animals would be dead .
Predicted Translation: 在動物，我都，我都，我都，我都，我都，我都，我都，我都，我都，我都，我都，我都，我都，我都，我都，我都，我都，我都，我都
Real Translation: 如果没有太阳，那所有生物都会死。
BLEU Score:  7.784451369270533e-232
--------------------------------------------------------------------------------
66
Input: if you can t have children , you could always adopt .
Predicted Translation: 如果你不事，无法養孩子。
Real Translation: 如果你不能有孩子，你总能领养。
BLEU Score:  3.4376286321877657e-78
--------------------------------------------------------------------------------
67
Input: is it true that tom wants to paint his house green ?
Predicted Translation: 汤姆是湯姆不想六改吗？
Real Translation: 汤姆想把房子漆成绿色，是真的吗？
BLEU Score:  5.364950815988815e-155
------------------------------------

Input: stop getting yourself worked up over little things .
Predicted Translation: 记事情在一切吵心。
Real Translation: 不要让自己因为一些小事就方寸大乱了。
BLEU Score:  5.483065946899793e-232
--------------------------------------------------------------------------------
96
Input: thanks for accepting my friend request on facebook .
Predicted Translation: 用用用�在你的號的在這裡的的的的的的的的。
Real Translation: 謝謝你接受我Facebook的交友邀請。
BLEU Score:  1.1337861261109773e-231
--------------------------------------------------------------------------------
97
Input: that child may have been kidnapped on his way home .
Predicted Translation: 那孩子可能一個小行李。
Real Translation: 那个孩子可能在回家的路上被绑架了。
BLEU Score:  4.677275655524805e-155
--------------------------------------------------------------------------------
98
Input: that was the first time that i d seen tom so angry .
Predicted Translation: 難來聚覺得湯姆不說話。
Real Translation: 那是我第一次见到汤姆如此生气。
BLEU Score:  1.0003688322288243e-231
--------------------------------------------------------------------

Input: tom was advised by mary not to go there by himself .
Predicted Translation: 汤姆被對對對玛丽該得不獨想去那裡。
Real Translation: 玛丽建议汤姆不要独自去。
BLEU Score:  1.3091834502273125e-231
--------------------------------------------------------------------------------
128
Input: up to now , how many books do you think you ve read ?
Predicted Translation: 你现在多多少书，不是嗎？
Real Translation: 到目前为止，你认为你阅读过多少书籍？
BLEU Score:  1.0658543616184898e-231
--------------------------------------------------------------------------------
129
Input: visiting all the tourist sights really wore me out .
Predicted Translation: 观地游�的�的的的的的的的的的的。
Real Translation: 參觀所有觀光勝地累壞了我。
BLEU Score:  1.1008876702055895e-231
--------------------------------------------------------------------------------
130
Input: water is liquid . when it freezes , it becomes solid .
Predicted Translation: 水结体，我们廟，我们���������������
Real Translation: 水是液体。冻起来就成了固体。
BLEU Score:  8.422437779564611e-232
-------------------------------------------------------

Input: flies and mosquitoes interfered with his meditation .
Predicted Translation: 一隻阘���，他的生。
Real Translation: 蒼蠅和蚊子干擾了他的冥想。
BLEU Score:  3.645525559050358e-155
--------------------------------------------------------------------------------
159
Input: fluency in english is a very marketable skill today .
Predicted Translation: 現在性的性的工利。
Real Translation: 流利的英語在今天是一種十分搶手的技能。
BLEU Score:  7.63718022611286e-232
--------------------------------------------------------------------------------
160
Input: foreign investors withdrew their money from america .
Predicted Translation: 油中從大油錢。
Real Translation: 外国投资者从美国收回他们的钱。
BLEU Score:  7.060301868108111e-232
--------------------------------------------------------------------------------
161
Input: he called in to say he could not attend the meeting .
Predicted Translation: 他必須在在在在會議。
Real Translation: 他打電話來說他不會參加會議了。
BLEU Score:  4.154122940232254e-155
--------------------------------------------------------------------------------
162
In

Input: if i were invisible i wouldn t have to dress myself .
Predicted Translation: 如果我不穿不可穿住的自己。
Real Translation: 若我是隱形的，就不用穿衣服了。
BLEU Score:  1.1795395762188e-231
--------------------------------------------------------------------------------
191
Input: if you need a dictionary , i can lend you my old one .
Predicted Translation: 如果你需要的话，你能把字典。
Real Translation: 如果你需要字典，我就把我旧的借给你。
BLEU Score:  3.117598395941269e-78
--------------------------------------------------------------------------------
192
Input: in case of an emergency , get in touch with my agent .
Predicted Translation: 在他的他的他的他的他的他的他的他的他的他的他的他的他的他的他的他的他的他的他的他的他的他的他的他的他的他的他的他的他的他的他的他的他的他的他的他的他的他的他的
Real Translation: 万一有紧急情况，联系我的代理人。
BLEU Score:  6.110848977001762e-232
--------------------------------------------------------------------------------
193
Input: is it necessary for me to explain the reason to him ?
Predicted Translation: 让我不解釋是什么理由理由理由理由理由理由理由理由理由理由理由理由理由理由理由理由理由理由理由理由理由理由理由理由理由理由理由理由理由理由理由理由理由理由理由理由
Real T

Input: the board unanimously decided to appoint her as ceo .
Predicted Translation: 选一個為員什麼時候理。
Real Translation: 董事会一致决定任命她为执行总裁。
BLEU Score:  1.0518351895246305e-231
--------------------------------------------------------------------------------
223
Input: the company invested a lot of money in this project .
Predicted Translation: 这的的人的小這目上上。
Real Translation: 該公司在這個企劃中投入了很多錢。
BLEU Score:  9.918706012922318e-232
--------------------------------------------------------------------------------
224
Input: the doctor advised my father to cut down on smoking .
Predicted Translation: 醫生的的的的的的的的的的的建议他儿子的按的加吸煙的吸煙的吸煙的吸煙的吸煙的吸煙的吸煙的吸煙的吸煙的吸煙的吸煙
Real Translation: 医生建议我父亲减少吸烟。
BLEU Score:  6.784338172413661e-232
--------------------------------------------------------------------------------
225
Input: the ice on the lake is too thin to bear your weight .
Predicted Translation: 在著著著著著著著著著著著著著著著著著著著著著著著著著著著著著著著著著著著著著著著
Real Translation: 湖上的冰太薄了，承受不了你的重量。
BLEU Score:  0
----------------------------

Input: artificial leather can t compare with the real thing .
Predicted Translation: 廑��和��廑�����������������������������
Real Translation: 人造皮革是比不上真皮的。
BLEU Score:  0
--------------------------------------------------------------------------------
254
Input: australia is the world s fifth largest coal producer .
Predicted Translation: “的世界上的氣的氣的为的为油。
Real Translation: 澳大利亚是世界第五大煤炭产地。
BLEU Score:  1.1200407237786664e-231
--------------------------------------------------------------------------------
255
Input: can you wake me up at seven o clock tomorrow morning ?
Predicted Translation: 你能在在在在在在在在在在在在在在在在在在在在在在在在在在在在在在在在在在在在在在在
Real Translation: 你明天早上七點可不可以叫我起床。
BLEU Score:  7.199666163340923e-232
--------------------------------------------------------------------------------
256
Input: cutting a cake into equal pieces is rather difficult .
Predicted Translation: 猄�������������������������������������
Real Translation: 把蛋糕等分切開更難。
BLEU Score:  0
---------------------------------------

Input: it is said that his father died in a foreign country .
Predicted Translation: 据报告在一国家的国家。
Real Translation: 据说他爸爸在外国去世了。
BLEU Score:  1.258141043412406e-231
--------------------------------------------------------------------------------
286
Input: it looks like tom will do what we ve asked him to do .
Predicted Translation: 看起来們們們做讓我們做。
Real Translation: 看來湯姆會按我們要求的去做。
BLEU Score:  6.033867102888587e-155
--------------------------------------------------------------------------------
287
Input: it makes no difference to me whether he comes or not .
Predicted Translation: 他這個氣不取決於他來。
Real Translation: 他来不来对我来说没区别。
BLEU Score:  1.3165594234639305e-231
--------------------------------------------------------------------------------
288
Input: it seems to me that she has a tendency to exaggerate .
Predicted Translation: 深她因地地�地地地地地地地地地地地地地地地地地地地地地地地地地地地地地地地地地地
Real Translation: 在我看來，她有一種誇張的傾向。
BLEU Score:  7.290245807398516e-232
-----------------------------------------------------

Input: the teacher demonstrated the idea with an experiment .
Predicted Translation: 教師想一個實验實验。
Real Translation: 这位老师用试验论证了这个想法。
BLEU Score:  8.436497969708995e-232
--------------------------------------------------------------------------------
318
Input: the teacher lined the children up in order of height .
Predicted Translation: 老師的倀�的高的。
Real Translation: 老师按照身高给孩子们排队。
BLEU Score:  1.0832677820940877e-231
--------------------------------------------------------------------------------
319
Input: there are many beautiful castles in northern germany .
Predicted Translation: 在的德国的許多�的。
Real Translation: 在德國北部有很多美麗的城堡？
BLEU Score:  8.853864984883467e-232
--------------------------------------------------------------------------------
320
Input: there s nothing more painful than losing one s child .
Predicted Translation: 一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个一个
Real Translation: 沒有比失去孩子更讓人悲傷的事。
BLEU Score:  0
-----------------------------------

Input: a foreign language cannot be mastered in a year or so .
Predicted Translation: 並不個並不年能能能能能能能能能能能能能能能能能能能能能能能能能能能能能能能能能能能能
Real Translation: 一個外國語言無法在一年左右就被掌握。
BLEU Score:  8.510469113101058e-232
--------------------------------------------------------------------------------
350
Input: after a couple of drinks , the guy was feeling no pain .
Predicted Translation: 一一一一一一一一一一一一一一一一一一一一一一一一一一一一一一一一一一一一一一一一
Real Translation: 喝了幾杯酒後，這個傢伙就感覺不痛了。
BLEU Score:  0
--------------------------------------------------------------------------------
351
Input: an argument may be logically sound without being true .
Predicted Translation: 想是话是法基�是法是法是法法法是法是法是法法法法法是法法法法法法法法法法法法法
Real Translation: 不正确的命题听起来可能符合逻辑。
BLEU Score:  0
--------------------------------------------------------------------------------
352
Input: apparently , there is nothing that cannot happen today .
Predicted Translation: 今天这天不會發生甚麼。
Real Translation: 显然，今天什么事都有可能发生。
BLEU Score:  1.2183324802375697e-231
-----------------

Input: i m going to run a couple of errands . wanna tag along ?
Predicted Translation: 我打算去現在所有的中风地地知道地。
Real Translation: 我要去购物。你要跟着来吗？
BLEU Score:  1.2627076138080564e-231
--------------------------------------------------------------------------------
382
Input: i m pretty sure tom is the only one who can t do that .
Predicted Translation: 我确定不會能做到。
Real Translation: 我确信汤姆是唯一不会那么做的人。
BLEU Score:  7.216120604020078e-232
--------------------------------------------------------------------------------
383
Input: i m sure you ll love what we have on the menu tonight .
Predicted Translation: 今晚这什麼，布满排布满所碟。
Real Translation: 我肯定你会喜欢我们今晚的菜肴。
BLEU Score:  1.1896457329133973e-231
--------------------------------------------------------------------------------
384
Input: if i lost my key , i wouldn t be able to lock the door .
Predicted Translation: 我�把門把門，把门把门把门的时候把门门门的时候的时候的时候的时候的时候的时候的时候的时候的时候的时候的时候的时候的时候的时候的时候的时候的时候的时候的时候的时候把门
Real Translation: 如果我弄丟了我的鑰匙，我就無法鎖門了。
BLEU Score:  9.336117803

Input: the bus is full . you ll have to wait for the next one .
Predicted Translation: 这公交车會圗來說，您要。
Real Translation: 這班公車客滿了。你必須等下一班。
BLEU Score:  6.3497053018839554e-232
--------------------------------------------------------------------------------
413
Input: the policeman told me that the last bus leaves at ten .
Predicted Translation: 警察對我九点站一軧。
Real Translation: 警察告诉我巴士的末班车在十点出发。
BLEU Score:  1.1103055738259992e-231
--------------------------------------------------------------------------------
414
Input: the professor spoke too fast for anyone to understand .
Predicted Translation: 医生太理解。
Real Translation: 那个教授讲得太快了，没有一个人听得懂。
BLEU Score:  9.793569269297845e-233
--------------------------------------------------------------------------------
415
Input: their contract is to run out at the end of this month .
Predicted Translation: 他们的約會約中會下。
Real Translation: 他们的合同在这个月底到期。
BLEU Score:  6.7393716283177006e-155
----------------------------------------------------------------------

Input: you should ve done it earlier . it can t be helped now .
Predicted Translation: 你们现在可以可以可以可以可以可以可以可以可以可以可以可以可以可以可以可以可以可以可以可以可以可以可以可以可以可以可以可以可以可以可以可以可以可以可以可以可以可以
Real Translation: 你本应该早点做的。现在已经没有任何办法了。
BLEU Score:  7.244248269687037e-232
--------------------------------------------------------------------------------
444
Input: you re the one who suggested that we do that together .
Predicted Translation: 这一起去這屃我什么手，人去你看起来你看起来你看起来你看起来你看起来你看起来你看起来你看起来你看起来你看起来你看起来你看起来你看起来你看起来你看起来你看起来你看起来你看起来你看起来你看起来你看起来你看起来你看起来你看起来你看起来你看起来你看起来你看起来我什么湯姆說他
Real Translation: 你是那個建議我們一起做那件事的人。
BLEU Score:  8.127238000397563e-232
--------------------------------------------------------------------------------
445
Input: he d like to have a coffee after work . i would too .
Predicted Translation: 他在在在在在在在在在在在在在在在在在在在在在在在在在在在在在在在在在在在在在在在
Real Translation: "他想在下班後喝杯咖啡。" "我也想。"
BLEU Score:  8.614911585158347e-232
--------------------------------------------------------------------------------
446
Input: how

Input: i don t want to go there . he doesn t want to go either .
Predicted Translation: 不知道他想去那裡。
Real Translation: 我不想去那儿，他也不想。
BLEU Score:  5.60411810495621e-155
--------------------------------------------------------------------------------
474
Input: i felt so sleepy that i could hardly keep my eyes open .
Predicted Translation: 我悎��，我，我还��著我还著。
Real Translation: 我感觉困得几乎不能睁开我的眼睛了。
BLEU Score:  1.090462944153118e-231
--------------------------------------------------------------------------------
475
Input: i found a good place to buy fruit a couple of days ago .
Predicted Translation: 我在给一條買了一條商店。
Real Translation: 前几天我发现了一个买水果的好地方。
BLEU Score:  8.875814970513353e-232
--------------------------------------------------------------------------------
476
Input: i know that it is highly unlikely that anyone knows me .
Predicted Translation: 我知道个人都知道我的知道我的知道我的知道我的知道我的知道我的知道我的知道我的知道我的知道我的知道我的知道我的知道我的知道我的知道我的知道我的知道我的知道我的
Real Translation: 我知道有人认识我的可能性微乎其微。
BLEU Score:  3.3084075264006686

In [93]:
bleuplus/498

0.002440708527898148

In [63]:
# !pip install pynlpir

Collecting pynlpir
  Downloading PyNLPIR-0.6.0-py2.py3-none-any.whl (13.1 MB)
Installing collected packages: pynlpir
Successfully installed pynlpir-0.6.0


In [49]:
# pynlpir.open()
# mystring = "你汉语说的很好！"
# tokenized_string = pynlpir.segment(mystring, pos_tagging=False)

In [50]:
# tokenized_string

['你', '汉语', '说', '的', '很', '好', '！']

In [51]:
# predicted = translate(new_transformer, en_val[0])

Input: what would you do if you had ten thousand dollars ?
Predicted Translation: 如果我个二里做什麼的?


In [55]:
# two_ = pynlpir.segment(predicted, pos_tagging=False)

In [56]:
# one_ = pynlpir.segment(chi_val[0], pos_tagging=False)

In [60]:
# bleu.sentence_bleu([one_], two_)

9.97486269044271e-232

In [69]:
# pynlpir.segment("他不法來我裡關�利。", pos_tagging=False)

['他', '不法', '來', '我', '裡', '關', '�利', '。']

In [81]:
# s = SnowNLP(他頭地尕����������������������������������)
# s.words

SyntaxError: invalid character in identifier (<ipython-input-81-9e7aba872beb>, line 1)