In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time
import  nltk.translate.bleu_score as bleu
import pynlpir

pynlpir.open()

In [2]:
path_to_zip = 'C:\\Users\\user\\Project From Alex NLP\\chi-eng.zip'
path_to_file = os.path.dirname(path_to_zip)+"\\chi-eng\\cmn.txt"
print(path_to_file)

C:\Users\user\Project From Alex NLP\chi-eng\cmn.txt


In [3]:
def preprocess_sentence(sentence):
  sentence = sentence.lower().strip()
  # creating a space between a word and the punctuation following it
  # eg: "he is a boy." => "he is a boy ."
  sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
  sentence = re.sub(r'[" "]+', " ", sentence)
  # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
  sentence = re.sub(r"[^a-zA-Z?.!,]+", " ", sentence)
  sentence = sentence.strip()
  # adding a start and an end token to the sentence
  # so that the model know when to start and stop predicting.
#   sentence = '<start> ' + sentence + ' <end>'
  return sentence

In [4]:
def create_dataset(path, num1,num2):
  english = []
  chinese = []
  lines = io.open(path, encoding='UTF-8').read().strip().split('\n')

#   word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_examples]]
  for l in lines[num1:num2]:
      eng = preprocess_sentence(l.split('\t')[0])
      chi = l.split('\t')[1]
      english.append(eng)
      chinese.append(chi)
        
  return english, chinese

In [5]:
eng_tr, chin_tr = create_dataset(path_to_file, 0,20000)
print(eng_tr[-1])
print(chin_tr[-1])

one man s meat is another man s poison .
甲之蜜糖，乙之砒霜。


In [6]:
len(eng_tr),len(chin_tr)

(20000, 20000)

In [7]:
eng_val, chin_val = create_dataset(path_to_file,23000,23500)
len(eng_val),len(chin_val)

(500, 500)

In [8]:
tokenizer_eng = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    eng_tr, target_vocab_size=2**13)

tokenizer_chin = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    chin_tr, target_vocab_size=2**13)

In [9]:
sample_string = 'Transformer is awesome.'

tokenized_string = tokenizer_eng.encode(sample_string)
print ('Tokenized string is {}'.format(tokenized_string))

original_string = tokenizer_eng.decode(tokenized_string)
print ('The original string: {}'.format(original_string))

assert original_string == sample_string

Tokenized string is [7301, 5149, 7332, 6165, 8, 3017, 7263]
The original string: Transformer is awesome.


In [10]:
sample_string = eng_tr[5000]

tokenized_string = tokenizer_eng.encode(sample_string)
print ('Tokenized string is {}'.format(tokenized_string))

original_string = tokenizer_eng.decode(tokenized_string)
print ('The original string: {}'.format(original_string))

assert original_string == sample_string

Tokenized string is [14, 567, 3, 990, 320, 1]
The original string: she hit the ball hard .


In [11]:
for ts in tokenized_string:
  print ('{} ----> {}'.format(ts, tokenizer_eng.decode([ts])))

14 ----> she 
567 ----> hit 
3 ----> the 
990 ----> ball 
320 ----> hard
1 ---->  .


In [12]:
sample_string = '很高興認識你'
tokenized_string = tokenizer_chin.encode(sample_string)
for ts in tokenized_string:
  print ('{} ----> {}'.format(ts, tokenizer_chin.decode([ts])))

4573 ----> 很高興
575 ----> 認識
9 ----> 你


In [13]:
BUFFER_SIZE = 20000
BATCH_SIZE = 64

In [14]:
def encode(lang1, lang2):
  lang1 = [tokenizer_chin.vocab_size] + tokenizer_chin.encode(
      lang1.numpy()) + [tokenizer_chin.vocab_size+1]

  lang2 = [tokenizer_eng.vocab_size] + tokenizer_eng.encode(
      lang2.numpy()) + [tokenizer_eng.vocab_size+1]
  
  return lang1, lang2

In [15]:
def tf_encode(chi, en):
  result_chi, result_en = tf.py_function(encode, [chi, en], [tf.int64, tf.int64])
  result_en.set_shape([None])
  result_chi.set_shape([None])

  return result_chi, result_en

In [16]:
# train_examples = tf.data.Dataset.from_tensor_slices((chin_tr, eng_tr))
# val_examples = tf.data.Dataset.from_tensor_slices((chin_val, eng_val))

In [17]:
MAX_LENGTH = 40

In [18]:
def filter_max_length(x, y, max_length=MAX_LENGTH):
  return tf.logical_and(tf.size(x) <= max_length,
                        tf.size(y) <= max_length)

In [19]:
train_preprocessed = (
    tf.data.Dataset.from_tensor_slices((chin_tr, eng_tr))
    .map(tf_encode) 
    .filter(filter_max_length)
    # cache the dataset to memory to get a speedup while reading from it.
    .cache()
    .shuffle(BUFFER_SIZE))

In [20]:
train_dataset = (train_preprocessed
                 .padded_batch(BATCH_SIZE, padded_shapes=([None], [None]))
                 .prefetch(tf.data.experimental.AUTOTUNE))

In [21]:
chin_batch, eng_batch = next(iter(train_dataset))
chin_batch

<tf.Tensor: shape=(64, 14), dtype=int64, numpy=
array([[7318, 5113, 5059,  647,  779,   75,    4,    1, 7319,    0,    0,
           0,    0,    0],
       [7318, 2636, 5708, 7125, 7319,    0,    0,    0,    0,    0,    0,
           0,    0,    0],
       [7318, 4463, 1024,   26,  479,    1, 7319,    0,    0,    0,    0,
           0,    0,    0],
       [7318, 3547,  245, 2369,    1, 7319,    0,    0,    0,    0,    0,
           0,    0,    0],
       [7318, 2755, 7295, 7250, 7209, 7291, 7201, 7243,    1, 7319,    0,
           0,    0,    0],
       [7318, 5114,    2, 7319,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0],
       [7318,   88,  763,  415,    1, 7319,    0,    0,    0,    0,    0,
           0,    0,    0],
       [7318,  240,  260, 4030,    7, 7125, 7319,    0,    0,    0,    0,
           0,    0,    0],
       [7318,  255,    5,  271,   75,  168,    2, 7319,    0,    0,    0,
           0,    0,    0],
       [7318,  115,  136,   19, 1971,    2,

In [22]:
eng_batch

<tf.Tensor: shape=(64, 13), dtype=int64, numpy=
array([[7473,   27, 1773,    8,   26,  279,    1, 7474,    0,    0,    0,
           0,    0],
       [7473,  496,    8,   20,  118,    6, 7474,    0,    0,    0,    0,
           0,    0],
       [7473,   13, 1540,  155, 1302,    1, 7474,    0,    0,    0,    0,
           0,    0],
       [7473,    2,  172,   26,   13,  137,   12,  219,    1, 7474,    0,
           0,    0],
       [7473,   11,  623, 6343,    1, 7474,    0,    0,    0,    0,    0,
           0,    0],
       [7473,    8,  329,  161,    6, 7474,    0,    0,    0,    0,    0,
           0,    0],
       [7473,   48,   12,   59,  929,    4,  693, 4347,    1, 7474,    0,
           0,    0],
       [7473,    8,   48,  102, 1334,   27,    9,   46, 1905,    6, 7474,
           0,    0],
       [7473,   86,   32,   24,   18,    5,   43,    4,  897,    6, 7474,
           0,    0],
       [7473,   24,  140,    9,  191,   15,   33,  319,    6, 7474,    0,
           0,    0],
  

In [23]:
tokenizer_chin.vocab_size

7318

In [24]:
# BUFFER_SIZE = 20000 #len(input_tensor_train)
# BATCH_SIZE = 4 #64
steps_per_epoch = 20 #tokenizer_en.vocab_size//BATCH_SIZE
embedding_dim = 512 #4
units =  1024 #64
vocab_inp_size = tokenizer_chin.vocab_size + 2
vocab_tar_size = tokenizer_eng.vocab_size + 2

# dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
# dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [25]:
# Write the encoder and decoder model

In [26]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')

  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state = hidden)
    return output, state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_sz, self.enc_units))

In [27]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(chin_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (64, 14, 1024)
Encoder Hidden state shape: (batch size, units) (64, 1024)


In [28]:
class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, query, values):
    # query hidden state shape == (batch_size, hidden size)
    # query_with_time_axis shape == (batch_size, 1, hidden size)
    # values shape == (batch_size, max_len, hidden size)
    # we are doing this to broadcast addition along the time axis to calculate the score
    query_with_time_axis = tf.expand_dims(query, 1)

    # score shape == (batch_size, max_length, 1)
    # we get 1 at the last axis because we are applying score to self.V
    # the shape of the tensor before applying self.V is (batch_size, max_length, units)
    score = self.V(tf.nn.tanh(
        self.W1(query_with_time_axis) + self.W2(values)))

    # attention_weights shape == (batch_size, max_length, 1)
    attention_weights = tf.nn.softmax(score, axis=1)

    # context_vector shape after sum == (batch_size, hidden_size)
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

In [29]:
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape: (batch size, units) (64, 1024)
Attention weights shape: (batch_size, sequence_length, 1) (64, 14, 1)


In [30]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)

    # used for attention
    self.attention = BahdanauAttention(self.dec_units)

  def call(self, x, hidden, enc_output):
    # enc_output shape == (batch_size, max_length, hidden_size)
    context_vector, attention_weights = self.attention(hidden, enc_output)

    # x shape after passing through embedding == (batch_size, 1, embedding_dim)
    x = self.embedding(x)

    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

    # passing the concatenated vector to the GRU
    output, state = self.gru(x)

    # output shape == (batch_size * 1, hidden_size)
    output = tf.reshape(output, (-1, output.shape[2]))

    # output shape == (batch_size, vocab)
    x = self.fc(output)

    return x, state, attention_weights

In [31]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden, sample_output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (64, 7475)


In [32]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

In [33]:
checkpoint_dir = './training_checkpoints_nmt_chi_to_eng_attention'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [36]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(inp, enc_hidden)

    dec_hidden = enc_hidden

    dec_input = tf.expand_dims([tokenizer_eng.vocab_size] * BATCH_SIZE, 1)

    # Teacher forcing - feeding the target as the next input
    for t in range(1, targ.shape[1]):
      # passing enc_output to the decoder
      predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

      loss += loss_function(targ[:, t], predictions)

      # using teacher forcing
      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))

  variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, variables)

  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [37]:
EPOCHS = 10

for epoch in range(EPOCHS):
  start = time.time()

  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(train_dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss

    if batch % 100 == 0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))
  # saving (checkpoint) the model every 2 epochs
  if (epoch + 1) % 2 == 0:
    checkpoint.save(file_prefix = checkpoint_prefix)

  print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 5.9579




































Epoch 1 Loss 4.3790
Time taken for 1 epoch 119.12664413452148 sec

Epoch 2 Batch 0 Loss 3.7765




















Epoch 2 Loss 3.4269
Time taken for 1 epoch 68.42726492881775 sec

Epoch 3 Batch 0 Loss 3.2545
Epoch 3 Loss 3.2319
Time taken for 1 epoch 29.776427268981934 sec

Epoch 4 Batch 0 Loss 3.1695




Epoch 4 Loss 3.1399
Time taken for 1 epoch 34.55469751358032 sec

Epoch 5 Batch 0 Loss 2.9081




Epoch 5 Loss 2.9876
Time taken for 1 epoch 40.00048470497131 sec

Epoch 6 Batch 0 Loss 2.8323
Epoch 6 Loss 2.8288
Time taken for 1 epoch 10.021930456161499 sec

Epoch 7 Batch 0 Loss 2.8971
Epoch 7 Loss 2.7561
Time taken for 1 epoch 19.246695518493652 sec

Epoch 8 Batch 0 Loss 2.6042
Epoch 8 Loss 2.7088
Time taken for 1 epoch 37.14300298690796 sec

Epoch 9 Batch 0 Loss 2.4809
Epoch 9 Loss 2.6570
Time taken for 1 epoch 13.914393901824951 sec

Epoch 10 Batch 0 Loss 2.2716
Epoch 10 Loss 2.5429
Time taken for 1 epoch 29.62283229827881 sec



In [43]:
max_length_inp = chin_batch.shape[1]
max_length_targ = eng_batch.shape[1]

In [64]:
def evaluate(sentence):
  attention_plot = np.zeros((max_length_targ, max_length_inp))

#   sentence = preprocess_sentence(sentence)

  start_token = [tokenizer_chin.vocab_size]
  end_token = [tokenizer_chin.vocab_size + 1]

  inputs = start_token + tokenizer_chin.encode(sentence) + end_token 
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=max_length_inp,
                                                         padding='post')
  inputs = tf.convert_to_tensor(inputs)

  result = ''

  hidden = [tf.zeros((1, units))]
  enc_out, enc_hidden = encoder(inputs, hidden)

  dec_hidden = enc_hidden
  dec_input = tf.expand_dims([tokenizer_eng.vocab_size], 0)


  for t in range(max_length_targ):
    predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                         dec_hidden,
                                                         enc_out)

    # storing the attention weights to plot later on
    attention_weights = tf.reshape(attention_weights, (-1, ))
    attention_plot[t] = attention_weights.numpy()

    predicted_id = tf.argmax(predictions[0]).numpy()


    if predicted_id == tokenizer_eng.vocab_size+1:
      return result, sentence, attention_plot
    else:
      result += tokenizer_eng.decode([predicted_id])


    # the predicted ID is fed back into the model
    dec_input = tf.expand_dims([predicted_id], 0)
    

  return result, sentence, attention_plot

In [65]:
evaluate('你要去哪裡?')

('i m a good  ?',
 '你要去哪裡?',
 array([[5.69968272e-17, 2.57003958e-16, 1.15028795e-08, 1.19025015e-06,
         4.95649365e-05, 3.86518077e-04, 1.38678530e-03, 3.77945392e-03,
         9.96315852e-03, 2.62539815e-02, 6.50045797e-02, 1.43420160e-01,
         2.77107924e-01, 4.72646743e-01],
        [8.40060785e-02, 8.58576000e-02, 1.15843400e-01, 1.13658696e-01,
         1.11236490e-01, 1.06640778e-01, 9.72775370e-02, 8.14318880e-02,
         6.17200211e-02, 4.43273224e-02, 3.24991681e-02, 2.54179239e-02,
         2.12829281e-02, 1.88001096e-02],
        [3.03276032e-01, 2.70313561e-01, 1.97644189e-01, 1.16445728e-01,
         5.38801923e-02, 2.51927152e-02, 1.18928654e-02, 6.16725488e-03,
         3.81829380e-03, 2.83040153e-03, 2.38129217e-03, 2.15745717e-03,
         2.03574356e-03, 1.96427479e-03],
        [2.00592831e-01, 1.98927447e-01, 2.22675160e-01, 1.79474115e-01,
         1.12512723e-01, 5.05158417e-02, 1.81262959e-02, 6.92183524e-03,
         3.31929629e-03, 2.03402829e-03, 1

In [201]:
# # function for plotting the attention weights
# def plot_attention(attention, sentence, predicted_sentence):
#   fig = plt.figure(figsize=(10,10))
#   ax = fig.add_subplot(1, 1, 1)
#   ax.matshow(attention, cmap='viridis')

#   fontdict = {'fontsize': 14}

#   ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90)
#   ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)

#   ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
#   ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

#   plt.show()

In [66]:
def translate(sentence):
  result, sentence, attention_plot = evaluate(sentence)

  print('Input: {}'.format(sentence))
  print('Predicted translation: {}'.format(result))

#   segmentation = pynlpir.segment(result, pos_tagging=False)

#   attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]
#   plot_attention(attention_plot, sentence.split(' '), result.split(' '))
#   if plot:
#       attention_plot = attention_plot[:len(segmentation), :len(sentence.split(' '))]
#       plot_attention(attention_plot, sentence.split(' '), segmentation)

  return result

In [67]:
# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x21a1bc12ec8>

In [51]:
# pynlpir.segment("我想鋼琴", pos_tagging=False)

In [68]:
translate('你好嗎？')

Input: 你好嗎？
Predicted translation: i m a good  ?


'i m a good  ?'

In [69]:
translate('你要去哪裡？')

Input: 你要去哪裡？
Predicted translation: i m a good  ?


'i m a good  ?'

In [70]:
translate('很高興認識你')

Input: 很高興認識你
Predicted translation: i m a good a good a good a good a good a 


'i m a good a good a good a good a good a '

In [60]:
# del eng_val[36]
# del chin_val[36]
# del eng_val[123]
# del chin_val[123]
# del eng_val[149]
# del chin_val[149]

In [62]:
len(eng_val)

497

In [72]:
bleuplus = 0
i = 0

for chi_, en_ in zip(chin_val,eng_val):
    print(i)
    predicted = translate(chi_)
    print('Real Translation: {}'.format(en_))
    bleu_ = bleu.sentence_bleu([en_.split()], predicted.split())
    print("BLEU Score: ", bleu_)
    print("----" *20)

    bleuplus += bleu_
    i += 1

0
Input: 如果你有一万美元，你想做什么呢？
Predicted translation: i m a good a good a good a good a good a 
Real Translation: what would you do if you had ten thousand dollars ?
BLEU Score:  0
--------------------------------------------------------------------------------
1
Input: 每次听到这首歌，都会让我回忆起自己年轻的时候。
Predicted translation: i m a good a good a good a good a good a 
Real Translation: when i hear that song , i remember my younger days .
BLEU Score:  9.594503055152632e-232
--------------------------------------------------------------------------------
2
Input: 你上一次用Facebook是什麼時候？
Predicted translation: i m a good a good a good a good a good a 
Real Translation: when was the last time you spent time on facebook ?
BLEU Score:  0
--------------------------------------------------------------------------------
3
Input: 你记事情的能力并不比我好多少。
Predicted translation: i m a good is a good is a good is a good 
Real Translation: you are no better at remembering things than i am .
BLEU Score:  9.594503055152632e-232
-

Input: 屋裡的每個人都被發生的事驚住了。
Predicted translation: i m a good a good a good a good a good a 
Real Translation: everybody in the room was stunned by what happened .
BLEU Score:  0
--------------------------------------------------------------------------------
32
Input: 除了汤姆以外任何人都知道他没必要这么做。
Predicted translation: i m a good a good a good a good a good a 
Real Translation: everyone except tom knew he didn t need to do that .
BLEU Score:  0
--------------------------------------------------------------------------------
33
Input: 每個人都應該做自己命運的主人。
Predicted translation: i m a good is a good is a good is a good 
Real Translation: everyone ought to be the master of his own destiny .
BLEU Score:  0
--------------------------------------------------------------------------------
34
Input: 剛才我的麥克風沒起作用，不知道為什麼。
Predicted translation: i m a good a good a good a good a good a 
Real Translation: for some reason the microphone didn t work earlier .
BLEU Score:  0
------------------------------------------

Input: 我做了該受罰的壞事。
Predicted translation: i m a good a good a good a good a good a 
Real Translation: i ve done bad things that i should be punished for .
BLEU Score:  9.594503055152632e-232
--------------------------------------------------------------------------------
65
Input: 如果没有太阳，那所有生物都会死。
Predicted translation: i m a good a good a good a good a good a 
Real Translation: if there was no sun , all the animals would be dead .
BLEU Score:  0
--------------------------------------------------------------------------------
66
Input: 如果你不能有孩子，你总能领养。
Predicted translation: i m a good a good a good a good a good a 
Real Translation: if you can t have children , you could always adopt .
BLEU Score:  0
--------------------------------------------------------------------------------
67
Input: 汤姆想把房子漆成绿色，是真的吗？
Predicted translation: i m a good a good a good a good a good a 
Real Translation: is it true that tom wants to paint his house green ?
BLEU Score:  0
--------------------------------

Input: 謝謝你接受我Facebook的交友邀請。
Predicted translation: i m a good a good a good a good a good a 
Real Translation: thanks for accepting my friend request on facebook .
BLEU Score:  0
--------------------------------------------------------------------------------
97
Input: 那个孩子可能在回家的路上被绑架了。
Predicted translation: he is a good a good a good a good a good a 
Real Translation: that child may have been kidnapped on his way home .
BLEU Score:  0
--------------------------------------------------------------------------------
98
Input: 那是我第一次见到汤姆如此生气。
Predicted translation: i m a good a good a good a good a good a 
Real Translation: that was the first time that i d seen tom so angry .
BLEU Score:  9.594503055152632e-232
--------------------------------------------------------------------------------
99
Input: 亚马逊河有很多支流。
Predicted translation: i m a good a good a good a good a good a 
Real Translation: the amazon is fed by a large number of tributaries .
BLEU Score:  9.594503055152632e-232
------

Input: 到目前为止，你认为你阅读过多少书籍？
Predicted translation: i m a good a good a good a good a good a 
Real Translation: up to now , how many books do you think you ve read ?
BLEU Score:  0
--------------------------------------------------------------------------------
129
Input: 參觀所有觀光勝地累壞了我。
Predicted translation: the father is a good a good a good a good a good 
Real Translation: visiting all the tourist sights really wore me out .
BLEU Score:  9.594503055152632e-232
--------------------------------------------------------------------------------
130
Input: 水是液体。冻起来就成了固体。
Predicted translation: the father is a good a good a good a good a good 
Real Translation: water is liquid . when it freezes , it becomes solid .
BLEU Score:  9.594503055152632e-232
--------------------------------------------------------------------------------
131
Input: 我們都想知道她為什麼甩了這麼好的一個男人。
Predicted translation: i m a good a good a good a good a good a 
Real Translation: we all wondered why she had dumped such a nice man

Input: 外国投资者从美国收回他们的钱。
Predicted translation: i m piece 
Real Translation: foreign investors withdrew their money from america .
BLEU Score:  0
--------------------------------------------------------------------------------
160
Input: 他打電話來說他不會參加會議了。
Predicted translation: i m a good a good a good a good a good a 
Real Translation: he called in to say he could not attend the meeting .
BLEU Score:  0
--------------------------------------------------------------------------------
161
Input: 他没有接电话，所以我给他发了封邮件。
Predicted translation: i m a good is a good a good is a good a 
Real Translation: he did not answer the phone , so i sent him an email .
BLEU Score:  8.884136397986129e-232
--------------------------------------------------------------------------------
162
Input: 他有一个妻子和两个孩子要养活。
Predicted translation: i m a good a good a good a good a good a 
Real Translation: he has a wife and two young children to provide for .
BLEU Score:  9.594503055152632e-232
-------------------------------

Input: 万一有紧急情况，联系我的代理人。
Predicted translation: the father is a good a good a good a good a good 
Real Translation: in case of an emergency , get in touch with my agent .
BLEU Score:  0
--------------------------------------------------------------------------------
192
Input: 我有必要向他解释原因吗？
Predicted translation: i m a good a good a good a good a good a 
Real Translation: is it necessary for me to explain the reason to him ?
BLEU Score:  0
--------------------------------------------------------------------------------
193
Input: 空手赤拳抓野兔並不容易。
Predicted translation: please ?
Real Translation: it is not easy to catch a hare with your bare hands .
BLEU Score:  0
--------------------------------------------------------------------------------
194
Input: 在你侍酒前最好先將白葡萄酒冷卻一下。
Predicted translation: please .
Real Translation: it s better to chill white wine before you serve it .
BLEU Score:  1.0322346066768973e-233
--------------------------------------------------------------------------------
1

Input: 該公司在這個企劃中投入了很多錢。
Predicted translation: i m a good a good a good a good a good a 
Real Translation: the company invested a lot of money in this project .
BLEU Score:  9.594503055152632e-232
--------------------------------------------------------------------------------
223
Input: 医生建议我父亲减少吸烟。
Predicted translation: i m a good a good a good a good a good a 
Real Translation: the doctor advised my father to cut down on smoking .
BLEU Score:  0
--------------------------------------------------------------------------------
224
Input: 湖上的冰太薄了，承受不了你的重量。
Predicted translation: i m good bathroom ?
Real Translation: the ice on the lake is too thin to bear your weight .
BLEU Score:  0
--------------------------------------------------------------------------------
225
Input: 這艘船逆風而行緩慢前進。
Predicted translation: i m piece 
Real Translation: the ship made slow progress against the strong wind .
BLEU Score:  0
--------------------------------------------------------------------------------

Input: 把蛋糕等分切開更難。
Predicted translation: i m a good a good a good a good a good a 
Real Translation: cutting a cake into equal pieces is rather difficult .
BLEU Score:  9.594503055152632e-232
--------------------------------------------------------------------------------
256
Input: 你知道春卷为什么叫春卷吗？
Predicted translation: i m a good a good a good a good a good a 
Real Translation: do you know why spring rolls are called spring rolls ?
BLEU Score:  0
--------------------------------------------------------------------------------
257
Input: 这本杂志不要扔。我还没看呢。
Predicted translation: i m a good a good a good a good a good a 
Real Translation: don t throw out this magazine . i haven t read it yet .
BLEU Score:  8.884136397986129e-232
--------------------------------------------------------------------------------
258
Input: 全世界許多國家的人說英語。
Predicted translation: i m a good a good a good a good a good a 
Real Translation: english is spoken in many countries around the world .
BLEU Score:  0
--------

Input: 看來湯姆會按我們要求的去做。
Predicted translation: i m a good a good a good a good a good a 
Real Translation: it looks like tom will do what we ve asked him to do .
BLEU Score:  0
--------------------------------------------------------------------------------
286
Input: 他来不来对我来说没区别。
Predicted translation: i m a good a good a good a good a good a 
Real Translation: it makes no difference to me whether he comes or not .
BLEU Score:  0
--------------------------------------------------------------------------------
287
Input: 在我看來，她有一種誇張的傾向。
Predicted translation: the father is a good a good a good a good a good 
Real Translation: it seems to me that she has a tendency to exaggerate .
BLEU Score:  9.594503055152632e-232
--------------------------------------------------------------------------------
288
Input: 安靜到連針掉在地上的聲音你都會聽得到。
Predicted translation: i m good bathroom .
Real Translation: it was so still that you would have heard a pin drop .
BLEU Score:  2.459770855197576e-232
-------------

Input: 这位老师用试验论证了这个想法。
Predicted translation: i m a good a good a good a good a good a 
Real Translation: the teacher demonstrated the idea with an experiment .
BLEU Score:  0
--------------------------------------------------------------------------------
317
Input: 老师按照身高给孩子们排队。
Predicted translation: i m a good a good a good a good a good a 
Real Translation: the teacher lined the children up in order of height .
BLEU Score:  0
--------------------------------------------------------------------------------
318
Input: 在德國北部有很多美麗的城堡？
Predicted translation: i m a good a good a good a good a good a 
Real Translation: there are many beautiful castles in northern germany .
BLEU Score:  0
--------------------------------------------------------------------------------
319
Input: 沒有比失去孩子更讓人悲傷的事。
Predicted translation: i m a good a good a good a good a good a 
Real Translation: there s nothing more painful than losing one s child .
BLEU Score:  0
--------------------------------------------

Input: 一個外國語言無法在一年左右就被掌握。
Predicted translation: i m piece 
Real Translation: a foreign language cannot be mastered in a year or so .
BLEU Score:  0
--------------------------------------------------------------------------------
349
Input: 喝了幾杯酒後，這個傢伙就感覺不痛了。
Predicted translation: the father is a good a good a good a good a good 
Real Translation: after a couple of drinks , the guy was feeling no pain .
BLEU Score:  1.1409851298103347e-231
--------------------------------------------------------------------------------
350
Input: 不正确的命题听起来可能符合逻辑。
Predicted translation: i m a good a good a good a good a good a 
Real Translation: an argument may be logically sound without being true .
BLEU Score:  0
--------------------------------------------------------------------------------
351
Input: 显然，今天什么事都有可能发生。
Predicted translation: i m a good a good a good a good a good a 
Real Translation: apparently , there is nothing that cannot happen today .
BLEU Score:  0
-----------------------------

Input: 我一把这本书读完了就借给你。
Predicted translation: i m a good a good a good a good a good a 
Real Translation: i ll lend you the book as soon as i m done reading it .
BLEU Score:  4.303621112174457e-155
--------------------------------------------------------------------------------
379
Input: 我不會告訴任何人我藏宝的地方。
Predicted translation: i m a good a good a good a good a good a 
Real Translation: i ll never tell anyone where i ve hidden the treasure .
BLEU Score:  9.594503055152632e-232
--------------------------------------------------------------------------------
380
Input: 我要去购物。你要跟着来吗？
Predicted translation: i m a good a good a good a good a good a 
Real Translation: i m going to run a couple of errands . wanna tag along ?
BLEU Score:  5.143563572960599e-155
--------------------------------------------------------------------------------
381
Input: 我确信汤姆是唯一不会那么做的人。
Predicted translation: i m a good a good a good a good a good a 
Real Translation: i m pretty sure tom is the only one who can t 

Input: 那是人类第一次在月球上行走。
Predicted translation: i m a good a good a good a good a good a 
Real Translation: that was the first time that a man walked on the moon .
BLEU Score:  9.594503055152632e-232
--------------------------------------------------------------------------------
410
Input: 这不是汤姆要做那件事的主要原因。
Predicted translation: i m a good a good a good a good a good a 
Real Translation: that wasn t the main reason why tom needed to do that .
BLEU Score:  0
--------------------------------------------------------------------------------
411
Input: 這班公車客滿了。你必須等下一班。
Predicted translation: i m a good a good a good a good a good a 
Real Translation: the bus is full . you ll have to wait for the next one .
BLEU Score:  0
--------------------------------------------------------------------------------
412
Input: 警察告诉我巴士的末班车在十点出发。
Predicted translation: i m a good a good a good a good a good a 
Real Translation: the policeman told me that the last bus leaves at ten .
BLEU Score:  0
------------

Input: 你本应该早点做的。现在已经没有任何办法了。
Predicted translation: i m a good a good a good a good a good a 
Real Translation: you should ve done it earlier . it can t be helped now .
BLEU Score:  0
--------------------------------------------------------------------------------
443
Input: 你是那個建議我們一起做那件事的人。
Predicted translation: i m a good a good a good a good a good a 
Real Translation: you re the one who suggested that we do that together .
BLEU Score:  0
--------------------------------------------------------------------------------
444
Input: "他想在下班後喝杯咖啡。" "我也想。"
Predicted translation: i m a good a good a good a good a good a 
Real Translation: he d like to have a coffee after work . i would too .
BLEU Score:  1.0565078215140205e-231
--------------------------------------------------------------------------------
445
Input: “你怎么知道我的电话号码的？” “汤姆告诉我的。”
Predicted translation: i m a good a good a good a good a good a 
Real Translation: how did you get my phone number ? tom gave it to me .
BLEU Score

Input: 我感觉困得几乎不能睁开我的眼睛了。
Predicted translation: the father is a good a good a good a good a good 
Real Translation: i felt so sleepy that i could hardly keep my eyes open .
BLEU Score:  0
--------------------------------------------------------------------------------
474
Input: 前几天我发现了一个买水果的好地方。
Predicted translation: i m a good a good a good a good a good a 
Real Translation: i found a good place to buy fruit a couple of days ago .
BLEU Score:  5.527118757926412e-155
--------------------------------------------------------------------------------
475
Input: 我知道有人认识我的可能性微乎其微。
Predicted translation: i m a good a good a good a good a good a 
Real Translation: i know that it is highly unlikely that anyone knows me .
BLEU Score:  9.594503055152632e-232
--------------------------------------------------------------------------------
476
Input: 我期待听到你在这件事上的想法 。
Predicted translation: i m a good a good a good a good a good a 
Real Translation: i look forward to hearing your thoughts on this 

In [73]:
bleuplus/len(eng_val)

1.956246064697348e-156