In [204]:
import tensorflow as tf
import tensorflow_datasets as tfds
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time
import  nltk.translate.bleu_score as bleu
import pynlpir

pynlpir.open()

In [2]:
path_to_zip = 'C:\\Users\\user\\Project From Alex NLP\\chi-eng.zip'
path_to_file = os.path.dirname(path_to_zip)+"\\chi-eng\\cmn.txt"
print(path_to_file)

C:\Users\user\Project From Alex NLP\chi-eng\cmn.txt


In [3]:
def preprocess_sentence(sentence):
  sentence = sentence.lower().strip()
  # creating a space between a word and the punctuation following it
  # eg: "he is a boy." => "he is a boy ."
  sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
  sentence = re.sub(r'[" "]+', " ", sentence)
  # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
  sentence = re.sub(r"[^a-zA-Z?.!,]+", " ", sentence)
  sentence = sentence.strip()
  # adding a start and an end token to the sentence
  # so that the model know when to start and stop predicting.
#   sentence = '<start> ' + sentence + ' <end>'
  return sentence

In [4]:
def create_dataset(path, num1,num2):
  english = []
  chinese = []
  lines = io.open(path, encoding='UTF-8').read().strip().split('\n')

#   word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_examples]]
  for l in lines[num1:num2]:
      eng = preprocess_sentence(l.split('\t')[0])
      chi = l.split('\t')[1]
      english.append(eng)
      chinese.append(chi)
        
  return english, chinese

In [5]:
en_tr, chi_tr = create_dataset(path_to_file, 0,20000)
print(en_tr[-1])
print(chi_tr[-1])

one man s meat is another man s poison .
甲之蜜糖，乙之砒霜。


In [6]:
len(en_tr),len(chi_tr)

(20000, 20000)

In [7]:
en_val, chi_val = create_dataset(path_to_file,23000,23500)
len(en_val),len(chi_val)

(500, 500)

In [8]:
tokenizer_en = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    en_tr, target_vocab_size=2**13)

tokenizer_chi = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    chi_tr, target_vocab_size=2**13)

In [9]:
sample_string = 'Transformer is awesome.'

tokenized_string = tokenizer_en.encode(sample_string)
print ('Tokenized string is {}'.format(tokenized_string))

original_string = tokenizer_en.decode(tokenized_string)
print ('The original string: {}'.format(original_string))

assert original_string == sample_string

Tokenized string is [7301, 5149, 7332, 6165, 8, 3017, 7263]
The original string: Transformer is awesome.


In [10]:
sample_string = en_tr[5000]

tokenized_string = tokenizer_en.encode(sample_string)
print ('Tokenized string is {}'.format(tokenized_string))

original_string = tokenizer_en.decode(tokenized_string)
print ('The original string: {}'.format(original_string))

assert original_string == sample_string

Tokenized string is [14, 567, 3, 990, 320, 1]
The original string: she hit the ball hard .


In [11]:
for ts in tokenized_string:
  print ('{} ----> {}'.format(ts, tokenizer_en.decode([ts])))

14 ----> she 
567 ----> hit 
3 ----> the 
990 ----> ball 
320 ----> hard
1 ---->  .


In [12]:
sample_string = '很高興認識你'
tokenized_string = tokenizer_chi.encode(sample_string)
for ts in tokenized_string:
  print ('{} ----> {}'.format(ts, tokenizer_chi.decode([ts])))

4573 ----> 很高興
575 ----> 認識
9 ----> 你


In [13]:
BUFFER_SIZE = 20000
BATCH_SIZE = 64

In [14]:
def encode(lang1, lang2):
  lang1 = [tokenizer_en.vocab_size] + tokenizer_en.encode(
      lang1.numpy()) + [tokenizer_en.vocab_size+1]

  lang2 = [tokenizer_chi.vocab_size] + tokenizer_chi.encode(
      lang2.numpy()) + [tokenizer_chi.vocab_size+1]
  
  return lang1, lang2

In [15]:
def tf_encode(en, chi):
  result_en, result_chi = tf.py_function(encode, [en, chi], [tf.int64, tf.int64])
  result_en.set_shape([None])
  result_chi.set_shape([None])

  return result_en, result_chi

In [16]:
train_examples = tf.data.Dataset.from_tensor_slices((en_tr, chi_tr))
val_examples = tf.data.Dataset.from_tensor_slices((en_val, chi_val))

In [17]:
MAX_LENGTH = 40

In [18]:
def filter_max_length(x, y, max_length=MAX_LENGTH):
  return tf.logical_and(tf.size(x) <= max_length,
                        tf.size(y) <= max_length)

In [19]:
train_preprocessed = (
    tf.data.Dataset.from_tensor_slices((en_tr, chi_tr))
    .map(tf_encode) 
    .filter(filter_max_length)
    # cache the dataset to memory to get a speedup while reading from it.
    .cache()
    .shuffle(BUFFER_SIZE))

In [20]:
train_dataset = (train_preprocessed
                 .padded_batch(BATCH_SIZE, padded_shapes=([None], [None]))
                 .prefetch(tf.data.experimental.AUTOTUNE))

In [21]:
en_batch, chi_batch = next(iter(train_dataset))
en_batch

<tf.Tensor: shape=(64, 11), dtype=int64, numpy=
array([[7473,    2,  146,  681,  113,  178,    1, 7474,    0,    0,    0],
       [7473,    5,   74,  100,    3, 2713,  369,    1, 7474,    0,    0],
       [7473,    8,   14,  767,    6, 7474,    0,    0,    0,    0,    0],
       [7473,    2,   28,   10,  497,   20, 1935,  102,  467,    1, 7474],
       [7473,    5,   19,    4,  250,  467,    1, 7474,    0,    0,    0],
       [7473,    5,   54,   39, 1504,   32,   29,   38,    6, 7474,    0],
       [7473,    2,   19,    7,  580,    1, 7474,    0,    0,    0,    0],
       [7473,    9,   12,    7, 5211,   17, 4086,   34, 1749,    1, 7474],
       [7473,   87,   12,   25,  393, 2907,    6, 7474,    0,    0,    0],
       [7473,   24,   18,    5,   43,    4,  112,   45,    6, 7474,    0],
       [7473,    4,   24, 6278,   28,    9,   37, 4426,    6, 7474,    0],
       [7473,   21,  116,   85, 3266,    1, 7474,    0,    0,    0,    0],
       [7473,   52,    9,  157,   36,    7,  345,   

In [22]:
chi_batch

<tf.Tensor: shape=(64, 15), dtype=int64, numpy=
array([[7318, 4479, 2511,    1, 7319,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0],
       [7318,   55,  357,  680, 7115,  149,  137,    1, 7319,    0,    0,
           0,    0,    0,    0],
       [7318,  732, 2397,    7,    2, 7319,    0,    0,    0,    0,    0,
           0,    0,    0,    0],
       [7318, 4532, 1614, 4001, 2342,    3,    1, 7319,    0,    0,    0,
           0,    0,    0,    0],
       [7318, 1101,  189, 2901,   81,    1, 7319,    0,    0,    0,    0,
           0,    0,    0,    0],
       [7318,    9, 3610, 2353,    5,  359,   39,    2, 7319,    0,    0,
           0,    0,    0,    0],
       [7318,  822,  693, 1531,    1, 7319,    0,    0,    0,    0,    0,
           0,    0,    0,    0],
       [7318,  164, 2973, 2142,    4, 5588,  189, 2810,    1, 7319,    0,
           0,    0,    0,    0],
       [7318, 5260, 5817, 7094, 7130, 7136, 5014, 7319,    0,    0,    0,
           0,    0,   

In [23]:
tokenizer_en.vocab_size

7473

In [166]:
# BUFFER_SIZE = 20000 #len(input_tensor_train)
# BATCH_SIZE = 4 #64
steps_per_epoch = 20 #tokenizer_en.vocab_size//BATCH_SIZE
embedding_dim = 512 #4
units =  1024 #64
vocab_inp_size = tokenizer_en.vocab_size + 2
vocab_tar_size = tokenizer_chi.vocab_size + 2

# dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
# dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [None]:
# Write the encoder and decoder model

In [25]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')

  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state = hidden)
    return output, state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_sz, self.enc_units))

In [167]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(en_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (64, 11, 1024)
Encoder Hidden state shape: (batch size, units) (64, 1024)


In [27]:
class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, query, values):
    # query hidden state shape == (batch_size, hidden size)
    # query_with_time_axis shape == (batch_size, 1, hidden size)
    # values shape == (batch_size, max_len, hidden size)
    # we are doing this to broadcast addition along the time axis to calculate the score
    query_with_time_axis = tf.expand_dims(query, 1)

    # score shape == (batch_size, max_length, 1)
    # we get 1 at the last axis because we are applying score to self.V
    # the shape of the tensor before applying self.V is (batch_size, max_length, units)
    score = self.V(tf.nn.tanh(
        self.W1(query_with_time_axis) + self.W2(values)))

    # attention_weights shape == (batch_size, max_length, 1)
    attention_weights = tf.nn.softmax(score, axis=1)

    # context_vector shape after sum == (batch_size, hidden_size)
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

In [28]:
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape: (batch size, units) (64, 1024)
Attention weights shape: (batch_size, sequence_length, 1) (64, 11, 1)


In [29]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)

    # used for attention
    self.attention = BahdanauAttention(self.dec_units)

  def call(self, x, hidden, enc_output):
    # enc_output shape == (batch_size, max_length, hidden_size)
    context_vector, attention_weights = self.attention(hidden, enc_output)

    # x shape after passing through embedding == (batch_size, 1, embedding_dim)
    x = self.embedding(x)

    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

    # passing the concatenated vector to the GRU
    output, state = self.gru(x)

    # output shape == (batch_size * 1, hidden_size)
    output = tf.reshape(output, (-1, output.shape[2]))

    # output shape == (batch_size, vocab)
    x = self.fc(output)

    return x, state, attention_weights

In [30]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden, sample_output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (64, 7320)


In [31]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

In [32]:
checkpoint_dir = './training_checkpoints_nmt_eng_to_chi_attention2'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [33]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(inp, enc_hidden)

    dec_hidden = enc_hidden

    dec_input = tf.expand_dims([tokenizer_chi.vocab_size] * BATCH_SIZE, 1)

    # Teacher forcing - feeding the target as the next input
    for t in range(1, targ.shape[1]):
      # passing enc_output to the decoder
      predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

      loss += loss_function(targ[:, t], predictions)

      # using teacher forcing
      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))

  variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, variables)

  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [34]:
EPOCHS = 10

for epoch in range(EPOCHS):
  start = time.time()

  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(train_dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss

    if batch % 100 == 0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))
  # saving (checkpoint) the model every 2 epochs
  if (epoch + 1) % 2 == 0:
    checkpoint.save(file_prefix = checkpoint_prefix)

  print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 4.0716












































Epoch 1 Loss 3.2995
Time taken for 1 epoch 167.59760069847107 sec





Epoch 2 Batch 0 Loss 2.9814




Epoch 2 Loss 2.9445
Time taken for 1 epoch 50.488362073898315 sec

Epoch 3 Batch 0 Loss 3.4319




Epoch 3 Loss 2.8228
Time taken for 1 epoch 34.71366786956787 sec

Epoch 4 Batch 0 Loss 2.8427
Epoch 4 Loss 2.7369
Time taken for 1 epoch 37.8781955242157 sec

Epoch 5 Batch 0 Loss 2.6136
Epoch 5 Loss 2.7232
Time taken for 1 epoch 14.747559309005737 sec

Epoch 6 Batch 0 Loss 2.7153
Epoch 6 Loss 2.5724
Time taken for 1 epoch 15.434094667434692 sec

Epoch 7 Batch 0 Loss 1.6461
Epoch 7 Loss 2.5624
Time taken for 1 epoch 54.325352907180786 sec

Epoch 8 Batch 0 Loss 2.2682
Epoch 8 Loss 2.6224
Time taken for 1 epoch 2.8304975032806396 sec

Epoch 9 Batch 0 Loss 2.3209
Epoch 9 Loss 2.4120
Time taken for 1 epoch 16.987568855285645 sec

Epoch 10 Batch 0 Loss 2.7229
Epoch 10 Loss 2.4676
Time taken for 1 epoch 10.2025625705719 sec



In [None]:
max_length_inp = en_batch.shape[1]
max_length_targ = chi_batch.shape[1]

In [216]:
def evaluate(sentence):
  attention_plot = np.zeros((max_length_targ, max_length_inp))

  sentence = preprocess_sentence(sentence)

  start_token = [tokenizer_en.vocab_size]
  end_token = [tokenizer_en.vocab_size + 1]

  inputs = start_token + tokenizer_en.encode(sentence) + end_token 
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=max_length_inp,
                                                         padding='post')
  inputs = tf.convert_to_tensor(inputs)

  result = ''

  hidden = [tf.zeros((1, units))]
  enc_out, enc_hidden = encoder(inputs, hidden)

  dec_hidden = enc_hidden
  dec_input = tf.expand_dims([tokenizer_chi.vocab_size], 0)


  for t in range(max_length_targ):
    predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                         dec_hidden,
                                                         enc_out)

    # storing the attention weights to plot later on
    attention_weights = tf.reshape(attention_weights, (-1, ))
    attention_plot[t] = attention_weights.numpy()

    predicted_id = tf.argmax(predictions[0]).numpy()


    if predicted_id == tokenizer_chi.vocab_size+1:
      return result, sentence, attention_plot
    else:
      result += tokenizer_chi.decode([predicted_id])


    # the predicted ID is fed back into the model
    dec_input = tf.expand_dims([predicted_id], 0)
    

  return result, sentence, attention_plot

In [187]:
evaluate("hi my name is tom")

('我想鋼琴',
 'hi my name is tom',
 array([[0.0921368 , 0.08826993, 0.09777708, 0.08960067, 0.08710279,
         0.08661309, 0.09152686, 0.09200553, 0.09192359, 0.09165885,
         0.09138484],
        [0.09069231, 0.09006741, 0.09123293, 0.09063593, 0.09081227,
         0.09101428, 0.09092752, 0.09116804, 0.09119379, 0.09115257,
         0.09110302],
        [0.0916662 , 0.08978347, 0.09438039, 0.09043141, 0.08919358,
         0.08930304, 0.09131841, 0.09124721, 0.09108305, 0.0908873 ,
         0.09070589],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        ],
        [

In [201]:
# function for plotting the attention weights
def plot_attention(attention, sentence, predicted_sentence):
  fig = plt.figure(figsize=(10,10))
  ax = fig.add_subplot(1, 1, 1)
  ax.matshow(attention, cmap='viridis')

  fontdict = {'fontsize': 14}

  ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90)
  ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)

  ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
  ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

  plt.show()

In [208]:
def translate(sentence):
  result, sentence, attention_plot = evaluate(sentence)

  print('Input: %s' % (sentence))
  print('Predicted translation: {}'.format(result))

  segmentation = pynlpir.segment(result, pos_tagging=False)

#   attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]
#   plot_attention(attention_plot, sentence.split(' '), result.split(' '))
#   if plot:
#       attention_plot = attention_plot[:len(segmentation), :len(sentence.split(' '))]
#       plot_attention(attention_plot, sentence.split(' '), segmentation)

  return result

In [120]:
# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x241956dc648>

In [205]:
pynlpir.segment("我想鋼琴", pos_tagging=False)

['我', '想', '鋼', '琴']

In [209]:
translate('how are you?')

Input: how are you ?
Predicted translation: 我想鋼琴


'我想鋼琴'

In [190]:
translate('what did you eat for breakfast?')

Input: what did you eat for breakfast ?
Predicted translation: 我想鋼琴


'我想鋼琴'

In [191]:
translate('did you finish your homework?')

Input: did you finish your homework ?
Predicted translation: 我想鋼琴


'我想鋼琴'

In [192]:
translate('nice to meet you')

Input: nice to meet you
Predicted translation: 我想鋼琴


'我想鋼琴'

In [193]:
translate("I am hungry right now")

Input: i am hungry right now
Predicted translation: 我想鋼琴


'我想鋼琴'

In [194]:
translate("I love you")

Input: i love you
Predicted translation: 我想鋼琴


'我想鋼琴'

In [195]:
translate('Where are you going?')

Input: where are you going ?
Predicted translation: 我想鋼琴


'我想鋼琴'

In [None]:
# questions_val, answers_val = load_conversations(80001,80500)
# len(questions_val),len(answers_val)

(1634, 1634)

In [212]:
len(en_val)

497

In [214]:
bleuplus = 0
i = 0

for en_, chi_ in zip(en_val,chi_val):
    print(i)
    predicted = translate(en_)
    two_ = pynlpir.segment(predicted, pos_tagging=False)
    one_ = pynlpir.segment(chi_, pos_tagging=False)
    print('Real Translation: {}'.format(chi_))
    bleu_ = bleu.sentence_bleu([one_], two_)
    print("BLEU Score: ", bleu_)
    print("----" *20)

    bleuplus += bleu_
    i += 1

0
Input: what would you do if you had ten thousand dollars ?
Predicted translation: 我想鋼琴
Real Translation: 如果你有一万美元，你想做什么呢？
BLEU Score:  1.7434293862051595e-232
--------------------------------------------------------------------------------
1
Input: when i hear that song , i remember my younger days .
Predicted translation: 我想鋼琴
Real Translation: 每次听到这首歌，都会让我回忆起自己年轻的时候。
BLEU Score:  3.8901167815143597e-233
--------------------------------------------------------------------------------
2
Input: when was the last time you spent time on facebook ?
Predicted translation: 我想鋼琴
Real Translation: 你上一次用Facebook是什麼時候？
BLEU Score:  0
--------------------------------------------------------------------------------
3
Input: you are no better at remembering things than i am .
Predicted translation: 我想鋼琴
Real Translation: 你记事情的能力并不比我好多少。
BLEU Score:  1.7434293862051595e-232
--------------------------------------------------------------------------------
4
Input: you don t seem to be as afraid as t

Real Translation: 他说他没有进房间，那是骗人的。
BLEU Score:  0
--------------------------------------------------------------------------------
38
Input: he went to italy for the purpose of studying music .
Predicted translation: 我想鋼琴
Real Translation: 他去義大利的目的是學習音樂。
BLEU Score:  0
--------------------------------------------------------------------------------
39
Input: he s getting along well with all of his classmates .
Predicted translation: 我想鋼琴
Real Translation: 他和他所有的同学相处融洽。
BLEU Score:  0
--------------------------------------------------------------------------------
40
Input: her parents can t help worrying about her injuries .
Predicted translation: 我想鋼琴
Real Translation: 她的父母不能不担心她的伤势。
BLEU Score:  0
--------------------------------------------------------------------------------
41
Input: how long has it been since you saw your girlfriend ?
Predicted translation: 我想鋼琴
Real Translation: 你跟你的女朋友见面以来有多久了？
BLEU Score:  0
----------------------------------------------------------------------

Real Translation: 学习外语里的俚语很有趣。
BLEU Score:  0
--------------------------------------------------------------------------------
77
Input: let go of my arm ! i can t stand people touching me .
Predicted translation: 我想鋼琴
Real Translation: 放開我的手臂！我不能忍受有人碰我。
BLEU Score:  1.057443375777407e-232
--------------------------------------------------------------------------------
78
Input: life in prison is worse than the life of an animal .
Predicted translation: 我想鋼琴
Real Translation: 监狱生活比做畜牲还不如。
BLEU Score:  0
--------------------------------------------------------------------------------
79
Input: many schools are closed today because of the storm .
Predicted translation: 我想鋼琴
Real Translation: 今天许多学校因为风暴停课。
BLEU Score:  0
--------------------------------------------------------------------------------
80
Input: many young people in japan eat bread for breakfast .
Predicted translation: 我想鋼琴
Real Translation: 許多日本的年輕人吃麵包當作早餐。
BLEU Score:  0
--------------------------------------------------

Predicted translation: 我想鋼琴
Real Translation: 这家公司在巴黎证券交易所上市了。
BLEU Score:  0
--------------------------------------------------------------------------------
116
Input: this guidebook might be of use to you on your trip .
Predicted translation: 我想鋼琴
Real Translation: 这本导游册子或许会对你的旅行有帮助。
BLEU Score:  0
--------------------------------------------------------------------------------
117
Input: to the best of my knowledge , the rumor is not true .
Predicted translation: 我想鋼琴
Real Translation: 据我所知，这个谣言不是真的。
BLEU Score:  1.7434293862051595e-232
--------------------------------------------------------------------------------
118
Input: tom can t decide whether to buy a toyota or a ford .
Predicted translation: 我想鋼琴
Real Translation: 汤姆不能决定买丰田还是福特。
BLEU Score:  0
--------------------------------------------------------------------------------
119
Input: tom didn t think that anybody would recognize mary .
Predicted translation: 我想鋼琴
Real Translation: 汤姆不认为有人会认出玛丽。
BLEU Score:  0
------------

Real Translation: 不要逐字逐句的把英文翻譯成日文。
BLEU Score:  0
--------------------------------------------------------------------------------
156
Input: everybody except tom knew he didn t need to do that .
Predicted translation: 我想鋼琴
Real Translation: 除了汤姆以外任何人都知道他没必要这么做。
BLEU Score:  0
--------------------------------------------------------------------------------
157
Input: flies and mosquitoes interfered with his meditation .
Predicted translation: 我想鋼琴
Real Translation: 蒼蠅和蚊子干擾了他的冥想。
BLEU Score:  0
--------------------------------------------------------------------------------
158
Input: fluency in english is a very marketable skill today .
Predicted translation: 我想鋼琴
Real Translation: 流利的英語在今天是一種十分搶手的技能。
BLEU Score:  0
--------------------------------------------------------------------------------
159
Input: foreign investors withdrew their money from america .
Predicted translation: 我想鋼琴
Real Translation: 外国投资者从美国收回他们的钱。
BLEU Score:  0
---------------------------------------------------

Real Translation: 闭着你的眼睛直到我告诉你睁开为止。
BLEU Score:  1.3577841712062768e-232
--------------------------------------------------------------------------------
196
Input: let s meet at the station at eight tomorrow morning .
Predicted translation: 我想鋼琴
Real Translation: 明早八点车站见。
BLEU Score:  0
--------------------------------------------------------------------------------
197
Input: london , where i live , used to be famous for its fog .
Predicted translation: 我想鋼琴
Real Translation: 我住的地方－倫敦，從前以霧聞名。
BLEU Score:  8.235377291091144e-233
--------------------------------------------------------------------------------
198
Input: lots of people in japan are indifferent to politics .
Predicted translation: 我想鋼琴
Real Translation: 很多日本人对政治冷淡。
BLEU Score:  0
--------------------------------------------------------------------------------
199
Input: my driving instructor says i should be more patient .
Predicted translation: 我想鋼琴
Real Translation: 我的駕駛教練說我應該更有耐心一點。
BLEU Score:  6.413718283190248e-233

Real Translation: 汤姆本可以用更好的方式处理问题。
BLEU Score:  0
--------------------------------------------------------------------------------
236
Input: tom donates half his salary to his favorite charity .
Predicted translation: 我想鋼琴
Real Translation: 汤姆捐一半他的薪水给他最喜欢的慈善机构。
BLEU Score:  0
--------------------------------------------------------------------------------
237
Input: tom has contributed a lot of money to this hospital .
Predicted translation: 我想鋼琴
Real Translation: 湯姆捐獻了很多錢給這間醫院。
BLEU Score:  0
--------------------------------------------------------------------------------
238
Input: we must take into account the fact that he is young .
Predicted translation: 我想鋼琴
Real Translation: 我们必须考虑到他很年轻。
BLEU Score:  0
--------------------------------------------------------------------------------
239
Input: we take health for granted until illness intervenes .
Predicted translation: 我想鋼琴
Real Translation: 我們視健康為理所當然, 直到疾病侵襲。
BLEU Score:  6.413718283190248e-233
--------------------------------

Real Translation: 我是倒数第二个得知这个坏消息的。
BLEU Score:  1.7434293862051595e-232
--------------------------------------------------------------------------------
276
Input: i worked in a post office during the summer vacation .
Predicted translation: 我想鋼琴
Real Translation: 暑假期间，我在邮局工作。
BLEU Score:  4.739132419722992e-232
--------------------------------------------------------------------------------
277
Input: i d like to try this on . where are the fitting rooms ?
Predicted translation: 我想鋼琴
Real Translation: 我想试试这件。请问哪里是试衣间？
BLEU Score:  1.6562239472029865e-155
--------------------------------------------------------------------------------
278
Input: i ll arrange for someone to pick you up at your home .
Predicted translation: 我想鋼琴
Real Translation: 我会安排一下，叫个人去你家接你。
BLEU Score:  1.057443375777407e-232
--------------------------------------------------------------------------------
279
Input: i m now rich enough to afford to get anything i want .
Predicted translation: 我想鋼琴
Real Translation:

Real Translation: 警方會把你關在牢裡二十年。
BLEU Score:  0
--------------------------------------------------------------------------------
315
Input: the prince fell in love with a woodcutter s daughter .
Predicted translation: 我想鋼琴
Real Translation: 王子愛上了一個樵夫的女兒。
BLEU Score:  0
--------------------------------------------------------------------------------
316
Input: the teacher demonstrated the idea with an experiment .
Predicted translation: 我想鋼琴
Real Translation: 这位老师用试验论证了这个想法。
BLEU Score:  0
--------------------------------------------------------------------------------
317
Input: the teacher lined the children up in order of height .
Predicted translation: 我想鋼琴
Real Translation: 老师按照身高给孩子们排队。
BLEU Score:  0
--------------------------------------------------------------------------------
318
Input: there are many beautiful castles in northern germany .
Predicted translation: 我想鋼琴
Real Translation: 在德國北部有很多美麗的城堡？
BLEU Score:  0
--------------------------------------------------------------

Input: besides the piano , can you play any other instruments ?
Predicted translation: 我想鋼琴
Real Translation: 除了钢琴以外，还会玩什么乐器吗？
BLEU Score:  0
--------------------------------------------------------------------------------
355
Input: between you and me , i don t like our new team captain .
Predicted translation: 我想鋼琴
Real Translation: 我就只告诉你，我不喜欢我们的新队长。
BLEU Score:  1.057443375777407e-232
--------------------------------------------------------------------------------
356
Input: could you possibly suggest a way to solve the problem ?
Predicted translation: 我想鋼琴
Real Translation: 你能提出解决问题的方法吗？
BLEU Score:  0
--------------------------------------------------------------------------------
357
Input: do you have an extra english dictionary by any chance ?
Predicted translation: 我想鋼琴
Real Translation: 您碰巧有一本額外的英文字典嗎？
BLEU Score:  0
--------------------------------------------------------------------------------
358
Input: don t throw away this magazine . i haven t read it yet .
Predicted t

Real Translation: 出去之前确认一下所有的灯都关了。
BLEU Score:  0
--------------------------------------------------------------------------------
394
Input: mary has a bad back . it s hard for her to lift things .
Predicted translation: 我想鋼琴
Real Translation: 瑪麗的背不好, 她很難把東西舉起來。
BLEU Score:  0
--------------------------------------------------------------------------------
395
Input: mary said that she loved me , but i didn t believe her .
Predicted translation: 我想鋼琴
Real Translation: 玛丽说她爱我，但是我并不相信她所说的话。
BLEU Score:  4.995008821347951e-233
--------------------------------------------------------------------------------
396
Input: modern technology has made our lives more comfortable .
Predicted translation: 我想鋼琴
Real Translation: 现代科学技术让我们的生活更加舒适。
BLEU Score:  0
--------------------------------------------------------------------------------
397
Input: more than percent of the students go to university .
Predicted translation: 我想鋼琴
Real Translation: 40%以上的学生去读大学。
BLEU Score:  0
----------------------

Real Translation: 你是不是有能力做到你想要完成的所有事情呢？
BLEU Score:  6.413718283190248e-233
--------------------------------------------------------------------------------
434
Input: when i first came back to boston , i was really lonely .
Predicted translation: 我想鋼琴
Real Translation: 当我第一次回波士顿时，我真是孤独。
BLEU Score:  1.7434293862051595e-232
--------------------------------------------------------------------------------
435
Input: who is in charge of the office while the boss is away ?
Predicted translation: 我想鋼琴
Real Translation: 在主任不在的时候，办公室由谁来负责？
BLEU Score:  0
--------------------------------------------------------------------------------
436
Input: why don t you wait here while i finish what i m doing ?
Predicted translation: 我想鋼琴
Real Translation: 为什么你不在这里等到我做完为止？
BLEU Score:  1.7434293862051595e-232
--------------------------------------------------------------------------------
437
Input: with your approval , i would like to offer him the job .
Predicted translation: 我想鋼琴
Real Translation: 如果您

Real Translation: 我不知道把拐杖放哪去了，你有看到嗎？
BLEU Score:  8.235377291091144e-233
--------------------------------------------------------------------------------
472
Input: i don t want to go there . he doesn t want to go either .
Predicted translation: 我想鋼琴
Real Translation: 我不想去那儿，他也不想。
BLEU Score:  2.6621681380487264e-232
--------------------------------------------------------------------------------
473
Input: i felt so sleepy that i could hardly keep my eyes open .
Predicted translation: 我想鋼琴
Real Translation: 我感觉困得几乎不能睁开我的眼睛了。
BLEU Score:  1.057443375777407e-232
--------------------------------------------------------------------------------
474
Input: i found a good place to buy fruit a couple of days ago .
Predicted translation: 我想鋼琴
Real Translation: 前几天我发现了一个买水果的好地方。
BLEU Score:  1.3577841712062768e-232
--------------------------------------------------------------------------------
475
Input: i know that it is highly unlikely that anyone knows me .
Predicted translation: 我想鋼琴
Real 

In [215]:
bleuplus/len(en_val)

2.7883096473415878e-157