<a href="https://colab.research.google.com/github/ProfAI/nlp00/blob/master/9%20-%20Seq2Seq%20e%20Machine%20Translation/machine_translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!wget http://www.manythings.org/anki/ita-eng.zip

--2019-04-19 17:55:58--  http://www.manythings.org/anki/ita-eng.zip
Resolving www.manythings.org (www.manythings.org)... 104.24.109.196, 104.24.108.196, 2606:4700:30::6818:6cc4, ...
Connecting to www.manythings.org (www.manythings.org)|104.24.109.196|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3981147 (3.8M) [application/zip]
Saving to: ‘ita-eng.zip’


2019-04-19 17:56:04 (12.4 MB/s) - ‘ita-eng.zip’ saved [3981147/3981147]



In [3]:
!unzip ita-eng.zip

Archive:  ita-eng.zip
  inflating: ita.txt                 
  inflating: _about.txt              


In [4]:
with open("ita.txt") as eng_ita_sents:
  eng_ita_sents = eng_ita_sents.read().split("\n")

print("%d esempi\n" % len(eng_ita_sents))
  
for i in range(5):
  print(eng_ita_sents[i])

321434 esempi

Hi.	Ciao!
Run!	Corri!
Run!	Corra!
Run!	Correte!
Who?	Chi?


In [5]:
eng_sents = []
ita_sents = []

for eng_ita_sent in eng_ita_sents:
  eng_ita_sent = eng_ita_sent.split("\t")
  if(len(eng_ita_sent)==2):
    eng_sents.append(eng_ita_sent[0])
    ita_sents.append(eng_ita_sent[1])

print(eng_sents[:5])
print(ita_sents[:5])      

['Hi.', 'Run!', 'Run!', 'Run!', 'Who?']
['Ciao!', 'Corri!', 'Corra!', 'Correte!', 'Chi?']


In [0]:
max_samples = 10000

eng_sents = eng_sents[:max_samples]
ita_sents = ita_sents[:max_samples]

In [7]:
import re

def preprocess(text):
  text = text.lower()
  text = re.sub(r'[^\w\s]','',text)
  text = "SOS "+text+" EOS"
  return text


ita_sents = [preprocess(ita_sent) for ita_sent in ita_sents]
eng_sents = [preprocess(eng_sent) for eng_sent in eng_sents]

print(eng_sents[:5])
print(ita_sents[:5])

['SOS hi EOS', 'SOS run EOS', 'SOS run EOS', 'SOS run EOS', 'SOS who EOS']
['SOS ciao EOS', 'SOS corri EOS', 'SOS corra EOS', 'SOS correte EOS', 'SOS chi EOS']


In [8]:
def get_dictionary(sents):
  
  dictionary = set({})
  
  for sent in sents:
    dictionary|=set(sent.split())
    
  return list(dictionary)


eng_dict = get_dictionary(eng_sents)
ita_dict = get_dictionary(ita_sents)

print("%d parole nel dizionario inglese" % len(eng_dict))
print("%d parole nel dizionario italiano" % len(ita_dict))

1412 parole nel dizionario inglese
3571 parole nel dizionario italiano


In [0]:
eng_dict_rev = {k: v for k, v in zip(eng_dict, range(len(eng_dict)))}
ita_dict_rev = {k: v for k, v in zip(ita_dict, range(len(ita_dict)))}

In [10]:
def encode(sents, dict_rev):
  
  sents_enc = []
  
  for sent in sents:
    sents_enc.append([dict_rev[word] for word in sent.split()])
    
  return sents_enc

X_encoder = encode(ita_sents, ita_dict_rev)
X_decoder = encode(eng_sents, eng_dict_rev)

Y = []

for x in X_decoder:
  Y.append(x[1:])

print(eng_sents[:10])
print(X_decoder[:10])
print(Y[:10])

['SOS hi EOS', 'SOS run EOS', 'SOS run EOS', 'SOS run EOS', 'SOS who EOS', 'SOS wow EOS', 'SOS jump EOS', 'SOS jump EOS', 'SOS jump EOS', 'SOS jump EOS']
[[864, 782, 48], [864, 840, 48], [864, 840, 48], [864, 840, 48], [864, 119, 48], [864, 646, 48], [864, 855, 48], [864, 855, 48], [864, 855, 48], [864, 855, 48]]
[[782, 48], [840, 48], [840, 48], [840, 48], [119, 48], [646, 48], [855, 48], [855, 48], [855, 48], [855, 48]]


In [11]:
from keras.preprocessing.sequence import pad_sequences

X_encoder = pad_sequences(X_encoder)
X_decoder = pad_sequences(X_decoder)
Y = pad_sequences(Y, maxlen=X_decoder.shape[1])

print(X_encoder.shape)
print(X_decoder.shape)
print(Y.shape)

Using TensorFlow backend.


(10000, 9)
(10000, 6)
(10000, 6)


In [13]:
import numpy as np

Y_oh = np.zeros((Y.shape[0], Y.shape[1], len(eng_dict)))

for i in range(len(Y)):
  for j in range(len(Y[i])):
    Y_oh[i][j][Y[i][j]]=1
    
Y_oh.shape

(10000, 6, 1412)

In [14]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Embedding

latent_dim = 256
num_encoder_tokens = len(ita_dict)
num_decoder_tokens = len(eng_dict)

# Define an input sequence and process it.
encoder_inputs = Input(shape=(None,))
x = Embedding(num_encoder_tokens, latent_dim)(encoder_inputs)
x, state_h, state_c = LSTM(latent_dim,
                           return_state=True)(x)
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
x = Embedding(num_decoder_tokens, latent_dim)(decoder_inputs)
x = LSTM(latent_dim, return_sequences=True)(x, initial_state=encoder_states)
decoder_outputs = Dense(num_decoder_tokens, activation='softmax')(x)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile & run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
# Note that `decoder_target_data` needs to be one-hot encoded,
# rather than sequences of integers like `decoder_input_data`!

Instructions for updating:
Colocations handled automatically by placer.


In [15]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 256)    914176      input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 256)    361472      input_2[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LS

In [16]:
model.fit([X_encoder, X_decoder], Y_oh,
          batch_size=512,
          epochs=100,
          validation_split=0.2)

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 8000 samples, validate on 2000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epo

<keras.callbacks.History at 0x7f91f5ad5ac8>

In [34]:
ita_sent = "ciao"

ita_sent = [ita_dict_rev[ita_sent]]

ita_sent = pad_sequences([ita_sent], maxlen=9)

ita_sent = np.array([ita_sent])

print(ita_sent.shape)

y = model.predict(ita_sent)

(1, 1, 9)


ValueError: ignored

In [19]:
X_encoder.shape

(10000, 9)