<a href="https://colab.research.google.com/github/ProfAI/nlp00/blob/master/9%20-%20Seq2Seq%20e%20Machine%20Translation/machine_translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!wget http://www.manythings.org/anki/ita-eng.zip

--2019-04-19 13:25:15--  http://www.manythings.org/anki/ita-eng.zip
Resolving www.manythings.org (www.manythings.org)... 104.24.109.196, 104.24.108.196, 2606:4700:30::6818:6dc4, ...
Connecting to www.manythings.org (www.manythings.org)|104.24.109.196|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3981147 (3.8M) [application/zip]
Saving to: ‘ita-eng.zip’


2019-04-19 13:25:16 (13.7 MB/s) - ‘ita-eng.zip’ saved [3981147/3981147]



In [2]:
!unzip ita-eng.zip

Archive:  ita-eng.zip
  inflating: ita.txt                 
  inflating: _about.txt              


In [1]:
with open("ita.txt") as eng_ita_sents:
  eng_ita_sents = eng_ita_sents.read().split("\n")

print("%d esempi\n" % len(eng_ita_sents))
  
for i in range(5):
  print(eng_ita_sents[i])

321434 esempi

Hi.	Ciao!
Run!	Corri!
Run!	Corra!
Run!	Correte!
Who?	Chi?


In [2]:
eng_sents = []
ita_sents = []

for eng_ita_sent in eng_ita_sents:
  eng_ita_sent = eng_ita_sent.split("\t")
  if(len(eng_ita_sent)==2):
    eng_sents.append(eng_ita_sent[0])
    ita_sents.append(eng_ita_sent[1])

print(eng_sents[:5])
print(ita_sents[:5])      

['Hi.', 'Run!', 'Run!', 'Run!', 'Who?']
['Ciao!', 'Corri!', 'Corra!', 'Correte!', 'Chi?']


In [3]:
import re

def preprocess(text):
  text = text.lower()
  text = re.sub(r'[^\w\s]','',text)
  text = "SOS "+text+" EOS"
  return text


ita_sents = [preprocess(ita_sent) for ita_sent in ita_sents]
eng_sents = [preprocess(eng_sent) for eng_sent in eng_sents]

print(eng_sents[:5])
print(ita_sents[:5])

['SOS hi EOS', 'SOS run EOS', 'SOS run EOS', 'SOS run EOS', 'SOS who EOS']
['SOS ciao EOS', 'SOS corri EOS', 'SOS corra EOS', 'SOS correte EOS', 'SOS chi EOS']


In [4]:
def get_dictionary(sents):
  
  dictionary = set({})
  
  for sent in sents:
    dictionary|=set(sent.split())
    
  return list(dictionary)


eng_dict = get_dictionary(eng_sents)
ita_dict = get_dictionary(ita_sents)

print("%d parole nel dizionario inglese" % len(eng_dict))
print("%d parole nel dizionario italiano" % len(ita_dict))

13437 parole nel dizionario inglese
27162 parole nel dizionario italiano


In [0]:
eng_dict_rev = {k: v for k, v in zip(eng_dict, range(len(eng_dict)))}
ita_dict_rev = {k: v for k, v in zip(ita_dict, range(len(ita_dict)))}

In [6]:
def encode(sents, dict_rev):
  
  sents_enc = []
  
  for sent in sents:
    sents_enc.append([dict_rev[word] for word in sent.split()])
    
  return sents_enc

X_encoder = encode(ita_sents, ita_dict_rev)
X_decoder = encode(eng_sents, eng_dict_rev)

Y = []

for x in X_decoder:
  Y.append(x[1:])

print(eng_sents[:10])
print(X_decoder[:10])
print(Y[:10])

['SOS hi EOS', 'SOS run EOS', 'SOS run EOS', 'SOS run EOS', 'SOS who EOS', 'SOS wow EOS', 'SOS jump EOS', 'SOS jump EOS', 'SOS jump EOS', 'SOS jump EOS']
[[465, 4452, 463], [465, 5408, 463], [465, 5408, 463], [465, 5408, 463], [465, 1118, 463], [465, 9211, 463], [465, 4860, 463], [465, 4860, 463], [465, 4860, 463], [465, 4860, 463]]
[[4452, 463], [5408, 463], [5408, 463], [5408, 463], [1118, 463], [9211, 463], [4860, 463], [4860, 463], [4860, 463], [4860, 463]]


In [0]:
import numpy as np

Y_oh = np.zeros((len(Y), len(eng_dict)))

for y in Y[0]:
  y_oh[y] = 1
  Y_oh.append(y_oh)
  
Y_oh.shape

In [7]:
from keras.preprocessing.sequence import pad_sequences

X_encoder = pad_sequences(X_encoder)
X_decoder = pad_sequences(X_decoder)

Using TensorFlow backend.


In [8]:
print(X_encoder.shape)
print(X_decoder.shape)

(321433, 51)
(321433, 49)


In [0]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Embedding

latent_dim = 256
num_encoder_tokens = len(ita_dict)
num_decoder_tokens = len(eng_dict)

# Define an input sequence and process it.
encoder_inputs = Input(shape=(None,))
x = Embedding(num_encoder_tokens, latent_dim)(encoder_inputs)
x, state_h, state_c = LSTM(latent_dim,
                           return_state=True)(x)
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
x = Embedding(num_decoder_tokens, latent_dim)(decoder_inputs)
x = LSTM(latent_dim, return_sequences=True)(x, initial_state=encoder_states)
decoder_outputs = Dense(num_decoder_tokens, activation='softmax')(x)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile & run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
# Note that `decoder_target_data` needs to be one-hot encoded,
# rather than sequences of integers like `decoder_input_data`!

In [88]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, None, 256)    6953472     input_5[0][0]                    
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, None, 256)    3439872     input_6[0][0]                    
__________________________________________________________________________________________________
lstm_3 (LS

In [91]:
model.fit([X_encoder, X_decoder], Y,
          batch_size=512,
          epochs=100,
          validation_split=0.2)

ValueError: ignored