<a href="https://colab.research.google.com/github/victorm0202/OP-2020-code/blob/master/seq2seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from keras.preprocessing.text import text_to_word_sequence
from keras.models import Sequential
from keras.layers import Activation, TimeDistributed, Dense, RepeatVector, recurrent, Embedding
from keras.layers.recurrent import LSTM
from keras.optimizers import Adam, RMSprop
from nltk import FreqDist
import numpy as np
import os
import datetime
from keras.preprocessing.sequence import pad_sequences
import sys
import argparse

In [None]:
def load_data(source, dist, max_len, vocab_size):

    # Reading raw text from source and destination files
    f = open(source, 'r')
    X_data = f.read()
    f.close()
    f = open(dist, 'r')
    y_data = f.read()
    f.close()

    # Splitting raw text into array of sequences
    X = [text_to_word_sequence(x)[::-1] for x, y in zip(X_data.split('\n'), y_data.split('\n')) if len(x) > 0 and len(y) > 0 and len(x) <= max_len and len(y) <= max_len]
    y = [text_to_word_sequence(y) for x, y in zip(X_data.split('\n'), y_data.split('\n')) if len(x) > 0 and len(y) > 0 and len(x) <= max_len and len(y) <= max_len]

    # Creating the vocabulary set with the most common words
    dist = FreqDist(np.hstack(X))
    X_vocab = dist.most_common(vocab_size-1)
    dist = FreqDist(np.hstack(y))
    y_vocab = dist.most_common(vocab_size-1)

    # Creating an array of words from the vocabulary set, we will use this array as index-to-word dictionary
    X_ix_to_word = [word[0] for word in X_vocab]
    # Adding the word "ZERO" to the beginning of the array
    X_ix_to_word.insert(0, 'ZERO')
    # Adding the word 'UNK' to the end of the array (stands for UNKNOWN words)
    X_ix_to_word.append('UNK')

    # Creating the word-to-index dictionary from the array created above
    X_word_to_ix = {word:ix for ix, word in enumerate(X_ix_to_word)}

    # Converting each word to its index value
    for i, sentence in enumerate(X):
        for j, word in enumerate(sentence):
            if word in X_word_to_ix:
                X[i][j] = X_word_to_ix[word]
            else:
                X[i][j] = X_word_to_ix['UNK']

    y_ix_to_word = [word[0] for word in y_vocab]
    y_ix_to_word.insert(0, 'ZERO')
    y_ix_to_word.append('UNK')
    y_word_to_ix = {word:ix for ix, word in enumerate(y_ix_to_word)}
    for i, sentence in enumerate(y):
        for j, word in enumerate(sentence):
            if word in y_word_to_ix:
                y[i][j] = y_word_to_ix[word]
            else:
                y[i][j] = y_word_to_ix['UNK']
    return (X, len(X_vocab)+2, X_word_to_ix, X_ix_to_word, y, len(y_vocab)+2, y_word_to_ix, y_ix_to_word)

def load_test_data(source, X_word_to_ix, max_len):
    f = open(source, 'r')
    X_data = f.read()
    f.close()

    X = [text_to_word_sequence(x)[::-1] for x in X_data.split('\n') if len(x) > 0 and len(x) <= max_len]
    for i, sentence in enumerate(X):
        for j, word in enumerate(sentence):
            if word in X_word_to_ix:
                X[i][j] = X_word_to_ix[word]
            else:
                X[i][j] = X_word_to_ix['UNK']
    return X

def find_checkpoint_file(folder):
    checkpoint_file = [f for f in os.listdir(folder) if 'checkpoint' in f]
    if len(checkpoint_file) == 0:
        return []
    modified_time = [os.path.getmtime(f) for f in checkpoint_file]
    return checkpoint_file[np.argmax(modified_time)]

def process_data(word_sentences, max_len, word_to_ix):
    # Vectorizing each element in each sequence
    sequences = np.zeros((len(word_sentences), max_len, len(word_to_ix)))
    for i, sentence in enumerate(word_sentences):
        for j, word in enumerate(sentence):
            sequences[i, j, word] = 1.
    return sequences

In [None]:
MAX_LEN = 200
VOCAB_SIZE = 20000
BATCH_SIZE = 100
LAYER_NUM = 3
HIDDEN_DIM = 1000
NB_EPOCH = 20
MODE = 'train'

In [None]:
# Loading input sequences, output sequences and the necessary mapping dictionaries
X, X_vocab_len, X_word_to_ix, X_ix_to_word, y, y_vocab_len, y_word_to_ix, y_ix_to_word = load_data('../data/europarl-v8.fi-en.en', '../data/europarl-v8.fi-en.fi', MAX_LEN, VOCAB_SIZE)

In [None]:
# Finding the length of the longest sequence
X_max_len = max([len(sentence) for sentence in X])
y_max_len = max([len(sentence) for sentence in y])

In [None]:
# Padding zeros to make all sequences have a same length with the longest one
X = pad_sequences(X, maxlen=X_max_len, dtype='int32')
y = pad_sequences(y, maxlen=y_max_len, dtype='int32')

In [None]:
# Creating the network model
#model = create_model(X_vocab_len, X_max_len, y_vocab_len, y_max_len, HIDDEN_DIM, LAYER_NUM)

In [None]:
hidden_size = HIDDEN_DIM
num_layers = LAYER_NUM
model = Sequential()
# Creating encoder network
model.add(Embedding(X_vocab_len, 1000, input_length=X_max_len, mask_zero=True))
model.add(LSTM(hidden_size))
model.add(RepeatVector(y_max_len))

# Creating decoder network
for _ in range(num_layers):
    model.add(LSTM(hidden_size, return_sequences=True))
model.add(TimeDistributed(Dense(y_vocab_len)))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy',
        optimizer='rmsprop',
        metrics=['accuracy'])

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 47, 1000)          20001000  
_________________________________________________________________
lstm (LSTM)                  (None, 1000)              8004000   
_________________________________________________________________
repeat_vector (RepeatVector) (None, 45, 1000)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 45, 1000)          8004000   
_________________________________________________________________
lstm_2 (LSTM)                (None, 45, 1000)          8004000   
_________________________________________________________________
lstm_3 (LSTM)                (None, 45, 1000)          8004000   
_________________________________________________________________
time_distributed (TimeDistri (None, 45, 20001)         2

In [None]:
# Finding trained weights of previous epoch if any
#saved_weights = find_checkpoint_file('.')
saved_weights = []
#len(saved_weights)

In [None]:
#checkpoint_file = [f for f in os.listdir('.') if 'checkpoint' in f]
#os.listdir('.')

No se completó el entrenamiento por falta de tiempo... ustedes pueden terminar de entrenarlo. Tarda bastante...

In [None]:
k_start = 1

# If any trained weight was found, then load them into the model
if len(saved_weights) != 0:
    print('[INFO] Saved weights found, loading...')
    epoch = saved_weights[saved_weights.rfind('_')+1:saved_weights.rfind('.')]
    model.load_weights(saved_weights)
    k_start = int(epoch) + 1

i_end = 0
for k in range(k_start, NB_EPOCH+1):
    # Shuffling the training data every epoch to avoid local minima
    indices = np.arange(len(X))
    np.random.shuffle(indices)
    X = X[indices]
    y = y[indices]

    # Training 1000 sequences at a time
    for i in range(0, len(X), 1000):
        if i + 1000 >= len(X):
            i_end = len(X)
        else:
            i_end = i + 1000
        y_sequences = process_data(y[i:i_end], y_max_len, y_word_to_ix)

        print('[INFO] Training model: epoch {}th {}/{} samples'.format(k, i, len(X)))
        model.fit(X[i:i_end], y_sequences, batch_size=BATCH_SIZE, epochs=1, verbose=2)
    model.save_weights('checkpoint_epoch_{}.hdf5'.format(k))


[INFO] Training model: epoch 1th 0/1415032 samples
10/10 - 50s - loss: 4.4504 - accuracy: 0.6457
[INFO] Training model: epoch 1th 1000/1415032 samples
10/10 - 52s - loss: 2.6989 - accuracy: 0.7262
[INFO] Training model: epoch 1th 2000/1415032 samples
10/10 - 53s - loss: 2.7249 - accuracy: 0.7244
[INFO] Training model: epoch 1th 3000/1415032 samples
10/10 - 52s - loss: 2.7007 - accuracy: 0.7273
[INFO] Training model: epoch 1th 4000/1415032 samples
10/10 - 52s - loss: 2.6994 - accuracy: 0.7231
[INFO] Training model: epoch 1th 5000/1415032 samples
10/10 - 49s - loss: 2.5946 - accuracy: 0.7303
[INFO] Training model: epoch 1th 6000/1415032 samples
10/10 - 48s - loss: 2.5947 - accuracy: 0.7335
[INFO] Training model: epoch 1th 7000/1415032 samples
10/10 - 49s - loss: 2.6285 - accuracy: 0.7217
[INFO] Training model: epoch 1th 8000/1415032 samples
10/10 - 49s - loss: 2.5548 - accuracy: 0.7310
[INFO] Training model: epoch 1th 9000/1415032 samples
10/10 - 49s - loss: 2.5653 - accuracy: 0.7244
[IN

In [None]:
# Training only if we chose training mode
if MODE == 'train':
    k_start = 1

    # If any trained weight was found, then load them into the model
    if len(saved_weights) != 0:
        print('[INFO] Saved weights found, loading...')
        epoch = saved_weights[saved_weights.rfind('_')+1:saved_weights.rfind('.')]
        model.load_weights(saved_weights)
        k_start = int(epoch) + 1

    i_end = 0
    for k in range(k_start, NB_EPOCH+1):
        # Shuffling the training data every epoch to avoid local minima
        indices = np.arange(len(X))
        np.random.shuffle(indices)
        X = X[indices]
        y = y[indices]

        # Training 1000 sequences at a time
        for i in range(0, len(X), 1000):
            if i + 1000 >= len(X):
                i_end = len(X)
            else:
                i_end = i + 1000
            y_sequences = process_data(y[i:i_end], y_max_len, y_word_to_ix)

            print('[INFO] Training model: epoch {}th {}/{} samples'.format(k, i, len(X)))
            model.fit(X[i:i_end], y_sequences, batch_size=BATCH_SIZE, nb_epoch=1, verbose=2)
        model.save_weights('checkpoint_epoch_{}.hdf5'.format(k))

# Performing test if we chose test mode
else:
    # Only performing test if there is any saved weights
    if len(saved_weights) == 0:
        print("The network hasn't been trained! Program will exit...")
        sys.exit()
    else:
        X_test = load_test_data('test', X_word_to_ix, MAX_LEN)
        X_test = pad_sequences(X_test, maxlen=X_max_len, dtype='int32')
        model.load_weights(saved_weights)

        predictions = np.argmax(model.predict(X_test), axis=2)
        sequences = []
        for prediction in predictions:
            sequence = ' '.join([y_ix_to_word(index) for index in prediction if index > 0])
            print(sequence)
            sequences.append(sequence)
        np.savetxt('test_result', sequences, fmt='%s')