In [22]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import regex as re
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split

In [7]:
normal = pd.read_csv('./data/sentence-aligned.v2/normal.aligned',sep='\t',header=None)
normal.head()

Unnamed: 0,0,1,2
0,"Cherokee, Oklahoma",0,It is the county seat of Alfalfa County .
1,"Cherokee, Oklahoma",0,"Cherokee is a city in Alfalfa County , Oklahom..."
2,Skateboard,5,Skateboard decks are usually between 28 and 33...
3,Skateboard,5,The underside of the deck can be printed with ...
4,Skateboard,6,This was created by two surfers ; Ben Whatson ...


In [8]:
simple = pd.read_csv('./data/sentence-aligned.v2/simple.aligned',sep='\t',header=None)
simple.head()

Unnamed: 0,0,1,2
0,"Cherokee, Oklahoma",0,It is the county seat of Alfalfa County .
1,"Cherokee, Oklahoma",0,Cherokee is a city of Oklahoma in the United S...
2,Skateboard,2,Skateboard decks are normally between 28 and 3...
3,Skateboard,2,The bottom of the deck can be printed with a d...
4,Skateboard,3,The longboard was made by two surfers ; Ben Wh...


In [15]:
num_train_samples = 100_000
num_val_samples = 10_000
embedding_dim = 100
learning_rate = 0.01
n_units = 128
vocab_size = 30_000
batch_size = 64
epochs = 100
max_seq_len = 100

In [None]:
for i,(normal,simple) in enumerate(zip(normal_sentences,simple_sentences)):
    

In [11]:
# Remove instances where Simple Wikipedia and Wikipedia sentences are identical.
identical_filter = (normal[2] != simple[2])

# Define a regex pattern used to remove some wikipeida formatting artefacts.
# pattern = r'(-lrb-)(?<=-lrb-)(.*?)(?=-rrb-)(-rrb-)'
pattern = '-lrb-.*?-rrb-'

# Select the column of the 'normal' dataframe with the relevant data.
normal_sentences = normal[2][identical_filter]

# For each sentence, we want to try to remove string artefacts from wikipedia, usually as a result of hyperlinks
# These can greatly increase the size of our sequences, needlessly, and are also noise.
input_texts=[]
input_vocab=set()
for text in normal_sentences:
    text = text.lower()
    matches = re.findall(pattern,text)
    for match in matches:
        match = "".join(match)
        text = text.replace(match,"")
    sentence = f'{text}'.split(' ')
    input_texts.append(sentence)
    
    # Add unique words to vocabulary
    for word in sentence:
        if word not in input_vocab:
            input_vocab.add(word)


# Define beginning of sentence and end of sentence tokens.
# These will allow us to initialize our decoder layer and also allow it to know when to stop.
bos='bos '
eos=' eos'

# Do the same for sentences from simplified wikipedia.
simplified_sentences = simple[2][identical_filter]
target_texts=[]
target_vocab=set()
for text in normal_sentences:
    text = text.lower()
    matches = re.findall(pattern,text)
    for match in matches:
        match = "".join(match)
        text = text.replace(match,"")
    sentence = f'{bos}{text}{eos}'.split(' ')
    target_texts.append(sentence)
    for word in sentence:
        if word not in target_vocab:
            target_vocab.add(word)
    
print(f'No. pairs before preprocessing: {len(normal[2])}')
print(f'No. pairs after preprocessing: {len(input_texts)}')

No. pairs before preprocessing: 167689
No. pairs after preprocessing: 117952


In [12]:
seqs = [len(seq) for seq in input_texts]
" ".join(input_texts[seqs.index(max(seqs))])

"asiatic cheetah  has for a long time been theoretically classified as a sub-species of the cheetah with the suffix `` venaticus '' applied at the end of its scientific binomial name acinonyx jubatus but at a cheetah reintroduction workshop organized in india on 9 september 2009 stephen j. o'brien from laboratory of genomic diversity of national cancer institute , usa who has in the past conducted numerous prestigious genetic studies including those on asiatic lions said that according to latest modern genetic studies which became possible only now it was discovered that in fact asiatic cheetah was genetically identical to the african cheetah with which it had separated only about 5000 years ago which was not enough time for a sub-species level differentiation , in comparison he said that the asian and african lion subspecies were separated some 100,000 years ago , so was the african and asian leopard subspecies 169,000 years ago . cheetah expert laurie marker of the cheetah conservati

In [13]:
# A wrapper for keras' tokenizer class which provides some much needed functionality.
# Code from https://github.com/Hvass-Labs/TensorFlow-Tutorials/blob/master/21_Machine_Translation.ipynb
class TokenizerWrap(Tokenizer):
    """Wrap the Tokenizer-class from Keras with more functionality."""
    def __init__(self, texts, padding,
                 reverse=False, num_words=None,oov_token=None):
        """
        :param texts: List of strings. This is the data-set.
        :param padding: Either 'post' or 'pre' padding.
        :param reverse: Boolean whether to reverse token-lists.
        :param num_words: Max number of words to use.
        """
        Tokenizer.__init__(self, num_words=num_words,oov_token=oov_token)
        self.fit_on_texts(texts)
        self.index_to_word = dict(zip(self.word_index.values(),
                                      self.word_index.keys()))
        self.tokens = self.texts_to_sequences(texts)
        if reverse:
            self.tokens = [list(reversed(x)) for x in self.tokens]
            truncating = 'pre'
        else:
            truncating = 'post'
        self.num_tokens = [len(x) for x in self.tokens]
        self.max_tokens = np.mean(self.num_tokens) \
                          + 2 * np.std(self.num_tokens)
        self.max_tokens = int(self.max_tokens)
        self.tokens_padded = pad_sequences(self.tokens,
                                           maxlen=self.max_tokens,
                                           padding=padding,
                                           truncating=truncating)
    def token_to_word(self, token):
        """Lookup a single word from an integer-token."""
        word = " " if token == 0 else self.index_to_word[token]
        return word 
    def tokens_to_string(self, tokens):
        """Convert a list of integer-tokens to a string."""
        words = [self.index_to_word[token]
                 for token in tokens
                 if token != 0]
        text = " ".join(words)
        return text
    def text_to_tokens(self, text, reverse=False, padding=False):
        """
        Convert a single text-string to tokens with optional
        reversal and padding.
        """
        tokens = self.texts_to_sequences([text])
        tokens = np.array(tokens)
        if reverse:
            tokens = np.flip(tokens, axis=1)
            truncating = 'pre'
        else:
            truncating = 'post'
        if padding:
            tokens = pad_sequences(tokens,maxlen=self.max_tokens,
                                   padding='pre',
                                   truncating=truncating)
        return tokens

In [16]:
%%time
input_tokenizer = TokenizerWrap(texts=input_texts,
                                padding='pre',
                                reverse=True,
                                num_words=vocab_size,
                                oov_token='unk')
inputs_tokenized = input_tokenizer.tokens_padded
inputs_tokenized.shape

CPU times: user 8.12 s, sys: 461 ms, total: 8.58 s
Wall time: 9.26 s


In [17]:
target_tokenizer = TokenizerWrap(texts=target_texts,
                                padding='post',
                                reverse=False,
                                num_words=vocab_size,
                                oov_token='unk')
targets_tokenized = target_tokenizer.tokens_padded
targets_tokenized.shape

(117952, 54)

In [18]:
encoder_input_data = inputs_tokenized
decoder_input_data = targets_tokenized[:,:-1]
decoder_input_data.shape

(117952, 53)

In [36]:
decoder_output_data = targets_tokenized[:,1:]
decoder_output_list=[]
for sequence in decoder_output_data[:batch_size]:
    one_hot = to_categorical(sequence,num_classes=vocab_size)
    decoder_output_list.append(one_hot)
decoder_output_data = np.array(decoder_output_list).reshape(decoder_output_data[:batch_size].shape[0],decoder_output_data[:batch_size].shape[1],vocab_size)
decoder_output_data.shape

(64, 53, 30000)

In [21]:
embeddings_dict = {}
f = open('./data/glove.6B/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:],dtype='float32')
    embeddings_dict[word] = coefs
f.close()
print(f'Loaded {len(embeddings_dict)} word vectors')

Loaded 400001 word vectors
