In [81]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import pickle
import operator
%matplotlib inline

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers import Embedding, LSTM, Dropout, RepeatVector, Bidirectional
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, merge
from keras.layers.embeddings import Embedding
from keras.utils import np_utils
from keras.callbacks import EarlyStopping
from keras.engine import Model
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

In [2]:
normal = pd.read_csv('../data/sentence-aligned.v2/normal.aligned',sep='\t',header=None)
normal.head()

Unnamed: 0,0,1,2
0,"Cherokee, Oklahoma",0,It is the county seat of Alfalfa County .
1,"Cherokee, Oklahoma",0,"Cherokee is a city in Alfalfa County , Oklahom..."
2,Skateboard,5,Skateboard decks are usually between 28 and 33...
3,Skateboard,5,The underside of the deck can be printed with ...
4,Skateboard,6,This was created by two surfers ; Ben Whatson ...


In [3]:
simple = pd.read_csv('../data/sentence-aligned.v2/simple.aligned',sep='\t',header=None)
simple.head()

Unnamed: 0,0,1,2
0,"Cherokee, Oklahoma",0,It is the county seat of Alfalfa County .
1,"Cherokee, Oklahoma",0,Cherokee is a city of Oklahoma in the United S...
2,Skateboard,2,Skateboard decks are normally between 28 and 3...
3,Skateboard,2,The bottom of the deck can be printed with a d...
4,Skateboard,3,The longboard was made by two surfers ; Ben Wh...


In [24]:
batch_size = 128  # Batch size for training.
epochs = 100  # Number of epochs to train for.
latent_dim = 300  # Latent dimensionality of the encoding space.
num_samples = 10_000  # Number of samples to train on.  ########### SHORTENED FOR PRACTICE PURPOSES

Identify sentences in the parallel corpus that were identical. This reduces the size of our training data by about 50 thousand. Another step of preprocessing is to turn everything to lower case.

In [5]:
identical_filter = (normal[2] != simple[2])
input_texts = normal[2][identical_filter]
input_texts = np.array([f'bos {text} eos'.lower().split(' ') for text in input_texts])
target_texts = simple[2][identical_filter]
target_texts = np.array([f'bos {text} eos'.lower().split(' ') for text in target_texts])
print(f'No. pairs before preprocessing: {len(normal[2])}')
print(f'No. pairs after preprocessing: {len(input_texts)}')

In [11]:
input_words = set()
target_words = set()
for i,sentence in enumerate(input_texts[:min(num_samples,len(input_texts)-1)]):
    for word in input_texts[i]:
        if word not in input_words:
            input_words.add(word)
    for word in target_texts[i]:
        if word not in target_words:
            target_words.add(word)
            
input_words = sorted(list(input_words))
target_words = sorted(list(target_words))
num_encoder_tokens = len(input_words)
num_decoder_tokens = len(target_words)
max_encoder_seq_length = max([len(sen) for sen in input_texts])
max_decoder_seq_length = max([len(sen) for sen in target_texts])

print('Number of samples:', len(input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

Number of samples: 117952
Number of unique input tokens: 121211
Number of unique output tokens: 105709
Max sequence length for inputs: 236
Max sequence length for outputs: 192


Our input vocabulary is about twelve thousand words, whereas the output vocabulary is about ten thousand. This makes sense; we expect the simplified vocabulary to be smaller.

# Tokenize??

In [93]:
MAX_NUM_WORDS=100000

In [94]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(input_texts)
input_sequences = tokenizer.texts_to_sequences(input_texts)

In [95]:
len(input_sequences)

117952

In [99]:
input_sequences[0]

[7323, 8, 7, 44, 5, 29620, 62, 2, 893, 2, 50, 55, 3]

In [104]:
print(len(input_sequences[0]))
print(len(input_texts[0]))

13
13


In [100]:
output_sequences = tokenizer.texts_to_sequences(target_texts)
output_sequences[0]

[7323, 8, 7, 44, 4, 893, 5, 1, 50, 55, 3]

In [101]:
target_texts[0]

['cherokee',
 'is',
 'a',
 'city',
 'of',
 'oklahoma',
 'in',
 'the',
 'united',
 'states',
 '.']

In [103]:
print(len(output_sequences[0]))
print(len(target_texts[0]))

11
11


In [105]:
word_index = tokenizer.word_index

In [106]:
word_index

{'the': 1,
 ',': 2,
 '.': 3,
 'of': 4,
 'in': 5,
 'and': 6,
 'a': 7,
 'is': 8,
 'to': 9,
 '-lrb-': 10,
 '-rrb-': 11,
 'was': 12,
 'as': 13,
 'for': 14,
 'on': 15,
 'by': 16,
 '``': 17,
 'with': 18,
 "''": 19,
 "'s": 20,
 'that': 21,
 'from': 22,
 'it': 23,
 'an': 24,
 'at': 25,
 'or': 26,
 'are': 27,
 'his': 28,
 'he': 29,
 'which': 30,
 'also': 31,
 'be': 32,
 ';': 33,
 'one': 34,
 'first': 35,
 'has': 36,
 'born': 37,
 'were': 38,
 'who': 39,
 'its': 40,
 'france': 41,
 'this': 42,
 'â': 43,
 'city': 44,
 'known': 45,
 ':': 46,
 'department': 47,
 'commune': 48,
 'not': 49,
 'united': 50,
 'after': 51,
 'have': 52,
 'but': 53,
 'their': 54,
 'states': 55,
 'other': 56,
 'new': 57,
 'had': 58,
 'most': 59,
 'two': 60,
 'they': 61,
 'county': 62,
 'her': 63,
 'been': 64,
 'world': 65,
 'football': 66,
 'used': 67,
 'when': 68,
 'may': 69,
 'american': 70,
 'into': 71,
 'region': 72,
 'such': 73,
 'all': 74,
 'can': 75,
 'more': 76,
 'during': 77,
 'she': 78,
 "'": 79,
 'name': 80,
 'so

In [108]:
input_data = pad_sequences(input_sequences, maxlen=max_encoder_seq_length)

In [117]:
output_data = pad_sequences(output_sequences,maxlen=max_decoder_seq_length)

In [109]:
input_data.shape

(117952, 236)

In [111]:
indices=np.arange(input_data.shape[0])

In [114]:
np.random.shuffle(indices)

In [116]:
data=input_data[indices]

In [118]:
targets=output_data[indices]

In [119]:
val_split=0.2
num_validation_samples = int(val_split * data.shape[0])

In [121]:
x_train = data[:-num_validation_samples]
y_train = targets[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = targets[-num_validation_samples:]

# Pre-Trained Embeddings

In [12]:
# %%time
# with open('../data/glove.6B/glove.6B.50d.txt','r',encoding='utf-8') as f:
#     lines = f.read().split('\n')

CPU times: user 1.15 s, sys: 1.28 s, total: 2.43 s
Wall time: 3.89 s


In [13]:
# vocab = {}
# for line in lines:
#     line = line.split(' ')
#     key = line[0]
#     line.remove(key)
#     this_vec = [float(val) for val in line]
#     vocab[key]=this_vec
# vocab_50 = vocab

In [14]:
%%time
with open('../data/glove.6B/glove.6B.300d.txt','r',encoding='utf-8') as f:
    lines = f.read().split('\n')

CPU times: user 8.34 s, sys: 20.6 s, total: 28.9 s
Wall time: 52.4 s


In [15]:
%%time
vocab = {}
for line in lines:
    line = line.split(' ')
    key = line[0]
    line.remove(key)
    this_vec = [float(val) for val in line]
    vocab[key]=this_vec
vocab_300 = vocab

In [63]:
# input_token_index = dict([(word,i) for i,word in enumerate(input_words)])

In [64]:
# target_token_index = dict([(word,i) for i,word in enumerate(target_words)])

In [57]:
# input_token_index={}
# for i,word in enumerate(input_words):
#     try:
#         input_token_index[i]=vocab_300[word]
#     except KeyError:
#         input_token_index[i]=vocab_300['unk']

In [58]:
# target_token_index={}
# for i,word in enumerate(target_words):
#     try:
#         target_token_index[i]=vocab_300[word]
#     except KeyError:
#         target_token_index[i]=vocab_300['unk']

In [25]:
encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, latent_dim),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, latent_dim),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, latent_dim),
    dtype='float32')

In [32]:
encoder_input_data.shape

(117952, 236, 300)

In [68]:
for i, (input_text,target_text) in enumerate(zip(input_texts,target_texts)):
    for j, word in enumerate(input_text):
        try:
            encoder_input_data[i,j] = vocab_300[word]
        except KeyError:
            encoder_input_data[i,j] = vocab_300['unk']
    for j, word in enumerate(target_text):
        try:
            decoder_input_data[i,j] = vocab_300[word]
        except:
            decoder_input_data[i,j] = vocab_300['unk']
        if j > 0:
            try:
                decoder_input_data[i,j-1]=vocab_300[word]
            except:
                decoder_input_data[i,j-1]=vocab_300['unk']            
                

In [69]:
encoder_input_data.shape

(117952, 236, 300)

# Modelling Time

In [86]:
learning_rate=1e-3
model = Sequential()
model.add(GRU(128,input_shape=(max_encoder_seq_length,),return_sequences=False))
model.add(RepeatVector(max_decoder_length))
model.add(GRU(128,return_sequences=True))
model.add(TimeDistributed(Dense(len(target_words),activation='softmax')))

ValueError: Input 0 is incompatible with layer gru_7: expected ndim=3, found ndim=2

In [None]:
decoder_inputs = Input(shape=(None, num_decoder_tokens))
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [None]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)


In [None]:
# Define an input sequence and process it.
encoder_inputs = Input(shape=(None,))
x = Embedding(num_encoder_tokens, latent_dim)(encoder_inputs)
x, state_h, state_c = LSTM(latent_dim,
                           return_state=True)(x)
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
x = Embedding(num_decoder_tokens, latent_dim)(decoder_inputs)
x = LSTM(latent_dim, return_sequences=True)(x, initial_state=encoder_states)
decoder_outputs = Dense(num_decoder_tokens, activation='softmax')(x)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile & run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
# Note that `decoder_target_data` needs to be one-hot encoded,
# rather than sequences of integers like `decoder_input_data`!
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2)

# Specific Trained Embeddings

# Constrained Seq2Seq

In [None]:
# ppdb = pd.read_csv('../data/ppdb-2.0-s-all',sep='|||')

In [None]:

xin = Input(batch_shape=(batch_size, seq_size), dtype='int32')
xemb = Embedding(embedding_size, mask_zero=True)(xin)

rnn_fwd1 = LSTM(rnn_size, return_sequence=True)(xemb)
rnn_bwd1 = LSTM(rnn_size, return_sequence=True, go_backwards=True)(xemb)
rnn_bidir1 = merge([rnn_fwd1, rnn_bwd1], mode='concat')

predictions = TimeDistributed(Dense(output_class_size, activation='softmax'))(rnn_bidir1) 

model = Model(input=xin, output=predictions)