In [2]:
import pandas as pd
import nltk
import re
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from gensim.models import KeyedVectors
import numpy as np
# download this to use gensim word2vec, KyedVectors
EMBEDDING_FILE = "GoogleNews-vectors-negative300.bin.gz"



In [9]:
# load the datset, keep the columns we want
NUM_ROWS = 5000
data_csv = pd.read_csv("questions.csv")
data_csv = data_csv[['question1','question2','is_duplicate']].head(NUM_ROWS) # change to higher number on faster machine
questions_cols = ['question1', 'question2']
print(data_csv.head())


                                           question1  \
0  What is the step by step guide to invest in sh...   
1  What is the story of Kohinoor (Koh-i-Noor) Dia...   
2  How can I increase the speed of my internet co...   
3  Why am I mentally very lonely? How can I solve...   
4  Which one dissolve in water quikly sugar, salt...   

                                           question2  is_duplicate  
0  What is the step by step guide to invest in sh...             0  
1  What would happen if the Indian government sto...             0  
2  How can Internet speed be increased by hacking...             0  
3  Find the remainder when [math]23^{24}[/math] i...             0  
4            Which fish would survive in salt water?             0  


In [11]:
stops = set(stopwords.words('english'))

# refer: https://github.com/eliorc/Medium/blob/master/MaLSTM.ipynb

def text_to_word_list(text):
    ''' Pre process and convert texts to a list of words '''
    text = str(text)
    text = text.lower()

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)

    text = text.split()

    return text
vocabulary = dict()
inverse_vocabulary = ['<unk>']  # '<unk>' will never be used, it is only a placeholder for the [0, 0, ....0] embedding

In [13]:

# Prepare embedding

print('loading word2vec')
word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)
print('done')

loading word2vec
done


In [14]:
for index, row in data_csv.iterrows():
 
    for question in questions_cols:
        
        q2n = []  # q2n -> question numbers representation
        for word in text_to_word_list(row[question]):
            if word in stops and word not in word2vec.vocab:
                continue

            if word not in vocabulary:
                vocabulary[word] = len(inverse_vocabulary)
                q2n.append(len(inverse_vocabulary))
                inverse_vocabulary.append(word)
            else:
                q2n.append(vocabulary[word])

        data_csv.set_value(index, question, q2n)

print('making embeddings')

embedding_dim = 300
embeddings = 1 * np.random.randn(len(vocabulary) + 1, embedding_dim)  # This will be the embedding matrix
embeddings[0] = 0  # So that the padding will be ignored

# Build the embedding matrix
for word, index in vocabulary.items():
    if word in word2vec.vocab:
        embeddings[index] = word2vec.word_vec(word)

del word2vec



making embeddings


In [15]:
import itertools
from keras.preprocessing.sequence import pad_sequences

max_seq_length = max(data_csv.question1.map(lambda x: len(x)).max(),
                     data_csv.question2.map(lambda x: len(x)).max())

print("max seq length: ", max_seq_length)

# Split to train validation
validation_size = int(0.2*NUM_ROWS)
training_size = len(data_csv) - validation_size

X = data_csv[questions_cols]
Y = data_csv['is_duplicate']

X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=validation_size)

X_train = {'left': X_train.question1, 'right': X_train.question2}
X_validation = {'left': X_validation.question1, 'right': X_validation.question2}

Y_train = Y_train.values
Y_validation = Y_validation.values

for dataset, side in itertools.product([X_train, X_validation], ['left', 'right']):
    dataset[side] = pad_sequences(dataset[side], maxlen=int(max_seq_length))


                    

max seq length:  212


In [7]:
print(embeddings[1])

[ 0.13964844 -0.00616455  0.21484375  0.07275391 -0.16113281  0.07568359
  0.16796875 -0.20117188  0.12597656  0.00915527  0.05249023 -0.15136719
 -0.02758789  0.04199219 -0.234375    0.13867188 -0.02600098  0.07910156
  0.02746582 -0.13085938 -0.02478027  0.10009766 -0.07910156 -0.07714844
  0.03759766  0.16894531  0.05371094 -0.05200195  0.14453125 -0.04370117
 -0.12597656  0.06884766 -0.10595703 -0.14550781 -0.00331116  0.01367188
  0.13964844  0.01660156  0.03417969  0.16113281 -0.01080322  0.06689453
  0.06835938 -0.15136719 -0.16894531  0.03295898 -0.06884766  0.06787109
 -0.07373047  0.08300781  0.05761719  0.14550781 -0.11865234 -0.13671875
  0.12402344  0.04296875 -0.11962891 -0.08154297  0.06494141 -0.05639648
 -0.04394531  0.1484375  -0.07714844  0.04614258 -0.02624512 -0.06591797
  0.04980469  0.08886719 -0.01647949 -0.02294922  0.10546875  0.04199219
  0.11035156 -0.08251953 -0.13574219 -0.07324219  0.1015625   0.05371094
 -0.07275391  0.08496094 -0.04443359 -0.078125    0

In [None]:
from keras.models import Model
from keras.layers import Input, Embedding, GRU, Lambda
import keras.backend as K
from keras.callbacks import ModelCheckpoint

# Model variables
n_hidden = 50
gradient_clipping_norm = 1.25
batch_size = 64
n_epoch = 25


left_input = Input(shape=(max_seq_length,), dtype='float')
right_input = Input(shape=(max_seq_length,), dtype='float')

embedding_layer = Embedding(len(embeddings), embedding_dim, weights=[embeddings], input_length=max_seq_length, trainable=False)

encoded_left = embedding_layer(left_input)
encoded_right = embedding_layer(right_input)

shared_gru = GRU(n_hidden)

left_output = shared_gru(encoded_left)
right_output = shared_gru(encoded_right)

def exponent_neg_manhattan_distance(left, right):
    return K.exp(-K.sum(K.abs(left-right), axis=1, keepdims=True))

malstm_distance = Lambda(function=lambda x: exponent_neg_manhattan_distance(x[0], x[1]),output_shape=lambda x: (x[0][0], 1))([left_output, right_output])

malstm = Model([left_input, right_input],[malstm_distance])


malstm.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])



malstm_trained = malstm.fit([X_train['left'], X_train['right']], Y_train, batch_size=batch_size, nb_epoch=n_epoch,
                            validation_data=([X_validation['left'], X_validation['right']], Y_validation))





Train on 4000 samples, validate on 1000 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25