## LSTM, Word Embeddings, semantic text similarity using Manhattan distance measure

In [1]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.layers import Dropout
from keras.preprocessing import sequence
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

Using TensorFlow backend.


In [2]:
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
glove_input_file = "glove.6B.300d.txt"
word2vec_output_file = "word2vec.txt"
glove2word2vec(glove_input_file,word2vec_output_file)
model = KeyedVectors.load_word2vec_format(word2vec_output_file,binary=False)



In [3]:
embedding_index = dict()
f = open('glove.6B.300d.txt', encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.array(values[1:],dtype='float32')
    embedding_index[word] = coefs
f.close()
print("Len of loaded word vectors: ",len(embedding_index))

Len of loaded word vectors:  400000


In [4]:
result = model.most_similar(positive=["woman","king"],negative=['man'],topn=1)
print(result)

[('queen', 0.6713277101516724)]


## Load Quora Questions dataset and Cleaning data

In [5]:
df = pd.read_csv("quora_duplicate_questions.tsv",delimiter='\t',encoding='utf-8')
df2 = df.copy()
df.drop(['id','qid1','qid2'],axis=1,inplace=True)
df['question1'] = df['question1'].apply(lambda s: " ".join(s1.lower() for s1 in str(s).split()))
df['question2'] = df['question2'].apply(lambda s: " ".join(s1.lower() for s1 in str(s).split()))
df['question1'] = df['question1'].str.replace('[^\w\s]','')
df['question2'] = df['question2'].str.replace(r'[^\w\s]','')

In [6]:
# function to clean data
import string
import itertools 
import re
from nltk.stem import WordNetLemmatizer
from string import punctuation

stops = ['the','a','an','and','but','if','or','because','as','what','which','this','that','these','those','then',
              'just','so','than','such','both','through','about','for','is','of','while','during','to','What','Which',
              'Is','If','While','This']
def cleanData(text, lowercase = False, remove_stops = False, stemming = False, lemmatization = False):
    txt = str(text)
    
    # Replace apostrophes with standard lexicons
    txt = txt.replace("isn't", "is not")
    txt = txt.replace("aren't", "are not")
    txt = txt.replace("ain't", "am not")
    txt = txt.replace("won't", "will not")
    txt = txt.replace("didn't", "did not")
    txt = txt.replace("shan't", "shall not")
    txt = txt.replace("haven't", "have not")
    txt = txt.replace("hadn't", "had not")
    txt = txt.replace("hasn't", "has not")
    txt = txt.replace("don't", "do not")
    txt = txt.replace("wasn't", "was not")
    txt = txt.replace("weren't", "were not")
    txt = txt.replace("doesn't", "does not")
    txt = txt.replace("gotta","got to")
    txt = txt.replace("quikly","quickly")
    txt = txt.replace("'s", " is")
    txt = txt.replace("'re", " are")
    txt = txt.replace("'m", " am")
    txt = txt.replace("'d", " would")
    txt = txt.replace("'ll", " will")
    
    # More cleaning
    txt = re.sub(r"review", "", txt)
    txt = re.sub(r"Review", "", txt)
    txt = re.sub(r"TripAdvisor", "", txt)
    txt = re.sub(r"reviews", "", txt)
    txt = re.sub(r"Hotel", "", txt)
    txt = re.sub(r"what's", "", txt)
    txt = re.sub(r"What's", "", txt)
    txt = re.sub(r"\'s", " ", txt)
    txt = txt.replace("pic", "picture")
    txt = re.sub(r"\'ve", " have ", txt)
    txt = re.sub(r"can't", "cannot ", txt)
    txt = re.sub(r"n't", " not ", txt)
    txt = re.sub(r"I'm", "I am", txt)
    txt = re.sub(r" m ", " am ", txt)
    txt = re.sub(r"\'re", " are ", txt)
    txt = re.sub(r"\'d", " would ", txt)
    txt = re.sub(r"\'ll", " will ", txt)
    txt = re.sub(r"60k", " 60000 ", txt)
    txt = re.sub(r" e g ", " eg ", txt)
    txt = re.sub(r" b g ", " bg ", txt)
    txt = re.sub(r"\0s", "0", txt)
    txt = re.sub(r" 9 11 ", "911", txt)
    txt = re.sub(r"e-mail", "email", txt)
    txt = re.sub(r"\s{2,}", " ", txt)
    txt = re.sub(r"quikly", "quickly", txt)
    txt = re.sub(r" usa ", " America ", txt)
    txt = re.sub(r" USA ", " America ", txt)
    txt = re.sub(r" u s ", " America ", txt)
    txt = re.sub(r" uk ", " England ", txt)
    txt = re.sub(r" UK ", " England ", txt)
    txt = re.sub(r"india", "India", txt)
    txt = re.sub(r"switzerland", "Switzerland", txt)
    txt = re.sub(r"china", "China", txt)
    txt = re.sub(r"chinese", "Chinese", txt) 
    txt = re.sub(r"imrovement", "improvement", txt)
    txt = re.sub(r"intially", "initially", txt)
    txt = re.sub(r"quora", "Quora", txt)
    txt = re.sub(r" dms ", "direct messages ", txt)  
    txt = re.sub(r"demonitization", "demonetization", txt) 
    txt = re.sub(r"actived", "active", txt)
    txt = re.sub(r"kms", " kilometers ", txt)
    txt = re.sub(r"KMs", " kilometers ", txt)
    txt = re.sub(r" cs ", " computer science ", txt) 
    txt = re.sub(r" upvotes ", " up votes ", txt)
    txt = re.sub(r" iPhone ", " phone ", txt)
    txt = re.sub(r"\0rs ", " rs ", txt) 
    txt = re.sub(r"calender", "calendar", txt)
    txt = re.sub(r"ios", "operating system", txt)
    txt = re.sub(r"gps", "GPS", txt)
    txt = re.sub(r"gst", "GST", txt)
    txt = re.sub(r"programing", "programming", txt)
    txt = re.sub(r"bestfriend", "best friend", txt)
    txt = re.sub(r"dna", "DNA", txt)
    txt = re.sub(r"III", "3", txt) 
    txt = re.sub(r"the US", "America", txt)
    txt = re.sub(r"Astrology", "astrology", txt)
    txt = re.sub(r"Method", "method", txt)
    txt = re.sub(r"Find", "find", txt) 
    txt = re.sub(r"banglore", "Banglore", txt)
    txt = re.sub(r" J K ", " JK ", txt)

    # Emoji replacement
    txt = re.sub(r':\)',r' Happy ',txt)
    txt = re.sub(r':D',r' Happy ',txt)
    txt = re.sub(r':P',r' Happy ',txt)
    txt = re.sub(r':\(',r' Sad ',txt)
    
    # Remove urls and emails
    txt = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', txt, flags=re.MULTILINE)
    txt = re.sub(r'[\w\.-]+@[\w\.-]+', ' ', txt, flags=re.MULTILINE)
    
    # Remove punctuation from text
    txt = ''.join([c for c in text if c not in punctuation])
#     txt = txt.replace(".", " ")
#     txt = txt.replace(":", " ")
#     txt = txt.replace("!", " ")
#     txt = txt.replace("&", " ")
#     txt = txt.replace("#", " ")
    
    # Remove all symbols
    txt = re.sub(r'[^A-Za-z0-9\s]',r' ',txt)
    txt = re.sub(r'\n',r' ',txt)
    
    txt = re.sub(r'[0-9]',r' ',txt)
    
    # Replace words like sooooooo with so
    txt = ''.join(''.join(s)[:2] for _, s in itertools.groupby(txt))
    
    # Split attached words
    #txt = " ".join(re.findall('[A-Z][^A-Z]*', txt))   
    
    if lowercase:
        txt = " ".join([w.lower() for w in txt.split()])
        
    if remove_stops:
        txt = " ".join([w for w in txt.split() if w not in stops])
    if stemming:
        st = PorterStemmer()
#         print (len(txt.split()))
#         print (txt)
        txt = " ".join([st.stem(w) for w in txt.split()])
    
    if lemmatization:
        wordnet_lemmatizer = WordNetLemmatizer()
        txt = " ".join([wordnet_lemmatizer.lemmatize(w, pos='v') for w in txt.split()])

    return txt

In [7]:
df['question1'] = df['question1'].apply(lambda x: cleanData(x, lowercase = True, remove_stops = True, stemming = False, lemmatization = True))
df['question2'] = df['question2'].apply(lambda x: cleanData(x,lowercase = True, remove_stops = True, stemming = False, lemmatization = True))

In [8]:
df.head()

Unnamed: 0,question1,question2,is_duplicate
0,step by step guide invest in share market in i...,step by step guide invest in share market,0
1,story kohinoor kohinoor diamond,would happen indian government steal kohinoor ...,0
2,how can i increase speed my internet connectio...,how can internet speed be increase by hack dns,0
3,why be i mentally very lonely how can i solve it,find remainder when math math divide by,0
4,one dissolve in water quikly sugar salt methan...,fish would survive in salt water,0


## Tokenizing and creating embedding matrix for question1 and question2 variables

In [9]:
from keras.preprocessing.text import one_hot,text_to_word_sequence,Tokenizer
raw_text = np.hstack([df.question1, df.question2])
t = Tokenizer()
t.fit_on_texts(raw_text)
df["seq_question1"] = t.texts_to_sequences(df.question1)
df["seq_question2"] = t.texts_to_sequences(df.question2)
vocabulary_size = len(t.word_index) + 1
print(vocabulary_size)
max_q1_seq = np.max(df.seq_question1.apply(lambda x: len(x)))
max_q2_seq = np.max(df.seq_question2.apply(lambda x: len(x)))
print("max question1 seq "+str(max_q1_seq))
print("max question2 seq "+str(max_q2_seq))

89578
max question1 seq 90
max question2 seq 181


In [10]:
import matplotlib.pyplot as plt
%matplotlib inline

In [13]:
#EMBEDDINGS MAX VALUE
#Base on the histograms, we select the next lengths
MAX_Q1_SEQ = 40
MAX_Q2_SEQ = 50
MAX_TEXT = np.max([np.max(df.seq_question1.max()),np.max(df.seq_question2.max())]) + 2
print(MAX_TEXT)
target = df['is_duplicate']

89578


In [14]:
from sklearn.cross_validation import train_test_split

#EXTRACT DEVELOPTMENT TEST
dtrain, dvalid = train_test_split(df, random_state=123, train_size=0.75)
print(dtrain.shape)
print(dvalid.shape)



(303217, 5)
(101073, 5)


In [15]:
from keras.preprocessing.sequence import pad_sequences
def get_keras_data(dataset):
    X = {
        'q1': pad_sequences(dataset.seq_question1, maxlen=MAX_Q1_SEQ),
        'q2': pad_sequences(dataset.seq_question2, maxlen=MAX_Q2_SEQ)
        
    }
    return X

X_train = get_keras_data(dtrain)
X_valid = get_keras_data(dvalid)


In [16]:
X_train['q1'].shape

(303217, 40)

## Model building using output as Manhattan distance from LSTM 


In [18]:
from keras.layers import Input, Dropout, Dense, BatchNormalization, Activation, concatenate, GRU, Embedding, Flatten, BatchNormalization
from keras.models import Model
from keras.layers import Input, Embedding, LSTM, Merge
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping
from keras import backend as K
from keras import optimizers
def exponent_neg_manhattan_distance(left, right):
    ''' Helper function for the similarity estimate of the LSTMs outputs'''
    return K.exp(-K.sum(K.abs(left-right), axis=1, keepdims=True))
dropout_r = 0.1
q1 = Input(shape=[X_train["q1"].shape[1]], name="q1")
q2 = Input(shape=[X_train["q2"].shape[1]], name="q2")
emb_q1 = Embedding(MAX_TEXT, 300)(q1)
emb_q2 = Embedding(MAX_TEXT,300)(q2)
left_output = LSTM(50) (emb_q1)
right_output = LSTM(50) (emb_q2)
main_l = concatenate([left_output, right_output])
main_l = Dropout(dropout_r) (Dense(128) (main_l))
main_l = Dropout(dropout_r) (Dense(64) (main_l))

# Calculates the distance as defined by the MaLSTM model
malstm_distance = Merge(mode=lambda x: exponent_neg_manhattan_distance(x[0], x[1]), output_shape=lambda x: (x[0][0], 1))([left_output, right_output])
model_new = Model([q1,q2],malstm_distance)
model_new.compile(loss='mean_squared_error',optimizer='adam',metrics=['accuracy'])




In [19]:
#FITTING THE MODEL
BATCH_SIZE = 64
epochs = 2

model_new.fit(X_train, dtrain.is_duplicate, epochs=epochs, batch_size=BATCH_SIZE
          , validation_data=(X_valid, dvalid.is_duplicate)
          , verbose=1)

Train on 303217 samples, validate on 101073 samples
Epoch 1/5
Epoch 2/5
 50304/303217 [===>..........................] - ETA: 3:24:05 - loss: 0.1379 - acc: 0.8075

KeyboardInterrupt: 

In [None]:
model_new.save("maLSTM.h5")
from keras.models import load_model
maLSTM = load_model('maLSTM.h5')

In [None]:
X_final = get_keras_data(df)
y_final_preds = model_new.predict(X_final)
df['predictions_probs'] = y_final_preds


In [None]:
# Plot accuracy
plt.plot(maLSTM.history['acc'])
plt.plot(maLSTM.history['val_acc'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plot loss
plt.plot(model_new.history['loss'])
plt.plot(model_new.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.show()