## RNN , Gated Recurrent Unit to build a model using a pre trained Word Embeddings Glove

In [2]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.layers import Dropout
from keras.preprocessing import sequence
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

Using TensorFlow backend.


In [3]:
from gensim.models import KeyedVectors




In [4]:
from gensim.scripts.glove2word2vec import glove2word2vec

In [5]:
glove_input_file = "glove.6B.100d.txt"

In [6]:
word2vec_output_file = "word2vec.txt"

In [7]:
glove2word2vec(glove_input_file,word2vec_output_file)

(400000, 100)

In [8]:
model = KeyedVectors.load_word2vec_format(word2vec_output_file,binary=False)

In [9]:
embedding_index = dict()
f = open('glove.6B.100d.txt', encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.array(values[1:],dtype='float32')
    embedding_index[word] = coefs
f.close()
print("Len of loaded word vectors: ",len(embedding_index))

Len of loaded word vectors:  400000


In [10]:
result = model.most_similar(positive=["woman","king"],negative=['man'],topn=1)
print(result)

[('queen', 0.7698541283607483)]


## Load Quora Questions dataset and Cleaning data

In [11]:
df = pd.read_csv("quora_duplicate_questions.tsv",delimiter='\t',encoding='utf-8')

In [12]:
df2 = df.copy()

In [13]:
df.drop(['id','qid1','qid2'],axis=1,inplace=True)

In [16]:
# function to clean data
import string
import itertools 
import re
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from string import punctuation

stops = ['the','a','an','and','but','if','or','because','as','what','which','this','that','these','those','then',
              'just','so','than','such','both','through','about','for','is','of','while','during','to','What','Which',
              'Is','If','While','This']
# punct = list(string.punctuation)
# punct.append("''")
# punct.append(":")
# punct.append("...")
# punct.append("@")
# punct.append('""')
def cleanData(text, lowercase = False, punct = False, remove_stops = False, stemming = False, lemmatization = False):
    txt = str(text)
    
    # Replace apostrophes with standard lexicons
    txt = txt.replace("isn't", "is not")
    txt = txt.replace("aren't", "are not")
    txt = txt.replace("ain't", "am not")
    txt = txt.replace("won't", "will not")
    txt = txt.replace("didn't", "did not")
    txt = txt.replace("shan't", "shall not")
    txt = txt.replace("haven't", "have not")
    txt = txt.replace("hadn't", "had not")
    txt = txt.replace("hasn't", "has not")
    txt = txt.replace("don't", "do not")
    txt = txt.replace("wasn't", "was not")
    txt = txt.replace("weren't", "were not")
    txt = txt.replace("doesn't", "does not")
    txt = txt.replace("gotta","got to")
    txt = txt.replace("quikly","quickly")
    txt = txt.replace("'s", " is")
    txt = txt.replace("'re", " are")
    txt = txt.replace("'m", " am")
    txt = txt.replace("'d", " would")
    txt = txt.replace("'ll", " will")
    
    # More cleaning
    txt = re.sub(r"review", "", txt)
    txt = re.sub(r"Review", "", txt)
    txt = re.sub(r"TripAdvisor", "", txt)
    txt = re.sub(r"reviews", "", txt)
    txt = re.sub(r"Hotel", "", txt)
    txt = re.sub(r"what's", "", txt)
    txt = re.sub(r"What's", "", txt)
    txt = re.sub(r"\'s", " ", txt)
    txt = txt.replace("pic", "picture")
    txt = re.sub(r"\'ve", " have ", txt)
    txt = re.sub(r"can't", "cannot ", txt)
    txt = re.sub(r"n't", " not ", txt)
    txt = re.sub(r"I'm", "I am", txt)
    txt = re.sub(r" m ", " am ", txt)
    txt = re.sub(r"\'re", " are ", txt)
    txt = re.sub(r"\'d", " would ", txt)
    txt = re.sub(r"\'ll", " will ", txt)
    txt = re.sub(r"60k", " 60000 ", txt)
    txt = re.sub(r" e g ", " eg ", txt)
    txt = re.sub(r" b g ", " bg ", txt)
    txt = re.sub(r"\0s", "0", txt)
    txt = re.sub(r" 9 11 ", "911", txt)
    txt = re.sub(r"e-mail", "email", txt)
    txt = re.sub(r"\s{2,}", " ", txt)
    txt = re.sub(r"quikly", "quickly", txt)
    txt = re.sub(r" usa ", " America ", txt)
    txt = re.sub(r" USA ", " America ", txt)
    txt = re.sub(r" u s ", " America ", txt)
    txt = re.sub(r" uk ", " England ", txt)
    txt = re.sub(r" UK ", " England ", txt)
    txt = re.sub(r"india", "India", txt)
    txt = re.sub(r"switzerland", "Switzerland", txt)
    txt = re.sub(r"china", "China", txt)
    txt = re.sub(r"chinese", "Chinese", txt) 
    txt = re.sub(r"imrovement", "improvement", txt)
    txt = re.sub(r"intially", "initially", txt)
    txt = re.sub(r"quora", "Quora", txt)
    txt = re.sub(r" dms ", "direct messages ", txt)  
    txt = re.sub(r"demonitization", "demonetization", txt) 
    txt = re.sub(r"actived", "active", txt)
    txt = re.sub(r"kms", " kilometers ", txt)
    txt = re.sub(r"KMs", " kilometers ", txt)
    txt = re.sub(r" cs ", " computer science ", txt) 
    txt = re.sub(r" upvotes ", " up votes ", txt)
    txt = re.sub(r" iPhone ", " phone ", txt)
    txt = re.sub(r"\0rs ", " rs ", txt) 
    txt = re.sub(r"calender", "calendar", txt)
    txt = re.sub(r"ios", "operating system", txt)
    txt = re.sub(r"gps", "GPS", txt)
    txt = re.sub(r"gst", "GST", txt)
    txt = re.sub(r"programing", "programming", txt)
    txt = re.sub(r"bestfriend", "best friend", txt)
    txt = re.sub(r"dna", "DNA", txt)
    txt = re.sub(r"III", "3", txt) 
    txt = re.sub(r"the US", "America", txt)
    txt = re.sub(r"Astrology", "astrology", txt)
    txt = re.sub(r"Method", "method", txt)
    txt = re.sub(r"Find", "find", txt) 
    txt = re.sub(r"banglore", "Banglore", txt)
    txt = re.sub(r" J K ", " JK ", txt)

    # Emoji replacement
    txt = re.sub(r':\)',r' Happy ',txt)
    txt = re.sub(r':D',r' Happy ',txt)
    txt = re.sub(r':P',r' Happy ',txt)
    txt = re.sub(r':\(',r' Sad ',txt)
    
    # Remove urls and emails
    txt = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', txt, flags=re.MULTILINE)
    txt = re.sub(r'[\w\.-]+@[\w\.-]+', ' ', txt, flags=re.MULTILINE)
    
    # Remove punctuation from text
    if punct:
        txt = "".join([c for c in text if c not in punctuation])

   
       
    # Replace words like sooooooo with so
    txt = "".join(''.join(s)[:2] for _, s in itertools.groupby(txt))
    
    # Split attached words
    #txt = " ".join(re.findall('[A-Z][^A-Z]*', txt))   
    
    if lowercase:
        txt = " ".join([w.lower() for w in txt.split()])
        
    if remove_stops:
        txt = " ".join([w for w in txt.split() if w not in stops])
    if stemming:
        st = PorterStemmer()
#         print (len(txt.split()))
#         print (txt)
        txt = " ".join([st.stem(w) for w in txt.split()])
    
    if lemmatization:
        wordnet_lemmatizer = WordNetLemmatizer()
        txt = " ".join([wordnet_lemmatizer.lemmatize(w, pos='v') for w in txt.split()])

    return txt

In [17]:
df['question1'] = df['question1'].apply(lambda x: cleanData(x, lowercase = True, punct = False, remove_stops = False, stemming = True, lemmatization = True))
df['question2'] = df['question2'].apply(lambda x: cleanData(x,lowercase = True, punct = False, remove_stops = False, stemming = True, lemmatization = True))

In [18]:
df.head()

Unnamed: 0,question1,question2,is_duplicate
0,what be the step by step guid to invest in sha...,what be the step by step guid to invest in sha...,0
1,what be the stori of kohinoor (koh-i-noor) dia...,what would happen if the indian govern steal t...,0
2,how can i increas the speed of my internet con...,how can internet speed be increas by hack thro...,0
3,whi be i mental veri lonely? how can i solv it?,find the remaind when [math]23^{24}[/math] be ...,0
4,"which one dissolv in water quickli sugar, salt...",which fish would surviv in salt water?,0


## Tokenizing and creating embedding matrix for question1 and question2 variables

In [19]:
from keras.preprocessing.text import one_hot,text_to_word_sequence,Tokenizer

In [20]:
raw_text = np.hstack([df.question1, df.question2])
t = Tokenizer()
t.fit_on_texts(raw_text)
df['q1_encoded'] = t.texts_to_sequences(df.question1)
df['q2_encoded'] = t.texts_to_sequences(df.question2)

In [21]:
vocabulary_size = len(t.word_index) + 1
print(vocabulary_size)

90150


In [22]:
MAX_TEXT = np.max([np.max(df.q1_encoded.max()),np.max(df.q2_encoded.max())]) + 2
print(MAX_TEXT)

89745


In [24]:
df.head()

Unnamed: 0,question1,question2,is_duplicate,q1_encoded,q2_encoded
0,what be the step by step guid to invest in sha...,what be the step by step guid to invest in sha...,0,"[3, 1, 2, 651, 57, 651, 2211, 7, 299, 8, 545, ...","[3, 1, 2, 651, 57, 651, 2211, 7, 299, 8, 545, ..."
1,what be the stori of kohinoor (koh-i-noor) dia...,what would happen if the indian govern steal t...,0,"[3, 1, 2, 588, 10, 12708, 12030, 4, 19292, 3430]","[3, 44, 97, 25, 2, 80, 257, 2336, 2, 12708, 12..."
2,how can i increas the speed of my internet con...,how can internet speed be increas by hack thro...,0,"[5, 12, 4, 198, 2, 441, 10, 17, 373, 597, 203,...","[5, 12, 373, 441, 1, 198, 57, 303, 230, 17389]"
3,whi be i mental veri lonely? how can i solv it?,find the remaind when [math]23^{24}[/math] be ...,0,"[16, 1, 4, 1129, 325, 3815, 5, 12, 4, 535, 15]","[75, 2, 3781, 38, 216, 2078, 1296, 216, 1, 172..."
4,"which one dissolv in water quickli sugar, salt...",which fish would surviv in salt water?,0,"[24, 49, 5188, 8, 241, 1847, 1757, 1849, 13075...","[24, 1580, 44, 1149, 8, 1849, 241]"


In [25]:
from keras.preprocessing.sequence import pad_sequences
max_size = 32
def get_keras_data(dataset):
    X = {
        'q1': pad_sequences(dataset.q1_encoded, maxlen=50),
        'q2': pad_sequences(dataset.q2_encoded, maxlen=50)
        
    }
    return X

X_train = get_keras_data(dtrain)
X_valid = get_keras_data(dvalid)





In [26]:
q1_embedding_matrix = np.zeros((vocabulary_size,100))
#embedding_index contains words from glove
for word,i in t.word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        q1_embedding_matrix[i] = embedding_vector

In [27]:
#Creating embedding_matrix for q2
q2_embedding_matrix = np.zeros((vocabulary_size,100))
#embedding_index contains words from glove
for word,i in t.word_index.items():
    q2_embedding_vector = embedding_index.get(word)
    if q2_embedding_vector is not None:
        q2_embedding_matrix[i] = q2_embedding_vector

In [28]:
y_test = df['is_duplicate']

In [29]:
from sklearn.cross_validation import train_test_split

#EXTRACT DEVELOPTMENT TEST
dtrain, dvalid = train_test_split(df, random_state=123, train_size=0.75)
print(dtrain.shape)
print(dvalid.shape)

(303217, 5)
(101073, 5)


In [30]:
from keras.layers import Input, Dropout, Dense, BatchNormalization, Activation, concatenate, GRU, Embedding, Flatten, BatchNormalization
from keras.models import Model
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping
from keras import backend as K
from keras.optimizers import Adam
dropout_r = 0.1
adam = Adam(lr=0.001)
q1 = Input(shape=[X_train["q1"].shape[1]], name="q1", dtype='int32')
q2 = Input(shape=[X_train["q2"].shape[1]], name="q2", dtype='int32')

emb_q1 = Embedding(vocabulary_size,100,weights=[q1_embedding_matrix],input_length=32,trainable=False)(q1)
emb_q2 = Embedding(vocabulary_size,100,weights=[q2_embedding_matrix],input_length=32,trainable=False)(q2)
rnn_layer1 = GRU(16) (emb_q1)
rnn_layer2 = GRU(16) (emb_q2)
main_l = concatenate([rnn_layer1, rnn_layer2])
main_l = Dropout(dropout_r) (Dense(128) (main_l))
main_l = Dropout(dropout_r) (Dense(64) (main_l))
output = Dense(1, activation="sigmoid") (main_l)
model_GRU = Model([q1,q2],output)
model_GRU.compile(loss='binary_crossentropy',optimizer=adam, metrics=['acc'])




In [99]:
model_GRU.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
q1 (InputLayer)                 (None, 50)           0                                            
__________________________________________________________________________________________________
q2 (InputLayer)                 (None, 50)           0                                            
__________________________________________________________________________________________________
embedding_5 (Embedding)         (None, 32, 100)      9015000     q1[0][0]                         
__________________________________________________________________________________________________
embedding_6 (Embedding)         (None, 32, 100)      9015000     q2[0][0]                         
__________________________________________________________________________________________________
gru_1 (GRU

In [31]:
#FITTING THE MODEL
BATCH_SIZE = 20000
epochs = 5

model_GRU.fit(X_train, y_train, epochs=epochs, batch_size=BATCH_SIZE
          , validation_data=(X_valid, y_)
          , verbose=1)

Train on 303217 samples, validate on 101073 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1c2a85254a8>

In [32]:
#EVLUEATE THE MODEL ON DEV TEST: What is it doing?
val_preds = model_GRU.predict(X_valid)

In [110]:
val_preds_x_train = model_GRU.predict(X_train)

In [33]:
X_final = get_keras_data(df)

In [34]:
y_final_preds = model_GRU.predict(X_final)

In [132]:
y_final_classes = np.argmax(y_final_preds,axis=1)

In [35]:
df['predictions_probs'] = y_final_preds

In [36]:
def update_score(row):
    row['results'] = 1 if row['predictions_probs'] > 0.5 else 0
    return row

In [37]:
df['results'] = df['predictions_probs'].apply(lambda x: 1 if x > 0.5 else 0)

In [45]:
df.loc[(df['results']== 0) & (df['is_duplicate'] == 1)]

Unnamed: 0,question1,question2,is_duplicate,q1_encoded,q2_encoded,predictions_probs,results
5,astrology: i be a capricorn sun cap moon and c...,"i be a tripl capricorn (sun, moon and ascend i...",1,"[3309, 4, 1, 6, 8211, 876, 3441, 766, 11, 3441...","[4, 1, 6, 4425, 8211, 876, 766, 11, 8941, 8, 8...",0.150777,0
7,how can i be a good geologist?,what should i do to be a great geologist?,1,"[5, 12, 4, 1, 6, 42, 18330]","[3, 29, 4, 9, 7, 1, 6, 352, 18330]",0.410625,0
13,what wa your first sexual experi like?,what wa your first sexual experience?,1,"[3, 58, 33, 102, 1017, 361, 36]","[3, 58, 33, 102, 1017, 881]",0.423808,0
15,what would a trump presid mean for current int...,how will a trump presid affect the student pre...,1,"[3, 44, 6, 91, 284, 72, 13, 329, 382, 7608, 15...","[5, 34, 6, 91, 284, 259, 2, 151, 854, 8, 199, ...",0.144300,0
16,what doe manipul mean?,what doe manipul means?,1,"[3, 20, 3721, 72]","[3, 20, 3721, 2987]",0.306189,0
20,whi do rocket look white?,whi be rocket and booster paint white?,1,"[16, 9, 3545, 152, 348]","[16, 1, 3545, 11, 8246, 1195, 348]",0.312528,0
31,what be some special care for someon with a no...,how can i keep my nose from get stuffi at night?,1,"[3, 1, 26, 794, 610, 13, 86, 28, 6, 1797, 27, ...","[5, 12, 4, 296, 17, 1797, 32, 22, 11486, 41, 614]",0.318407,0
32,what game of throne villain would be the most ...,what game of throne villain would you most lik...,1,"[3, 187, 10, 1704, 4175, 44, 1, 2, 52, 36, 7, ...","[3, 187, 10, 1704, 4175, 44, 14, 52, 36, 7, 1,...",0.309191,0
38,how do we prepar for upsc?,how do i prepar for civil service?,1,"[5, 9, 53, 133, 13, 992]","[5, 9, 4, 133, 13, 443, 1536]",0.442758,0
48,what be some exampl of product that can be mak...,what be some of the product make from crude oil?,1,"[3, 1, 26, 233, 10, 261, 27, 12, 1, 37, 32, 60...","[3, 1, 26, 10, 2, 261, 37, 32, 6072, 704]",0.246882,0


In [39]:
from sklearn.metrics import confusion_matrix,accuracy_score,f1_score,recall_score
cm = confusion_matrix(df['is_duplicate'], df['results'])

In [40]:
cm

array([[219951,  35076],
       [ 96581,  52682]], dtype=int64)

In [41]:
Total = cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1]
Accuracy = (cm[0][0] + cm[1][1])/Total
print("Accuracy %.2f" %(Accuracy*100))

Accuracy 67.44


In [42]:
Acc = accuracy_score(df['is_duplicate'], df['results'])

In [43]:
f1 = f1_score(df['is_duplicate'], df['results'])
print(f1)

0.444534450534


In [44]:
recall = recall_score(df['is_duplicate'], df['results'])
print(recall)

0.352947481961


In [49]:
submissions = pd.DataFrame({'question1': df2['question1'],'question2': df2['question2'],'is_duplicate':df['is_duplicate'],'results': df['results']})

In [51]:
submissions.to_csv("submissions.csv",index=False)