In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import re
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import backend as K

In [12]:
with open('nva_phrases.txt','r') as file:
    candidates = file.read()
with open('nva_queries.txt','r') as file:
    queries = file.read()

In [13]:
candidates = candidates.split('\n')
queries = queries.split('\n')

In [99]:
candidates = list(set(candidates))
queries = list(set(queries))
print(len(candidates))
print(len(queries))

199
24


In [100]:
questions=[]
for query in queries:
    pairs =[]
    for candidate in candidates:
        pair=[]
        pair.append(query)
        pair.append(candidate)
        pairs.append(pair)
    questions.extend(pairs)

In [101]:
df = pd.DataFrame(questions,columns=['question1','question2'])
df= df[df.question2!=''].reset_index().drop('index',axis=1)
print(df.shape)
df.tail()

(4752, 2)


Unnamed: 0,question1,question2
4747,Parathas were delicious,Chicken was tender
4748,Parathas were delicious,meat was indistinguishable ENDPAD due
4749,Parathas were delicious,Biryani was too good
4750,Parathas were delicious,first choices are always heaps
4751,Parathas were delicious,kebabs are good


##### Functions

In [102]:
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
imp_stopwords=['not', 'against','until']
for word in imp_stopwords:
  #print(word)
  stop_words.remove(word)
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'ca

In [103]:
def text_process(text):
    ''' Pre process and convert texts to a list of words '''
    text = str(text)
    text = text.lower()

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)

    text = text.split()
    text = [word for word in text if word not in stop_words]
    text = ' '.join(text)
    return text

In [104]:
def mandist(left, right):
    ''' Helper function for the similarity estimate of the LSTMs outputs'''
    return K.exp(-K.sum(K.abs(left-right), axis=1, keepdims=True))

##### Preprocess

In [105]:
df['question1_p'] = df['question1'].apply(text_process)
df['question2_p'] = df['question2'].apply(text_process)
df.tail(10)

Unnamed: 0,question1,question2,question1_p,question2_p
4742,Parathas were delicious,burger and sandwich was good,parathas delicious,burger sandwich good
4743,Parathas were delicious,sizzlers are simply yumm,parathas delicious,sizzlers simply yumm
4744,Parathas were delicious,chole gravy was fine,parathas delicious,chole gravy fine
4745,Parathas were delicious,Pizzas are delicious,parathas delicious,pizzas delicious
4746,Parathas were delicious,Homemade lichi flavoured lemonade,parathas delicious,homemade lichi flavoured lemonade
4747,Parathas were delicious,Chicken was tender,parathas delicious,chicken tender
4748,Parathas were delicious,meat was indistinguishable ENDPAD due,parathas delicious,meat indistinguishable endpad due
4749,Parathas were delicious,Biryani was too good,parathas delicious,biryani good
4750,Parathas were delicious,first choices are always heaps,parathas delicious,first choices always heaps
4751,Parathas were delicious,kebabs are good,parathas delicious,kebabs good


In [106]:
import pickle
tokenizer = pickle.load(open('BILSTM_stop_tokenizer','rb'))

In [107]:
MAX_LEN=109
X_left = tokenizer.texts_to_sequences(df.question1_p)
X_right = tokenizer.texts_to_sequences(df.question2_p)
X_left_padded = pad_sequences(X_left, maxlen=MAX_LEN)
X_right_padded = pad_sequences(X_right, maxlen=MAX_LEN)

In [108]:
model = tf.keras.models.load_model('BILSTM_stop.h5')
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 109)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 109)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 109, 300)     24493200    input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 50)           140400      embedding[0][0]              

##### Predict

In [109]:
%%time
predictions = model.predict([X_left_padded, X_right_padded])

Wall time: 6.22 s


In [110]:
records=[]
for idx in range(len(predictions)):
    record=[]
    record.append(df.question1.iloc[idx])
    record.append(df.question2.iloc[idx])
    record.append(df.question1_p.iloc[idx])
    record.append(df.question2_p.iloc[idx])
    record.append(predictions[idx][0])
    records.append(record)

In [111]:
results_df = pd.DataFrame(records,columns=['question1','question2','question1_p','question2_p','similarity'])
print(results_df.shape)
results_df.head(10)

(4752, 5)


Unnamed: 0,question1,question2,question1_p,question2_p,similarity
0,Paratha,Food was decent,paratha,food decent,0.096995
1,Paratha,gulab jamun are too good,paratha,gulab jamun good,0.292841
2,Paratha,home made specials,paratha,home made specials,0.184465
3,Paratha,pizza was thin,paratha,pizza thin,0.093448
4,Paratha,Seekh kebab was just okay,paratha,seekh kebab okay,0.017054
5,Paratha,brownie seemed stale,paratha,brownie seemed stale,0.037703
6,Paratha,tandoored chicken was delicious,paratha,tandoored chicken delicious,0.173765
7,Paratha,sauce was penty ..,paratha,sauce penty,0.24983
8,Paratha,food was very nice,paratha,food nice,0.201407
9,Paratha,Mamas sauce was yummy,paratha,mamas sauce yummy,0.218082


In [210]:
# results_df.to_excel('/content/drive/My Drive/Chatbot/data/general_results.xlsx')

#### Sort Top similar candidates

In [112]:
q1 = set(df.question1)
q2 = set(df.question2)

In [113]:
len(q2)

198

In [114]:
groups =[]
for x in results_df.groupby('question1'):
#   print(x[0])
#   print(x[1].similarity)
  groups.append(x)

In [115]:
df.question1.value_counts()

Misal was worth it                   198
The Chicken Wings were amazing       198
I loved pani puri                    198
The pizza and pasta was delicious    198
Pizza                                198
The food was terrible                198
I love the pizza                     198
The fries were terrible              198
I like the Pasta                     198
Pasta                                198
Pizza was amazing                    198
The food was amazing                 198
dal makhni was terrble               198
Keema pav was good                   198
the pizza was bad                    198
Mango juice was good                 198
Parathas were delicious              198
Paratha                              198
Chicken                              198
the food was not good                198
The pizzas were awesome              198
I liked Pizza                        198
The biryani was decent               198
Food                                 198
Name: question1,

In [116]:
top_candidates=[]
bottom_candidates=[]
for group in groups:
    res1 = group[1].sort_values(by='similarity',ascending=False)[['question1','question2','similarity','question1_p','question2_p']].iloc[:10]
    res2 = group[1].sort_values(by='similarity',ascending=False)[['question1','question2','similarity','question1_p','question2_p']].iloc[:10]
    top_candidates.append(res1)
    bottom_candidates.append(res2)

In [117]:
final_df = pd.concat([x for x in top_candidates]).reset_index().drop('index',axis=1)
print(final_df.shape)
final_df.head()

(240, 5)


Unnamed: 0,question1,question2,similarity,question1_p,question2_p
0,Chicken,Chicken was too good,0.234482,chicken,chicken good
1,Chicken,Lucknowi chicken was good,0.234482,chicken,lucknowi chicken good
2,Chicken,tandoored chicken was delicious,0.173395,chicken,tandoored chicken delicious
3,Chicken,Chicken was tender,0.150997,chicken,chicken tender
4,Chicken,chicken dish was perfect ..,0.134056,chicken,chicken dish perfect


In [118]:
final_df.question1.value_counts()

The fries were terrible              10
Parathas were delicious              10
Misal was worth it                   10
Paratha                              10
Pizza                                10
I liked Pizza                        10
Pizza was amazing                    10
Chicken                              10
Keema pav was good                   10
The Chicken Wings were amazing       10
I loved pani puri                    10
the pizza was bad                    10
dal makhni was terrble               10
I like the Pasta                     10
the food was not good                10
I love the pizza                     10
The pizza and pasta was delicious    10
Pasta                                10
Food                                 10
The food was amazing                 10
Mango juice was good                 10
The biryani was decent               10
The pizzas were awesome              10
The food was terrible                10
Name: question1, dtype: int64

In [119]:
final_df.to_csv('NVA_similarity_215.csv')