In [12]:
import pandas as pd;

#Load dataset and examine dataset, rename columns to questions and answers

df=pd.read_csv("FAQ_NLP_Questions.csv",encoding='cp1252');
df.columns=["questions","answers"];

df

Unnamed: 0,questions,answers
0,Explain in details about Natural Processing la...,Natural Language Processing (NLP) is designed ...
1,There have some various common elements of nat...,There have a lot of components normally using ...
2,Explain details about varieties areas availabl...,Natural language processing (NLP) can have an ...
3,In the case of processing natural language we ...,These are the basic NLP Interview Questions as...
4,One another very common terminology used in th...,TF-IDF or tf-IDF is basically stood for some c...
5,There are several tagging using for processing...,Part of speech tagger is a very interesting an...
6,As the analysis is one of the critical require...,Pragmatic analysis is one of the critical anal...
7,Again as NLP is used for multiple language pro...,Dependency parsing is actually known in the in...
8,One of the very basic requirement of NLP is ke...,This is the most asked NLP Interview Question ...
9,There have some classification model define in...,There have several classifications followed by...


# Preprocessing

For this task we are performing the following preprocessing :
<br>-Removing all characters that are not alpha numeric
<br>-Removing stopwords - commonly used words such as 'a', 'to', 'in' and so on.. that do not contribute to the semantic similarity between two sentences.
<br> We apply this to both the FAQ questions and the user query sentence.


In [13]:
import re
import gensim 
from gensim.parsing.preprocessing import remove_stopwords

#from nltk.stem.lancaster import LancasterStemmer
#st = LancasterStemmer()

def clean_sentence(sentence, stopwords=False):
    
    sentence = sentence.lower().strip()
    sentence = re.sub(r'[^a-z0-9\s]', '', sentence)
    
    if stopwords:
         sentence = remove_stopwords(sentence)
    
    return sentence
                    
def get_cleaned_sentences(df,stopwords=False):    
    sents=df[["questions"]];
    cleaned_sentences=[]

    for index,row in df.iterrows():
        #print(index,row)
        cleaned=clean_sentence(row["questions"],stopwords);
        cleaned_sentences.append(cleaned);
    return cleaned_sentences;

cleaned_sentences=get_cleaned_sentences(df,stopwords=True)
print("Without Stopwords:\n")
print(cleaned_sentences);

print("\n")
print("With Stopwords:\n")
cleaned_sentences_with_stopwords=get_cleaned_sentences(df,stopwords=False)
print(cleaned_sentences_with_stopwords);

Without Stopwords:

['explain details natural processing language nlp currently key artificial language learning processes started industry', 'common elements natural language processing elements essential understanding nlp properly explain example', 'explain details varieties areas available processing natural languages smartly know impacted areas small processing started recently', 'case processing natural language normally mentioned common terminology nlp binding language terminology properly explain details nlp terminology example', 'common terminology case natural learning processing called tfidf explain details understanding tfidf properly come example', 'tagging processing natural languages tagging parts speech pos tagging industrys popular ones explain speech pos tagging properly', 'analysis critical requirements natural language processing nlp follow analysis approaches understanding nlp properly key analysis called pragmatic analysis explain pragmatic analysis details', 'nlp 

# Bag of words model
The first model we will use for semantic similarity is leveraging Bag of Words (BOW). With BOW, each sentence is encoded into a vector whose length is the number of words in the vocabulary. Each element of the vector indicates how many times the particular word occurs in the sentence. <br>Note that a vector representation of a sentence is often also called an "Embedding" since it is a way of embedding a sentence in M-dimensional space if the vector is of length M.

In [14]:
import numpy

sentences=cleaned_sentences_with_stopwords
#sentences=cleaned_sentences

# Split it by white space 
sentence_words = [[word for word in document.split() ]
         for document in sentences]

from gensim import corpora

dictionary = corpora.Dictionary(sentence_words)
for key, value in dictionary.items():
    print(key, ' : ', value)

import pprint
bow_corpus = [dictionary.doc2bow(text) for text in sentence_words]
for sent,embedding in zip(sentences,bow_corpus):
    print(sent)
    print(embedding)

question_orig="What really natural language processing is?"
question=clean_sentence(question_orig,stopwords=False);
question_embedding = dictionary.doc2bow(question.split())


print("\n\n",question,"\n",question_embedding)

0  :  about
1  :  artificial
2  :  been
3  :  currently
4  :  details
5  :  explain
6  :  have
7  :  in
8  :  industry
9  :  is
10  :  key
11  :  language
12  :  learning
13  :  natural
14  :  nlp
15  :  of
16  :  one
17  :  processes
18  :  processing
19  :  started
20  :  that
21  :  the
22  :  which
23  :  an
24  :  are
25  :  can
26  :  common
27  :  detail
28  :  elements
29  :  essential
30  :  example
31  :  for
32  :  please
33  :  properly
34  :  same
35  :  some
36  :  there
37  :  those
38  :  understanding
39  :  various
40  :  with
41  :  you
42  :  areas
43  :  as
44  :  available
45  :  impacted
46  :  know
47  :  languages
48  :  recently
49  :  small
50  :  smartly
51  :  this
52  :  varieties
53  :  very
54  :  we
55  :  whether
56  :  and
57  :  binding
58  :  case
59  :  every
60  :  mentioned
61  :  normally
62  :  terminology
63  :  another
64  :  called
65  :  come
66  :  on
67  :  tfidf
68  :  used
69  :  all
70  :  be
71  :  how
72  :  industrys
73  :  it
74  :

# cosine similarity
Once we find a vector representation for each sentence using BOW,we can compute the distance between two vectors by taking the cosine similarity. Note other similarity measure can be used as well, but we will stick to cosine similarity throughout for simplicity.
<br>The closest matching answer can be retrieved by finding the cosine similarity of the query vector with each of the FAQ question vectors.

In [15]:
import sklearn
from sklearn.metrics.pairwise import cosine_similarity;
def retrieveAndPrintFAQAnswer(question_embedding,sentence_embeddings,FAQdf,sentences):
    max_sim=-1;
    index_sim=-1;
    for index,faq_embedding in enumerate(sentence_embeddings):
        #sim=cosine_similarity(embedding.reshape(1, -1),question_embedding.reshape(1, -1))[0][0];
        sim=cosine_similarity(faq_embedding,question_embedding)[0][0];
        print(index, sim, sentences[index])
        if sim>max_sim:
            max_sim=sim;
            index_sim=index;
       
    print("\n")
    print("Question: ",question)
    print("\n");
    print("Retrieved: ",FAQdf.iloc[index_sim,0]) 
    print(FAQdf.iloc[index_sim,1])        
    
retrieveAndPrintFAQAnswer(question_embedding,bow_corpus,df,sentences);

0 0.11043152607484653 explain in details about natural processing language nlp  which is currently one of the key artificial language learning processes that have been started in the industry
1 0.996240588195683 there have some various common elements of natural language processing those elements are essential for understanding nlp properly can you please explain the same in detail with an example
2 0.11043152607484653 explain details about varieties areas available in processing natural languages smartly whether we know impacted areas are very small as this processing started very recently
3 0.11043152607484653 in the case of processing natural language we normally mentioned one common terminology nlp and binding every language with the same terminology properly please explain in details about this nlp terminology with an example
4 0.9909924304103231 one another very common terminology used in the case of natural learning processing is called tfidf please explain in details on the und

# Word2Vec Embeddings
Word2Vec ebeddings are popularly trained using the skipgram model. These embeddings are trained to take a word as input and reconstruct its context. As a result, they are able to take into account semantic similarity of words based on context information. The resulting embeddings are such that words with similar meaning tend to be closer in terms of cosine similarity.
# Skipgram model :
The most popular word2vec model is the skipgram model. Particularly, the most commonly used pre-trained model is based on the Google News dataset that has 3 billion running words and creates upto 300 dimensional embedding for 3 Million words.
# Glove Embeddings :
Glove is an alternate approach to build word embeddings using matrix factorization techinques on the word-word co-occurance matrix.<br>While both the techniques are popular, glove performs better on some datasets while word2vec skipgram model performs better on some. Here, we experiment with both the word2vec and the glove models.

In [16]:

from gensim.models import Word2Vec 
import gensim.downloader as api


glove_model=None;
try:
    glove_model = gensim.models.KeyedVectors.load("./glovemodel.mod")
    print("Loaded glove model")
except:            
    glove_model = api.load('glove-twitter-25')
    glove_model.save("./glovemodel.mod")
    print("Saved glove model")
    
v2w_model=None;
try:
    v2w_model = gensim.models.KeyedVectors.load("./w2vecmodel.mod")
    print("Loaded w2v model")
except:            
    v2w_model = api.load('word2vec-google-news-300')
    v2w_model.save("./w2vecmodel.mod")
    print("Saved glove model")

w2vec_embedding_size=len(v2w_model['computer']);
glove_embedding_size=len(glove_model['computer']);

Loaded glove model
Loaded w2v model


# Finding phrase embedding from word embeddings
To find the phrase embedding, there are several specialized techniques, the most simple technique to convert word embeddings to phrase embeddings is applicable to word2vec and glove embeddings, is to sum up the individual word embeddings in the phrase to get the phrase vector.

In [17]:


def getWordVec(word,model):
        samp=model['computer'];
        vec=[0]*len(samp);
        try:
                vec=model[word];
        except:
                vec=[0]*len(samp);
        return (vec)


def getPhraseEmbedding(phrase,embeddingmodel):
                       
        samp=getWordVec('computer', embeddingmodel);
        vec=numpy.array([0]*len(samp));
        den=0;
        for word in phrase.split():
            #print(word)
            den=den+1;
            vec=vec+numpy.array(getWordVec(word,embeddingmodel));
        #vec=vec/den;
        #return (vec.tolist());
        return vec.reshape(1, -1)



In [18]:


#With w2Vec

sent_embeddings=[];
for sent in cleaned_sentences:
    sent_embeddings.append(getPhraseEmbedding(sent,v2w_model));

question_embedding=getPhraseEmbedding(question,v2w_model);

retrieveAndPrintFAQAnswer(question_embedding,sent_embeddings,df, cleaned_sentences);



0 0.809038982448252 explain details natural processing language nlp currently key artificial language learning processes started industry
1 0.7357250616707345 common elements natural language processing elements essential understanding nlp properly explain example
2 0.6672128531896241 explain details varieties areas available processing natural languages smartly know impacted areas small processing started recently
3 0.6695720880902569 case processing natural language normally mentioned common terminology nlp binding language terminology properly explain details nlp terminology example
4 0.722072672357894 common terminology case natural learning processing called tfidf explain details understanding tfidf properly come example
5 0.5505312478214434 tagging processing natural languages tagging parts speech pos tagging industrys popular ones explain speech pos tagging properly
6 0.5585174439833444 analysis critical requirements natural language processing nlp follow analysis approaches und

In [19]:
#With Glove

sent_embeddings=[];
for sent in cleaned_sentences:
    sent_embeddings.append(getPhraseEmbedding(sent,glove_model));
    
question_embedding=getPhraseEmbedding(question,glove_model);

retrieveAndPrintFAQAnswer(question_embedding,sent_embeddings,df, cleaned_sentences);

0 0.9283977592726819 explain details natural processing language nlp currently key artificial language learning processes started industry
1 0.9041228179020536 common elements natural language processing elements essential understanding nlp properly explain example
2 0.8934297228771688 explain details varieties areas available processing natural languages smartly know impacted areas small processing started recently
3 0.8921201177469507 case processing natural language normally mentioned common terminology nlp binding language terminology properly explain details nlp terminology example
4 0.9623097698639631 common terminology case natural learning processing called tfidf explain details understanding tfidf properly come example
5 0.8941870591043388 tagging processing natural languages tagging parts speech pos tagging industrys popular ones explain speech pos tagging properly
6 0.8484073726956479 analysis critical requirements natural language processing nlp follow analysis approaches u