#### Purpose of this algorithm
* Build an algorithm to answer the questions from document which has question and answer markers.
* Note: This algorithm works only if the document has markers.

#### High level steps of the algorithm
* Read the documents from the folder which has the markers, below are the markers used for question and answer extraction.
    * question_start = 'QUE-S'
    * question_end = 'QUE-E'
    * answer_start = 'ANS-S'
    * answer_end = 'ANS-E'
* Create the question and answering knowledge base.
* Create a search index to understand the question and return the answer.
* Used soft cosine similarity from gensim in this algorithm for search index, this is to bring in semantic meaning to the sentence or question.


In [1]:
# Import the required packages 
import os

import gensim.downloader as api
from gensim.corpora import Dictionary
from gensim.models import Word2Vec, WordEmbeddingSimilarityIndex
from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix

from nltk.corpus import stopwords
import re

from docx import Document
import pandas as pd

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
"""
Read the document, loop through all the paragraphs and return them as a text
"""
def create_doc_text(doc):
    doc_paragraphs = []
    for para in doc.paragraphs:
        doc_paragraphs.append(para.text)
    
    doc_paragraphs = list(filter(None, doc_paragraphs))
    
    doc_text = ''
    for doc_paragraph in doc_paragraphs:
        doc_text = doc_text + doc_paragraph
    
    return doc_text

In [3]:
"""
Split the sentences with required markers from upcoming functions
"""
def split(text, start, end):
    s_splits = text.split(start)
    s_splits = list(filter(None, s_splits))
    
    e_splits = []
    for split in s_splits:
        e_splits.append(split.split(end))
    
    return e_splits

In [4]:
"""
Create knowledge base function by integrating above two functions
"""
def create_knowledge(doc):
    # create the markers for question and answer pair extraction
    q_start = 'QUE-S'
    q_end = 'QUE-E'
    a_start = 'ANS-S'
    a_end = 'ANS-E'
    
    doc_text = create_doc_text(doc)
    q_splits = split(doc_text.strip(), q_start, q_end)
    
    q_a_pairs = []
    for i in range(len(q_splits)):
        question = q_splits[i][0]
        answer = q_splits[i][1]
        answer = split(answer.strip(), a_start, a_end)
        #print(answer)
        answer = answer[0][0]
        q_a_pairs.append((question.strip(), answer.strip()))
    
    return q_a_pairs


In [5]:
"""
Create the knowledgebase from the document 
"""
data_dir = "C:\\srini\\qna\\word"
file_name = "XXX_HR_XXX_Code_FAQ.docx"
data_path = os.path.join(data_dir, file_name)

doc = Document(data_path)
q_a_pairs = create_knowledge(doc)
knowledge_base = pd.DataFrame(q_a_pairs, columns =['question', 'answer'])

In [6]:
# Sanity check the created knowledgebase
knowledge_base.head()

Unnamed: 0,question,answer
0,Our team is not in a client-facing or sales ro...,Whether your role requires you to interact wit...
1,Why are the physical security teams instructed...,A people manager could have a number of member...
2,"Does this policy apply to all members, includi...",The dress code policy is applicable to all mem...
3,Formal attire during high-profile business mee...,CGI believes that members should maintain a pr...
4,Is Jeans paired with a formal shirt considered...,"No, denim clothing or footwear is not acceptab..."


In [7]:
# get stop words from nltk
stopWords = stopwords.words('english')

In [8]:
# pre processing data function
def clean_data(sentence):
    # convert to lowercase, ignore all special characters - 
    # keep only alpha-numericals and spaces (not removing full-stop here)
    sentence = re.sub(r'[^A-Za-z0-9\s.]', r'', str(sentence).lower())
    sentence = re.sub(r'\n', r' ', sentence)
    
    # remove stop words
    sentence = " ".join([word for word in sentence.split() if word not in stopWords])
    
    return sentence.split()

# Pre-process all the questions
questions_list = knowledge_base.question.map(lambda x: clean_data(x))
# Make list of lists to feed the data into the algorithm
questions_list = questions_list.tolist()
# Sanity check the documents after pre-processing
questions_list[4:6]


[['jeans', 'paired', 'formal', 'shirt', 'considered', 'business', 'casuals'],
 ['work', 'night', 'shift.', 'need', 'follow', 'dress', 'code']]

In [9]:
# methods to train word-vectors
"""
Using below we can build our own word to vector embeddings 
"""
#w2v_model = Word2Vec(questions_list, size=50, min_count=1, iter=50)  


"""
Using below, we are loading pre-trained word vectors from the large corpus
List of pre-trained word vectors available in gensim, below link for details
https://github.com/RaRe-Technologies/gensim-data
"""
#w2v_model = api.load("glove-wiki-gigaword-50")
w2v_model = api.load("word2vec-google-news-300")

2020-02-25 20:07:26,914 : INFO : loading projection weights from C:\Users\mommasani.srinivasul/gensim-data\word2vec-google-news-300\word2vec-google-news-300.gz
2020-02-25 20:10:24,073 : INFO : loaded (3000000, 300) matrix from C:\Users\mommasani.srinivasul/gensim-data\word2vec-google-news-300\word2vec-google-news-300.gz


In [10]:
# Sanity checks for the word2vec model
print(w2v_model.similarity('maharaja','maharani'))
print(w2v_model.most_similar('Kharagpur'))
print(len(w2v_model.wv['save']))
print(len(w2v_model.wv.vocab))


2020-02-25 20:10:24,116 : INFO : precomputing L2-norms of word weight vectors


0.6258443
[('Rourkela', 0.7121847867965698), ('Asansol', 0.7043161392211914), ('Dankuni', 0.6961910724639893), ('Bokaro', 0.6898972392082214), ('Howrah', 0.6849250793457031), ('Sambalpur', 0.6841989159584045), ('Jhargram', 0.683922290802002), ('Burdwan', 0.6817904114723206), ('Uluberia', 0.6772913336753845), ('Bilaspur', 0.6764135360717773)]
300
3000000


  after removing the cwd from sys.path.
  """


In [11]:
# Construct term simiarity index the trained or pre-trained word 2 vectors
termsim_index = WordEmbeddingSimilarityIndex(w2v_model.wv)
# Construct the dictionary from our documents
dictionary = Dictionary(questions_list)
# Construct document to bag of words using the dictionary built above
bow_corpus = [dictionary.doc2bow(question) for question in questions_list]
# Construct similarity matrix using term similarity index built from word2vec and from the 
# document dictionary
similarity_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary)  
# Construct final document similarity index
scs_sim_index = SoftCosineSimilarity(bow_corpus, similarity_matrix, num_best=5)

  
2020-02-25 20:10:33,541 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-02-25 20:10:33,544 : INFO : built Dictionary(73 unique tokens: ['clientfacing', 'code', 'dress', 'necessary', 'role']...) from 11 documents (total 101 corpus positions)
2020-02-25 20:10:33,546 : INFO : constructing a sparse term similarity matrix using <gensim.models.keyedvectors.WordEmbeddingSimilarityIndex object at 0x00000233D1FC48C8>
2020-02-25 20:10:33,548 : INFO : iterating over columns in dictionary order
2020-02-25 20:10:33,553 : INFO : PROGRESS: at 1.37% columns (1 / 73, 1.369863% density, 1.369863% projected density)
2020-02-25 20:10:51,776 : INFO : constructed a sparse term similarity matrix with 2.720961% density


In [12]:
# make a query
query = 'jeans and formal shirt are business casual?'
query = clean_data(query)

# calculate similarity of query to each doc from bow_corpus
scs_sims = scs_sim_index[dictionary.doc2bow(query)]
for i in range(len(scs_sims)):
    print("confidence score --> {} for record --> {}" .format(scs_sims[i][1], knowledge_base.question[scs_sims[i][0]]))


confidence score --> 1.0 for record --> Is Jeans paired with a formal shirt considered business casuals?
confidence score --> 0.5443339347839355 for record --> Formal attire during high-profile business meetings and visits is fine but why do we need to have a dress code throughout the week?
confidence score --> 0.23741625249385834 for record --> Due to medical reasons, I need to wear certain clothing or footwear. Can I be exempted from the dress code?
confidence score --> 0.17929314076900482 for record --> Our team is not in a client-facing or sales role, why is a dress code necessary?
confidence score --> 0.16627000272274017 for record --> The dress code is very restrictive. I cycle to work every day and so wear biking attire, I later change at work. Will the security team allow me to enter the facility?


  Y = np.multiply(Y, 1 / np.sqrt(Y_norm))
  Y = np.multiply(Y, 1 / np.sqrt(Y_norm))
