In [None]:
# !pip install bert-embedding
# !pip install pandas

In [None]:
import os, re, io
import numpy as np
import requests
import pandas as pd
from gensim.parsing.preprocessing import remove_stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
import spacy
nlp = spacy.load('en_core_web_sm')
from bert_embedding import BertEmbedding
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
## Data Preprocessing

"""
Blueprint of preprocessing and lemmatization for the text similarity

Args:
  data_df: Name of the Dataframe
  column_name : column name

Returns: 
  final_token : The list of the words

"""

class TextSimilarity():
    def __init__(self, data_df, column_name=None):
        """Init the Preprocessing"""
        self.data_df = data_df  
        if not column_name and type(column_name) == str:
            raise Exception("column name is mandatory. Make sure type is string format")
        self.column = column_name
        self.convert_lowercase()    
        self.applied_stopword = False
        self.processed_column_name = f"processed_{self.column}"
        
    def convert_lowercase(self):
        """text convert into lowercase"""
        ## fill empty values into empty
        self.data_df.fillna('',inplace=True)
        ## reduce all the columns to lowercase
        self.data_df = self.data_df.apply(lambda column: column.astype(str).str.lower(), axis=0)    

    def remove_question_no(self):
        """remove question no present in text"""
        ## remove question no        
        self.data_df[self.column] = self.data_df[self.column].apply(lambda row: re.sub(r'^\d+[.]',' ', row))    
        
    def remove_symbols(self):
        """remove unwanted characte"""
        self.data_df[self.column] = self.data_df[self.column].apply(lambda row: re.sub(r'[^A-Za-z0-9\s]', ' ', row))    

    def remove_stopwords(self):
        """remove stopwords and create a new column"""
        for idx, question in enumerate(self.data_df[self.column]):      
            self.data_df.loc[idx, self.processed_column_name] = remove_stopwords(question)        

    def apply_lemmatization(self, perform_stopword):
        """get the root words to reduce inflection of words"""
        lemmatizer = WordNetLemmatizer()    
        ## get the column name to perform lemma operation whether stopwords removed text or not
        if perform_stopword:
            column_name = self.processed_column_name
        else:
            column_name = self.column
        ## iterate every question, perform tokenize and lemma
        for idx, question in enumerate(self.data_df[column_name]):

            lemmatized_sentence = []
            ## use spacy for lemmatization
            doc = nlp(question.strip())
            for word in doc:       
                lemmatized_sentence.append(word.lemma_)      
                ## update to the same column
                self.data_df.loc[idx, self.processed_column_name] = " ".join(lemmatized_sentence)

    def run_all(self, perform_stopword = True):
        """Run all the methods as per the requirements"""
        self.remove_question_no()
        self.remove_symbols()
        if perform_stopword:
            self.remove_stopwords()
        self.apply_lemmatization(perform_stopword)    
        return self.data_df

In [None]:
df = pd.read_csv("COVID19_FAQ.csv")
df.head(10)

## pre-process training question data
question_answer = TextSimilarity(df.copy(), column_name="questions")
clean_df = question_answer.run_all()
clean_df.head(10)


Unnamed: 0,questions,answers,processed_questions
0,how does covid 19 spread,people can catch covid-19 from others who have...,covid 19 spread
1,what are the symptoms of covid 19,the most common symptoms of covid-19 are fever...,symptom covid 19
2,how do i know if it is covid 19 or just the ...,a covid-19 infection has the same signs and sy...,know covid 19 common flu
3,can the virus that causes covid 19 be transm...,studies to date suggest that the virus that ca...,virus cause covid 19 transmit air
4,what can i do to protect myself and prevent ...,protection measures for everyone stay aware ...,protect prevent spread disease
5,i am well and asymptomatic should i use a m...,"according to the who, for individuals without ...",asymptomatic use mask
6,how likely am i to catch covid 19,the risk depends on where you are - and more s...,likely catch covid 19
7,are pregnant women more susceptible to the c...,we do not have information from published scie...,pregnant woman susceptible covid 19 virus harm...
8,what is the risk of my child becoming sick w...,"based on available evidence, children do not a...",risk child sick covid 19
9,are the symptoms of covid 19 different in ch...,no. the symptoms of covid-19 are similar in ch...,symptom covid 19 different child adult


In [None]:
test_query_questions = ["Am I considered a close contact if I was wearing a mask?",
"Is the virus that causes COVID-19 found in feces (stool)?",
"Can the COVID-19 virus spread through sewerage?"]

test_df = pd.DataFrame(test_query_questions, columns=["test_questions"])  

## pre-process testing QA data
question_answer = TextSimilarity(test_df, column_name="test_questions")
query_df = question_answer.run_all()
query_df

Unnamed: 0,test_questions,processed_test_questions
0,am i considered a close contact if i was weari...,consider close contact wear mask
1,is the virus that causes covid 19 found in fec...,virus cause covid 19 fece stool
2,can the covid 19 virus spread through sewerage,covid 19 virus spread sewerage


In [None]:
## get bert embeddings
def func_get_bert_embeddings(sentences):
    bert_embedding = BertEmbedding()
    return bert_embedding(sentences)

question_QA_bert_embeddings_list = func_get_bert_embeddings(clean_df["questions"].to_list())
query_QA_bert_embeddings_list = func_get_bert_embeddings(test_df["test_questions"].to_list())

Vocab file is not found. Downloading.
Downloading /root/.mxnet/models/book_corpus_wiki_en_uncased-a6607397.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/vocab/book_corpus_wiki_en_uncased-a6607397.zip...
Downloading /root/.mxnet/models/bert_12_768_12_book_corpus_wiki_en_uncased-75cc780f.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/bert_12_768_12_book_corpus_wiki_en_uncased-75cc780f.zip...


In [None]:
## store QA bert embeddings in list
question_QA_bert_embeddings = []
for embeddings in question_QA_bert_embeddings_list:
    question_QA_bert_embeddings.append(embeddings[1])

## store query string bert embeddings in list
query_QA_bert_embeddings = []
for embeddings in query_QA_bert_embeddings_list:
    query_QA_bert_embeddings.append(embeddings[1])

In [None]:
## helps to retrieve similar question based of input vectors/embeddings for test query
def func_get_SimilarFAQ(train_question_vectors, test_question_vectors, train_df, train_column_name, test_df, test_column_name):
    similar_question_index = []
    final_similarity_score = []
    for test_index, test_vector in enumerate(test_question_vectors):
        sim, sim_Q_index = -1, -1
        for train_index, train_vector in enumerate(train_question_vectors):
            sim_score = cosine_similarity(train_vector, test_vector)[0][0]
            
            if sim < sim_score:
                print("sim_score", sim_score)
                # print("sim",sim)
                # break
                sim = sim_score
                final_similarity_score.append(sim_score)
                sim_Q_index = train_index
        
        print("final_similarity_score:", final_similarity_score.pop())
        print(f"Query Question: {test_df[test_column_name].iloc[test_index]}")    
        print(f"Get Question: {train_df[train_column_name].iloc[sim_Q_index]}")
        print("\n")

func_get_SimilarFAQ(question_QA_bert_embeddings, query_QA_bert_embeddings, clean_df, "questions", query_df, "test_questions")

sim_score 0.50462335
sim_score 0.5484787
sim_score 0.60838175
sim_score 0.63710284
sim_score 0.63902116
sim_score 0.6493133
sim_score 0.65950215
sim_score 0.6864102
sim_score 0.73272663
sim_score 0.9094605
final_similarity_score: 0.9094605
Query Question: am i considered a close contact if i was wearing a mask 
Get Question:   am i considered a close contact if i was wearing a mask 


sim_score 0.5194749
sim_score 0.6951481
sim_score 0.7672975
final_similarity_score: 0.7672975
Query Question: is the virus that causes covid 19 found in feces  stool  
Get Question:   is there a vaccine  drug or treatment for covid 19 


sim_score 0.5936823
sim_score 0.76098454
sim_score 0.79319656
final_similarity_score: 0.79319656
Query Question: can the covid 19 virus spread through sewerage 
Get Question:   can the covid 19 virus spread through sewerage systems 


