### Code to Match User Search to Database FAQs

###### Jupyter implementation of the code used in the backend of the website

In [1]:
#Load FAQ dataset

import pandas as pd
import numpy

df = pd.read_csv("faq_database.csv",header=0)
df

Unnamed: 0,Questions,Answers
0,How is data scientist different than data analyst,Data Science is a field which contains various...
1,What is regularisation? Explain L1 and L2 regu...,Regularisation is a mathematical way of solvin...
2,How do Data Scientists use statistics?,Statistics plays a powerful role in Data Scien...
3,What does the job hunting experience look like ?,Job hunting experience involves networking to...
4,Any insights you can offer about the DS job ma...,There are many kinds of roles data scientist ...
5,What?s the impact of Covid on hiring for DS ro...,Hiring is going to slow down. First in small c...
6,What skills and qualities do employers look fo...,The following are some skills employers usuall...
7,Do employers look for an advanced ML degree?,For more senior roles: People typically look f...
8,How does a typical day of a data scientist loo...,Here are some tasks in the typical day of a da...
9,Do I need to prepare algorithms and data struc...,Yes. In many data science interviews (ML Scien...


In [2]:
#Code to clean datasets:

import re
import gensim
from gensim.parsing.preprocessing import remove_stopwords

def clean_sentence(sentence, stopwords=False):
    
    sentence = sentence.lower().strip()
    sentence = re.sub(r'[^a-z0-9\s]', '',sentence)
    
    if stopwords:
        sentence = remove_stopwords(sentence)
        
    return sentence
        
def get_cleaned_sentences(df,stopwords=False):
    sents = df[['Questions']];
    cleaned_sentences =[]
    
    for index,row in df.iterrows():
        cleaned = clean_sentence(row["Questions"],stopwords);
        cleaned_sentences.append(cleaned);
    return cleaned_sentences;

cleaned_sentences=get_cleaned_sentences(df,stopwords=True)
print(cleaned_sentences);

print("\n")

cleaned_sentences_with_stopwords=get_cleaned_sentences(df,stopwords=False)
print(cleaned_sentences_with_stopwords);



['data scientist different data analyst', 'regularisation explain l1 l2 regularisation', 'data scientists use statistics', 'job hunting experience look like', 'insights offer ds job market', 'whats impact covid hiring ds roles', 'skills qualities employers look data scientist', 'employers look advanced ml degree', 'typical day data scientist look like', 'need prepare algorithms data structures data science interview', 'proficient data scientist coding', 'mathematical background required data scientist', 'rounds data scientist interview', 'data cleansing important', 'linear logistic regression', 'normal distribution', 'difference interpolation extrapolation', 'recommender', 'r python choose text analysis', 'explain ab testing', 'data scientists increase salaries', 'data scientists work home', 'data scientist considered demand occupation']


['how is data scientist different than data analyst', 'what is regularisation explain l1 and l2 regularisation', 'how do data scientists use statist

In [3]:
#Function to return closest matching question and corresponding answer
#Uses Cosine Similarity metric

import sklearn
from sklearn.metrics.pairwise import cosine_similarity;

def retrieveAndPrintFAQAnswer(question_embedding,sentence_embeddings,FAQdf,sentences):
    max_sim = -1;
    index_sim = -1;
    for index,faq_embedding in enumerate(sentence_embeddings):
        sim=cosine_similarity(faq_embedding,question_embedding)[0][0];
        print(index,sim,sentences[index])
        if sim>max_sim:
            max_sim = sim;
            index_sim = index;
        
    print("\n")
    print("Question: ",question)
    print("\n");
    print("Retrieved: ",FAQdf.iloc[index_sim,0])
    print(FAQdf.iloc[index_sim,1])
        

In [4]:
#Load Glove and Google News Model
from gensim.models import Word2Vec
import gensim.downloader as api

glove_model = None;
try:
    glove_model = gensim.models.KeyedVectors.load("./glovemodel.mod")
    print("Loaded glove model")
except:
    glove_model = api.load('glove-twitter-25')
    glove_model.save("./glovemodel.mod")
    print("Saved glove model")

v2w_model = None;
try:
    v2w_model = gensim.models.KeyedVectors.load("./w2vecmodel.mod")
    print("Loaded W2Vec model")
except:
    v2w_model = api.load('word2vec-google-news-300')
    glove_model.save("./w2vecmodel.mod")
    print("Saved W2Vec model")

w2vec_embedding_size=len(v2w_model['computer'])
glove_embedding_size=len(glove_model['computer'])

    

Loaded glove model
Loaded W2Vec model


In [5]:
#Start BERT Server
#*Please make sure BERT Server Client is running in background*
#*Refer: https://bert-as-service.readthedocs.io/en/latest/section/get-start.html

from bert_serving.client import BertClient
bc = BertClient()
res = bc.encode(['Data Science requirements'])
print(res.shape)

(1, 768)


In [6]:
#Create Embeddings of Words and Sentences:

def getWordVec(word,model):
    samp = model['computer']
    vec = [0]*len(samp);
    try:
        vec = model[word]
    except:
        vec = [0]*len(samp)
    return(vec)

def getPhraseEmbedding(phrase,embeddingmodel):
    samp = getWordVec('computer',embeddingmodel);
    vec = numpy.array([0]*len(samp))
    den=0;
    for word in phrase.split():
        den = den + 1;
        vec = vec + numpy.array(getWordVec(word,embeddingmodel));
    return vec.reshape(1,-1)

### Use the following to search for a question

### Glove-Twitter-25

In [7]:
#Please change question as required
#FOR GLOVE MODEL

question = "What is the role of a data scientist?"

question_embedding = getPhraseEmbedding(question,glove_model);

sent_embeddings = [];

for sent in cleaned_sentences:
    sent_embeddings.append(getPhraseEmbedding(sent,glove_model));

retrieveAndPrintFAQAnswer(question_embedding,sent_embeddings,df,cleaned_sentences)

0 0.8313907929825542 data scientist different data analyst
1 0.7783146034532031 regularisation explain l1 l2 regularisation
2 0.8028992774077468 data scientists use statistics
3 0.9177990480036843 job hunting experience look like
4 0.8253179912453033 insights offer ds job market
5 0.8719168769853538 whats impact covid hiring ds roles
6 0.8423017372494708 skills qualities employers look data scientist
7 0.8319386259540305 employers look advanced ml degree
8 0.9386164743220934 typical day data scientist look like
9 0.8677292128612414 need prepare algorithms data structures data science interview
10 0.6312426048899856 proficient data scientist coding
11 0.765853408452705 mathematical background required data scientist
12 0.880996711503084 rounds data scientist interview
13 0.7950051355207921 data cleansing important
14 0.23777662317838924 linear logistic regression
15 0.8436636233723235 normal distribution
16 0.8346646142688189 difference interpolation extrapolation
17 0.0 recommender
18 

### Google-News-300

In [8]:
#Please change question as required
#FOR GOOGLE NEWS MODEL

question = "What is the role of a data scientist?"

question_embedding = getPhraseEmbedding(question,v2w_model);

sent_embeddings = [];

for sent in cleaned_sentences:
    sent_embeddings.append(getPhraseEmbedding(sent,v2w_model));

retrieveAndPrintFAQAnswer(question_embedding,sent_embeddings,df,cleaned_sentences)

0 0.4414584837680419 data scientist different data analyst
1 0.11078282357511264 regularisation explain l1 l2 regularisation
2 0.5061646051852381 data scientists use statistics
3 0.4314382088086639 job hunting experience look like
4 0.45196381140765984 insights offer ds job market
5 0.4938778533904861 whats impact covid hiring ds roles
6 0.5243024201806661 skills qualities employers look data scientist
7 0.31313168148776815 employers look advanced ml degree
8 0.5271634331068429 typical day data scientist look like
9 0.5316223730356031 need prepare algorithms data structures data science interview
10 0.34271566978097806 proficient data scientist coding
11 0.42443813190010304 mathematical background required data scientist
12 0.36354749539109327 rounds data scientist interview
13 0.5318969980969773 data cleansing important
14 0.2686188082082466 linear logistic regression
15 0.22453467590673362 normal distribution
16 0.27423390906019884 difference interpolation extrapolation
17 0.22793090

### BERT

In [9]:
#Please change question as required
#FOR BERT MODEL

question_orig = "What is the role of a data scientist?"
question = clean_sentence(question_orig,stopwords=False)
cleaned_sentences = get_cleaned_sentences(df,stopwords=False)

sent_bertphrase_embedding = [];

for sent in cleaned_sentences:
    sent_bertphrase_embedding.append(bc.encode([sent]))

question_embedding=bc.encode([question]);

retrieveAndPrintFAQAnswer(question_embedding,sent_bertphrase_embedding,df,cleaned_sentences)

0 0.8184699 how is data scientist different than data analyst
1 0.69985795 what is regularisation explain l1 and l2 regularisation
2 0.81766987 how do data scientists use statistics
3 0.78070855 what does the job hunting experience look like 
4 0.7190356 any insights you can offer about the ds job market 
5 0.7553308 whats the impact of covid on hiring for ds roles
6 0.8318986 what skills and qualities do employers look for in a data scientist
7 0.72800684 do employers look for an advanced ml degree
8 0.7915081 how does a typical day of a data scientist look like
9 0.7796669 do i need to prepare algorithms and data structures for a data science interview 
10 0.83909994 how proficient should a data scientist be in coding
11 0.91271245 what is the mathematical background required for a data scientist 
12 0.83308864 what are the various rounds in a data scientist interview 
13 0.8028996 why data cleansing is important
14 0.7286502 what is linear and logistic regression
15 0.7572076 what i