### Sarthak Mishra
### Roll No- 18388
### Department of Physics, IISER Bhopal

# Question Answering from e-manual using word embeddings in NLP

In [1]:
import pandas as pd

# data collected from kaggle competition: e-manual Datathon 2021
df = pd.read_json("../data/emanual_dataset.json")
df

Unnamed: 0,Question,Type,Answer,id
0,what is sound mode?,Normal,"when the focus is moved to the icon, a list of...",0
1,could you explain about sound output?,Normal,"when the focus is moved to the icon, a list of...",1
2,what is the function of sleep timer?,Normal,"when the focus is moved to the icon, a list of...",2
3,where can i can view the current network?,Normal,"when the focus is moved to the icon, a list of...",3
4,how to select an external device connected to ...,Normal,you can select an external device connected to...,4
...,...,...,...,...
541,can i cancel scheduled viewing / cancel schedu...,Yes_No,yes,541
542,is my samsung smart remote automatically pairs...,Yes_No,yes,542
543,is it possible to install different apps using...,Yes_No,yes,543
544,can i change the content and settings for ambi...,Yes_No,yes,544


In [2]:
df['Type'].unique()

array(['Normal', 'Paraphrased', 'Compound', 'Yes_No'], dtype=object)

In [3]:
# only the required columns 
df = df[['Question', 'Answer']].reset_index(drop=True)
# renaming better accessibility
df.columns = ['questions', 'answers']
df

Unnamed: 0,questions,answers
0,what is sound mode?,"when the focus is moved to the icon, a list of..."
1,could you explain about sound output?,"when the focus is moved to the icon, a list of..."
2,what is the function of sleep timer?,"when the focus is moved to the icon, a list of..."
3,where can i can view the current network?,"when the focus is moved to the icon, a list of..."
4,how to select an external device connected to ...,you can select an external device connected to...
...,...,...
541,can i cancel scheduled viewing / cancel schedu...,yes
542,is my samsung smart remote automatically pairs...,yes
543,is it possible to install different apps using...,yes
544,can i change the content and settings for ambi...,yes


# Preprocessing 

For this task I have performed the following preprocessing : 
1. Removing all characters that are not alpha numeric
2. Removing stopwords - commonly used words such as 'a', 'to', 'in' and so on, since they do not contribute to the semantic similarity between two sentences.

I have applied this to both the e-Manual questions and the user query sentence.

I have also printed a list of sentences without removing stopwords.

In [4]:
import re
import gensim 
from gensim.parsing.preprocessing import remove_stopwords

def clean_sentence(sentence, stopwords=False):
    
    sentence = sentence.lower().strip()
    sentence = re.sub(r'[^a-z0-9\s]', '', sentence)
    
    if stopwords:
         sentence = remove_stopwords(sentence)
    return sentence
                    
def get_cleaned_sentences(df,stopwords=False):    
    sents=df[["questions"]]
    cleaned_sentences=[]

    for index,row in df.iterrows():
        cleaned=clean_sentence(row["questions"],stopwords)
        cleaned_sentences.append(cleaned)
    return cleaned_sentences

cleaned_sentences=get_cleaned_sentences(df,stopwords=True)
print("Cleaned sentences without stop words:-\n", cleaned_sentences)

print("\n\n\n")

cleaned_sentences_with_stopwords=get_cleaned_sentences(df,stopwords=False)
print("Cleaned sentences with stop words:-\n", cleaned_sentences_with_stopwords)
    

Cleaned sentences without stop words:-
 ['sound mode', 'explain sound output', 'function sleep timer', 'view current network', 'select external device connected tv', 'search apps smart hub services', 'use feature apps', 'turn tv ambient mode', 'item home screen', 'turn autorun smart hub function', 'launch app automatically', 'test smart hub connections', 'reset smart hub', 'create manage samsung account', 'create new account', 'create samsung account facebook account', 'create samsung account paypal account', 'change information samsung account', 'configure sync internet settings', 'delete samsung account', 'sign samsung account', 'view privacy policy', 'install app', 'delete app', 'add apps home screen', 'need lock app', 'app location', 'reinstall apps', 'update apps automatically', 'rate apps', 'use universal guide', 'smartthings', 'select location list smart', 'create new account smartthings', 'items turn notifications', 'create custom mode', 'view list connected smart devices', 'la



# Bag of words Model    

In [5]:
import numpy

sentences=cleaned_sentences_with_stopwords

# Split it by white space 
sentence_words = [[word for word in document.split() ]
         for document in sentences]

from gensim import corpora

dictionary = corpora.Dictionary(sentence_words)
for key, value in dictionary.items():
    print(key, ' : ', value)

import pprint
bow_corpus = [dictionary.doc2bow(text) for text in sentence_words]
for sent,embedding in zip(sentences,bow_corpus):
    print(sent)
    print(embedding)

question_orig="signal information under self diagnosis isn't activated. how do i activate that?"
question=clean_sentence(question_orig,stopwords=False)
question_embedding = dictionary.doc2bow(question.split())


print("\n\n",question,"\n",question_embedding)

0  :  is
1  :  mode
2  :  sound
3  :  what
4  :  about
5  :  could
6  :  explain
7  :  output
8  :  you
9  :  function
10  :  of
11  :  sleep
12  :  the
13  :  timer
14  :  can
15  :  current
16  :  i
17  :  network
18  :  view
19  :  where
20  :  an
21  :  connected
22  :  device
23  :  external
24  :  how
25  :  select
26  :  to
27  :  tv
28  :  apps
29  :  hub
30  :  in
31  :  search
32  :  services
33  :  smart
34  :  feature
35  :  use
36  :  ambient
37  :  turn
38  :  home
39  :  item
40  :  move
41  :  on
42  :  screen
43  :  autorun
44  :  app
45  :  automatically
46  :  last
47  :  launch
48  :  used
49  :  connections
50  :  test
51  :  reset
52  :  account
53  :  and
54  :  create
55  :  manage
56  :  my
57  :  samsung
58  :  new
59  :  a
60  :  facebook
61  :  using
62  :  paypal
63  :  change
64  :  information
65  :  configure
66  :  internet
67  :  settings
68  :  sync
69  :  delete
70  :  out
71  :  sign
72  :  policy
73  :  privacy
74  :  install
75  :  add
76  :  do
7

# Cosine Similarity

##### BOW representation did not do very well and retrieved the wrong answer since it is looking for exact word match which is very rare in real life scenarios.

In [6]:
import sklearn
from sklearn.metrics.pairwise import cosine_similarity

def retrieveAndPrintAnswer(question_embedding,sentence_embeddings,eManualdf,sentences):
    max_sim=-1
    index_sim=-1
    for index,eManual_embedding in enumerate(sentence_embeddings):
        sim=cosine_similarity(eManual_embedding,question_embedding)[0][0]
        print(index, sim, sentences[index])
        if sim>max_sim:
            max_sim=sim
            index_sim=index
    
    print("\n")
    print("Question: ",question)
    print("\n")
    print("Retrieved: ", eManualdf.iloc[index_sim,0]) 
    print("Answer: ", eManualdf.iloc[index_sim,1])        
    
retrieveAndPrintAnswer(question_embedding,bow_corpus,df,sentences)


0 0.06237828615518053 what is sound mode
1 0.9205817818752566 could you explain about sound output
2 0.06237828615518053 what is the function of sleep timer
3 0.9997852982986982 where can i can view the current network
4 0.9997852982986982 how to select an external device connected to the tv
5 0.9997852982986982 where can i search the apps in smart hub services
6 0.9993063048340446 how to use the feature of apps
7 0.7498378553650925 how to turn tv in ambient mode
8 0.9997852982986982 how to move item on the home screen
9 0.9988367534372583 how to turn on the autorun smart hub function
10 0.9997852982986982 how to launch the last used app automatically
11 0.9997852982986982 how to test the smart hub connections
12 0.9997841818782067 how to reset smart hub
13 0.9997841818782067 how to create and manage my samsung account
14 0.9997841818782067 how to create new account
15 0.9997841818782067 how to create a samsung account using a facebook account
16 0.9999604961681129 can i create a samsu

368 0.9999604961681129 can i change auto volume
369 0.9205817818752566 can i select sound feedback
370 0.9205817818752566 can i reset the sound
371 0.9947279261542804 can i connect to the bluetooth audio devices to the tv
372 0.9988367534372583 can i connect to the samsung wireless audio devices which has wifi enabled function
373 0.9947279261542804 can i set the current time on the tv
374 0.9665615578617778 what are the different cases do i need to reset the clock time
375 0.9997852982986982 can i set the clock automatically
376 0.9999604961681129 can i set time zone
377 0.9999604961681129 can i set daylight saving time dst
378 0.9997852982986982 can i change the current time on tv
379 0.9996012472825965 how do i set sleep timer for the tv
380 0.9997852982986982 can i turn off the tv using off timer
381 0.9999604961681129 can i prevent screen burn
382 0.9999604961681129 can i select ambient light detection 
383 0.9999604961681129 can i select minimum backlight 
384 0.9999604961681129 

# Word2Vec Embeddings

**Skipgram model** 

In [7]:
from gensim.models import Word2Vec 
import gensim.downloader as api

w2v_model=None
try:
    w2v_model = gensim.models.KeyedVectors.load("../data/w2vecmodel.mod")
    print("Loaded w2v model")
except:            
    w2v_model = api.load('word2vec-google-news-300')
    w2v_model.save("../data/w2vecmodel.mod")
    print("Saved glove model")

w2vec_embedding_size=len(w2v_model['computer'])


Loaded w2v model


**Getting Phrase Embeddings from Word Embeddings**

In [8]:
def getWordVec(word,model):
        samp=model['computer']
        vec=[0]*len(samp)
        try:
                vec=model[word]
        except:
                vec=[0]*len(samp)
        return (vec)


def getPhraseEmbedding(phrase,embeddingmodel):
                       
        samp=getWordVec('computer', embeddingmodel)
        vec=numpy.array([0]*len(samp))
        den=0
        for word in phrase.split():
            den=den+1
            vec=vec+numpy.array(getWordVec(word,embeddingmodel))
        return vec.reshape(1, -1)


In [9]:
#With w2Vec

sent_embeddings=[]
for sent in cleaned_sentences:
    sent_embeddings.append(getPhraseEmbedding(sent,w2v_model))

question_embedding=getPhraseEmbedding(question,w2v_model)

retrieveAndPrintAnswer(question_embedding,sent_embeddings,df,cleaned_sentences)


0 0.3917313045094571 sound mode
1 0.3925412569781399 explain sound output
2 0.4112559041323438 function sleep timer
3 0.3360633031857873 view current network
4 0.531323061182427 select external device connected tv
5 0.3651834009951936 search apps smart hub services
6 0.3623130384427682 use feature apps
7 0.4176118181970096 turn tv ambient mode
8 0.31533836105953894 item home screen
9 0.5101820539192932 turn autorun smart hub function
10 0.4420087058408628 launch app automatically
11 0.38104795592075474 test smart hub connections
12 0.40335298329957014 reset smart hub
13 0.4170619784798318 create manage samsung account
14 0.3343663782196549 create new account
15 0.34221912531764354 create samsung account facebook account
16 0.32744196850905294 create samsung account paypal account
17 0.5213624832295437 change information samsung account
18 0.4383281337794791 configure sync internet settings
19 0.39535950905715017 delete samsung account
20 0.4040719696892737 sign samsung account
21 0.303

460 0.3658512861264085 fix flickering dimming issues
461 0.38896484718464125 fix screen color issues
462 0.3711260757531712 fix screen brightness issues
463 0.40378673091133943 steps fix blurring issues tv screen
464 0.35409261559361727 fix unwanted powering issue
465 0.3371237998705755 fix powering issue
466 0.365865609773843 getting channel error fix
467 0.22134565444507104 picture distorted
468 0.3546142673273839 fix missingwrong color issue
469 0.3681773954407397 fix poor color issue
470 0.37425544054505067 fix dotted line issue edge tv screen
471 0.3113797293731168 fix black white issue
472 0.3375765410556681 fix low volume issue
473 0.3486167076320997 hear sound
474 0.3656552085064101 fix odd sound speaker
475 0.4552994936036495 fix weak signal issue
476 0.3205805697080376 tv receiving channels
477 0.3419458684269567 way captions digital channels
478 0.4096373546388024 getting distorted picture tv fix issue
479 0.32545448521815684 tv connecting network
480 0.4339431620602112 wire

##### Dataframe to output some random user queries and questions retrieved out of the e-manual for the two models, BOW and W2Vec.

In [10]:
df_show = pd.read_csv("user_queries.csv")
df_show

# after long search I couldn't get a solution to show the complete column intead of '...'
# A solution will be highly appreciable.

Unnamed: 0,user_queries
0,what is picture mode? ...
1,how to activate the caption function? ...
2,what is the use of universal guide? ...
3,can i remove item on the home screen? ...
4,how can i manage my payment information saved ...
5,what are the features of app service? ...
6,how to launch app? ...
7,how can i check app details? ...
8,where to view photos? ...
9,can i play media content saved on my mobile? ...


In [11]:
def retrieveQuestion(question_embedding,sentence_embeddings,sentences):
    max_sim=-1
    index_sim=-1
    for index,eManual_embedding in enumerate(sentence_embeddings):
        sim=cosine_similarity(eManual_embedding,question_embedding)[0][0]
        if sim>max_sim:
            max_sim=sim
            index_sim=index
    
    return df.iloc[index_sim,0]

In [12]:
# function to display df of retrieved queries from BOW and word2vec model

def show_user(df_show):
    cosine_list = []
    w2vec_list = []
    
    for i in range(df_show.shape[0]):
        question_orig = df_show.iloc[i, 0]
        question = clean_sentence(question_orig,stopwords=False)
        question_embedding = dictionary.doc2bow(question.split())
        
        retreivedQuestion = retrieveQuestion(question_embedding,bow_corpus,sentences)
        cosine_list.append(retreivedQuestion)
        
        question_embedding_w2v = getPhraseEmbedding(question,w2v_model)
        retreivedQuestion_w2v = retrieveQuestion(question_embedding_w2v,sent_embeddings,cleaned_sentences)
        w2vec_list.append(retreivedQuestion_w2v)
    
    df_show['bow'] = cosine_list
    df_show['w2vec'] = w2vec_list
    return df_show

df_temp = show_user(df_show)
df_temp

Unnamed: 0,user_queries,bow,w2vec
0,what is picture mode? ...,what is sound mode?,how to change the picture mode?
1,how to activate the caption function? ...,how to turn on the autorun smart hub function?,can i activate the caption function?
2,what is the use of universal guide? ...,what is sound mode?,what is the use of universal guide?
3,can i remove item on the home screen? ...,where can i can view the current network?,how to move item on the home screen?
4,how can i manage my payment information saved ...,where can i can view the current network?,i want to know the information about the tv. h...
5,what are the features of app service? ...,what are the uses of buttons in the e-manual?,i want to use the feature of apps. how can i d...
6,how to launch app? ...,how to reset smart hub?,how to launch the last used app automatically?
7,how can i check app details? ...,can i create a samsung account using a paypal ...,can i check app details?
8,where to view photos? ...,where to view privacy policy?,where can i view bixby guide?
9,can i play media content saved on my mobile? ...,can i create a samsung account using a paypal ...,can i turn on the tv with a mobile device?
