In [38]:
import pandas as pd
import nltk 
import numpy as np
import re
from nltk.stem import wordnet # to perform lemmitization
from sklearn.feature_extraction.text import CountVectorizer # to perform bow
from sklearn.feature_extraction.text import TfidfVectorizer # to perform tfidf
from nltk import pos_tag # for parts of speech
from sklearn.metrics import pairwise_distances # to perfrom cosine similarity
from nltk import word_tokenize # to create tokens
from nltk.corpus import stopwords # for stop words

In [39]:
df=pd.read_excel('dialog_talk_agent.xlsx')
df.head(2)

Unnamed: 0,Context,Answer
0,Tell me about your personality,Just think of me as the ace up your sleeve. I ...
1,I want to know you better,


In [40]:
data = pd.read_excel("WHO_FAQ.xlsx", encoding='utf8')
data2=pd.read_csv("Chatbot_dataset.csv")
data2.drop('Unnamed: 0',axis=1,inplace=True)
data2.head(2)

Unnamed: 0,Context,Answer
0,What is coronavirus?,"Coronavirus is not a single virus, but a famil..."
1,What is COVID-19?,COVID-19 (Coronavirus disease-2019) is the off...


In [41]:
df=pd.concat([df,data,data2]).reset_index(drop=True)

In [42]:
df.shape[0] # returns the number of rows in dataset

1733

In [43]:
df

Unnamed: 0,Context,Answer
0,Tell me about your personality,Just think of me as the ace up your sleeve. I ...
1,I want to know you better,
2,Define yourself,
3,Describe yourself,
4,tell me about yourself,
...,...,...
1728,How can travellers protect themselves from the...,The CDC has recommended to avoid travelling to...
1729,How will self-isolated travelers be monitored ...,Public Health Authorities are screening suspec...
1730,What care should I take if travelling to China...,"In case of unavoidable travel to China, the CD..."
1731,Can pregnant women with coronavirus deliver he...,Pregnant women diagnosed with COVID-19 can del...


In [44]:
df=df.ffill() # fills the null value with the previous value.

In [45]:
df

Unnamed: 0,Context,Answer
0,Tell me about your personality,Just think of me as the ace up your sleeve. I ...
1,I want to know you better,Just think of me as the ace up your sleeve. I ...
2,Define yourself,Just think of me as the ace up your sleeve. I ...
3,Describe yourself,Just think of me as the ace up your sleeve. I ...
4,tell me about yourself,Just think of me as the ace up your sleeve. I ...
...,...,...
1728,How can travellers protect themselves from the...,The CDC has recommended to avoid travelling to...
1729,How will self-isolated travelers be monitored ...,Public Health Authorities are screening suspec...
1730,What care should I take if travelling to China...,"In case of unavoidable travel to China, the CD..."
1731,Can pregnant women with coronavirus deliver he...,Pregnant women diagnosed with COVID-19 can del...


In [46]:
df1=df.head(10) # copy of first ten rows of dataset

In [47]:
# function that converts text into lower case and removes special characters

def step1(x):
    for i in x:
        a=str(i).lower()
        p=re.sub(r'[^a-z0-9]',' ',a)
        print(p)

In [48]:
step1(df1['Context'])

tell me about your personality
i want to know you better
define yourself
describe yourself
tell me about yourself
all about you
tell me some stuff about you
talk some stuff about you
talk about yourself
about yourself


In [49]:
 # word tokenizing
    
s='tell me about your personality'
words=word_tokenize(s)
print(words)

['tell', 'me', 'about', 'your', 'personality']


In [50]:
lemma = wordnet.WordNetLemmatizer() # intializing lemmatizer
lemma.lemmatize('absorbed', pos = 'v')

'absorb'

In [51]:
pos_tag(nltk.word_tokenize(s),tagset = None) # returns the parts of speech of every word

[('tell', 'VB'),
 ('me', 'PRP'),
 ('about', 'IN'),
 ('your', 'PRP$'),
 ('personality', 'NN')]

In [52]:
# function that performs text normalization steps

def text_normalization(text):
    text=str(text).lower() # text to lower case
    spl_char_text=re.sub(r'[^ a-z]','',text) # removing special characters
    tokens=nltk.word_tokenize(spl_char_text) # word tokenizing
    lema=wordnet.WordNetLemmatizer() # intializing lemmatization
    tags_list=pos_tag(tokens,tagset=None) # parts of speech
    lema_words=[]   # empty list 
    for token,pos_token in tags_list:
        if pos_token.startswith('V'):  # Verb
            pos_val='v'
        elif pos_token.startswith('J'): # Adjective
            pos_val='a'
        elif pos_token.startswith('R'): # Adverb
            pos_val='r'
        else:
            pos_val='n' # Noun
        lema_token=lema.lemmatize(token,pos_val) # performing lemmatization
        lema_words.append(lema_token) # appending the lemmatized token into a list
    
    return " ".join(lema_words) # returns the lemmatized tokens as a sentence 

In [53]:
text_normalization('telling you some stuff about me: I am a data scientist ')

'tell you some stuff about me i be a data scientist'

In [54]:
df['lemmatized_text']=df['Context'].apply(text_normalization) # applying the fuction to the dataset to get clean text
df.tail(15)

Unnamed: 0,Context,Answer,lemmatized_text
1718,When did coronavirus reach China?,The first coronavirus case in China was confir...,when do coronavirus reach china
1719,How is the coronavirus outbreak affecting China?,Coronavirus affects only few provinces in Chin...,how be the coronavirus outbreak affect china
1720,How are travellers from China during the coron...,Travellers arriving from China are undergoing ...,how be traveller from china during the coronav...
1721,When did coronavirus reach Hong Kong?,The first confirmed coronavirus case in Hong K...,when do coronavirus reach hong kong
1722,When did coronavirus reach India?,India's Ministry of Health & Family Welfare co...,when do coronavirus reach india
1723,When did coronavirus reach the US?,The first novel coronavirus infection (COVID-1...,when do coronavirus reach the u
1724,When did coronavirus reach Australia?,Australia confirmed its first case of the COVI...,when do coronavirus reach australia
1725,When did coronavirus reach Japan?,Japan reported its first confirmed coronavirus...,when do coronavirus reach japan
1726,When did coronavirus reach the UK?,The UK reported confirmed coronavirus cases fi...,when do coronavirus reach the uk
1727,Can posts and packages coming from China trans...,"No, WHO says, coronavirus can’t survive for ve...",can post and package come from china transmit ...


In [55]:
# all the stop words we have 

stop = stopwords.words('english')
print(stop)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

# bag of words

In [56]:
cv = CountVectorizer() # intializing the count vectorizer
X = cv.fit_transform(df['lemmatized_text']).toarray()

In [57]:
# returns all the unique word from data 

features = cv.get_feature_names()
df_bow = pd.DataFrame(X, columns = features)
df_bow.head()

Unnamed: 0,abbvie,abort,about,absolutely,abysmal,actually,adore,advice,advise,affect,...,yeh,yep,yes,yet,you,your,youre,yours,yourself,yup
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [58]:
Question ='Will you help me and tell me about yourself more' # considering an example query

In [59]:
# checking for stop words

Q=[]
a=Question.split()
for i in a:
    if i in stop:
        continue
    else:
        Q.append(i)
    b=" ".join(Q) 

In [60]:
Question_lemma = text_normalization(b) # applying the function that we created for text normalizing
Question_bow = cv.transform([Question_lemma]).toarray() # applying bow

In [61]:
text_normalization

<function __main__.text_normalization(text)>

In [62]:
Question_bow

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

# similarity

In [63]:
# cosine similarity for the above question we considered.

cosine_value = 1- pairwise_distances(df_bow, Question_bow, metric = 'cosine' )
(cosine_value)

array([[0.25819889],
       [0.        ],
       [0.        ],
       ...,
       [0.        ],
       [0.        ],
       [0.        ]])

In [64]:
df['similarity_bow']=cosine_value # creating a new column 

In [65]:
df_simi = pd.DataFrame(df, columns=['Answer','similarity_bow']) # taking similarity value of responses for the question we took
df_simi 

Unnamed: 0,Answer,similarity_bow
0,Just think of me as the ace up your sleeve. I ...,0.258199
1,Just think of me as the ace up your sleeve. I ...,0.000000
2,Just think of me as the ace up your sleeve. I ...,0.000000
3,Just think of me as the ace up your sleeve. I ...,0.000000
4,Just think of me as the ace up your sleeve. I ...,0.288675
...,...,...
1728,The CDC has recommended to avoid travelling to...,0.000000
1729,Public Health Authorities are screening suspec...,0.166667
1730,"In case of unavoidable travel to China, the CD...",0.000000
1731,Pregnant women diagnosed with COVID-19 can del...,0.000000


In [66]:
df_simi_sort = df_simi.sort_values(by='similarity_bow', ascending=False) # sorting the values
df_simi_sort.head()

Unnamed: 0,Answer,similarity_bow
211,I am glad to help. What can I do for you?,0.57735
194,I am glad to help. What can I do for you?,0.57735
1565,Thanks for being so patient. Sometimes these t...,0.408248
186,I am glad to help. What can I do for you?,0.408248
200,I am glad to help. What can I do for you?,0.408248


In [67]:
threshold = 0.2 # considering the value of p=smiliarity to be greater than 0.2
df_threshold = df_simi_sort[df_simi_sort['similarity_bow'] > threshold] 
df_threshold

Unnamed: 0,Answer,similarity_bow
211,I am glad to help. What can I do for you?,0.57735
194,I am glad to help. What can I do for you?,0.57735
1565,Thanks for being so patient. Sometimes these t...,0.408248
186,I am glad to help. What can I do for you?,0.408248
200,I am glad to help. What can I do for you?,0.408248
184,I am glad to help. What can I do for you?,0.408248
188,I am glad to help. What can I do for you?,0.333333
1589,All right. I will be here.,0.333333
222,I am glad to help. What can I do for you?,0.333333
214,I am glad to help. What can I do for you?,0.333333


- Finally using bow for the question 'Will you help me and tell me about yourself more' , the above are the responses we got using bow and the smiliarity value of responses, we consider the response with highest similarity

In [68]:
index_value = cosine_value.argmax() # returns the index number of highest value
index_value 

194

In [69]:
(Question)

'Will you help me and tell me about yourself more'

In [70]:
df['Answer'].loc[index_value] # The text at the above index becomes the response for the question

'I am glad to help. What can I do for you?'

# tf-idf

In [71]:
Question1 ='Tell me about yourself.'

In [72]:
# using tf-idf

tfidf=TfidfVectorizer() # intializing tf-id 
x_tfidf=tfidf.fit_transform(df['lemmatized_text']).toarray() # transforming the data into array

In [73]:
Question_lemma1 = text_normalization(Question1)
Question_tfidf = tfidf.transform([Question_lemma1]).toarray() # applying tf-idf

In [74]:
# returns all the unique word from data with a score of that word

df_tfidf=pd.DataFrame(x_tfidf,columns=tfidf.get_feature_names()) 
df_tfidf.head()

Unnamed: 0,abbvie,abort,about,absolutely,abysmal,actually,adore,advice,advise,affect,...,yeh,yep,yes,yet,you,your,youre,yours,yourself,yup
0,0.0,0.0,0.402789,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.333476,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.226077,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.642593,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.642593,0.0
4,0.0,0.0,0.448696,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.60944,0.0


# similarity

In [75]:
cos=1-pairwise_distances(df_tfidf,Question_tfidf,metric='cosine')  # applying cosine similarity
cos

array([[0.56427161],
       [0.        ],
       [0.39162151],
       ...,
       [0.        ],
       [0.        ],
       [0.        ]])

In [76]:
df['similarity_tfidf']=cos # creating a new column 
df_simi_tfidf = pd.DataFrame(df, columns=['Answer','similarity_tfidf']) # taking similarity value of responses for the question we took
df_simi_tfidf 

Unnamed: 0,Answer,similarity_tfidf
0,Just think of me as the ace up your sleeve. I ...,0.564272
1,Just think of me as the ace up your sleeve. I ...,0.000000
2,Just think of me as the ace up your sleeve. I ...,0.391622
3,Just think of me as the ace up your sleeve. I ...,0.391622
4,Just think of me as the ace up your sleeve. I ...,1.000000
...,...,...
1728,The CDC has recommended to avoid travelling to...,0.000000
1729,Public Health Authorities are screening suspec...,0.000000
1730,"In case of unavoidable travel to China, the CD...",0.000000
1731,Pregnant women diagnosed with COVID-19 can del...,0.000000


In [77]:
df_simi_tfidf_sort = df_simi_tfidf.sort_values(by='similarity_tfidf', ascending=False) # sorting the values
df_simi_tfidf_sort.head(10)

Unnamed: 0,Answer,similarity_tfidf
4,Just think of me as the ace up your sleeve. I ...,1.0
16,Just think of me as the ace up your sleeve. I ...,0.770149
9,Just think of me as the ace up your sleeve. I ...,0.756799
8,Just think of me as the ace up your sleeve. I ...,0.648152
379,I should get one. It's all work and no play la...,0.593181
500,The virtual world is my playground. I am alway...,0.589239
0,Just think of me as the ace up your sleeve. I ...,0.564272
6,Just think of me as the ace up your sleeve. I ...,0.513776
48,I am not programmed for that exact question. T...,0.470817
24,"I am a relatively new bot, but I am wise beyon...",0.437328


In [78]:
threshold = 0.2 # considering the value of p=smiliarity to be greater than 0.2
df_threshold = df_simi_tfidf_sort[df_simi_tfidf_sort['similarity_tfidf'] > threshold] 
df_threshold

Unnamed: 0,Answer,similarity_tfidf
4,Just think of me as the ace up your sleeve. I ...,1.0
16,Just think of me as the ace up your sleeve. I ...,0.770149
9,Just think of me as the ace up your sleeve. I ...,0.756799
8,Just think of me as the ace up your sleeve. I ...,0.648152
379,I should get one. It's all work and no play la...,0.593181
500,The virtual world is my playground. I am alway...,0.589239
0,Just think of me as the ace up your sleeve. I ...,0.564272
6,Just think of me as the ace up your sleeve. I ...,0.513776
48,I am not programmed for that exact question. T...,0.470817
24,"I am a relatively new bot, but I am wise beyon...",0.437328


- by using tfidf for the question 'Will you help me and tell me about yourself more' , the above are the responses we got and the smiliarity value of responses, we consider the response with highest similarity

In [79]:
index_value1 = cos.argmax() # returns the index number of highest value
index_value1

4

In [80]:
Question1

'Tell me about yourself.'

In [81]:
df['Answer'].loc[index_value1]  # returns the text at that index

'Just think of me as the ace up your sleeve. I can help you work smarter instead of harder'

# Model Using Bag of Words

In [82]:
# Function that removes stop words and process the text

def stopword_(text):   
    tag_list=pos_tag(nltk.word_tokenize(text),tagset=None)
    stop=stopwords.words('english')
    lema=wordnet.WordNetLemmatizer()
    lema_word=[]
    for token,pos_token in tag_list:
        if token in stop:
            continue
        if pos_token.startswith('V'):
            pos_val='v'
        elif pos_token.startswith('J'):
            pos_val='a'
        elif pos_token.startswith('R'):
            pos_val='r'
        else:
            pos_val='n'
        lema_token=lema.lemmatize(token,pos_val)
        lema_word.append(lema_token)
    return " ".join(lema_word) 

In [83]:
# defining a function that returns response to query using bow

def chat_bow(text):
    s=stopword_(text)
    lemma=text_normalization(s) # calling the function to perform text normalization
    bow=cv.transform([lemma]).toarray() # applying bow
    cosine_value = 1- pairwise_distances(df_bow,bow, metric = 'cosine' )
    index_value=cosine_value.argmax() # getting index value 
    return df['Answer'].loc[index_value]

In [84]:
chat_bow('hi there')

'Hey!'

In [85]:
chat_bow('Your are amazing')

'Terrific!'

In [86]:
chat_bow('i miss you')

"I've been right here all along!"

# Model Using tf-idf

In [87]:
# defining a function that returns response to query using tf-idf

def chat_tfidf(text):
    lemma=text_normalization(text) # calling the function to perform text normalization
    tf=tfidf.transform([lemma]).toarray() # applying tf-idf
    cos=1-pairwise_distances(df_tfidf,tf,metric='cosine') # applying cosine similarity
    index_value=cos.argmax() # getting index value 
    return df['Answer'].loc[index_value]

In [88]:
chat_tfidf('how are you')

'Lovely, thanks.'

In [112]:
chat_tfidf('You are bad')

'I am sorry but I am just a baby bot 🤖. I am learning and getting better all the time.'

In [117]:
chat_tfidf('Is wearing a mask necessary?')

'Only wear a mask if you are ill with COVID-19 symptoms (especially coughing) or looking after someone who may have COVID-19. Disposable face mask can only be used once.'

In [110]:
chat_tfidf('When did covid19 reach India?')

"India's Ministry of Health & Family Welfare confirmed the first coronavirus case on 30 January 2020 in the state of Kerala."

In [109]:
chat_tfidf('I am a smokers. Am I vulnerable?')

'Smokers are likely to be more vulnerable to COVID-19 as the act of smoking means that fingers (and possibly contaminated cigarettes) are in contact with lips which increases the possibility of transmission of virus from hand to mouth.'

In [98]:
chat_tfidf('WhAT is Coronavirus?')

'Coronaviruses are a large family of viruses which may cause illness in animals or humans.\xa0'

In [95]:
chat_tfidf('you are amazing and hope to see u soon.')

'Bye.'

# Conclusion
- Our chat bot worked well with both bow and tf-idf, tf-idf model worked well even with stop words compared to bow where we had to remove stop words before using bag of words.

In [96]:
print('*****************************************************')
x=[input("YOUR QUERY: \n")]

while(x[0].lower() not in ['bye','thanks','ok','cya']):
    print("\n\nBOT: ",chat_tfidf(x))
    x=[input("YOUR QUERY :")]
print("\nBye !! Stay Safe!!")
print('*****************************************************')

*****************************************************
YOUR QUERY: 
hi


BOT:  Hey!
YOUR QUERY :how are you


BOT:  Lovely, thanks.
YOUR QUERY :what is coronavirus


BOT:  Coronaviruses are a large family of viruses which may cause illness in animals or humans. 
YOUR QUERY :what is covid19


BOT:  COVID-19 is the infectious disease caused by the most recently discovered coronavirus. This new virus and disease were unknown before the outbreak began in Wuhan, China, in December 2019.
YOUR QUERY :ok

Bye !! Stay Safe!!
*****************************************************
