In [115]:
import pandas as pd
import nltk 
import numpy as np
import re
from nltk.stem import wordnet # to perform lemmitization
from sklearn.feature_extraction.text import CountVectorizer # to perform bow
from sklearn.feature_extraction.text import TfidfVectorizer # to perform tfidf
from nltk import pos_tag # for parts of speech
from sklearn.metrics import pairwise_distances # to perfrom cosine similarity
from nltk import word_tokenize # to create tokens
from nltk.corpus import stopwords # for stop words

In [116]:
df=pd.read_excel('conversation table.xlsx') #to import the dialog box file
df.head(15)

Unnamed: 0,Context,Text Response
0,Tell me about your personality,Just think of me as the ace up your sleeve.
1,I want to know you better,I can help you work smarter instead of harder
2,Define yourself,
3,Describe yourself,
4,tell me about yourself,
5,all about you,
6,tell me some stuff about you,
7,talk some stuff about you,
8,talk about yourself,
9,about yourself,


In [117]:
# fills the null value with the previous value.

In [118]:
df.ffill(axis = 0,inplace=True)
df

Unnamed: 0,Context,Text Response
0,Tell me about your personality,Just think of me as the ace up your sleeve.
1,I want to know you better,I can help you work smarter instead of harder
2,Define yourself,I can help you work smarter instead of harder
3,Describe yourself,I can help you work smarter instead of harder
4,tell me about yourself,I can help you work smarter instead of harder
...,...,...
1587,can we chat,Talking is what I do best.
1588,I'll be back in a few minutes,I'll be waiting.
1589,I'll be back,All right. I'll be here.
1590,I'll get back to you in a moment,Till next time.


In [119]:
# function that performs text normalization steps

def text_normalization(text):
    text=str(text).lower() # text to lower case
    spl_char_text=re.sub(r'[^ a-z]','',text) # removing special characters
    tokens=nltk.word_tokenize(spl_char_text) # word tokenizing
    lema=wordnet.WordNetLemmatizer() # intializing lemmatization
    tags_list=pos_tag(tokens,tagset=None) # parts of speech
    lema_words=[]   # empty list 
    for token,pos_token in tags_list:
        if pos_token.startswith('V'):  # Verb
            pos_val='v'
        elif pos_token.startswith('J'): # Adjective
            pos_val='a'
        elif pos_token.startswith('R'): # Adverb
            pos_val='r'
        else:
            pos_val='n' # Noun
        lema_token=lema.lemmatize(token,pos_val) # performing lemmatization
        lema_words.append(lema_token) # appending the lemmatized token into a list
    
    return " ".join(lema_words) # returns the lemmatized tokens as a sentence

In [120]:
# function that converts text into lower case and removes special characters

def step1(x):
    for i in x:
        a=str(i).lower()
        p=re.sub(r'[^a-z0-9]',' ',a)
        print(p)

In [121]:
df1=df.head(5) # copy of first five rows of dataset

In [122]:
step1(df1['Context'])

tell me about your personality
i want to know you better
define yourself
describe yourself
tell me about yourself


In [123]:
# word tokenizing
    
s='he always spoke in grammatical sentences'
words=word_tokenize(s)
print(words)

['he', 'always', 'spoke', 'in', 'grammatical', 'sentences']


In [124]:
lemma = wordnet.WordNetLemmatizer() # intializing lemmatizer
lemma.lemmatize('converted', pos = 'v')

'convert'

In [125]:
pos_tag(nltk.word_tokenize(s),tagset = None) # returns the parts of speech of every word

[('he', 'PRP'),
 ('always', 'RB'),
 ('spoke', 'VBD'),
 ('in', 'IN'),
 ('grammatical', 'JJ'),
 ('sentences', 'NNS')]

In [126]:
text_normalization('telling you i cooked waterfowl belonging to her')

'tell you i cook waterfowl belong to her'

In [127]:
df['lemmatized_text']=df['Context'].apply(text_normalization) # applying the fuction to the dataset to get clean text
df.tail(20)

In [None]:
# all the stop words we have 

stop = stopwords.words('english')
print(stop)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [None]:
cv = CountVectorizer() # intializing the count vectorizer
X = cv.fit_transform(df['lemmatized_text']).toarray()

In [None]:
# returns all the unique word from data 

features = cv.get_feature_names()
df_bow = pd.DataFrame(X, columns = features)
df_bow.head()

Unnamed: 0,abort,about,absolutely,abysmal,actually,adore,advice,advise,affirmative,afraid,...,yeh,yep,yes,yet,you,your,youre,yours,yourself,yup
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [None]:
Question ='Will you help me and tell me about yourself more' # considering an example query

In [None]:
# checking for stop words

Q=[]
a=Question.split()
for i in a:
    if i in stop:
        continue
    else:
        Q.append(i)
    b=" ".join(Q) 

In [None]:
Question_lemma = text_normalization(b) # applying the function that we created for text normalizing
Question_bow = cv.transform([Question_lemma]).toarray() # applying bow mins describe the occurence of word within docu

In [None]:
text_normalization

<function __main__.text_normalization(text)>

In [None]:
Question_bow

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [None]:
# similarity

In [None]:
# cosine similarity for the above question we considered.

cosine_value = 1- pairwise_distances(df_bow, Question_bow, metric = 'cosine' )
(cosine_value)

array([[0.25819889],
       [0.        ],
       [0.        ],
       ...,
       [0.        ],
       [0.        ],
       [0.        ]])

In [None]:
df['similarity_bow']=cosine_value # creating a new column 

In [None]:
df_simi = pd.DataFrame(df, columns=['Text Response','similarity_bow']) # taking similarity value of responses for the question we took
df_simi 

Unnamed: 0,Text Response,similarity_bow
0,Just think of me as the ace up your sleeve.,0.258199
1,I can help you work smarter instead of harder,0.000000
2,I can help you work smarter instead of harder,0.000000
3,I can help you work smarter instead of harder,0.000000
4,I can help you work smarter instead of harder,0.288675
...,...,...
1587,Talking is what I do best.,0.000000
1588,I'll be waiting.,0.000000
1589,All right. I'll be here.,0.000000
1590,Till next time.,0.000000


In [None]:
df_simi_sort = df_simi.sort_values(by='similarity_bow', ascending=False) # sorting the values
df_simi_sort.head()

Unnamed: 0,Text Response,similarity_bow
211,I'm glad to help. What can I do for you?,0.57735
194,I'm glad to help. What can I do for you?,0.57735
184,I'm glad to help. What can I do for you?,0.408248
186,I'm glad to help. What can I do for you?,0.408248
200,I'm glad to help. What can I do for you?,0.408248


In [None]:
threshold = 0.2 # considering the value of p=smiliarity to be greater than 0.2
df_threshold = df_simi_sort[df_simi_sort['similarity_bow'] > threshold] 
df_threshold

Unnamed: 0,Text Response,similarity_bow
211,I'm glad to help. What can I do for you?,0.57735
194,I'm glad to help. What can I do for you?,0.57735
184,I'm glad to help. What can I do for you?,0.408248
186,I'm glad to help. What can I do for you?,0.408248
200,I'm glad to help. What can I do for you?,0.408248
219,I'm glad to help. What can I do for you?,0.333333
728,It's my pleasure to help.,0.333333
188,I'm glad to help. What can I do for you?,0.333333
190,I'm glad to help. What can I do for you?,0.333333
191,I'm glad to help. What can I do for you?,0.333333


In [None]:
# Finally using bow for the question 'Will you help me and tell me about yourself more' , the above are the responses we 
#got using bow and the smiliarity value of responses, we consider the response with highest similarity

In [None]:
index_value = cosine_value.argmax() # returns the index number of highest value
index_value 

194

In [None]:
(Question)

'Will you help me and tell me about yourself more'

In [None]:
df['Text Response'].loc[index_value] # The text at the above index becomes the response for the question

"I'm glad to help. What can I do for you?"

In [None]:
# tf-idf

In [None]:
Question1 ='Describe yourself'

In [None]:
# using tf-idf

tfidf=TfidfVectorizer() # intializing tf-id 
x_tfidf=tfidf.fit_transform(df['lemmatized_text']).toarray() # transforming the data into array

In [None]:
Question_lemma1 = text_normalization(Question1)
Question_tfidf = tfidf.transform([Question_lemma1]).toarray() # applying tf-idf

In [None]:
# returns all the unique word from data with a score of that word

df_tfidf=pd.DataFrame(x_tfidf,columns=tfidf.get_feature_names()) 
df_tfidf.head()

Unnamed: 0,abort,about,absolutely,abysmal,actually,adore,advice,advise,affirmative,afraid,...,yeh,yep,yes,yet,you,your,youre,yours,yourself,yup
0,0.0,0.407572,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.330555,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.218768,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.64179,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.64179,0.0
4,0.0,0.45379,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.608937,0.0


In [None]:
# similarity

In [None]:
cos=1-pairwise_distances(df_tfidf,Question_tfidf,metric='cosine')  # applying cosine similarity
cos

array([[0.        ],
       [0.        ],
       [0.41189475],
       ...,
       [0.        ],
       [0.        ],
       [0.        ]])

In [None]:
df['similarity_tfidf']=cos # creating a new column 
df_simi_tfidf = pd.DataFrame(df, columns=['Text Response','similarity_tfidf']) # taking similarity value of responses for the question we took
df_simi_tfidf 

Unnamed: 0,Text Response,similarity_tfidf
0,Just think of me as the ace up your sleeve.,0.000000
1,I can help you work smarter instead of harder,0.000000
2,I can help you work smarter instead of harder,0.411895
3,I can help you work smarter instead of harder,1.000000
4,I can help you work smarter instead of harder,0.390810
...,...,...
1587,Talking is what I do best.,0.000000
1588,I'll be waiting.,0.000000
1589,All right. I'll be here.,0.000000
1590,Till next time.,0.000000


In [None]:
df_simi_tfidf_sort = df_simi_tfidf.sort_values(by='similarity_tfidf', ascending=False) # sorting the values
df_simi_tfidf_sort.head(10)

Unnamed: 0,Text Response,similarity_tfidf
3,I can help you work smarter instead of harder,1.0
9,I can help you work smarter instead of harder,0.514611
8,I can help you work smarter instead of harder,0.441754
11,I can help you work smarter instead of harder,0.411895
2,I can help you work smarter instead of harder,0.411895
4,I can help you work smarter instead of harder,0.39081
1061,"Yeah, I crack myself up too.",0.0
1062,Laughter is good for you. Keep it up.,0.0
1063,See? Now we're having fun.,0.0
1064,You have a great laugh.,0.0


In [None]:
threshold = 0.2 # considering the value of p=smiliarity to be greater than 0.2
df_threshold = df_simi_tfidf_sort[df_simi_tfidf_sort['similarity_tfidf'] > threshold] 
df_threshold

Unnamed: 0,Text Response,similarity_tfidf
3,I can help you work smarter instead of harder,1.0
9,I can help you work smarter instead of harder,0.514611
8,I can help you work smarter instead of harder,0.441754
11,I can help you work smarter instead of harder,0.411895
2,I can help you work smarter instead of harder,0.411895
4,I can help you work smarter instead of harder,0.39081


In [None]:
index_value1 = cos.argmax() # returns the index number of highest value
index_value1

3

In [None]:
Question1

'Describe yourself'

In [None]:
df['Text Response'].loc[index_value1]  # returns the text at that index

'I can help you work smarter instead of harder'

In [None]:
# Model Using Bag of Words

In [None]:
# Function that removes stop words and process the text

def stopword_(text):   
    tag_list=pos_tag(nltk.word_tokenize(text),tagset=None)
    stop=stopwords.words('english')
    lema=wordnet.WordNetLemmatizer()
    lema_word=[]
    for token,pos_token in tag_list:
        if token in stop:
            continue
        if pos_token.startswith('V'):
            pos_val='v'
        elif pos_token.startswith('J'):
            pos_val='a'
        elif pos_token.startswith('R'):
            pos_val='r'
        else:
            pos_val='n'
        lema_token=lema.lemmatize(token,pos_val)
        lema_word.append(lema_token)
    return " ".join(lema_word) 

In [None]:
# Model Using tf-idf

In [None]:
# defining a function that returns response to query using tf-idf

def chat_tfidf(text):
    lemma=text_normalization(text) # calling the function to perform text normalization
    tf=tfidf.transform([lemma]).toarray() # applying tf-idf
    cos=1-pairwise_distances(df_tfidf,tf,metric='cosine') # applying cosine similarity
    index_value=cos.argmax() # getting index value 
    return df['Text Response'].loc[index_value]

In [None]:
chat_tfidf("I don't think you're fake")

"I'm not a real person, but I certainly exist. I chat, therefore I am."

In [None]:
chat_tfidf('in which city do you live')

'Right here in your device. Whenever you need me.'

In [None]:
chat_tfidf('are you there')

"Of course. I'm always here."

In [None]:
chat_tfidf('are you there')

"Of course. I'm always here."

In [None]:
chat_tfidf('are you in astu')

'Thanks! The feeling is mutual.'

In [None]:
chat_tfidf('astu')

'Just think of me as the ace up your sleeve.'

In [None]:
chat_tfidf('how old are you')

"I'm a relatively new bot, but I'm wise beyond my years."

In [None]:
chat_tfidf('do you think i am wise?')

'Thanks! The feeling is mutual.'

In [None]:
chat_tfidf('hey')

'Hi there, friend!'