# **Task**: To build a marketing chatbot

**Importing required libraries**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import nltk #natural language toolkit
from nltk import bigrams,ngrams,trigrams
import re

**Reading dataset using pandas and storing as a data frame**

In [2]:
df=pd.read_excel('./dialog_talk_agent(correct).xlsx')

**Getting top 5 values**

In [3]:
df.head(5)

Unnamed: 0,Context,Text Response
0,Tell me about your personality,Just think of me as the ace up your sleeve.
1,I want to know you better,I can help you work smarter instead of harder
2,Define yourself,I can help you work smarter instead of harder
3,Describe yourself,I can help you work smarter instead of harder
4,tell me about yourself,I can help you work smarter instead of harder


**Checking for null values**

In [4]:
df.isnull().sum()

Context          11
Text Response    11
dtype: int64

**Dropping null values if present**

In [5]:
df.dropna(inplace=True)

# Step 1: TEXT NORMALIZATION

**Importing libraries for lemmatizing of the text**

In [6]:
from nltk import pos_tag #for finding parts of speech of words in corpus
from nltk.stem import wordnet #for lemmatizing the words in corpus

**Initializing Lemma**

In [7]:
lema=wordnet.WordNetLemmatizer()

**Creating a function for the normalization of the data**

In [8]:
def text_normalize(text):
    #Step 1: Converting to lower case
    pre_text=text.lower()
    #Step 2: Substituting special characters and numbers with empty space
    pre_text1=re.sub(r'[^a-z0-9]',' ',pre_text)
    #Step 3: Calling pos_tag using pre-processed text
    tag_list=pos_tag(nltk.word_tokenize(pre_text1),tagset=None)
    lema_sent=[]#initializing empty list
    #Step 4: Finding the parts of speech for the processed text
    for token,pos_token in tag_list:
        if pos_token.startswith('V'):
            pos_val='v'
        elif pos_token.startswith('R'):#adverb
            pos_val='r'
        elif pos_token.startswith('J'):#adjective
            pos_val='a'
        else:#any parts of speech except verb adverb adjective
            pos_val='n'
        #Step 5: Lemmatizing the word with its POS
        lema_token=lema.lemmatize(token,pos_val)#computing
        lema_sent.append(lema_token)#append values in list
    return ' '.join(lema_sent)


**Applying the above function to the text**

In [9]:
df['doc_tp']=df['Context'].apply(text_normalize)
df.head(3)

Unnamed: 0,Context,Text Response,doc_tp
0,Tell me about your personality,Just think of me as the ace up your sleeve.,tell me about your personality
1,I want to know you better,I can help you work smarter instead of harder,i want to know you good
2,Define yourself,I can help you work smarter instead of harder,define yourself


**Saving the above function in a pickle** 

In [10]:
import pickle
pickle.dump(text_normalize,open('./text_normalise.pickle','wb'))

# Step 2: WORD EMBEDDING
It is done using,
1. BAG OF WORDS model(BOW)
2. TERM FREQUENCY AND INVERSE DOCUMENT FREQUENCY model (TFIDF)

### BOW

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

**Applying bag of words using CountVectorizer**

In [12]:
cv=CountVectorizer()
X=cv.fit_transform(df['doc_tp']).toarray()#converting text to bow

**Getting features after applying BOW**

In [13]:
features=cv.get_feature_names()
print(features)

['21', 'abort', 'about', 'absolutely', 'abysmal', 'actually', 'adore', 'advice', 'advise', 'affirmative', 'afraid', 'afternoon', 'again', 'age', 'agree', 'ah', 'ahah', 'ahaha', 'ahahah', 'ahahaha', 'ahead', 'all', 'almost', 'alone', 'already', 'alright', 'alrighty', 'also', 'always', 'amaze', 'amazing', 'an', 'and', 'angry', 'annoy', 'annoying', 'annul', 'answer', 'any', 'anymore', 'anything', 'anytime', 'apologise', 'apologize', 'apology', 'apparently', 'appreciate', 'aren', 'ask', 'asleep', 'assist', 'assistance', 'at', 'attractive', 'aways', 'awesome', 'awful', 'baby', 'back', 'bad', 'be', 'bear', 'beautiful', 'because', 'bed', 'beg', 'best', 'bestie', 'birth', 'birthday', 'bore', 'boring', 'bos', 'bot', 'brainy', 'bravo', 'brilliant', 'buddy', 'busy', 'but', 'bye', 'can', 'cancel', 'care', 'celebrate', 'certainly', 'chat', 'chatbot', 'cheer', 'childhood', 'city', 'clever', 'come', 'confirm', 'cook', 'cookie', 'cool', 'correct', 'could', 'country', 'course', 'crack', 'crazy', 'cry',

**Converting it to a DataFrame**

In [14]:
df_bow=pd.DataFrame(X,columns=features)
df_bow

Unnamed: 0,21,abort,about,absolutely,abysmal,actually,adore,advice,advise,affirmative,...,year,yeh,yep,yes,yet,you,your,yours,yourself,yup
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1576,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1577,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1578,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1579,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


**Saving the BOW data**

In [15]:
np.savez('X_bow.npz',X)

**Saving the BOW data into pickle**

In [16]:
pickle.dump(df_bow,open('./df_bow.pickle','wb'))

### TFIDF

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer()

**Applying it to the normalized corpus**

In [18]:
X_tfidf=tfidf.fit_transform(df['doc_tp']).toarray()
X_tfidf.shape

(1581, 491)

**Getting the features after applying TFIDF**

In [19]:
features=tfidf.get_feature_names()
df_tfidf=pd.DataFrame(X_tfidf,columns=features)
df_tfidf.shape

(1581, 491)

**Checking the length of the features**

In [20]:
len(features)

491

**Saving it in a pickle**

In [21]:
pickle.dump(df_tfidf,open('./df_tfidf.pickle','wb'))

# Query Processing

**Importing required libraries for finding COSINE SIMILARITY between user query and database**

In [22]:
from sklearn.metrics import pairwise_distances

**Creating USER INTERFACE**

In [23]:
while(True):
    Q=input('User: ')
    if (Q=='Bye')or(Q=='bye'):
        print('Bye,Hope we had a great talk!!!')
        break
    #step-1: Cleaning,Lemmatization and removing stop words
    Q_lema=text_normalize(Q)
    #step-2: Applying tfidf to user query
    Q_tfidf=tfidf.transform([Q_lema]).toarray()
    #step-3: FINDING COSINE SIMILARITY between database of queries and user query 
    #To get relevant response
    cos=1-pairwise_distances(X_tfidf,Q_tfidf,metric='cosine')
    ind=cos.argmax()
    #step-4: Giving response to the query whose cosine value is greater than threshold 
    if ind > 0.2:
        res=df.loc[df['Context']==df['Context'].loc[ind], 'Text Response']
        print('System:',res.iloc[0])
    else:
        print('Could you ask in another way!!!')
    

User: Tell me something new
System: I can help you work smarter instead of harder
User: oops
Could you ask in another way!!!
User: sure
System: Indeed.
User: bye
Bye,Hope we had a great talk!!!


**The chat will get terminated when user types 'bye'**