Importing all the Libraries

In [8]:
import numpy as np      #Numerical Computations with arrays
import nltk             #For Natural Language Processing
import random           #Generating random numbers
import pandas as pd     #Provides Data Structures and Analytical Tools
import pickle           #To Save files at some point of the code, Saving and loading files
import operator         #To Perform Common Operations
from sklearn.svm import SVC        #Imports Support Vector Classifier for Classification of Tasks
from sklearn.model_selection import train_test_split as train_test_split         #Train Test Split function to Split the Data into Training and Testing Sets for training the Retreival Based Chatbot
from sklearn.feature_extraction.text import TfidfVectorizer         #Converts Raw Documents into matrix of tfidf features vectors
from sklearn.preprocessing import LabelEncoder as label_encoder     #Encodes labels into numerical values, assigns unique numerical integer value to unique category
from sklearn.metrics.pairwise import cosine_similarity              #Computes Cosine Similarity in vectors
from nltk.stem.lancaster import LancasterStemmer as stemmer         #Used for stemming to bring the word to its root form like running ran run into a single category word run
nltk.download('punkt')
nltk.download('punkt_tab')
import nltk

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


1. Data Pre-Processing

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Defining functions and performing Data Cleaning

In [10]:
data = pd.read_csv('dataset_updated194.csv')            #Reading the CSV File Data

stemming = stemmer()  # Creates an Instance for stemming

def cleanup(sentence):  # Function to Perform tokenizing and Stemming
    word_tokens = nltk.word_tokenize(sentence)                       #Performs tokenization
    stemmed_words = [stemming.stem(w) for w in word_tokens]          #Stemming the Data to reduce it into it's root form.
    return ' '.join(stemmed_words)                                   #Returning with a space


le = label_encoder()    #Encoding to uniques integer values

tfv = TfidfVectorizer(min_df=1, stop_words='english')               #excluding terms like tis is and etc

questions = data['Question'].values                #Extacting Question Data to a variable called questions

pre_questions = []                              #Empty list to store pre processed questions Pre_Questions is Pre Processed Questions
for question in questions:          #Performing the Data Cleaning Operations
    pre_questions.append(cleanup(question))

tfv.fit(pre_questions)                       # Fits the TfidfVectorizer to the preprocessed questions "Pre_Questions" to learn the vocabulary and IDF (inverse document frequency) weights.
le.fit(data['Class'])                        # Encoding Target Labels to Column 'Class'

2. Training the Model

In [11]:
Pre_Question = tfv.transform(pre_questions)         #Pre Processed Questions into Vectors
Pre_Class = le.transform(data['Class'])                     #Encodes the labels from a class

trainq, testq, trainc, testc = train_test_split(Pre_Question, Pre_Class, test_size=.25, random_state=42)               #Splitting Data into Training and Testing Sets

model = SVC(kernel='linear')                 #Instance of Support Vector Classifier to Classify the tasks
model.fit(trainq, trainc)                    #Training the SVC Model
print("Accuracy Score of the Training and Testing Data:", model.score(testq, testc)*100,"%")     #Printing the Accuracy Score


def get_max5(arr):                           #Defines a function named "get_max5" that takes an array as input and returns the indices of the top 5 elements in descending order.
    ixarr = []
    for ix, el in enumerate(arr):
        ixarr.append((el, ix))
    ixarr.sort()

    ixs = []
    for i in ixarr[-5:]:
        ixs.append(i[1])

    return ixs[::-1]

Accuracy Score of the Training and Testing Data: 53.06122448979592 %


3. Chatbot Implementation

In [12]:
def chat():
    cnt = 0                                                                       #Getting a Feedback Track to find the number of times user is not satisfied with the response generated
    print("Follow the Following Steps if needed for a better response")
    print()
    print("PRESS Q or E to QUIT")
    print("TYPE \"TOP5\" to Display 5 most relevant results")
    print("TYPE \"CONF\" to Display the most confident result")
    print()
    print()
    TOP5 = False

    print("Bot: Hi, Welcome to our E-Assit Chatbot! How may I assit you today?")
    while True:
        usr = input("You: ")

        if usr.lower() == 'yes':
            print("Bot: Yes! How may I assit you today?")
            continue

        if usr.lower() == 'no':
            print("Bot: Could you please explain?")
            continue

        if usr == 'Q' or usr == 'E' or usr == 'q' or usr == 'e':
            print("Bot: Thank You, We hope we were able to Assit you!")
            break

        if usr == 'TOP5' or usr == 'top5':
            TOP5 = True
            print("Will display 5 most relevant results now")
            continue

        if usr == 'CONF'or usr == 'conf':
            TOP5 = False
            print("Only the most relevant result will be displayed")
            continue

        t_usr = tfv.transform([cleanup(usr.strip().lower())])
        class_ = le.inverse_transform(model.predict(t_usr))[0]
        questionset = data[data['Class'] == class_]

        cos_sims = []
        for question in questionset['Question']:
            sims = cosine_similarity(tfv.transform([question]), t_usr)
            cos_sims.append(sims)

        ind = cos_sims.index(max(cos_sims))

        if not TOP5:
            print("Bot:", data['Answer'][questionset.index[ind]])
        else:
            inds = get_max5(cos_sims)
            for ix in inds:
                print("Question: " + data['Question'][questionset.index[ix]])
                print("Answer: " + data['Answer'][questionset.index[ix]])
                print('-' * 50)

        print("\n" * 2)
        outcome = input("We hope we solved your Query? Yes/No: If Yes Press Q or E to Quit").lower().strip()
        if outcome == 'yes':
            cnt = 0
        elif outcome == 'no':
            inds = get_max5(cos_sims)
            sugg_choice = input("Bot: Do you want me to suggest you questions ? Yes/No: ").lower()
            if sugg_choice == 'yes':
                q_cnt = 1
                for ix in inds:
                    print(q_cnt, "Question: " + data['Question'][questionset.index[ix]])
                    print('-' * 50)
                    q_cnt += 1
                num = int(input("Please enter the question number you find most relevant: "))
                print("Bot: ", data['Answer'][questionset.index[inds[num - 1]]])


chat()

Follow the Following Steps if needed for a better response

PRESS Q or E to QUIT
TYPE "TOP5" to Display 5 most relevant results
TYPE "CONF" to Display the most confident result


Bot: Hi, Welcome to our E-Assit Chatbot! How may I assit you today?
You: Hi
Bot: Hi! Sure, I'd be happy to help. What product are you looking for?



We hope we solved your Query? Yes/No: no
Bot: Do you want me to suggest you questions ? Yes/No: no
You: how do i track my order?
Bot: You can track your order on our website by logging into your account and selecting the "Order Status" option. You'll be able to see the current status of your order and any updates.



We hope we solved your Query? Yes/No: I want to return an item i bought
You: no
Bot: Could you please explain?
You: I want to return an item i bought
Bot: Hello! You can initiate a return within 30 days of purchase through your account on our website or by contacting our customer support team.



We hope we solved your Query? Yes/No: yes
You: thank y

In [None]:
model.save('my_model.h5')  # Save as a HDF5 file
