# Chat-Bot

### Problem Statement: 


To create a virtual answering tool that will giving meaningful answer to all our questions using Artificial intelligence

### Importing necessary libraries 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

### Text Pre-Processing 

###### Loading Data Set 

In [2]:
df_chat_bot_faq=pd.read_csv("/home/admin1/phoenix/MyDoc/data_sets/chat_bot/Dataset_for_chatbot/Data.csv",encoding='latin1')

In [3]:
"""......................Label Encoding Intent Column......................."""
le = LabelEncoder()
le.fit(df_chat_bot_faq['Intent'])
df_chat_bot_faq["intent_classes"]=le.transform(df_chat_bot_faq['Intent'])

In [4]:
""" Method that will add <EOS> at the end of question and <SOS> at the end of answer"""
def tagger(decoder_input_sentence,question):
    if question:
        return decoder_input_sentence+" <EOS>"
    else:
        return decoder_input_sentence+"<SOS> "

In [5]:
"""Method to do the pre-processing of text"""
def text_pre_processing(text,question=False):
    text = str(text).lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = text.replace("?","")
    text = text.replace(",","")
    return tagger(text,question)

In [6]:
#obtaining the pre-processed questions in a list
chat_questions_cleaned = []
#obtaining the pre-processed answers in a list
chat_answers_cleaned=[]
chat_questions = df_chat_bot_faq['Question'].values
chat_answers=df_chat_bot_faq['Answer'].values
for question in chat_questions:
    chat_questions_cleaned.append(text_pre_processing(question,True))
for answer in chat_answers:
    chat_answers_cleaned.append(text_pre_processing(answer))

In [7]:
"""Making a new DataFrame Of pre-processed questions and answers"""
cleaned_chat=pd.DataFrame({"questions":chat_questions_cleaned,"answers":chat_answers_cleaned,"intent":df_chat_bot_faq['intent_classes'].values})

In [8]:
"""Making a list Of all the sentences in questions and answers"""
sentence_dict=[]
for sentence in cleaned_chat["questions"]:
    sentence_dict.append(sentence)
for sentence in cleaned_chat["answers"]:
    sentence_dict.append(sentence)

In [9]:
"""Obtaining an object of countvectorizer and converting all sentences of question and answers into vectors"""
vectorizer = CountVectorizer()
word_dict_vector = vectorizer.fit(sentence_dict)
questions_vector = vectorizer.transform(cleaned_chat['questions'])
answers_vector =vectorizer.transform(cleaned_chat['answers'])
print("Shape of question vector  : ",questions_vector.shape)
print("Shape of answer vector  : ",answers_vector.shape)

Shape of question vector  :  (123, 413)
Shape of answer vector  :  (123, 413)


In [10]:
"""Obtaing a dictionary of unique words in document and a integer as their key value"""
dict_word={}
list_word=vectorizer.get_feature_names()
for i in range(len(list_word)):
    dict_word[i]=list_word[i]

### Model

In [11]:
"""XG-Boost classifier to find the intent of given question"""
from sklearn.ensemble import GradientBoostingClassifier
classifier= GradientBoostingClassifier(n_estimators=200)

In [12]:
classifier.fit(questions_vector,df_chat_bot_faq["intent_classes"].values)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=200,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [13]:
"""Method to find the most simillar question saved in our question to the question asked by user"""
def find_question(vector,object_vectorizer,intent):
    intent=intent[0]
    tf_vectorizer=object_vectorizer
    df=cleaned_chat.loc[cleaned_chat["intent"]==intent]
    tf_vectorizer.fit(np.concatenate((df.questions,df.answers)))
    answer_vector = tf_vectorizer.transform(cleaned_chat.loc[cleaned_chat["intent"]==intent]["questions"])
    result =np.where(vector==1)[1]
    result =[dict_word[i] for i in result]
    inp_ans_vec = tf_vectorizer.transform(result)
    simillarities = cosine_similarity(inp_ans_vec,answer_vector)
    closest = np.argmax(simillarities,axis=1)
    return df.questions.iloc[closest].values[0]

In [14]:
"""Method to return the answer to the question asked"""
def return_result(object_vectorizer,intent,vector):
    intent=intent[0]
    tf_vectorizer=object_vectorizer
    df=df_chat_bot_faq.loc[df_chat_bot_faq["intent_classes"]==intent]
    answer_vector_obj = tf_vectorizer.fit(np.concatenate((df.Question,df.Answer)))
    ques_vector = tf_vectorizer.transform(df_chat_bot_faq.loc[df_chat_bot_faq["intent_classes"]==intent]["Question"].values)
    result =np.where(vector==1)[1]
    result =[dict_word[i] for i in result]
    inp_ans_vec = tf_vectorizer.transform(result)
    simillarities = cosine_similarity(inp_ans_vec,ques_vector)
    closest = np.argmax(simillarities,axis=1)
    return df.Answer.iloc[closest].values[0]  

### Saving Our Model and Other Important Objects in Pickel File 

In [15]:
import joblib
path_of_pkl_objects="/home/admin1/phoenix/repos/project_with_phoenix/chat_bot/chat_bot/pkl_objects/"
joblib.dump(chat_questions_cleaned,path_of_pkl_objects+"chat_questions_cleaned.pkl")
joblib.dump(chat_answers_cleaned,path_of_pkl_objects+"chat_answers_cleaned.pkl")
joblib.dump(sentence_dict,path_of_pkl_objects+"sentence_dict.pkl")
joblib.dump(dict_word,path_of_pkl_objects+"dict_word.pkl")
joblib.dump(vectorizer,path_of_pkl_objects+"count_vectorizer.pkl")
joblib.dump(classifier,path_of_pkl_objects+"classifier.pkl")
joblib.dump(df_chat_bot_faq,path_of_pkl_objects+"df_chat_bot_faq.pkl")
joblib.dump(cleaned_chat,path_of_pkl_objects+"cleaned_chat.pkl")

['/home/admin1/phoenix/repos/project_with_phoenix/chat_bot/chat_bot/pkl_objects/cleaned_chat.pkl']