# Importing the required libraries

In [1]:
import numpy as np
import nltk
import string
import random

In [2]:
import warnings
warnings.filterwarnings('ignore')

# Importing and reading the corpus

In [3]:
f = open('C:/Users/user/OneDrive/Desktop/chatbot.txt', 'r', errors = 'ignore')
raw_doc = f.read()
raw_doc = raw_doc.lower() # Coverts text to lowercase
nltk.download('punkt') # Using the Punkt tokenizer
nltk.download('wordnet') # Using the WordNet dictionary
sentence_tokens = nltk.sent_tokenize(raw_doc) # Converts our doc to list of sentences
word_tokens = nltk.word_tokenize(raw_doc) # Converts our doc to list of words



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Example of sentence tokens

In [4]:
print(sentence_tokens[:2])

['data science definition:-\n- data science is an interdisciplinary field that combines domain expertise, programming skills, and statistical knowledge to extract meaningful insights and knowledge from data.', 'it encompasses various techniques, algorithms, and tools to analyze and visualize data.']


# Example of word tokens



In [5]:
print(word_tokens[:2])

['data', 'science']


#  Text  preprocessing

In [6]:
lemmer = nltk.stem.WordNetLemmatizer()
# WordNet is a semantically-oriented dictionary of English included in NLTK.

def LemTokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens]

remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)

def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

# Defining the greeting function

In [7]:
GREET_INPUTS = {"hello", "hi", "greetings", "sup", "what's up", "hey", "good day", "howdy", "salutations", "hola", "hey there", "hello there"}
GREET_RESPONSES = ["Hi!", "Hey!", "Hi there!", "Hello!", "I am glad! You are talking to me.", "Greetings!", "Hey, how can I assist you?", "Hello! How can I help you today?", "Hey, nice to see you!", "What's up?"]

def greet(sentence):
    
    for word in sentence.split():
        if word.lower() in GREET_INPUTS:
            return random.choice(GREET_RESPONSES)
            

# Defining a function for handling unknown inputs

In [8]:
def handle_unknown():
    
    unknown_responses = ["I'm not sure I understand. Can you please rephrase?", 
                         "I didn't catch that. Could you please clarify?", 
                         "I'm sorry, I'm not sure what you mean. Can you try rephrasing that?", 
                         "I'm having trouble understanding. Could you please say it another way?"]
    
    return random.choice(unknown_responses)


# Response generation

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer # Tf -> Term frequency, idf -> Inverse document frequency
from sklearn.metrics.pairwise import cosine_similarity

In [10]:
def response(user_response):
    
    robo1_response = ""
    TfidfVec = TfidfVectorizer(tokenizer = LemNormalize, stop_words = "english")
    tfidf = TfidfVec.fit_transform(sentence_tokens)
    vals = cosine_similarity(tfidf[-1], tfidf)
    idx = vals.argsort()[0][-2]
    flat = vals.flatten()
    flat.sort()
    req_tfidf = flat[-2]
    
    if(req_tfidf == 0):
        robo1_response = robo1_response + handle_unknown()
        return robo1_response
    
    else:
        # Consider multiple sentences for a more detailed response
        start_idx = idx  
        end_idx = min(len(sentence_tokens), idx + 2)  # Consider two sentences after the selected sentence
        robo1_response = robo1_response + ' '.join(sentence_tokens[start_idx:end_idx])
        return robo1_response

# Defining conversation start/end protocols

In [11]:
flag = True
print("BOT: My name is Jarvis. Let's have a conversation! Also, if you want to exit any time, just type Bye!")

while flag:
    user_response = input("User: ")
    user_response = user_response.lower()

    if (user_response != 'bye'):
        if (user_response == 'thanks' or user_response == 'thank you'):
            flag = False
            print("BOT: You are welcome...")

        else:
          if(greet(user_response) != None):
            print("BOT: " + greet(user_response))

          else:
              sentence_tokens.append(user_response)
              word_tokens = word_tokens + nltk.word_tokenize(user_response)
              final_words = list(set(word_tokens))
              print("BOT: " + response(user_response))
              sentence_tokens.remove(user_response)

    else:
        flag = False
        print("BOT: Goodbye! Take care.")


BOT: My name is Jarvis. Let's have a conversation! Also, if you want to exit any time, just type Bye!
User: Hi
BOT: Hey, nice to see you!
User: Data science definition
BOT: data science definition:-
- data science is an interdisciplinary field that combines domain expertise, programming skills, and statistical knowledge to extract meaningful insights and knowledge from data. it encompasses various techniques, algorithms, and tools to analyze and visualize data.
User: Importance of data science
BOT: importance of data science:-
- data science plays a crucial role in today's data-driven world. it helps businesses make informed decisions, improve products and services, and gain a competitive edge.
User: Challenges in data science
BOT: challenges in data science:-
- data quality: ensuring the accuracy and reliability of data is a persistent challenge in data science. - ethical considerations: handling sensitive or private data requires careful ethical considerations and compliance with reg