In [75]:
import nltk
import string
import random
import numpy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

data = open("/content/universe.txt", "r")
raw_doc = data.read()
raw_doc = raw_doc.lower()

nltk.download("punkt")
nltk.download("wordnet")
nltk.download("omw-1.4")

word_tokens = nltk.word_tokenize(raw_doc)
sent_tokens = nltk.sent_tokenize(raw_doc)

lemmer = nltk.stem.WordNetLemmatizer()

def lemtokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens]

remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)

def lem_normalise(text):
    return lemtokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

# Defining greeting function
greet_inputs = ["hi", "hello", "wassup", "how are you?"]
greet_response = ["hi", "hey", "hey there", "bathike unna"]

def greet(sentence):
    for word in sentence.split():
        if word.lower() in greet_inputs:
            return random.choice(greet_response)
    return None

def response(user_response):
    robo1_response = ''
    Tfidfvector = TfidfVectorizer(tokenizer=lem_normalise, stop_words="english")
    tfidf = Tfidfvector.fit_transform(sent_tokens)
    vals = cosine_similarity(tfidf[-1], tfidf)
    idx = vals.argsort()[0][-2]
    flat = vals.flatten()
    flat.sort()
    req_tfidf = flat[-2]
    if req_tfidf == 0:
        robo1_response = robo1_response + "I am sorry, I am unable to understand you."
        return robo1_response
    else:
        robo1_response = robo1_response + sent_tokens[idx]
        return robo1_response

flag = True
print("Hey, I am a learning bot. I can help you by answering your questions based on my training.")

while flag:
    user_response = input()
    user_response = user_response.lower()

    if user_response != "bye":
        if user_response == "thank you" or user_response == "thanks":
            flag = False
            print("Bot: You are welcome.")
        else:
            if greet(user_response) is not None:
                print("Bot: " + greet(user_response))
            else:
                sent_tokens.append(user_response)
                word_tokens = word_tokens + nltk.word_tokenize(user_response)
                final_words = list(set(word_tokens))
                print("Bot: ", end="")
                print(response(user_response))
                sent_tokens.remove(user_response)
    else:
        flag = False
        print("Bot: Goodbye.")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Hey, I am a learning bot. I can help you by answering your questions based on my training.
hi
Bot: bathike unna
hello
Bot: hey
wassup
Bot: hi
universe
Bot: some physicists have suggested various multiverse hypotheses, in which our universe might be one among many universes that likewise exist.
theory
Bot: the big bang theory is the prevailing cosmological description of the development of the universe.
bye
Bot: Goodbye.


### **Building a chat bot and the process behind it (each and every step)



# Importing necessry libraries

**NLTK**

 NLTK is a versatile and comprehensive library that offers a rich set of tools, data, and algorithms for NLP tasks. It is widely used by researchers, developers, and practitioners in the field of natural language processing and computational linguistics

In [47]:
import nltk
import random   # it is used to choose random text
import numpy as np  #used for numerical computation
import string     #it is used for string operation (we are working with text data)

Downloading the data

In [48]:
data=open("/content/universe.txt")   #reading data by using file handling and saving in a variable
raw_doc=data.read()
raw_doc

'The universe is all of space and time[a] and their contents,[10] including planets, stars, galaxies, and all other forms of matter and energy. The Big Bang theory is the prevailing cosmological description of the development of the universe. According to this theory, space and time emerged together 13.787±0.020 billion years ago,[11] and the universe has been expanding ever since the Big Bang. While the spatial size of the entire universe is unknown,[3] it is possible to measure the size of the observable universe, which is approximately 93 billion light-years in diameter at the present day.\n\nSome of the earliest cosmological models of the universe were developed by ancient Greek and Indian philosophers and were geocentric, placing Earth at the center.[12][13] Over the centuries, more precise astronomical observations led Nicolaus Copernicus to develop the heliocentric model with the Sun at the center of the Solar System. In developing the law of universal gravitation, Isaac Newton 

In [49]:
raw_doc=raw_doc.lower()  #to avoid case sensitive issues
raw_doc

'the universe is all of space and time[a] and their contents,[10] including planets, stars, galaxies, and all other forms of matter and energy. the big bang theory is the prevailing cosmological description of the development of the universe. according to this theory, space and time emerged together 13.787±0.020 billion years ago,[11] and the universe has been expanding ever since the big bang. while the spatial size of the entire universe is unknown,[3] it is possible to measure the size of the observable universe, which is approximately 93 billion light-years in diameter at the present day.\n\nsome of the earliest cosmological models of the universe were developed by ancient greek and indian philosophers and were geocentric, placing earth at the center.[12][13] over the centuries, more precise astronomical observations led nicolaus copernicus to develop the heliocentric model with the sun at the center of the solar system. in developing the law of universal gravitation, isaac newton 

In [50]:
nltk.download("punkt")   #The punkt tokenizer is a pre-trained model used for tokenization, which means it can split text into individual words or sentences
nltk.download("wordnet") #WordNet is a large lexical database of English words. It provides lexical and semantic information, including synonyms, antonyms, hypernyms, hyponyms, and more.
                        #WordNet is often used for various NLP tasks such as word sense disambiguation, semantic similarity, and information retrieval.
nltk.download("omw-1.4") #Open Multi-lingual wordnet
#It includes wordnets for different languages, allowing you to access lexical and semantic information for words in languages other than English.

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

**Tokenisation**

In [51]:
sent_tokens=nltk.sent_tokenize(text=raw_doc)
word_tokens=nltk.word_tokenize(text=raw_doc)

In [52]:
sent_tokens[:5]  #glimpse of 5 sentences

['the universe is all of space and time[a] and their contents,[10] including planets, stars, galaxies, and all other forms of matter and energy.',
 'the big bang theory is the prevailing cosmological description of the development of the universe.',
 'according to this theory, space and time emerged together 13.787±0.020 billion years ago,[11] and the universe has been expanding ever since the big bang.',
 'while the spatial size of the entire universe is unknown,[3] it is possible to measure the size of the observable universe, which is approximately 93 billion light-years in diameter at the present day.',
 'some of the earliest cosmological models of the universe were developed by ancient greek and indian philosophers and were geocentric, placing earth at the center.']

In [53]:
word_tokens[:10]   #glimpse of word tokens

['the', 'universe', 'is', 'all', 'of', 'space', 'and', 'time', '[', 'a']

**Lemmatisation**

In [54]:
lemmer=nltk.stem.WordNetLemmatizer()
lemmer

<WordNetLemmatizer>

In [55]:
sent_tokens

['the universe is all of space and time[a] and their contents,[10] including planets, stars, galaxies, and all other forms of matter and energy.',
 'the big bang theory is the prevailing cosmological description of the development of the universe.',
 'according to this theory, space and time emerged together 13.787±0.020 billion years ago,[11] and the universe has been expanding ever since the big bang.',
 'while the spatial size of the entire universe is unknown,[3] it is possible to measure the size of the observable universe, which is approximately 93 billion light-years in diameter at the present day.',
 'some of the earliest cosmological models of the universe were developed by ancient greek and indian philosophers and were geocentric, placing earth at the center.',
 '[12][13] over the centuries, more precise astronomical observations led nicolaus copernicus to develop the heliocentric model with the sun at the center of the solar system.',
 "in developing the law of universal gra

**lemmatisation and normalisimg**

In [57]:
def lemtokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens]   #make sure lemmatisation perfomed on each token

remove_punct_dict = dict((ord(punct), punct) for punct in string.punctuation)   #getting to know about index values of punctuation marks

def lem_normalise(text):    #perfoming normalisation on text which we gave and remove punctuation by using translate
    return lemtokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))
print(remove_punct_dict)

{33: '!', 34: '"', 35: '#', 36: '$', 37: '%', 38: '&', 39: "'", 40: '(', 41: ')', 42: '*', 43: '+', 44: ',', 45: '-', 46: '.', 47: '/', 58: ':', 59: ';', 60: '<', 61: '=', 62: '>', 63: '?', 64: '@', 91: '[', 92: '\\', 93: ']', 94: '^', 95: '_', 96: '`', 123: '{', 124: '|', 125: '}', 126: '~'}


**Defining greeting functions at start**

In [66]:

# Defining greeting function
greet_inputs = ["hi", "hello", "wassup", "how are you?"]
greet_response = ["hi", "hey", "hey there", "bathike unna"]

def greet(sentence):
    for word in sentence.split():
        if word.lower() in greet_inputs:
            return random.choice(greet_response)
    return None
user_input=input("enter a sentence:   ")
response=greet(user_input)
print(response)

enter a sentence:   wassup
hey


In [73]:
def response(user_response):
    robo1_response = ''
    Tfidfvector = TfidfVectorizer(tokenizer=lem_normalise, stop_words="english")
    tfidf = Tfidfvector.fit_transform(sent_tokens)
    vals = cosine_similarity(tfidf[-1], tfidf)
    idx = vals.argsort()[0][-2]
    flat = vals.flatten()
    flat.sort()
    req_tfidf = flat[-2]
    if req_tfidf == 0:
        robo1_response = robo1_response + "I am sorry, I am unable to understand you."
        return robo1_response
    else:
        robo1_response = robo1_response + sent_tokens[idx]
        return robo1_response
userr_response=input("enter a repsone")
print(response(userr_response))


enter a repsonebig bang theory
the universe is all of space and time[a] and their contents,[10] including planets, stars, galaxies, and all other forms of matter and energy.


mentioning outline commands along with the response

In [74]:
flag = True
print("Hey, I am a learning bot. I can help you by answering your questions based on my training.")

while flag:
    user_response = input()
    user_response = user_response.lower()

    if user_response != "bye":
        if user_response == "thank you" or user_response == "thanks":
            flag = False
            print("Bot: You are welcome.")
        else:
            if greet(user_response) is not None:
                print("Bot: " + greet(user_response))
            else:
                sent_tokens.append(user_response)
                word_tokens = word_tokens + nltk.word_tokenize(user_response)
                final_words = list(set(word_tokens))
                print("Bot: ", end="")
                print(response(user_response))
                sent_tokens.remove(user_response)
    else:
        flag = False
        print("Bot: Goodbye.")

Hey, I am a learning bot. I can help you by answering your questions based on my training.
hi
Bot: bathike unna
hey
Bot: I am sorry, I am unable to understand you.
hello
Bot: hey there
wassup
Bot: hey
universe
Bot: some physicists have suggested various multiverse hypotheses, in which our universe might be one among many universes that likewise exist.
big bang theory
Bot: the big bang theory is the prevailing cosmological description of the development of the universe.
etymology
Bot: [30][31]

etymology
the word universe derives from the old french word univers, which in turn derives from the latin word universum.
bye
Bot: Goodbye.
