### Importing the required Libraries 

In [1]:
import numpy as np
import nltk
import random 
import string

### Importing and Reading the Corpus 

In [3]:
f=open('Chatbot.txt','r',errors='ignore')
raw_doc=f.read()
raw_doc=raw_doc.lower()
nltk.download('punkt')
nltk.download('wordnet')
sent_tokens=nltk.sent_tokenize(raw_doc)
word_tokens=nltk.word_tokenize(raw_doc)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


### Example of sentence tokens 

In [4]:
sent_tokens[:2]

['data science is an interdisciplinary academic field [1] that uses statistics, scientific computing, scientific methods, processes, algorithms and systems to extract or extrapolate knowledge and insights from noisy, structured, and unstructured data.',
 '[2]\n\ndata science also integrates domain knowledge from the underlying application domain (e.g., natural sciences, information technology, and medicine).']

### Example of Word Tokens 

In [5]:
word_tokens[:2]

['data', 'science']

### Text Preprocessing 

In [18]:
lemmer=nltk.stem.WordNetLemmatizer()
# WordNet is a semantically- oriented dictionary of english included in NLTK
def LemTokens(token):
  return [lemmer.lemmatize(token) for token in tokens]
remove_punct_dict=dict((ord(punct),None) for punct in string.punctuation)
def LemNormalize(text):
  return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

### Defining the greeting function  

In [19]:
GREET_INPUTS=("hello","hi","greeting","sup","what's up","hey",)
GREET_RESPONSES= ["hi","hey","*nods*","hi there","hello","I am glad! You are talking to me",]
def greet(sentence):

  for word in sentence.split():
    if word.lower() in GREET_INPUTS:
      return random.choice(GREET_RESPONSES)

### Response Generation 

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [21]:
def responce(user_response):
  robo1_response=''
  TfidfVec= TfidfVectorizer(tokenizer=LemNormalize,stop_words='english')
  tfidf= TfidfVec.fit_transform(sent_tokens)
  vals= cosine_similarity(tfidf[-1],tfidf)
  idx= vals.argsort()[0][-2]
  flat=vals.flatten()
  flat.sort()
  req_tfidf=flat[-2]
  if(req_tfidf==0):
    robo1_response=robo1_response+"I am sorry! I don't understand you"
    return robo1_response
  else:
    robo1_response=robo1_response+sent_tokens[idx]
    return robo1_response  

### Defining conversation start/end protocols

In [25]:
flag=True
print("BOT: My name is Helen. Let's have a conversation! Also, if you want to exit any time, just type Bye!")
while (flag==True):
  user_response= input()
  user_response= user_response.lower()
  if(user_response!='bye'):
    if(user_response=='thanks' or user_response=='thank you'):
      flag=False
      print("BOT: You are welcome..")
    else:
      if(greet(user_response)!=None):
        print("BOT: "+greet(user_response))
      else:
        sent_tokens.append(user_response)
        word_tokens=word_tokens+nltk.word_tokenize(user_response)
        final_words=list(set(word_tokens))
        print("BOT: ",end="")
        print(response(user_response))
        sent_tokens.remove(user_response)

  else:
    flag=False
    print("BOT: Goodbye! Take care <3 ")

BOT: My name is Helen. Let's have a conversation! Also, if you want to exit any time, just type Bye!
hi
BOT: hi
hello
BOT: hello
hey
BOT: hello
greeting
BOT: I am glad! You are talking to me
bye
BOT: Goodbye! Take care <3 
