In [1]:
import numpy as np
import nltk
import random
import string
import sklearn

f=open('data.txt','r',errors = 'ignore')
raw=f.read()



In [2]:
sent_tokens = nltk.sent_tokenize(raw)# converts to list of sentences 
word_tokens = nltk.word_tokenize(raw)# converts to list of words
sent_tokens[0]
word_tokens[0:10]

['A', 'chatbot', '(', 'also', 'known', 'as', 'a', 'spy', ',', 'conversational']

In [3]:
#lemmatization
lemmer = nltk.stem.WordNetLemmatizer()

def LemTokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens]

remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)

def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

#test the preprocessing function
LemNormalize(sent_tokens[0])


['a',
 'chatbot',
 'also',
 'known',
 'a',
 'a',
 'spy',
 'conversational',
 'bot',
 'chatterbot',
 'interactive',
 'agent',
 'conversational',
 'interface',
 'conversational',
 'ai',
 'talkbot',
 'or',
 'artificial',
 'spy',
 'entity',
 'is',
 'a',
 'computer',
 'program',
 'or',
 'an',
 'artificial',
 'intelligence',
 'which',
 'conduct',
 'a',
 'conversation',
 'via',
 'auditory',
 'or',
 'textual',
 'method']

In [4]:
GREETING_INPUTS = ("hello", "hi", "greetings", "sup", "what's up","hey",)
GREETING_RESPONSES = ["hi", "hey", "*nods*", "hi there", "hello", "I am glad! You are talking to me"]
def greeting(sentence): 
    for word in sentence.split():
        if word.lower() in GREETING_INPUTS:
            return random.choice(GREETING_RESPONSES)
        
        
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
#preprocess the sentences in data
TfidfVec = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english')
tfidf = TfidfVec.fit_transform(sent_tokens)
tfidf.shape

  'stop_words.' % sorted(inconsistent))


(126, 946)

In [6]:

#match input to the preprocessed sentences
def response(user_response):
    robo_response=''
    new = TfidfVec.transform([user_response])
    vals = cosine_similarity(new[0], tfidf)
    idx=vals.argsort()[0][-1]
    flat = vals.flatten()
    flat.sort()
    req_tfidf = flat[-1]
    if(req_tfidf==0):
        robo_response=robo_response+"I am sorry! I don't understand you"
        return robo_response
    else:
        robo_response = robo_response+sent_tokens[idx]
        return robo_response




In [None]:
#starting the bot
flag=True
print("CHATTY: My name is CHATTY. I will answer your queries about Chatbots. If you want to exit, type Bye!")
while(flag==True):
    user_response = input()
    user_response=user_response.lower()
    if(user_response!='bye'):
        if(user_response=='thanks' or user_response=='thank you' ):
            flag=False
            print("CHATTY: You are welcome..")
        else:
            if(greeting(user_response)!=None):
                print("CHATTY: "+greeting(user_response))
            else:
                print("CHATTY: ",end="")
                print(response(user_response))
    else:
        flag=False
        print("CHATTY: Bye! take care..")
        

## NLP analysis

In [1]:

import spacy
from spacy import displacy

#load the required model
nlp = spacy.load("en_core_web_sm")

#process a sentence
doc1 = nlp(u"What is the weather today in Seattle?")  #use unicode!!

#visualize the results in a browser: http://localhost:5000
# displacy.serve(doc1, style="dep")
# displacy.serve(doc1, style="ent")

# http://nlp.stanford.edu:8080/corenlp/process


In [3]:
#detailed results
for token in doc1:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.head,
            token.shape_, token.is_alpha, token.is_stop)
for ent in doc1.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)
          

What what PRON WP attr is Xxxx True True
is be AUX VBZ ROOT is xx True True
the the DET DT det weather xxx True True
weather weather NOUN NN nsubj is xxxx True False
today today NOUN NN npadvmod weather xxxx True False
in in ADP IN prep weather xx True True
Seattle Seattle PROPN NNP pobj in Xxxxx True False
? ? PUNCT . punct is ? False False
today 20 25 DATE
Seattle 29 36 GPE


In [4]:
nlpd = spacy.load('en_core_web_md')
doc2 = nlp(u"Apple is looking at buying U.K. startup for $1 billion")
doc3 = nlp(u"What's the time now in Singapore?")
print(doc2.similarity(doc1))
print(doc3.similarity(doc1))

#load the model with word vectors， which enables more accurate semantic similarity comparison   
tokens = nlpd(u'king queen man woman')

for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)
    
for token1 in tokens:
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2))
        
doc_d = nlpd(u"What is the weather today in Seattle?")
doc2_d = nlpd(u"Apple is looking at buying U.K. startup for $1 billion")
doc3_d = nlpd(u"What's the time now in Singapore?")
print(doc2_d.similarity(doc_d))
print(doc3_d.similarity(doc_d))


0.43775487742397196
0.7431442177902458
king True 7.1417456 False king PROPN NNP compound xxxx True False
queen True 6.8297405 False queen PROPN NNP compound xxxx True False
man True 6.352939 False man PROPN NNP compound xxx True False
woman True 6.8987513 False woman NOUN NN ROOT xxxx True False
king king 1.0
king queen 0.72526103
king man 0.40884617
king woman 0.26556593
queen king 0.72526103
queen queen 1.0
queen man 0.27109137
queen woman 0.40660653
man king 0.40884617
man queen 0.27109137
man man 1.0
man woman 0.7401744
woman king 0.26556593
woman queen 0.40660653
woman man 0.7401744
woman woman 1.0
0.6995620076752264
0.9084785787989824


  "__main__", mod_spec)
  "__main__", mod_spec)
