In [0]:
#Description: This is a 'self learning' chatbot program
#Resources:
#(1) https://www.freecodecamp.org/news/how-to-process-textual-data-using-tf-idf-in-python-cd2bbc0a94a3/ 
#(2) https://github.com/randerson112358/Building-a-Simple-Chatbot-in-Python-using-NLTK 
#(3) http://www.tfidf.com/ 
#(4) https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.cosine_similarity.html
#(5) https://youtube.be/QpMsT0Wulul

In [0]:
#Install the package NLTk
pip install nltk



In [0]:
#Install the package newspaper3k
pip install newspaper3k

Collecting newspaper3k
[?25l  Downloading https://files.pythonhosted.org/packages/d7/b9/51afecb35bb61b188a4b44868001de348a0e8134b4dfa00ffc191567c4b9/newspaper3k-0.2.8-py3-none-any.whl (211kB)
[K     |█▌                              | 10kB 15.6MB/s eta 0:00:01[K     |███                             | 20kB 1.7MB/s eta 0:00:01[K     |████▋                           | 30kB 2.3MB/s eta 0:00:01[K     |██████▏                         | 40kB 1.6MB/s eta 0:00:01[K     |███████▊                        | 51kB 1.8MB/s eta 0:00:01[K     |█████████▎                      | 61kB 2.2MB/s eta 0:00:01[K     |██████████▉                     | 71kB 2.4MB/s eta 0:00:01[K     |████████████▍                   | 81kB 2.6MB/s eta 0:00:01[K     |██████████████                  | 92kB 2.9MB/s eta 0:00:01[K     |███████████████▌                | 102kB 2.7MB/s eta 0:00:01[K     |█████████████████               | 112kB 2.7MB/s eta 0:00:01[K     |██████████████████▋             | 122kB 2.7MB/

In [0]:
#Import libraries
from newspaper import Article
import random
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import numpy as np
import warnings

In [0]:
#Ignore any warning messages
warnings.filterwarnings('ignore')

In [0]:
#Download the packages from NLTK
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)

True

In [0]:
#Get the article URL
article = Article('https://en.wikipedia.org/wiki/Coronavirus')
article.download()
article.parse()
article.nlp()
corpus = article.text

#Print the corpus/text
print(corpus)

Subfamily of viruses in the family Coronaviridae

Coronaviruses are a group of related viruses that cause diseases in mammals and birds. In humans, coronaviruses cause respiratory tract infections that can be mild, such as some cases of the common cold (among other possible causes, predominantly rhinoviruses), and others that can be lethal, such as SARS, MERS, and COVID-19. Symptoms in other species vary: in chickens, they cause an upper respiratory tract disease, while in cows and pigs they cause diarrhea. There are yet to be vaccines or antiviral drugs to prevent or treat human coronavirus infections.

Coronaviruses constitute the subfamily Orthocoronavirinae, in the family Coronaviridae, order Nidovirales, and realm Riboviria.[5][6] They are enveloped viruses with a positive-sense single-stranded RNA genome and a nucleocapsid of helical symmetry. The genome size of coronaviruses ranges from approximately 27 to 34 kilobases, the largest among known RNA viruses.[7] The name coronaviru

In [0]:
#Tokenization
text = corpus
sent_tokens = nltk.sent_tokenize(text) #Convert the text into a list of sentences

#Print the list of sentences
print(sent_tokens)

['Subfamily of viruses in the family Coronaviridae\n\nCoronaviruses are a group of related viruses that cause diseases in mammals and birds.', 'In humans, coronaviruses cause respiratory tract infections that can be mild, such as some cases of the common cold (among other possible causes, predominantly rhinoviruses), and others that can be lethal, such as SARS, MERS, and COVID-19.', 'Symptoms in other species vary: in chickens, they cause an upper respiratory tract disease, while in cows and pigs they cause diarrhea.', 'There are yet to be vaccines or antiviral drugs to prevent or treat human coronavirus infections.', 'Coronaviruses constitute the subfamily Orthocoronavirinae, in the family Coronaviridae, order Nidovirales, and realm Riboviria.', '[5][6] They are enveloped viruses with a positive-sense single-stranded RNA genome and a nucleocapsid of helical symmetry.', 'The genome size of coronaviruses ranges from approximately 27 to 34 kilobases, the largest among known RNA viruses.'

In [0]:
#Create a dictionary (key:value) pair to remove punctuations
remove_punct_dict = dict(  ( ord(punct),None) for punct in string.punctuation)

#Print the punctuations
print(string.punctuation)

#Print the dictionary
print(remove_punct_dict)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
{33: None, 34: None, 35: None, 36: None, 37: None, 38: None, 39: None, 40: None, 41: None, 42: None, 43: None, 44: None, 45: None, 46: None, 47: None, 58: None, 59: None, 60: None, 61: None, 62: None, 63: None, 64: None, 91: None, 92: None, 93: None, 94: None, 95: None, 96: None, 123: None, 124: None, 125: None, 126: None}


In [0]:
#Create a function to return a list of lemmatized lower case words after removing punctuations
def LemNormalize(text):
  return nltk.word_tokenize(text.lower().translate(remove_punct_dict))

#Print the tokenization text
print(LemNormalize(text))

['subfamily', 'of', 'viruses', 'in', 'the', 'family', 'coronaviridae', 'coronaviruses', 'are', 'a', 'group', 'of', 'related', 'viruses', 'that', 'cause', 'diseases', 'in', 'mammals', 'and', 'birds', 'in', 'humans', 'coronaviruses', 'cause', 'respiratory', 'tract', 'infections', 'that', 'can', 'be', 'mild', 'such', 'as', 'some', 'cases', 'of', 'the', 'common', 'cold', 'among', 'other', 'possible', 'causes', 'predominantly', 'rhinoviruses', 'and', 'others', 'that', 'can', 'be', 'lethal', 'such', 'as', 'sars', 'mers', 'and', 'covid19', 'symptoms', 'in', 'other', 'species', 'vary', 'in', 'chickens', 'they', 'cause', 'an', 'upper', 'respiratory', 'tract', 'disease', 'while', 'in', 'cows', 'and', 'pigs', 'they', 'cause', 'diarrhea', 'there', 'are', 'yet', 'to', 'be', 'vaccines', 'or', 'antiviral', 'drugs', 'to', 'prevent', 'or', 'treat', 'human', 'coronavirus', 'infections', 'coronaviruses', 'constitute', 'the', 'subfamily', 'orthocoronavirinae', 'in', 'the', 'family', 'coronaviridae', 'orde

In [0]:
#Keyword Matching

#Greeting Inputs
GREETING_INPUTS = ["hi", "hello", "hola", "how are you", "greetings", "wassup", "hey"]
#Greeting responses back to the user
GREETING_RESPONSES=["howdy", "hi", "hey", "i'm fit." "what's good", "hello", "hey there"]

#Function to return a random greeting response to a users greeting
def greeting(sentence):
  #if the user's input is a greeting, then return a randomly chosen greeting response
  for word in sentence.split():
    if word.lower() in GREETING_INPUTS:
      return random.choice(GREETING_RESPONSES)

In [0]:
#Generate the response
def response(user_response):
  #The users response / query
  #user_response = 'What is chronic kidney disease'

  user_response = user_response.lower() #Make the response lower case

  ###Print the users query/ response
  #print(user_response)

  #Set the chatbot response to an empty string
  robo_response = ''

  #Append the users response to the sentence list
  sent_tokens.append(user_response)

  ###Print the sentence list after appending the users response
  #print(sent_tokens)

  #Create a TfidfVectorizer Object
  TfidfVec = TfidfVectorizer(tokenizer = LemNormalize, stop_words='english')

  #Convert the text to a matrix of TF-IDF features
  tfidf = TfidfVec.fit_transform(sent_tokens)

  ###Print the TFIDF features
  #print(tfidf)

  #Get the measure of similarity (similarity scores)
  vals = cosine_similarity(tfidf[-1], tfidf)

  #Print the similarity scores
  #print(vals)

  #Get the index of the most similar text/sentence to the users response
  idx = vals.argsort()[0][-2]

  #Reduce the dimensionality of vals
  flat = vals.flatten()

  #sort the list in ascending order
  flat.sort()

  #Get the most similar score to the users response
  score = flat[-2]

  #Print the similarity score
  #print(score)

  #If the variable 'score' is 0 then their is no text similar to the users response
  if(score == 0):
    robo_response = robo_response+"I apologize, I don't understand."
  else:
    robo_response = robo_response+sent_tokens[idx]
  
  #Print the chat bot response
  #print(robo_response)
  
  #Remove the users response from the sentence tokens list
  sent_tokens.remove(user_response)
  
  return robo_response

In [0]:
flag = True
print("DOCBot: I am Doctor Bot or DOCBot for short. I will answer your queries about Corona Virus or CoVid-19. If you want to exit, type Bye!")
while(flag == True):
  print("YOU :")
  user_response = input()
  user_response = user_response.lower()
  if(user_response != 'bye'):
    if(user_response == 'thanks' or user_response =='thank you'):
      flag=False
      print("DOCBot: You are welcome !")
    else:
      if(greeting(user_response) != None):
        print("DOCBot: "+greeting(user_response))
      else:
        print("DOCBot: "+response(user_response))       
  else:
    flag = False
    print("DOCBot: Chat with you later !")