## Importing relevant libraries

In [1]:
import nltk, random, string, numpy as np, requests, re, bs4 as bs

In [2]:
import warnings
warnings.filerwarnings = True

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
nltk.download('punkt') #Using punkt tokenizer
nltk.download('wordnet') #Using the wordnet dictonary

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\91994\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\91994\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Import your website links here

In [5]:
links = ['https://my.clevelandclinic.org/health/diseases/21214-coronavirus-covid-19',
         'https://www.who.int/emergencies/diseases/novel-coronavirus-2019/advice-for-public/myth-busters',
         'https://www.who.int/emergencies/diseases/novel-coronavirus-2019/question-and-answers-hub/q-a-detail/coronavirus-disease-covid-19'
        
        ]    

In [6]:
raw_html = ''
for link in links:
    raw_html += " "+ (requests.get(link)).text

In [7]:
#raw_html = r.text
raw_html



# Obtaining text from the html file

In [8]:
#Creating a beautifulsoup object
corpus_html = bs.BeautifulSoup(raw_html)

#Obtaining paragraph from raw html
corpus_paras = corpus_html.find_all('p')
corpus_text = ''

#Combining all paragraphs
for para in corpus_paras:
    corpus_text += ' ' + para.text
    
corpus_text = corpus_text.lower()  



In [9]:
corpus_text



## Removing reference numbers and special characters

In [10]:
#removing article references(if any) like [14].etc 
corpus_text = re.sub(r'\[[0-9]*\]',' ', corpus_text)
corpus_text = re.sub(r'\s+', ' ', corpus_text)

In [11]:
corpus_text 



## Making sentence tokens and word tokens

In [12]:
corpus_sentences = nltk.sent_tokenize(corpus_text)

In [13]:
corpus_sentences

[" now scheduling covid-19 vaccine appointments for ages 12+schedule a vaccine appointmentcovid-19 vaccine faqs going to a cleveland clinic location?new visitation hoursmasks are required for patients and visitors (even if you're vaccinated) coronaviruses are a family of viruses that can cause respiratory illness in humans.",
 'they get their name, “corona,” from the many crown-like spikes on the surface of the virus.',
 'severe acute respiratory syndrome (sars), middle east respiratory syndrome (mers) and the common cold are examples of coronaviruses that cause illness in humans.',
 'the new strain of coronavirus, covid-19, was first reported in wuhan, china in december 2019. the virus has since spread to all continents (except antarctica).',
 'the number of people infected changes daily.',
 'organizations that collect this information, including the world health organization (who) and the centers for disease control and prevention (cdc), are gathering information and continuously lea

## Greeting response

In [14]:
greeting_responses = ['hi', 'hello','hola','hey',"hi there", "greetings"]
greeting_inputs = ['hi', 'hello','hola','hey',"hi there", "greetings"]

def greet_response(greeting):
    for token in greeting.split():
        if token.lower() in greeting_inputs:
            return random.choice(greeting_responses) + ', ask me anything about Covid-19, I\'ll clarify'

## Lemmatization and removing punctuation 

In [15]:
#Creating a lemmatization object using a class called nltk.stem.WordNetLemmatizer()
wn_lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_corpus(tokens):
    return [wn_lemmatizer.lemmatize(token) for token in tokens] #Used a list comprehension here

punct_removal_dict = dict((ord(x), None) for x in string.punctuation)
#Creating an ascii dictionary of punctuations to use .translate() ie replace all punctuations with None

def get_processed_text(document):
    return lemmatize_corpus(nltk.word_tokenize(document.lower().translate(punct_removal_dict)))
#lower case it -> remove all punctuations -> tokenize it to list of words -> give the lammatized list of words

# Language Modelling with TF-IDF

In [16]:
def respond(user_input):
    bot_response = ''
    corpus_sentences.append(user_input)
    
    #Vectorizing the user input
    word_vectorizer = TfidfVectorizer(tokenizer= get_processed_text, stop_words= 'english') #Created an object from class
    corpus_word_vectors = word_vectorizer.fit_transform(corpus_sentences)
    
    cos_sim_vectors = cosine_similarity(corpus_word_vectors[-1], corpus_word_vectors)
    similar_response_idx = cos_sim_vectors.argsort()[0][-2]
    
    
    matched_vector = cos_sim_vectors.flatten()
    matched_vector.sort()
    vector_matched = matched_vector[-2]

    if vector_matched == 0:
        bot_response = bot_response + "I am sorry, I could not understand you"
        return bot_response
    else:
        bot_response = bot_response + corpus_sentences[similar_response_idx]
        return bot_response
    

In [17]:
chat = True
while(chat==True):
    user_query = input()
    user_query = user_query.lower()
    if user_query !='quit':
        if greet_response(user_query) != None:
            print('Dr Bot:'+ greet_response(user_query))
        else:
            print('Dr Bot: ', end = '')
            print(respond(user_query))
            corpus_sentences.remove(user_query)
    else:
        chat = False
        print('Dr Bot: Bye, stay safe')

hi
Dr Bot:hi, ask me anything about Covid-19, I'll clarify
should i wear a mask
Dr Bot: 



stay six feet away from others (“social distancing”) and wear a cloth mask.
what are the symptoms of covid 19
Dr Bot: the cdc says you may have coronavirus if you have these symptoms or combination of symptoms: additional symptoms are possible.
thanks
Dr Bot: I am sorry, I could not understand you
bye
Dr Bot: I am sorry, I could not understand you

Dr Bot: I am sorry, I could not understand you
quit
Dr Bot: Bye
