In [30]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [31]:
cd /content/drive/My Drive/retrieval based+generative chatbot

/content/drive/My Drive/retrieval based+generative chatbot


Opening and reading the data

In [0]:
data = open("neuroscience.txt","r",errors = "ignore")
data = data.read()

In [0]:
#Converting all the uppercase letters to lowecase
data = data.lower()

Now preprocessing and cleaning the data

In [34]:
#Tokenizing the data(using the sentence tokenizer)
import nltk
import re,string
nltk.download("punkt")
token_sentence = nltk.sent_tokenize(data)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [35]:
#Now normalizing these tokens(sentences)
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('wordnet')
import unicodedata
from collections import defaultdict
nltk.download('averaged_perceptron_tagger')

def Normalize(text):
    remove_punct = dict((ord(punct), None) for punct in string.punctuation)
    #word tokenization(word tokenizer for creating word as tokens)
    word_token = nltk.word_tokenize(text.lower().translate(remove_punct))
    
    #Removing ascii
    new_words_tokens = []
    for word in word_token:
        #Basically a text normalization process()
        #ascii uses one byte for each character,unicode uses four byte
        #This function returns the normal form for the Unicode string,Valid values for form are ‘NFC’, ‘NFKC’, ‘NFD’, and ‘NFKD’.
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words_tokens.append(new_word)
    
    #Removing tags
    removing_tags = []
    for w in new_words_tokens:
        #text = re.sub(r'\[[0-9]*\]', '', w)
        #text = re.sub(r'\s+', '', w)
        text=re.sub("&lt;/?.*?&gt;","&lt;&gt;",w)
        removing_tags.append(text)
    #removing_tags is a list containing word tokens after removing the punctuations,ascii and the tags
        
    #Now we will perform the pos tagging(Part of Speech tagging) and lemmatization on the list of words
    #Here basically we label each word with appropriate part of speech(making word to its corresponding part of speech in content given as data)
    #Pos tagging is essential for building the lemmantizer which are basically used to reduce word to its root form
    tag_map = defaultdict(lambda : wn.NOUN)
    tag_map['J'] = wn.ADJ#For adjective
    tag_map['V'] = wn.VERB#For verb
    tag_map['R'] = wn.ADV#For Adverb
    #Pos tagging basically creates tuples containing word and its corresponding tag
    lemmatizer = WordNetLemmatizer()
    lemmatized_list = []
    removing_tags = [i for i in removing_tags if i]
    for token, tag in nltk.pos_tag(removing_tags):
        lemma = lemmatizer.lemmatize(token, tag_map[tag[0]])
        lemmatized_list.append(lemma)
    return lemmatized_list

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Removing the stopwords(now the output will not be containing definations with starting words as the,then etc.

Or we can also keep the stopwords

In [36]:
print(Normalize(data))

['conscious', 'experience', 'in', 'human', 'depends', 'on', 'brain', 'activity', 'so', 'neuroscience', 'will', 'contribute', 'to', 'explain', 'consciousness', 'what', 'would', 'it', 'be', 'for', 'neuroscience', 'to', 'explain', 'consciousness', 'how', 'much', 'progress', 'have', 'neuroscience', 'make', 'in', 'do', 'so', 'what', 'challenge', 'do', 'it', 'face', 'how', 'can', 'it', 'meet', 'those', 'challenge', 'what', 'be', 'the', 'philosophical', 'significance', 'of', 'it', 'finding', 'this', 'entry', 'address', 'these', 'and', 'related', 'question', 'to', 'bridge', 'the', 'gulf', 'between', 'brain', 'and', 'consciousness', 'we', 'need', 'neural', 'data', 'computational', 'and', 'psychological', 'model', 'and', 'philosophical', 'analysis', 'to', 'identify', 'principle', 'to', 'connect', 'brain', 'activity', 'to', 'conscious', 'experience', 'in', 'an', 'illuminating', 'way', 'this', 'entry', 'will', 'focus', 'on', 'identify', 'such', 'principle', 'without', 'shy', 'away', 'from', 'the',

Now as the data preprocessing part is over so lets make some UI responses at starting the conversation

In [0]:
#Creating the input and output responses of the chatbot
import random
Conversation_starting_input = ["hii","hi","hello","hey","what's up","good morning", "good evening", "morning", "evening"]
conversation_starting_output = ["hii","hi","hello","hi there how can i help you","hey there! Good to see you here. i'm at your service","hey hows you?", "*nods*", "hello, how you doing", "Welcome, I am good and you"]
def starting(user_input):
  for word in user_input.split():
    if word.lower() in Conversation_starting_input:
      return random.choice(conversation_starting_output)

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel

def generate_response(user_input):
    response = ''
    token_sentence.append(user_input)

    word_vectorizer = TfidfVectorizer(tokenizer=get_processed_text, stop_words='english')
    all_word_vectors = word_vectorizer.fit_transform(token_sentence)
    similar_vector_values = cosine_similarity(all_word_vectors[-1], all_word_vectors)
    similar_sentence_number = similar_vector_values.argsort()[0][-2]

    matched_vector = similar_vector_values.flatten()
    matched_vector.sort()
    vector_matched = matched_vector[-2]

    if vector_matched == 0:
        response = response + "I am sorry, I could not understand you"
        return response
    else:
        response = response + token_sentence[similar_sentence_number]
        return response

In [39]:
continue_dialogue = True
print("Hello, I am your friend. You can ask me any question regarding neuroscience: And type bye if you want to leave")
while(continue_dialogue == True):
    human_text = input()
    human_text = human_text.lower()
    if human_text != 'bye':
        if human_text == 'thanks' or human_text == 'thank you very much' or human_text == 'thank you':
            continue_dialogue = False
            print("Most welcome")
        else:
            if starting(human_text) != None:
                print("Hey " + starting(human_text))
            else:
                print("Hey ", end="")
                print(generate_response(human_text))
                article_sentences.remove(human_text)
    else:
        continue_dialogue = False
        print("Good bye and take care of yourself...")

Hello, I am your friend. You can ask me any question regarding neuroscience:
hi
Hey hi
bye
Good bye and take care of yourself...
