<a href="https://colab.research.google.com/github/Teasotea/DialogSystem/blob/main/ConversationalAI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [60]:
!pip install transformers



In [61]:
!pip install nltk



In [62]:
import numpy as np
import pandas as pd
import time
import os
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import nltk
from nltk.stem.lancaster import LancasterStemmer

In [63]:
stemmer = LancasterStemmer()

# Part I: Greeting Classification

In [64]:
training_data = []
greetings = ['hi', "hola", 'hey', 'hello','morning', 'evening', 'good day', 'good morning', 'greetings', 'howdy', 'welcome', 'bonjour',
             'buenas noches', 'buenos dias', 'salutation', 'salut', 'hail', 'salaam', 'aloha', 'ciao', 'good wishes', 'respects', 'high-five',
             'aloha', 'yoo-hoo', 'yawp', 'psst', 'oh', 'toast', 'ave', "how is it going?", 'yo', 'hi there']
# other = ['face','wisecrack','care','thick','reference','deserve','engine','cry','mud','worth',
#          'railroad','permanent','throne','tradition','loan','employ','resource','privilege','parachute',
#          'rent','of','characteristic','coin','teenager','established','reveal','bad','undress','revoke','ward']
for i in greetings:
  training_data.append({"class":"greeting", "sentence":i})
# for i in other:
#   training_data.append({"class":"other", "sentence":i})
training_data.append({"class":"other", "sentence":'word'})
  
greet_df = pd.DataFrame(training_data)
greet_df

Unnamed: 0,class,sentence
0,greeting,hi
1,greeting,hola
2,greeting,hey
3,greeting,hello
4,greeting,morning
5,greeting,evening
6,greeting,good day
7,greeting,good morning
8,greeting,greetings
9,greeting,howdy


In [65]:
nltk.download('punkt')
corpus_words = {}
class_words = {}
classes = list(set([a['class'] for a in training_data]))
for c in classes:
    class_words[c] = []

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [66]:
classes

['other', 'greeting']

In [67]:
for data in training_data:
    for word in nltk.word_tokenize(data['sentence']):
        # ignore a some things
        if word not in ["?", "'s"]:
            # stem and lowercase each word
            stemmed_word = stemmer.stem(word.lower())
            # have we not seen this word already?
            if stemmed_word not in corpus_words:
                corpus_words[stemmed_word] = 1
            else:
                corpus_words[stemmed_word] += 1

            # add the word to our words in class list
            class_words[data['class']].extend([stemmed_word])

# we now have each stemmed word and the number of occurances of the word in our training corpus (the word's commonality)
print ("Corpus words and counts: %s \n" % corpus_words)
# also we have all words in each class
print ("Class words: %s" % class_words)

Corpus words and counts: {'hi': 2, 'hol': 1, 'hey': 1, 'hello': 1, 'morn': 2, 'ev': 1, 'good': 3, 'day': 1, 'greet': 1, 'howdy': 1, 'welcom': 1, 'bonjo': 1, 'buena': 1, 'noch': 1, 'bueno': 1, 'dia': 1, 'salut': 2, 'hail': 1, 'salaam': 1, 'aloh': 2, 'ciao': 1, 'wish': 1, 'respect': 1, 'high-five': 1, 'yoo-hoo': 1, 'yawp': 1, 'psst': 1, 'oh': 1, 'toast': 1, 'av': 1, 'how': 1, 'is': 1, 'it': 1, 'going': 1, 'yo': 1, 'ther': 1, 'word': 1} 

Class words: {'other': ['word'], 'greeting': ['hi', 'hol', 'hey', 'hello', 'morn', 'ev', 'good', 'day', 'good', 'morn', 'greet', 'howdy', 'welcom', 'bonjo', 'buena', 'noch', 'bueno', 'dia', 'salut', 'salut', 'hail', 'salaam', 'aloh', 'ciao', 'good', 'wish', 'respect', 'high-five', 'aloh', 'yoo-hoo', 'yawp', 'psst', 'oh', 'toast', 'av', 'how', 'is', 'it', 'going', 'yo', 'hi', 'ther']}


In [68]:
# calculate a score for a given class
def calculate_class_score(sentence, class_name, show_details=True):
    score = 0
    # tokenize each word in our new sentence
    for word in nltk.word_tokenize(sentence):
        # check to see if the stem of the word is in any of our classes
        if stemmer.stem(word.lower()) in class_words[class_name]:
            # treat each word with same weight
            score += 1
            
            if show_details:
                print ("   match: %s" % stemmer.stem(word.lower() ))
    return score

In [69]:
# we can now calculate a score for a new sentence
sentence = "good day for us to have lunch?"

# now we can find the class with the highest score
for c in class_words.keys():
    print ("Class: %s  Score: %s \n" % (c, calculate_class_score(sentence, c)))

Class: other  Score: 0 

   match: good
   match: day
Class: greeting  Score: 2 



In [70]:
# calculate a score for a given class taking into account word commonality
def calculate_class_score_commonality(sentence, class_name, show_details=True):
    score = 0
    # tokenize each word in our new sentence
    for word in nltk.word_tokenize(sentence):
        # check to see if the stem of the word is in any of our classes
        if stemmer.stem(word.lower()) in class_words[class_name]:
            # treat each word with relative weight
            score += (1 / corpus_words[stemmer.stem(word.lower())])

            if show_details:
                print ("   match: %s (%s)" % (stemmer.stem(word.lower()), 1 / corpus_words[stemmer.stem(word.lower())]))
    return score

In [71]:
# now we can find the class with the highest score
for c in class_words.keys():
    print ("Class: %s  Score: %s \n" % (c, calculate_class_score_commonality(sentence, c)))

Class: other  Score: 0 

   match: good (0.3333333333333333)
   match: day (1.0)
Class: greeting  Score: 1.3333333333333333 



In [72]:
# return the class with highest score for sentence
def classify(sentence):
    high_class = 'other'
    high_score = 0
    # loop through our classes
    for c in class_words.keys():
        # calculate score of sentence for each class
        score = calculate_class_score_commonality(sentence, c, show_details=False)
        # keep track of highest score
        if score > high_score:
            high_class = c
            high_score = score

    return high_class

In [73]:
classify("oh! are u a human?")

'greeting'

# Part II: Question Answering

In [74]:
!pip install datasets



In [75]:
import datasets

# [ds for ds in datasets.list_datasets() if 'ml' in ds.lower()]

In [76]:
qa_ds = datasets.load_dataset('squad', streaming = False)
qa_ds

Reusing dataset squad (/root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [77]:
qa_ds['train'].description

'Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable.\n'

In [78]:
print(qa_ds['train'].dataset_size)
qa_ds['train'].features

89846964


{'answers': Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None)}, length=-1, id=None),
 'context': Value(dtype='string', id=None),
 'id': Value(dtype='string', id=None),
 'question': Value(dtype='string', id=None),
 'title': Value(dtype='string', id=None)}

In [79]:
qa_ds['train'].to_pandas().head()

Unnamed: 0,id,title,context,question,answers
0,5733be284776f41900661182,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,"{'text': ['Saint Bernadette Soubirous'], 'answ..."
1,5733be284776f4190066117f,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,"{'text': ['a copper statue of Christ'], 'answe..."
2,5733be284776f41900661180,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,"{'text': ['the Main Building'], 'answer_start'..."
3,5733be284776f41900661181,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,{'text': ['a Marian place of prayer and reflec...
4,5733be284776f4190066117e,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,{'text': ['a golden statue of the Virgin Mary'...


In [80]:
from transformers import BertTokenizer
b_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [81]:
qa_ds['train'] = qa_ds['train'].map(
    lambda x: b_tokenizer(
        x['question'], x['context'], max_length = 512, padding = 'max_length', truncation = True
    ), batched = True, batch_size = 32
)

Loading cached processed dataset at /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453/cache-3538d80e5b2bcda5.arrow


### Pretraining BERT using MLM and NSP

In [82]:
from transformers import BertForPreTraining #for MLM and NSP

In [83]:
bert_qa_model = BertForPreTraining.from_pretrained('bert-base-uncased')

Some weights of BertForPreTraining were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Part III: Natural Language Generation

In [84]:
# checkpoint 
checkpoint = "microsoft/DialoGPT-medium"
# download and cache tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# download and cache pre-trained model
modelNLG = AutoModelForCausalLM.from_pretrained(checkpoint)

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/642 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/823M [00:00<?, ?B/s]

In [85]:
#change the code later, make it better
class ChatBot():
    def __init__(self):
        # once chat starts, the history will be stored for chat continuity
        self.chat_history_ids = None
        # make input ids global to use them anywhere within the object
        self.bot_input_ids = None
        # a flag to check whether to end the conversation
        self.end_chat = False
        # greet while starting
        self.welcome()
        self.is_greeting = False
        
    def welcome(self):
        print("Initializing ChatBot ...")
        # some time to get user ready
        time.sleep(2)
        print('Type "bye" or "quit" or "exit" to end chat \n')
        # give time to read what has been printed
        time.sleep(3)

        
    def user_input(self):
        # receive input from user
        text = input("User    >> ")
        # end conversation if user wishes so
        if text.lower().strip() in ['bye', 'quit', 'exit']:
            # turn flag on 
            self.end_chat=True
            # a closing comment
            print('ChatBot >>  See you soon! Bye!')
            time.sleep(1)
            print('\nQuitting ChatBot ...')
        else:
            # continue chat, preprocess input text
            # encode the new user input, add the eos_token and return a tensor in Pytorch
            if classify(text) == 'other':
              self.new_user_input_ids = tokenizer.encode(text + tokenizer.eos_token, \
                                                       return_tensors='pt')
            else: 
              self.is_greeting = True
              # self.new_user_input_ids = tokenizer.encode(text + tokenizer.eos_token, \
              #                                          return_tensors='pt')


###CHANGE HERE THE CODE
###
###

    def bot_greet(self):
        greeting = np.random.choice([
            "Welcome, I am ChatBot, here for your kind service",
            "Hey, Great day! I am your virtual assistant",
            "Hello, it's my pleasure meeting you",
            "Hi, I am a ChatBot. Let's chat!"
        ])
        print("ChatBot >>  " + greeting)
        self.is_greeting = False

    def bot_response(self):
        # append the new user input tokens to the chat history
        # if chat has already begun
        if self.chat_history_ids is not None:
            self.bot_input_ids = torch.cat([self.chat_history_ids, self.new_user_input_ids], dim=-1) 
        else:
            # if first entry, initialize bot_input_ids
            self.bot_input_ids = self.new_user_input_ids
        
        # define the new chat_history_ids based on the preceding chats
        # generated a response while limiting the total chat history to 1000 tokens, 
        self.chat_history_ids = modelNLG.generate(self.bot_input_ids, max_length=1000, \
                                               pad_token_id=tokenizer.eos_token_id)
            
        # last ouput tokens from bot
        response = tokenizer.decode(self.chat_history_ids[:, self.bot_input_ids.shape[-1]:][0], \
                               skip_special_tokens=True)
        # in case, bot fails to answer
        if response == "":
            response = self.random_response()
        # print bot response
        print('ChatBot >>  '+ response)
        
    def random_response(self):
        i = -1
        response = tokenizer.decode(self.chat_history_ids[:, self.bot_input_ids.shape[i]:][0], \
                               skip_special_tokens=True)
        # iterate over history backwards to find the last token
        while response == '':
            i = i-1
            response = tokenizer.decode(self.chat_history_ids[:, self.bot_input_ids.shape[i]:][0], \
                               skip_special_tokens=True)
        # if it is a question, answer suitably
        if response.strip() == '?':
            reply = np.random.choice(["I don't know", 
                                     "I am not sure"])
        # not a question? answer suitably
        else:
            reply = np.random.choice(["Great", 
                                      "Fine. What's up?", 
                                      "Okay"
                                     ])
        return reply

In [86]:
# build a ChatBot object
bot = ChatBot()
# start chatting
while True:
    # receive user input
    bot.user_input()
    # check whether to end chat
    if bot.end_chat:
        break
    # output bot response
    if bot.is_greeting == False:
      bot.bot_response()  
    else: bot.bot_greet()  

Initializing ChatBot ...
Type "bye" or "quit" or "exit" to end chat 

User    >> bye
ChatBot >>  See you soon! Bye!

Quitting ChatBot ...
