In [1]:
from datasets import load_dataset

# Load MultiWOZ 2.2 dataset
dataset = load_dataset("multi_woz_v22")

# Print available splits (train, validation, test)
print(dataset)

# View a sample dialogue
print(dataset["train"][0])

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['dialogue_id', 'services', 'turns'],
        num_rows: 8437
    })
    validation: Dataset({
        features: ['dialogue_id', 'services', 'turns'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['dialogue_id', 'services', 'turns'],
        num_rows: 1000
    })
})
{'dialogue_id': 'PMUL4398.json', 'services': ['restaurant', 'hotel'], 'turns': {'turn_id': ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], 'speaker': [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1], 'utterance': ['i need a place to dine in the center thats expensive', 'I have several options for you; do you prefer African, Asian, or British food?', 'Any sort of food would be fine, as long as it is a bit expensive. Could I get the phone number for your recommendation?', 'There is an Afrian place named Bedouin in the centre. How does that sound?', 'Sounds good, could I get that phone number? Also, could you recommend me an expensive hotel?', "Bedo

In [3]:
total_categories = list(set(element for sublist in dataset['train']['services'] for element in sublist))

print(total_categories)

['restaurant', 'taxi', 'hospital', 'train', 'bus', 'hotel', 'attraction']


In [11]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity 
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import numpy as np
import spacy
import gensim.downloader as api
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.models import KeyedVectors



In [None]:
import json
import gensim.downloader as api

info = api.info()
print(json.dumps(info, indent=4))

# Retry downloading the pre-trained Word2Vec model
#pretrained_model_path = api.load('word2vec-google-news-300', return_path=True)

{
    "corpora": {
        "semeval-2016-2017-task3-subtaskBC": {
            "num_records": -1,
            "record_format": "dict",
            "file_size": 6344358,
            "reader_code": "https://github.com/RaRe-Technologies/gensim-data/releases/download/semeval-2016-2017-task3-subtaskB-eng/__init__.py",
            "license": "All files released for the task are free for general research use",
            "fields": {
                "2016-train": [
                    "..."
                ],
                "2016-dev": [
                    "..."
                ],
                "2017-test": [
                    "..."
                ],
                "2016-test": [
                    "..."
                ]
            },
            "description": "SemEval 2016 / 2017 Task 3 Subtask B and C datasets contain train+development (317 original questions, 3,169 related questions, and 31,690 comments), and test datasets in English. The description of the tasks and the collect

In [19]:

nlp = spacy.load("en_core_web_sm")


pretrained_model_path = "/Users/rocco02/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz"

# Load the downloaded model
pretrained_model = KeyedVectors.load_word2vec_format(pretrained_model_path, binary=True)


In [30]:
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

In [None]:
def calculate_similarity_indices(vector1, vector2):
    # Calculate the cosine similarity between each row in vector1 and vector2
    similarities = cosine_similarity(vector1, vector2)

    # For each row in vector1, find the index of the most similar row in vector2 
    most_similar_indices = np.argmax(similarities, axis=1)
    
    return most_similar_indices


def tokenize_sentence(sentence):
    
    doc = nlp(sentence)
    tokens = [token.text.lower() for token in doc if not token.is_punct]
    return tokens


def embed_sentence(sentence):
    # Split the sentence into tokens
    tokens = tokenize_sentence(sentence)
    
    # Initialize an empty array to store the word embeddings
    embeddings = np.zeros(pretrained_model.vector_size)
    
    n = 0
    # Iterate over each token in the sentence
    for token in tokens:
        # Check if the token is present in the pretrained word2vec model
        if token in pretrained_model:
            # Add the word embedding to the sentence embeddings
            embeddings += pretrained_model[token]
            n+=1
    
    # Normalize the sentence embeddings
    embeddings /= n+1
    
    return embeddings

In [6]:
class ChatBot:

    def __init__(self):
        
        self.dataframes = {}
        self.themes = total_categories
        self.name = 'MarioBot'

        self.presentation = 'Hi, my name is MarioBot, I am a chatbot. I am here to help you with any questions you may have regarding: '


        for theme in self.themes:

            temp_df = pd.read_csv(f"data/{theme}.csv")
            self.dataframes[theme] = temp_df

        self.nlp = None
        self.w2v = None

    def load_models(self):

        self.nlp = spacy.load("en_core_web_sm")

        pretrained_model_path = "/Users/rocco02/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz"
        self.w2v = KeyedVectors.load_word2vec_format(pretrained_model_path, binary=True)

    
    def get_dialogue(self):

        print(f"{self.name}: {self.presentation + ', '.join(self.themes)}.\nPlease ask me a question:")

        self.load_models()
        
        while True:

            query = input("Insert the question: ")

            print(f"\nUser: {query}")

            matching_words =  self.find_matching_words(query)

            topic = matching_words[0]

            if len(matching_words) > 1:
                
                print(f'\n{self.name}: Looks like you are asking about multiple topics. We will solve one topic at time to avoid confusion. We start with the first topic: {matching_words[0]}.')

            elif len(matching_words) == 0:

                print(f"\n{self.name}: I am sorry, I do not have information about that topic. Try to rephrase the question or ask me something else.")

            
            answer = self.find_best_answer(query, topic)
            print(f"\n{self.name}: {answer}")

        
    def tokenize_sentence(self, sentence):
        
        doc = self.nlp(sentence)
        tokens = [token.text.lower() for token in doc if not token.is_punct]
        return tokens


    def embed_sentence(self, sentence):
        # Split the sentence into tokens
        tokens = tokenize_sentence(sentence)
        
        # Initialize an empty array to store the word embeddings
        embeddings = np.zeros(self.w2v.vector_size)
        
        n = 0
        # Iterate over each token in the sentence
        for token in tokens:
            # Check if the token is present in the pretrained word2vec model
            if token in self.w2v:
                # Add the word embedding to the sentence embeddings
                embeddings += self.w2v[token]
                n+=1
        
        # Normalize the sentence embeddings
        embeddings /= n+1
        
        return embeddings


    def find_matching_words(self, query):
        return [word for word in self.themes if word in query]
    
    def find_best_answer(self, query, topic):
        
        embed_query = embed_sentence(query)
        df = self.dataframes[topic]['question_embeddings'].to_numpy()

        most_similar_responce = calculate_similarity_indices(embed_query, df)

        predicted_sentence = df.iloc[most_similar_responce[0]]['answer']

        return predicted_sentence
        

In [9]:
c = ChatBot()
c.dataframes['restaurant'].head()

Unnamed: 0,question,answer
0,i need a place to dine in the center thats exp...,I have several options for you; do you prefer ...
1,"Any sort of food would be fine, as long as it ...",There is an Afrian place named Bedouin in the ...
2,"Sounds good, could I get that phone number? Al...",Bedouin's phone is 01223367660. As far as hote...
3,Hi there! Can you give me some info on Cityroomz?,"Cityroomz is located at Sleeperz Hotel, Statio..."
4,Yes please. I need it for 7 people for 3 night...,How many days would you like to book it for?


In [21]:
def calculate_similarity_indices(vector1, vector2):
    # Calculate the cosine similarity between each row in vector1 and vector2
    similarities = cosine_similarity(vector1, vector2)

    # For each row in vector1, find the index of the most similar row in vector2 
    most_similar_indices = np.argmax(similarities, axis=1)
    
    return most_similar_indices


calculate_similarity_indices([[1, 2, 3]], [[1, 2, 3], [4, 5, 6]])

array([0])