### Import the knowledge graph

In [1]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('/content/drive/My Drive/Speakeasy_Project')

Mounted at /content/drive


In [2]:
!pip install rdflib # Install the rdflib package
from rdflib.term import URIRef, Literal
import rdflib
import torch
import numpy as np


Collecting rdflib
  Downloading rdflib-7.1.1-py3-none-any.whl.metadata (11 kB)
Collecting isodate<1.0.0,>=0.7.2 (from rdflib)
  Downloading isodate-0.7.2-py3-none-any.whl.metadata (11 kB)
Downloading rdflib-7.1.1-py3-none-any.whl (562 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/562.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━[0m [32m512.0/562.4 kB[0m [31m15.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m562.4/562.4 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading isodate-0.7.2-py3-none-any.whl (22 kB)
Installing collected packages: isodate, rdflib
Successfully installed isodate-0.7.2 rdflib-7.1.1


In [61]:
graph = rdflib.Graph()
graph.parse('14_graph.nt', format='turtle')

<Graph identifier=N6d6d0318edd14803af89a2803db02046 (<class 'rdflib.graph.Graph'>)>

### NameSpaces

The entities are stored with different URIs. The most common namespaces are the following:


In [62]:
# define some prefixes
WD = rdflib.Namespace('http://www.wikidata.org/entity/')
WDT = rdflib.Namespace('http://www.wikidata.org/prop/direct/')
DDIS = rdflib.Namespace('http://ddis.ch/atai/')
RDFS = rdflib.namespace.RDFS
SCHEMA = rdflib.Namespace('http://schema.org/')

In [63]:
print('Some subjects from the knowledge graph')
for objs in list(set(graph.subjects()))[:10]:
    print(objs)

print('\n Some objects from the knowledge graph')
for objs in list(set(graph.objects()))[10:20]:
    print(objs)

Some subjects from the knowledge graph
http://www.wikidata.org/entity/Q19637493
http://www.wikidata.org/entity/Q28416606
http://www.wikidata.org/entity/Q15061318
http://www.wikidata.org/entity/Q2986066
http://www.wikidata.org/entity/Q545599
http://www.wikidata.org/entity/Q30782103
http://www.wikidata.org/entity/Q3
http://www.wikidata.org/entity/Q1323212
http://www.wikidata.org/entity/Q504191
http://www.wikidata.org/entity/Q1066948

 Some objects from the knowledge graph
Kemal İnci
Edgar Barens
http://www.wikidata.org/entity/Q1323212
http://www.wikidata.org/entity/Q1066948
Eau d'Heure lakes
tt1579247
tt0072053
1972 film by George A. Romero
American singer, actor and film producer
http://www.wikidata.org/entity/Q3161584


Some ways to access the label of an entity in the graph subjects given it's URI:

In [64]:
for node in graph.subjects():
    if graph.value(subject=node, predicate=RDFS.label): # Check if the triple exists
        print(f"node {node} has label {graph.value(subject=node, predicate=RDFS.label)}")
    break


node http://www.wikidata.org/entity/Q26705136 has label Juan Andrés Arango


We want to check if every subject in the graph has a label

In [65]:
i = 0
j = 0
for node in graph.subjects():
    j += 1
    if graph.value(subject=node, predicate=RDFS.label): # Check if the triple exists
        i += 1

print(f"Number of subjects with a label: {i}\n")
print(f"Number of subjects in the graph: {j}\n")
if i != j:
    print(f"There are {j-i} subject entities without a label")

Number of subjects with a label: 2051387

Number of subjects in the graph: 2056777

There are 5390 subject entities without a label


### Make a dictionary of nodes URIs with the respective labels

We want to make a dictionary in which the keys are the nodes URIs and the values are the nodes labels

In [66]:
# Function to extract the local part of a URI (e.g., after the last / or #)
def extract_label_from_uri(uri, namespaces):
    # Loop through all namespaces and remove the matching part
    for namespace in namespaces:
        if str(uri).startswith(str(namespace)):
            return str(uri).replace(str(namespace), "")
    # If no match, return the original URI
    return str(uri).split('/')[-1]

# Function to build a dictionary of nodes and their labels
def build_node_label_dict(graph, namespaces):
    nodes = {}

    for node in graph.all_nodes():
        if isinstance(node, rdflib.term.URIRef):  # Only process URIs
            # Check if the node has a label
            label = graph.value(node, RDFS.label)

            if label:
                # If label exists, use it
                nodes[node.toPython()] = str(label)
            else:
                # If no label, extract the local part of the URI
                local_label = extract_label_from_uri(node, namespaces)
                nodes[node.toPython()] = local_label

    return nodes

namespaces = [WD, WDT, DDIS, RDFS, SCHEMA]

nodes = build_node_label_dict(graph, namespaces)

# Check the result
for uri, label in nodes.items():
    print(f"URI: {uri}, Label: {label}")
    break

URI: http://www.wikidata.org/entity/Q15061318, Label: Aleksandr Chubaryan


Make an inverse dictionary to find URIs of the entities given the labels

In [67]:
ent2uri = {ent: uri for uri, ent in nodes.items()}

We also make another dictionary specifically for predicates

In [68]:
# Function to build a dictionary of predicates and their labels
def build_pred_label_dict(graph, namespaces):
    predicates = {}

    for node in graph.predicates():
        if isinstance(node, rdflib.term.URIRef):  # Only process URIs
            # Check if the node has a label
            label = graph.value(node, RDFS.label)

            if label:
                # If label exists, use it
                predicates[node.toPython()] = str(label)

            # This condition is never evaluated cause all the predicates have labels
            else:
                # If no label, extract the local part of the URI
                local_label = extract_label_from_uri(node, namespaces)
                predicates[node.toPython()] = local_label

    return predicates

# TODO: change the name of predicates into 'pred2lbl'
predicates = build_pred_label_dict(graph, namespaces)

# Check the result
for uri, label in predicates.items():
    print(f"URI: {uri}, Label: {label}")
    break

URI: http://www.wikidata.org/prop/direct/P27, Label: country of citizenship


Make an inverse dictionary to find URIs of the predicates given the labels

In [69]:
pred2uri = {pred: uri for uri, pred in predicates.items()}

## Embeddings

We will now implement an approach that relies on embeddings rather than querying the graph directly. For this we will need to extract entities from the graph in a more dynamic way and will resort to NER

### NER

We choose model 'Babelscape' because it was already trained on a large wikidata dataset and it is by far the best at recognizing movie titles as 'MISC'

In [70]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ner")
model = AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner")

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)



### Synonyms handling

To account for the presence of synonyms in the question we decided to implement a model that computes the similarity between a phrase and the list of predicates from the knowledge graph and returns the most similar matches

In [71]:
import spacy

# this command downloads the Spacy model
spacy.cli.download("en_core_web_md")

nlp = spacy.load("en_core_web_md")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [72]:
def find_match(phrase, predicate_dict, n=5, confidence=0.6):
    """
    Given a phrase, a dictionary of predicate values, an integer n, and a confidence threshold,
    return the top n most similar words to the phrase from the dictionary values
    that have a similarity score above the confidence threshold.
    """
    phrase_token = nlp(phrase)
    similarities = []

    # Calculate similarity between phrase and each predicate value
    for predicate in predicate_dict.values():
        predicate_token = nlp(predicate)
        similarity = phrase_token.similarity(predicate_token)

        # Only consider matches above the confidence threshold
        if similarity > confidence:
            similarities.append((predicate, similarity))

    # Sort by similarity in descending order and get the top n matches
    top_n_matches = sorted(similarities, key=lambda x: x[1], reverse=True)[:n]

    # Return only the most similar words
    return [match[0] for match in top_n_matches]


Some weakness of this methods: "Children" is not correctly associated to predicate "Child".

In [73]:
# Example usage
phrase = "release date"
n = 3
print(find_match(phrase, predicates, n))  # Should return the top 3 most similar predicates

['publication date']


### EditDistance Matching

For entities the problem of syninyms is not that relevent because generally we can assume that people's names and movie's titles have no synonims. However we still need to make sure that the entities recognized by the NER algorithm correspond to real entities in the knowledge graph, otherwise we cannot map them to an embedding. To achieve this we can use the match_entity function based on editdistance. This function is also useful for predicates.

In [74]:
import editdistance

def match_entity_editdistance(entity, dictionary=nodes, threshold=5):
    """
    Matches the given entity to the closest node in the dictionary based on edit distance.
    Returns None if the closest match exceeds the specified distance threshold.

    Args:
    - entity (str): The entity to match.
    - dictionary (dict): The graph dictionary with nodes to match against.
    - threshold (int): The maximum allowable edit distance for a match.

    Returns:
    - (str, str) or (None, None): Returns (node_key, node_value) if a match is found within the threshold,
      otherwise returns (None, None).
    """
    tmp = float('inf')  # Start with the highest possible distance
    match_node = None
    match_value = None

    for key, value in dictionary.items():
        # Calculate edit distance between the entity and current node value
        distance = editdistance.eval(value, entity)
        if distance < tmp:
            tmp = distance
            match_node = key
            match_value = value

    # Return None if the closest match exceeds the threshold
    if tmp > threshold:
        return None

    return match_node, match_value, tmp

In [75]:
# Example usage
phrase = "Incption"
print(match_entity_editdistance(phrase, nodes))

('http://www.wikidata.org/entity/Q25188', 'Inception', 1)


## Extracting Predicates

We have implemented the following pipeline to extract predicates from the question:

- Extract meaningful words from the question with spacy
    - For exaple: from question 'who directed...' only 'directed' is extracted
- Generate ngrams from meaningful words
    - If the predicate is made of 2 words like "publication date" the meaningful word would be ['publication', 'date'] which would not be mapped to 'publication date' but to other words. This is why we generate a list of ngrams like ["publication 'date", "publication", "date"].
- Starting with the longest ngram, try to find the predicate from the predicate list that is closest to the ngram. If a match is found, we return it. This means that we prioritize matching longest ngrams
    - Before we compare the ngram to the list of predicates we lemmatize it and turn it into a noun using the verb_to_noun dictionary we wrote. This is because many predicates in the list are in the form "director", "writer" instead of "direct" and "write"
    - We also check if the ngram corresponds exactly to a predicate in the graph via the EditDistance matching function. In tha case the matching predicate is returned immediately


In [76]:
verb_to_noun = {
    "affiliate": "affiliation",
    "animate": "animator",
    "base": "based on",
    "cast": "cast member",
    "characterize": "characters",
    "depict": "depicts",
    "describe": "node description",
    "design": "designed by",
    "distribute": "distributed by",
    "educate": "educated at",
    "employ": "employer",
    "found": "founded by",
    "influence": "influenced by",
    "locate": "location",
    "narrate": "narrator",
    "originate": "country of origin",
    "participate": "participant in",
    "perform": "performer",
    "produce": "producer",
    "publish": "publication date",
    "rate": "rating",
    "receive": "award received",
    "represent": "represented by",
    "screen": "screenwriter",
    "study": "student of",
    "write": "screenwriter",
    "direct": "director",
    "photograph": "director of photography",
    "edit": "film editor",
    "speak": "languages spoken, written or signed",
    "produce": "production company",
    "confer": "conferred by",
    "broadcast": "broadcast by",
    "present": "presented in",
    "voice": "voice actor",
    "film": "filming location",
    "release": "publication date",
    "award": "award received",
    "create": "creator",
    "develop": "developer",
    "choreograph": "choreographer",
    "make": "production company",
    "assemble": "crew member(s)",
    "inspire": "inspired by",
    "contribute": "contributor to the creative work or subject",
    "style": "costume designer",
    "nominate": "nominated for",
    "portray": "cast member",
    "describe": "node description",
    "label": "node label",
    "set": "narrative location",
    "shot": "filming location",
    "character" : "characters"
}


In [77]:
# Check if some values in the dict do not correspond to actual entities in the graph
for value in verb_to_noun.values():
    if value not in predicates.values():
        print(f"{value} to be deleted")

### Check ngram match

We are going to use the check_ngram_match match also to match predicates in the factual question part

In [78]:
def check_ngram_match(ngram, predicate_dict, threshold=2, n=5, confidence=0.6):
    """
    Checks if an n-gram closely matches a predicate in the dictionary.
    First, it attempts an exact or close match based on edit distance.
    If no close match is found, it falls back to finding the best similarity match.

    Args:
    - ngram (str): The n-gram to check.
    - predicate_dict (dict): Dictionary of known predicates.
    - threshold (int): Maximum edit distance for an exact match.
    - n (int): Number of top matches to return for similarity matching.
    - confidence (float): Minimum similarity threshold for a match.

    Returns:
    - list: List containing the best-matching predicate or an empty list if no match is found.
    """
    # Check if the ngram matches a predicate exactly or almost exactly
    if match_entity_editdistance(ngram, dictionary=predicate_dict, threshold=threshold):
        match_node, match_value, _ = match_entity_editdistance(ngram, dictionary=predicate_dict, threshold=threshold)
        return [match_value]

    # Apply lemmatization before similarity matching
    ngram = " ".join([verb_to_noun.get(token.lemma_, token.lemma_) for token in nlp(ngram)])
    matches = find_match(ngram, predicate_dict, n=n, confidence=confidence)

    return matches


In [79]:
def extract_relation_embeddings(sentence, predicate_dict, n=5, confidence=0.6, max_ngram_size=3):
    """
    Extracts the relation from a sentence by finding the most similar predicates,
    prioritizing longer n-grams first. If a match with similarity > confidence
    is found, it returns that result immediately.

    Args:
    - sentence (str): The input sentence from which to extract the relation.
    - predicate_dict (dict): Dictionary of known predicates with their descriptions.
    - n (int): Number of top matches to return.
    - confidence (float): Minimum similarity threshold for a match.
    - max_ngram_size (int): Maximum number of words in an n-gram to consider for matching.

    Returns:
    - list: Top `n` predicate matches that have a similarity score above the confidence threshold.
    """
    # Step 1: Parse the sentence to filter stop words and prioritize key phrases
    doc = nlp(sentence)
    meaningful_words = [token.text for token in doc if not token.is_stop and token.is_alpha]

    # Step 2: Generate prioritized n-grams from meaningful words (starting with the longest n-grams)
    ngrams = []
    for size in range(max_ngram_size, 0, -1):  # Start with larger n-grams
        ngrams += [" ".join(meaningful_words[i:i+size]) for i in range(len(meaningful_words) - size + 1)]

    # Step 3: Check each n-gram for similarity, starting with the longest
    for ngram in ngrams:
        matches = check_ngram_match(ngram, predicate_dict, threshold=2, n=n, confidence=confidence)
        if matches:
            return matches

    # Step 4: If no matches above the confidence threshold are found, return an empty list
    return []


In [80]:
sentence = "Who is the director of ?"
relation = extract_relation_embeddings(sentence, predicates, n=3, confidence=0.5)
print("Extracted Relation:", relation)

Extracted Relation: ['director']


## Extract embeddings from the files

We extract embeddings from the files. We will explain how to use them after the process_question function. Since there is a problem with the relation embeddings we need to extract them now and account for that in the process_question function

In [81]:
'''import numpy as np
import csv

entity_matrix = np.load('/Users/gianmarcoalbano/Desktop/Advanced topics in AI/Chatbot-Project/ddis-graph-embeddings/entity_embeds.npy')
predicate_matrix = np.load('/Users/gianmarcoalbano/Desktop/Advanced topics in AI/Chatbot-Project/ddis-graph-embeddings/relation_embeds.npy')

with open('/Users/gianmarcoalbano/Desktop/Advanced topics in AI/Chatbot-Project/ddis-graph-embeddings/entity_ids.del') as ifile:
    ent2id = {ent: int(idx) for idx, ent in csv.reader(ifile, delimiter='\t')}
    id2ent = {v: k for k, v in ent2id.items()}
with open('/Users/gianmarcoalbano/Desktop/Advanced topics in AI/Chatbot-Project/ddis-graph-embeddings/relation_ids.del') as ifile:
    pred2id = {rel: int(idx) for idx, rel in csv.reader(ifile, delimiter='\t')}
    id2pred = {v: k for k, v in pred2id.items()}'''

"import numpy as np\nimport csv\n\nentity_matrix = np.load('/Users/gianmarcoalbano/Desktop/Advanced topics in AI/Chatbot-Project/ddis-graph-embeddings/entity_embeds.npy')\npredicate_matrix = np.load('/Users/gianmarcoalbano/Desktop/Advanced topics in AI/Chatbot-Project/ddis-graph-embeddings/relation_embeds.npy')\n\nwith open('/Users/gianmarcoalbano/Desktop/Advanced topics in AI/Chatbot-Project/ddis-graph-embeddings/entity_ids.del') as ifile:\n    ent2id = {ent: int(idx) for idx, ent in csv.reader(ifile, delimiter='\t')}\n    id2ent = {v: k for k, v in ent2id.items()}\nwith open('/Users/gianmarcoalbano/Desktop/Advanced topics in AI/Chatbot-Project/ddis-graph-embeddings/relation_ids.del') as ifile:\n    pred2id = {rel: int(idx) for idx, rel in csv.reader(ifile, delimiter='\t')}\n    id2pred = {v: k for k, v in pred2id.items()}"

In [None]:
#Abu Colab Code

import numpy as np
import csv

file_path_embeds = "/content/drive/My Drive/Speakeasy_Project/ddis-graph-embeddings/entity_embeds.npy"
entity_matrix = np.load(file_path_embeds)

file_path_predicate = "/content/drive/My Drive/Speakeasy_Project/ddis-graph-embeddings/relation_embeds.npy"
predicate_matrix = np.load(file_path_predicate)

# Assuming 'entity_ids.del' is in the same directory as other files
file_path_entity_ids = "/content/drive/My Drive/Speakeasy_Project/ddis-graph-embeddings/entity_ids.del"  # Construct the path for entity_ids.del

# Use the constructed path for opening the file
with open(file_path_entity_ids) as ifile:  # Changed 'entity_ids.del' to file_path_entity_ids
    ent2id = {ent: int(idx) for idx, ent in csv.reader(ifile, delimiter='\t')}
    id2ent = {v: k for k, v in ent2id.items()}

# Assuming 'relation_ids.del' is in the same directory as other files
file_path_relation_ids = "/content/drive/My Drive/Speakeasy_Project/ddis-graph-embeddings/relation_ids.del"  # Construct the path for relation_ids.del

# Use the constructed path for opening the file
with open(file_path_relation_ids) as ifile:  # Changed 'relation_ids.del' to file_path_relation_ids
    pred2id = {rel: int(idx) for idx, rel in csv.reader(ifile, delimiter='\t')}
    id2pred = {v: k for k, v in pred2id.items()}

### Predicates without embeddings

There seems to be a problem with the embeddings. Some of them are missing

In [82]:
print(f"predicates list: {len(predicates)}")
print(f"relation embeddings list: {len(predicate_matrix)}\n")
#pred2id['http://www.wikidata.org/prop/direct/P577']
pred_without_embeddings = []
# Which predicates are missing an embedding?
for predicate in predicates.values():
    try:
        id = pred2id[pred2uri[predicate]]
    except KeyError:
        print(f"{predicate} has no embedding")
        pred_without_embeddings.append(predicate)

predicates list: 255
relation embeddings list: 248

tag has no embedding
IMDb ID has no embedding
node label has no embedding
image has no embedding
node description has no embedding
publication date has no embedding
box office has no embedding
rating has no embedding


# Error handling

If our model is unable to find the answer we provide a human like response using the paraphrasing model "pegasus". Here is a dimostration of how we plan to use it

In [83]:
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
model_name = 'tuner007/pegasus_paraphrase'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

def get_response(input_text,num_return_sequences,num_beams):
  batch = tokenizer([input_text],truncation=True,padding='longest',max_length=60, return_tensors="pt").to(torch_device)
  translated = model.generate(**batch,max_length=60,num_beams=num_beams, num_return_sequences=num_return_sequences, temperature=1.5)
  tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
  return tgt_text

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at tuner007/pegasus_paraphrase and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [92]:
question = "Who is the main character of The Masked Gang: Cyprus? "
num_beams = 10
num_return_sequences = 3
context = f"{question} i don't know"
get_response(context,num_return_sequences,num_beams)

["I don't know who the main character is in The Masked Gang: Cyprus.",
 "I don't know who the main character of The Masked Gang is.",
 "I don't know who the main character is of The Masked Gang: Cyprus."]

# Handle multiple questions at once

We also implemented a way to split the question into 2 subquestions using spacy, so that we can answer them separately

In [127]:
import spacy

# Load the spaCy model
nlp = spacy.load("en_core_web_md")

def split_questions(text):
    doc = nlp(text)
    questions = []
    current_question = []

    for token in doc:
        # Add token to the current question
        current_question.append(token.text)

        # If token is a conjunction like 'and', treat it as a potential separator
        if token.dep_ == "cc" and token.text.lower() == "and":
            # Join tokens accumulated so far and start a new question
            questions.append(" ".join(current_question[:-1]))
            current_question = []

    # Add the final question after loop ends
    if current_question:
        questions.append(" ".join(current_question))

    return [q.strip() for q in questions if q.strip()]

# Example compound question
question = "Who is the director of Star Wars and who is the screenwriter of inception"
split_questions_list = split_questions(question)

# Result
print(split_questions_list)


['Who is the director of Star Wars', 'who is the screenwriter of inception']


# NER Pipeline

We can now combine all these functions to successfully extract both predicates and entities from a question. We will use the model "..." (we can still decide on a different model) to recognize entities and proceed in the following way:
- Preprocess the question so that special characters that hold no important meaning like ! or : are removed
- Extract a list of dictionaries with all the entities from the question using NER
    - The dictionaries will look like: {'entity_group' : 'PER', 'word': Andrew Garfield'}
- Map the extracted entities to actual nodes in the graph via the editdistance function
    - If the distance from the entity in the question and the closest entity in the graph is > 5 then no entity is matched
    - If the distance from the entity in the question and the closest entity in the graph is < 5 but > 1 then we prompt the chatbot to ask the user to verify if they matched the right enitity
- Remove the entities from the question
- Pass the question without entities to the predicate_extraction function
- Add the extracted predicates to the list of dictionaries as {'entity_group' : 'predicate', 'word': 'screenwriter'}


Some notes on how to handle the 'Entity matching too distant' case. In the final notebook with the speakeasy infrastructure you should make a variable with the matched entities that were too distant. so that they are stored for generating the next message in case they answer 'yes' to the question 'did you mean -matched_entity-?'

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import re

tokenizer = AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ner")
model = AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner")

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)

In [125]:
import re

def preprocess_question_embeddings(question):
    # Remove symbols like :, !, -, etc., by replacing them with an empty string
    cleaned_question = re.sub(r'[:!\\-]', '', question)
    # Remove any extra spaces that might result from removing symbols
    cleaned_question = re.sub(r'\s+', ' ', cleaned_question).strip()
    return cleaned_question

# Define a function to extract entities and relation from a given question
def extract_entities_NER(question, predicate_dict=predicates, n=5, confidence=0.5, max_ngram_size=3):
    extracted_entities = []

    exit_status = ""
    question = preprocess_question_embeddings(question)
    print(f"Question after preprocessing: {question}\n")

    # Step 1: Use the NER pipeline to get entities in the question
    entities = ner_pipeline(question)

    recommendation_keywords = ["Recommend", "recommend", "Recommendation", "recommendation", "Suggest", "suggest", "Suggestion", "suggestion", "like"]


    # If there are no entities in the question return (maybe prompt the user to double check if the capitalized the right letters)
    if entities:

        # Step 2: Turn dictionaries in entities into simplified dictionaries and concatenate words to join_entity['word']
        for entity in entities:
            simplified_entity = {
                'entity_group': entity['entity_group'],
                'word': entity['word']
            }
            extracted_entities.append(simplified_entity)



        # Step 3: Remove extracted entities from the question to isolate the predicate phrase
        question_no_entities = question
        for entity in extracted_entities:
            print(f"Extracted entity: {entity['word']}\n")
            # Convert both the question and entity to lowercase for consistent replacement
            question_no_entities = re.sub(r'\b' + re.escape(entity['word'].lower()) + r'\b', '', question_no_entities.lower(), flags=re.IGNORECASE)

        # Replace multiple spaces with a single space and trim leading/trailing whitespace
        question_no_entities = re.sub(r'\s+', ' ', question_no_entities).strip()

        print(f"Question after removing entities: {question_no_entities}\n")

        # Step 3.5: Match each entity to the closest node in the graph. Remove them if there is no match
        for entity in extracted_entities:

            if match_entity_editdistance(entity['word'], threshold=5):
                match_node, match_value, distance = match_entity_editdistance(entity['word'])

                # If the closest entity we can find in the graph is still distant, return the best matched value
                # with exit status Entity matching too distant. Then ask the user if the match_value actually
                # corresponds to what they wanted
                if distance > 5:
                    exit_status = 'Entity matching too distant'
                    return match_value, exit_status
                else:
                    # Update 'word' in entity to be the best-matching node's label
                    entity['word'] = match_value
            else:
                # Remove the entity from extracted_entities if no match was found
                extracted_entities.remove(entity)

    else:
        exit_status = 'No entities found by NER'

    liked_movies = []
    if any(keyword in question for keyword in recommendation_keywords):
        #extracted_entities.append({'entity_group': 'recommend_action', 'word': 'recommend'})
        liked_movies = [entity['word'] for entity in extracted_entities if entity['entity_group'] == 'MISC']
        print(f"Liked movies: {liked_movies}\n")
        return liked_movies, exit_status



    # Step 4: Extract the relation from the modified question using the extract_relation function
    relations = extract_relation_embeddings(question_no_entities if entities else question, predicates, n=n, confidence=confidence, max_ngram_size=max_ngram_size)

    # Step 4.5: Check if the extracted relations have an embedding
    for relation in relations:
        if relation in pred_without_embeddings:
            exit_status = 'predicate missing embedding'
            return relation, exit_status

    # Step 5: Add the relation to the extracted_entities list if a match is found
    '''if relations:
        print(f"Extracted predicates: {relations}\n")
        extracted_entities.append({'entity_group': 'predicate', 'word': []})
        for relation in relations:
            extracted_entities[-1]['word'].append(relation)'''

    if relations:
        extracted_entities.append({'entity_group': 'predicate', 'word': []})
        for relation in relations:
            if relation in pred2uri and pred2uri[relation] in pred2id:  # Check if relation has embedding
                extracted_entities[-1]['word'].append(relation)

    return extracted_entities, exit_status


In [None]:
sentence = "Who is the director of The Godfather"
extracted_entities, _ = extract_entities_NER(sentence, predicates, n=2, confidence=0.6)
print("Extracted entities:", extracted_entities[0])
print("Exit status:", _)

## Turn labels into Embeddings

Now that we have a reliable way of extracting entities and predicates from the question we can turn them into embeddigs:

ent2id can be used to retrieve the index of an entity in the embedding matrix given it's Uri. Retriving the embedding of an entity given it's label would look like this:

In [None]:
entity_label = 'The Godfather'

# Turn label into URI
Uri = ent2uri[entity_label]
print(f"The URI of {entity_label} is {Uri}\n")

# Turn URI into a row index
id = ent2id[Uri]
print(f"The id of {entity_label} is {id}\n")

# Look up the row index in the embedding matrix
entity_embedding = entity_matrix[id]
print(f"The embedding of {entity_label} has lenght {len(entity_embedding)}\n") # I don't print it cause it's long


In [None]:
entity_label = 'director'

# Turn label into URI
Uri = pred2uri[entity_label]
print(f"The URI of {entity_label} is {Uri}\n")

# Turn URI into a row index
id = pred2id[Uri]
print(f"The id of {entity_label} is {id}\n")

# Look up the row index in the embedding matrix
entity_embedding = predicate_matrix[id]
print(f"The embedding of {entity_label} has lenght {len(entity_embedding)}\n") # I don't print it cause it's long

### Extract embeddings

We write a function to make embedding retrival more straightforward:

In [None]:
def extract_embedding(label, type='entity'):

    if type=='entity':
        pipeline = [ent2uri, ent2id, entity_matrix]
    else:
        pipeline = [pred2uri, pred2id, predicate_matrix]


    Uri = pipeline[0][label]

    # Turn URI into a row index
    id = pipeline[1][Uri]

    # Look up the row index in the embedding matrix
    entity_embedding = pipeline[2][id]

    return entity_embedding

In [None]:
# Example usage

entity_label = 'The Godfather'
entity_embedding = extract_embedding(entity_label)
print(f"The embedding of {entity_label} has lenght {len(entity_embedding)}\n")

pred_label = 'director'
pred_embedding = extract_embedding(pred_label, 'predicate')
print(f"The embedding of {pred_label} has lenght {len(pred_embedding)}\n")

### Extract labels

We need also a way to turn an embedding into a label

In [None]:
def extract_label(embedding, type='entity'):

    if type=='entity':
        pipeline = [entity_matrix, id2ent, nodes]
    else:
        pipeline = [predicate_matrix, id2pred, predicates]

    # Find the index in the entity embeddings matrix that corresponds to the embedding vector
    id = np.where((pipeline[0] == embedding).all(axis=1))[0][0]

    # Turn the id into a URI
    Uri = pipeline[1][id]

    # Turn the URI into a label
    label = pipeline[2][Uri]

    return label


In [None]:
# Example usage

entity_label = 'The Godfather'
entity_embedding = extract_embedding(entity_label)
print(f"The embedding of {entity_label} has lenght {len(entity_embedding)}\n")

# Turn the embedding back into a label
label = extract_label(entity_embedding)
print(f"The extracted label for entity: {entity_label} is {label}\n")

pred_label = 'characters'
pred_embedding = extract_embedding(pred_label, 'predicate')
print(f"The embedding of {pred_label} has lenght {len(pred_embedding)}\n")

# Turn the embedding back into a label
label = extract_label(pred_embedding, 'predicate')
print(f"The extracted label for predicate: {pred_label} is {label}")

### Evaluate embeddings similarity

Given the embedding of an entity we want to find the most similar entities in the graph to said entity

In [None]:
from sklearn.metrics import pairwise_distances

In [None]:
def find_similarities(embedding, n):

    embedding = np.atleast_2d(embedding)

    answer = []

    dist = pairwise_distances(embedding, entity_matrix)
    for idx in dist.argsort().reshape(-1)[:n]:
        answer.append(nodes[id2ent[idx]])

    return answer

In [None]:
# Example Usage

entity_embedding = extract_embedding('Batman')

print(find_similarities(entity_embedding, 5))

##RecSys


The NER extracts a list of movies that the user likes and appends them to a list. Based on the liked movies, the RecSys will generate recommendations by leveraging both the embeddings of these movies and their genres. First, it checks if each liked movie exists in a knowledge graph. If a movie isn't found, it attempts to find the closest matching entity using the editdistance function. The embeddings for the liked movies (or their closest matches) are then averaged to create a representative vector.

Using cosine similarity, it calculates how similar this average embedding is to all other movie embeddings in the graph. It sorts these similarities to identify the most similar movies while excluding any that are already in the question or their closest matches. Finally, it returns a specified number of unique movie recommendations that are not part of the user's liked list.

In [128]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def recommend_movies_by_genre(liked_movies, n=5):
    """
    Recommends movies similar to the user's liked movies based on embeddings and genre.
    Uses closest matching entity if the exact movie is not in the knowledge graph.

    Args:
        liked_movies (list): List of movie labels liked by the user.
        n (int): Number of recommendations to generate.

    Returns:
        list: List of recommended movie labels.
    """
    liked_movies_embeddings = []
    for movie in liked_movies:
        #print(f"Processing movie: {movie}")
        if movie in ent2uri:  # If the movie is in the graph, use its embedding directly
            liked_movies_embeddings.append(extract_embedding(movie))
            #print(liked_movies_embeddings)


        else:  # If not, find the closest matching entity
            threshold = max(3, int(len(movie) * 0.9))  # Adjust threshold
            match = match_entity_editdistance(movie, threshold=threshold)
            if match:
                match_node, match_value, _ = match
                # Check if the match_value is a valid key before proceeding
                if match_value in ent2uri:
                    liked_movies_embeddings.append(extract_embedding(match_value))
                    print(f"Using '{match_value}' instead of '{movie}' for recommendation.")
                else:
                    print(f"Closest match '{match_value}' not found in embedding data. Skipping.")
            else:
                print(f"Could not find a close match for '{movie}' in the knowledge graph.")

    if not liked_movies_embeddings:
        return "None of the provided movies were found in the knowledge graph."

    # Calculate the average embedding of liked movies (or their closest matches)
    #print(f"Liked movies embeddings: {liked_movies_embeddings}")
    avg_embedding = np.mean(liked_movies_embeddings, axis=0)

    # Calculate similarity to all other movie embeddings
    similarities = cosine_similarity(avg_embedding.reshape(1, -1), entity_matrix)

    # Get indices of most similar movies (excluding liked movies and their close matches)
    sorted_indices = similarities.argsort()[0][::-1]  # Sort in descending order

    excluded_movies = set(liked_movies + [match_value for _, match_value, _ in [match_entity_editdistance(m) for m in liked_movies if match_entity_editdistance(m)]if match_value])

    # Filter recommended indices, excluding liked movies and their close matches
    # The change is here: Add a check if id2ent[i] is in nodes before accessing it
    recommended_indices = [i for i in sorted_indices if id2ent[i] in nodes and nodes[id2ent[i]] not in excluded_movies]

    # Return labels of recommended movies
    recommendations = [nodes[id2ent[i]] for i in recommended_indices[:n]]
    return recommendations

# Answer questions with Embeddings

We can now use the following pipeline for answering questions:
- Extract the entities and relation from the question
- Turn entities and relation into embeddings
- If the entity is a subject, retrieve the object by: _object = subject + relation_
- If the entity is an object, retrieve the subject by _subject = object - relation_

In [126]:
def answer_question_embeddings(question):
    entities, exit_status = extract_entities_NER(question, predicates, n=3, confidence=0.6)
    recommendation_keywords = ["Recommend", "recommend", "Recommendation", "recommendation", "Suggest", "suggest", "Suggestion", "suggestion", "like"]
    liked_movies = []

    # Handle cases based on exit_status
    if exit_status == 'No entities found by NER':
        return "We could not find any entities in the question. Could you verify that you have capitalized the right letters, such as movie titles or people’s names?"

    elif exit_status == 'Entity matching too distant':
        #match_value, _ = entities  # entities contains the match value in this case
        #return f"The closest entity match found was '{match_value}', but it seems too distant. Could you rephrase it or specify it more clearly?"
        # Check if entities is a tuple with more than 2 elements
        if isinstance(entities, tuple) and len(entities) > 2:
            # If so, assume the first element is the match value
            match_value = entities[0]
        else:
            # Otherwise, unpack as before if there are only two elements
            try:
                match_value, _ = entities
            except (TypeError, ValueError):
                # Handle cases where entities is not iterable or has unexpected format
                return "Error: Unexpected format for entities. Please check the extract_entities_NER function."
        return f"The closest entity match found was '{match_value}', but it seems too distant. Could you rephrase it or specify it more clearly?"


    elif exit_status == 'predicate missing embedding':
        relation, _ = entities  # entities contains the relation in this case
        return f"Unfortunately, we were not provided with an embedding for the relation '{relation}'. Please try another question."

    elif any(keyword in question for keyword in recommendation_keywords):
        # Assuming extract_entities_NER returns a tuple of (match_value, liked_movies)
        # for recommendation-type queries
        liked_movies = entities
        # If entities contains more than two elements, assume the liked movies
        # are stored in the second element

        return recommend_movies_by_genre(liked_movies)
    # Proceed if everything worked correctly
    # Check if entities is a list of dictionaries before proceeding
    if not exit_status and isinstance(entities, list) and all(isinstance(item, dict) for item in entities):
        extracted_predicates = [d['word'] for d in entities if d['entity_group'] == 'predicate']

        # Check if extracted_predicates is empty
        if extracted_predicates:
            extracted_predicates = extracted_predicates[0]  # Access the first element only if it exists
        else:
            # Handle the case where no predicates are found (e.g., return an error message or a default value)
            return "No predicate found in the question."  # Or handle it differently

        extracted_entities = [d['word'] for d in entities if d['entity_group'] != 'predicate']
        # Extract predicates and entities
        #extracted_predicates = [d['word'] for d in entities if d['entity_group'] == 'predicate'][0]
        #extracted_entities = [d['word'] for d in entities if d['entity_group'] != 'predicate']

        # Convert predicates and entities to embeddings
        predicates_embeddings = [extract_embedding(pred, 'predicate') for pred in extracted_predicates]
        entities_embeddings = [extract_embedding(ent) for ent in extracted_entities]

        # Compute answer using similarity function
        answer = find_similarities(entities_embeddings[0] + predicates_embeddings[0], 3)
        return answer
    else:
        return "Error: Invalid format for entities. Please check the extract_entities_NER function."

In [124]:
# Example Usage

question = "Who is the director of Star Wars: Episode VI - Return of the Jedi?"

print(answer_question_embeddings(question))

Question after preprocessing: Who is the director of Star Wars Episode VI Return of the Jedi?

Extracted entity: Star Wars Episode VI Return of the Jedi

Question after removing entities: who is the director of ?

['George Lucas', 'Anthony Daniels', 'Ellis Rubin']


In [120]:
question1 = "Given that I like Inception and Jumanji, can you recommend some movies?"

print(answer_question_embeddings(question1))


Question after preprocessing: Given that I like Inception and Jumanji, can you recommend some movies?

Extracted entity: Inception

Extracted entity: Jumanji

Question after removing entities: given that i like and , can you recommend some movies?

Liked movies: ['Inception', 'Jumanji']

['Bee Movie', 'The Wolverine', 'The Little Mermaid', 'Through the Wormhole', 'The Rocketeer']


In [121]:
question2 = "Recommend movies like Nightmare on Elm Street, Friday the 13th, and Halloween."

print(answer_question_embeddings(question2))

Question after preprocessing: Recommend movies like Nightmare on Elm Street, Friday the 13th, and Halloween.

Extracted entity: Nightmare on Elm Street

Extracted entity: Friday the 13th

Extracted entity: Halloween

Question after removing entities: recommend movies like , , and .

Liked movies: ['A Nightmare on Elm Street', 'Friday the 13th', 'Halloween']

['A Nightmare on Elm Street 4: The Dream Master', 'Final Destination', 'Jason X', 'A Nightmare on Elm Street 5: The Dream Child', 'A Nightmare on Elm Street 3: Dream Warriors']


In [122]:
question3 = "Given that I like Madagascar 1, Pocahontas, and Rio, can you recommend some movies? "

print(answer_question_embeddings(question3))

Question after preprocessing: Given that I like Madagascar 1, Pocahontas, and Rio, can you recommend some movies?

Extracted entity: Madagascar 1

Extracted entity: Pocahontas

Extracted entity: Rio

Question after removing entities: given that i like , , and , can you recommend some movies?

Liked movies: ['Madagascar', 'Pocahontas', 'Rio']

['Robots', 'Home', 'Penguins of Madagascar', 'Shrek Forever After', "Madagascar 3: Europe's Most Wanted"]


In [123]:
question6 = "Given that I like The Lion King, Pocahontas, and The Beauty and the Beast, can you recommend some movies?"

print(answer_question_embeddings(question6))

Question after preprocessing: Given that I like The Lion King, Pocahontas, and The Beauty and the Beast, can you recommend some movies?

Extracted entity: The Lion King

Extracted entity: Pocahontas

Extracted entity: The Beauty and the Beast

Question after removing entities: given that i like , , and , can you recommend some movies?

Liked movies: ['The Lion King', 'Pocahontas', 'Beauty and the Beast']

["The Lion King II: Simba's Pride", 'The Lion Guard', 'The King', 'The Little Mermaid', 'The Great Mouse Detective']


# TODO

- Implement way to handle double questions like "who is the director of ... AND who is the screenwriter of ...."
- Finish and perfect factual questions queries
- Implement language model to generate more realistic responses

## Factual question answering

For factual question we will proceed in the following way:
- Write a list of common question patterns using re to extract relations and entities from them
- Extract the probable relations and entities from the question matching it to the pattern and map those probable entities to actual entities in the graph using edit distance or embedding similarity with spacy
- Generate a custom sparql query based on the question pattern matched
- Query the graph with the custom query

### Step 1: Write a list of question patterns
The question pattern contains:
- The actual pattern like r"(?:find|which) movies.*contain(?:s)?(?: the word)? (?P<word>\w+)"
- A string associated to that specific question type which we are going to use to map question type to custom queries
- A boolean value that is 1 if we want to match the entities from the question to actual entities in the graph and 0 if we don't. One example could be question type: r"movies rated below (?P<number>\d+(\.\d+)?)" which asks the chatbot to list all the movies rated below a certain score. In this case there are no entities to be retrieved from the graph


In [None]:
import re

question_patterns = [

    # Pattern 0: who and what
    (r"who is the (?P<relation>.+?) of (?P<entity>.+)", 'who', 1),

    # Pattern 1: Find movies with (word) in their titles
    (r"(?:find|which) movies.*contain(?:s)?(?: the word)? (?P<word>\w+)", 'find_word_in_title', 0),
    (r"(?:find|which) movies with (?P<word>\w+) in (?:their )?titles?", 'find_word_in_title', 0),
    (r"(?:find|which) movies (?:whose )?(?:title|name) contains? (?P<word>\w+)", 'find_word_in_title', 0),

    # Pattern 2: Highest-rated movies (optional 'above' and a number)
    (r"(?:what are|list)(?: the)?(?: highest[-\s]rated)? movies(?: rated)?(?: above| greater than)?(?: (?P<number>\d+(\.\d+)?))?", 'movies_rating_above', 0),
    (r"movies (?:rated )?(?:above )?(?P<number>\d+(\.\d+)?)?", 'movies_rating_above', 0),

    # Pattern 3: Lowest-rated movies (optional 'below' and a number)
    (r"(?:what are|list)(?: the)?(?: lowest[-\s]rated)? movies(?: rated)?(?: below| less than)?(?: (?P<number>\d+(\.\d+)?))?", 'movies_rating_below', 0),
    (r"movies (?:rated )?(?:below )?(?P<number>\d+(\.\d+)?)?", 'movies_rating_below', 0),

    # Pattern 4: Entities in alphabetical order
    (r"which (?P<entity>.+) comes first alphabetically", 'entity_first_alphabetically', 1),
    (r"list (?P<entity>.+) in alphabetical order", 'entity_first_alphabetically', 1),

    # Pattern 5: Entities in reverse alphabetical order
    (r"which (?P<entity>.+) comes last alphabetically", 'entity_last_alphabetically', 1),
    (r"list (?P<entity>.+) in reverse alphabetical order", 'entity_last_alphabetically', 1),

]

### Step 2: Process question to extract entities

We will proceed in the following way:

- Extract the dictionary of matched entities via the .groupdict() method.
    - For question r"who is the director of Star Wars" matched to pattern r"who is the (?P<relation>.+?) of (?P<entity>.+)" the dictionary looks like this {'relation': 'director', 'entity': 'Star Wars'}
- Append the question type to the dictionary. So for r"who is the director of Star Wars" it would result in: {'relation': 'director', 'entity': 'Star Wars', 'qtype': 'who'}
- If the question type needs to match entities to the knowledge graph (boolean value == 1) then we gather the relation and/or entity that we extracted from the question via matching the pattern and we match those values to actual entities in the graph
    - For entities we try to match them with the match_entity_editdistance function.
    - For predicates we need to account for synonyms so we use the check_ngram_match function that takes a potential predicate and tries to match it via editDistance to an actual predicate and if it doesn't work it tries by evaluating embeddings similarity via spacy

In [None]:
def process_question_factual(question, entity_dictionary, predicate_dictionary):

    for pattern, qtype, matching in question_patterns:

        print(f"pattern = {pattern}")
        match = re.match(pattern, question, re.IGNORECASE)

        if match:
            params = match.groupdict()
            params['type'] = qtype  # Add the question type to the params
            print(f"Question matched to pattern {qtype}\n")

            if matching:
                # Extract and match the relation and entity
                relation = params.get('relation', "").lower()  # Set default as empty string
                entity = params.get('entity', "") # Set default as empty string (don't lower it)

                # Match the entity to the closest in the knowledge graph. Returns an uri and label of the closest entity, and the distance
                _, matched_entity_label, _ = match_entity_editdistance(entity, dictionary=entity_dictionary) if entity else None

                # Match the relation to the closest in the knowledge graph. check_ngram_match returns a list
                matched_predicate_label = check_ngram_match(relation, predicates, threshold=2, n=5, confidence=0.6) if relation else None

                # Update entity and predicate with the matched labels
                if matched_entity_label:
                    params['entity'] = matched_entity_label
                if matched_predicate_label:
                    params['relation'] = matched_predicate_label[0]

            return params

    return None

In [None]:
# Example usage

user_input = {
    0: "Who is the director of Star Wars",
    1: "Which movies whose name contains italy",
    2: "List movies rated above 7",
    3: "what are the lowest-rated movies?",
    4: "Which films comes first alphabetically",
    5: "list actors in reverse alphabetical order"
}

for pattern, question in user_input.items():

    params = process_question_factual(question, nodes, predicates)

    if params:
        print(f"Pattern {pattern}: Question: {question}\n")

        for key, value in params.items():
            print(f"{key} : {params[key]}\n")
        print("\n\n")

    else:
        print(f"\nPattern {pattern} not matched\n\n\n\n")

### Step 3: Match question types to SPARQL queries

The generate_sparql_query function takes the type of the question found in the params dictionary and generates the correspnding query. For example for question "Who is the director of Star Wars" it generates a query that query the graph looking for an entity that has relation "director" with the entity "Star Wars"

In [None]:
def generate_sparql_query(params):
    qtype = params.get('type')

    if qtype == 'who':
        sparql_query = f"""
        SELECT ?result WHERE {{
            ?entity rdfs:label "{params['entity']}"@en .
            ?entity <{pred2uri[params['relation']]}> ?item .
            ?item rdfs:label ?result .
            FILTER (lang(?result) = 'en')
        }}
        """
        return sparql_query


    # Fix: this query returns names of all the entities whose label contains the word, not just movies
    elif qtype == 'find_word_in_title':
        word = params.get('word')
        sparql_query = f"""
        SELECT ?movieLabel WHERE {{
            ?movie rdfs:label ?movieLabel .
            FILTER(CONTAINS(LCASE(?movieLabel), LCASE("{word}"))) .
            FILTER (lang(?movieLabel) = 'en')
        }}
        """
        return sparql_query

    elif qtype == 'movies_rating_above':
        number = params.get('number')
        sparql_query = f"""
        SELECT ?movieLabel WHERE {{
            ?movie ddis:rating ?rating .
            FILTER(?rating > {number}) .
            ?movie rdfs:label ?movieLabel .
            FILTER (lang(?movieLabel) = 'en')
        }} ORDER BY DESC(?rating) LIMIT 1
        """
        return sparql_query

    elif qtype == 'movies_rating_below':
        number = params.get('number')
        sparql_query = f"""
        SELECT ?movieLabel WHERE {{
            ?movie ddis:rating ?rating .
            FILTER(?rating < {number}) .
            ?movie rdfs:label ?movieLabel .
            FILTER (lang(?movieLabel) = 'en')
        }} ORDER BY DESC(?rating)
        """
        return sparql_query

    elif qtype == 'entity_first_alphabetically':
        sparql_query = f"""
        SELECT ?entity_label WHERE {{
            ?entity wdt:P31 <{params['matched_entity_uri']}> .
            ?entity rdfs:label ?entity_label .
            FILTER (lang(?entity_label) = 'en')
        }} ORDER BY ASC(?entity_label)
        """
        return sparql_query

    elif qtype == 'entity_last_alphabetically':
        sparql_query = f"""
        SELECT ?entity_label WHERE {{
            ?entity wdt:P31 <{params['matched_entity_uri']}> .
            ?entity rdfs:label ?entity_label .
            FILTER (lang(?entity_label) = 'en')
        }} ORDER BY DESC(?entity_label)
        """
        return sparql_query

    else:
        return None

'''
TODO
you shoud include prefixes in each query!
'''

In [96]:
print(check_ngram_match('Star Wars', nodes))

sparql_query = f"""
        SELECT ?result WHERE {{
            ?entity rdfs:label "{check_ngram_match('Star Wars', nodes)[0]}"@en .
            ?entity <{pred2uri['director']}> ?item .
            ?item rdfs:label ?result .
            FILTER (lang(?result) = 'en')
        }}
        """
for res in graph.query(sparql_query):
    print(res)

['Star Wars']


### Step 4: Query the graph

We implement a function that takes the graph and query and returns the result and an exit code that is an empty string if results were found, and "No results" if no results were found

In [97]:
def query_graph(graph, sparql_query):

    # Execute the query
    qres = graph.query(sparql_query)

    # Process the results
    results = []
    for row in qres:
        results.append(str(row.result))

    # Check if we have results, if not return exit_code = "No results"
    return results

### Step 5: Put everything together

We implement a function that takes a question and computes the factual answer

In [98]:
def answer_question_factual(question):

    if process_question_factual(question, nodes, predicates):
        params = process_question_factual(question, nodes, predicates)
        print(f"Parameters found: {params}\n")
    else:
        return exit_message

    sparql_query = generate_sparql_query(params)

    answer = query_graph(graph, sparql_query)

    if answer:
        return answer
    else:
        return None


In [99]:
def answer_question_factual(question):

    try:

        # Process question to get parameters
        params = process_question_factual(question, nodes, predicates)

        print(f"Parameters found: {params}\n")

        # Generate the SPARQL query based on the extracted parameters
        sparql_query = generate_sparql_query(params)

        # Execute the query on the knowledge graph
        answer = query_graph(graph, sparql_query)

        # Check if an answer was returned
        if answer:
            return answer
        else:
            return None

    except Exception as e:

        return None



# Final Chatbot Structure

We have successfully implemented a way to retrieve embeddings answers and factual answers. Now we put everything together and implement a way for the chatbot to seem as human as possible

In [119]:
def generate_answer(graph, question):

    questions_list = split_questions(question)


    Answer = ""

    if len(questions_list) > 1:

        for i, q in enumerate(questions_list):

            embedding_answer = answer_question_embeddings(q)

            print("\n\n-----------\n\n")

            factual_answer = answer_question_factual(q)

            print("\n\n-----------\n\n")

            Answer += f"For question {q} the answer suggested by the embeddings is: {embedding_answer} while the answer obtained by quering the graph is: {factual_answer}\n"

    else:

        if answer_question_embeddings(question):

            embedding_answer = answer_question_embeddings(question)

            Answer += f"The answer suggested by the embeddings is: {embedding_answer}\n"

        print("\n\n-----------\n\n")

        if answer_question_factual(question):

            factual_answer = answer_question_factual(question)

            Answer += f"The answer obtained by querying the graph is: {factual_answer}\n"

        else:
            # Generate a response message with pegasus
            num_beams = 10
            num_return_sequences = 1
            context = f"{question} i don't know"
            print(context)
            print(get_response(context,num_return_sequences,num_beams)[0])
            print("\n")
            Answer += get_response(context,num_return_sequences,num_beams)[0]


        print("\n\n-----------\n\n")

    return Answer

In [113]:
# Test

question = "Who is the director of Star Wars and who is the screenwriter of The Godfather"

print(generate_answer(graph, question))

Question after preprocessing: Who is the director of Star Wars and who is the screenwriter of The Godfather

Extracted entity: Star Wars

Extracted entity: The Godfather

Question after removing entities: who is the director of and who is the screenwriter of

Question after preprocessing: Who is the director of Star Wars and who is the screenwriter of The Godfather

Extracted entity: Star Wars

Extracted entity: The Godfather

Question after removing entities: who is the director of and who is the screenwriter of



-----------


pattern = who is the (?P<relation>.+?) of (?P<entity>.+)
Question matched to pattern who

Who is the director of Star Wars and who is the screenwriter of The Godfather i don't know




I don't know who is the director of Star Wars and who is the writer of The Godfather.




-----------


The answer suggested by the embeddings is: ['George Lucas', 'Anthony Daniels', 'Andy Secombe']
I don't know who is the director of Star Wars and who is the writer of The Godfather.


In [118]:
question6 = "Given that I like The Lion King, Pocahontas, and The Beauty and the Beast, can you recommend some movies?"

#print(generate_answer(graph, question1))
print(generate_answer(graph, question6))


Question after preprocessing: Given that I like The Lion King, Pocahontas, and The Beauty and the Beast, can you recommend some movies?

Extracted entity: The Lion King

Extracted entity: Pocahontas

Extracted entity: The Beauty and the Beast

Question after removing entities: given that i like , , and , can you recommend some movies?

Liked movies: ['The Lion King', 'Pocahontas', 'Beauty and the Beast']

Question after preprocessing: Given that I like The Lion King, Pocahontas, and The Beauty and the Beast, can you recommend some movies?

Extracted entity: The Lion King

Extracted entity: Pocahontas

Extracted entity: The Beauty and the Beast

Question after removing entities: given that i like , , and , can you recommend some movies?

Liked movies: ['The Lion King', 'Pocahontas', 'Beauty and the Beast']



-----------


pattern = who is the (?P<relation>.+?) of (?P<entity>.+)
pattern = (?:find|which) movies.*contain(?:s)?(?: the word)? (?P<word>\w+)
pattern = (?:find|which) movies wi



Can you recommend some movies that I like?




-----------


The answer suggested by the embeddings is: ["The Lion King II: Simba's Pride", 'The Lion Guard', 'The King', 'The Little Mermaid', 'The Great Mouse Detective']
Can you recommend some movies that I like?


# SpeakEasy Environment

In [104]:
from rdflib.namespace import Namespace, RDF, RDFS, XSD
from rdflib.term import URIRef, Literal
import rdflib

In [None]:
!pip install '/content/drive/MyDrive/Speakeasy_Project/speakeasy-python-client-library/dist/speakeasypy-1.0.0-py3-none-any.whl'

from speakeasypy import Speakeasy, Chatroom
from typing import List
import time

DEFAULT_HOST_URL = 'https://speakeasy.ifi.uzh.ch'
listen_freq = 2


class Agent:
    def __init__(self, username, password):
        self.username = username
        # Initialize the Speakeasy Python framework and login.
        self.speakeasy = Speakeasy(host=DEFAULT_HOST_URL, username=username, password=password)
        self.speakeasy.login()  # This framework will help you log out automatically when the program terminates.

    def listen(self):
        graph = rdflib.Graph()
        #graph.parse('/Users/gianmarcoalbano/Desktop/Advanced topics in AI/Speakeasy Project/Datasets/14_graph.nt', format='turtle')
        graph.parse('/content/drive/MyDrive/14_graph.nt', format='turtle')
        while True:
            # only check active chatrooms (i.e., remaining_time > 0) if active=True.
            rooms: List[Chatroom] = self.speakeasy.get_rooms(active=True)
            for room in rooms:
                if not room.initiated:
                    # send a welcome message if room is not initiated
                    room.post_messages(f'Hello! This is a welcome message from {room.my_alias}.')
                    room.initiated = True
                # Retrieve messages from this chat room.
                # If only_partner=True, it filters out messages sent by the current bot.
                # If only_new=True, it filters out messages that have already been marked as processed.
                for message in room.get_messages(only_partner=True, only_new=True):
                    print(
                        f"\t- Chatroom {room.room_id} "
                        f"- new message #{message.ordinal}: '{message.message}' "
                        f"- {self.get_time()}")

                    # Implement your agent here #
                    result = generate_answer(graph, message.message)

                    # Send a message to the corresponding chat room using the post_messages method of the room object.
                    room.post_messages(f"Received your message: '{result}' ")
                    # Mark the message as processed, so it will be filtered out when retrieving new messages.
                    room.mark_as_processed(message)

                # Retrieve reactions from this chat room.
                # If only_new=True, it filters out reactions that have already been marked as processed.
                for reaction in room.get_reactions(only_new=True):
                    print(
                        f"\t- Chatroom {room.room_id} "
                        f"- new reaction #{reaction.message_ordinal}: '{reaction.type}' "
                        f"- {self.get_time()}")

                    # Implement your agent here #

                    room.post_messages(f"Received your reaction: '{reaction.type}' ")
                    room.mark_as_processed(reaction)

            time.sleep(listen_freq)

    @staticmethod
    def get_time():
        return time.strftime("%H:%M:%S, %d-%m-%Y", time.localtime())


if __name__ == '__main__':
    demo_bot = Agent("swift-comet", "X2wqU6D3")
    demo_bot.listen()