In [2]:
from rdflib.term import URIRef, Literal
import rdflib

In [3]:
import re
import editdistance

### Import the knowledge graph

In [4]:
graph = rdflib.Graph()
graph.parse('/Users/gianmarcoalbano/Desktop/Advanced topics in AI/Speakeasy Project/Datasets/14_graph.nt', format='turtle')

<Graph identifier=N647463251d334519b4912ed2bf80bf01 (<class 'rdflib.graph.Graph'>)>

### Some info on the knowledge graph

The entities are stored with different URIs. The most common namespaces are the following:


In [5]:
# define some prefixes
WD = rdflib.Namespace('http://www.wikidata.org/entity/')
WDT = rdflib.Namespace('http://www.wikidata.org/prop/direct/')
DDIS = rdflib.Namespace('http://ddis.ch/atai/')
RDFS = rdflib.namespace.RDFS
SCHEMA = rdflib.Namespace('http://schema.org/')

In [6]:
print('Some subjects from the knowledge graph')
for objs in list(set(graph.subjects()))[:10]:
    print(objs)
    
print('\n Some objects from the knowledge graph')
for objs in list(set(graph.objects()))[10:20]:
    print(objs)

Some subjects from the knowledge graph
http://www.wikidata.org/entity/Q3701012
http://www.wikidata.org/entity/Q3547244
http://www.wikidata.org/entity/Q2159242
http://www.wikidata.org/entity/Q1273264
http://www.wikidata.org/entity/Q7612508
http://www.wikidata.org/entity/Q392915
http://www.wikidata.org/entity/Q1374891
http://www.wikidata.org/entity/Q4455695
http://www.wikidata.org/entity/Q14954903
http://www.wikidata.org/entity/Q635352

 Some objects from the knowledge graph
1995 film by Paolo Virzì
nm0221657
http://www.wikidata.org/entity/Q1273264
http://www.wikidata.org/entity/Q7612508
http://www.wikidata.org/entity/Q1374891
http://www.wikidata.org/entity/Q4455695
tt1336988
http://www.wikidata.org/entity/Q14954903
American actor (1891-1938)
2019 film by Ladj Ly


Some ways to access the label of an entity in the graph subjects given it's URI:

In [7]:
for node in graph.subjects():
    if graph.value(subject=node, predicate=RDFS.label): # Check if the triple exists
        print(f"node {node} has label {graph.value(subject=node, predicate=RDFS.label)}")
    break


node http://www.wikidata.org/entity/Q320417 has label A Map of the World


We want to check if every subject in the graph has a label

In [8]:
i = 0
j = 0
for node in graph.subjects():
    j += 1
    if graph.value(subject=node, predicate=RDFS.label): # Check if the triple exists
        i += 1

print(f"Number of subjects with a label: {i}\n")
print(f"Number of subjects in the graph: {j}\n")
if i != j:
    print(f"There are {j-i} subject entities without a label")

Number of subjects with a label: 2051387

Number of subjects in the graph: 2056777

There are 5390 subject entities without a label


In [9]:
for node in graph.subjects():
    if graph.value(subject=node, predicate=RDFS.label) == None:
        print(f"entity {node} has no label")

entity http://www.wikidata.org/entity/Q94685087 has no label
entity http://www.wikidata.org/entity/Q23827934 has no label
entity http://www.wikidata.org/entity/Q101245050 has no label
entity http://www.wikidata.org/entity/Q65092062 has no label
entity http://www.wikidata.org/entity/Q104631284 has no label
entity http://www.wikidata.org/entity/Q2688886 has no label
entity http://www.wikidata.org/entity/Q15849464 has no label
entity http://www.wikidata.org/entity/Q55598789 has no label
entity http://www.wikidata.org/entity/Q4197865 has no label
entity http://www.wikidata.org/entity/Q5980862 has no label
entity http://www.wikidata.org/entity/Q2937568 has no label
entity http://www.wikidata.org/entity/Q86725850 has no label
entity http://www.wikidata.org/entity/Q106464367 has no label
entity http://www.wikidata.org/entity/Q102315151 has no label
entity http://www.wikidata.org/entity/Q20636406 has no label
entity http://www.wikidata.org/entity/Q106473918 has no label
entity http://www.wikid

### Make a dictionary of nodes URIs with the respective labels

We want to make a dictionary in which the keys are the nodes URIs and the values are the nodes labels

In [23]:
# Function to extract the local part of a URI (e.g., after the last / or #)
def extract_label_from_uri(uri, namespaces):
    # Loop through all namespaces and remove the matching part
    for namespace in namespaces:
        if str(uri).startswith(str(namespace)):
            return str(uri).replace(str(namespace), "")
    # If no match, return the original URI
    return str(uri).split('/')[-1]

# Function to build a dictionary of nodes and their labels
def build_node_label_dict(graph, namespaces):
    nodes = {}
    
    for node in graph.all_nodes():
        if isinstance(node, rdflib.term.URIRef):  # Only process URIs
            # Check if the node has a label
            label = graph.value(node, RDFS.label)
            
            if label:
                # If label exists, use it
                nodes[node.toPython()] = str(label)
            else:
                # If no label, extract the local part of the URI
                local_label = extract_label_from_uri(node, namespaces)
                nodes[node.toPython()] = local_label
    
    return nodes

namespaces = [WD, WDT, DDIS, RDFS, SCHEMA]

nodes = build_node_label_dict(graph, namespaces)

# Check the result
for uri, label in nodes.items():
    print(f"URI: {uri}, Label: {label}")
    break

URI: http://www.wikidata.org/entity/Q1273264, Label: Michael Coulter


Make an inverse dictionary to find URIs of the entities given the labels

In [101]:
ent2uri = {ent: uri for uri, ent in nodes.items()}

http://www.wikidata.org/entity/Q18914861


We also make another dictionary specifically for predicates

In [27]:
# Function to build a dictionary of predicates and their labels
def build_pred_label_dict(graph, namespaces):
    predicates = {}
    
    for node in graph.predicates():
        if isinstance(node, rdflib.term.URIRef):  # Only process URIs
            # Check if the node has a label
            label = graph.value(node, RDFS.label)
            
            if label:
                # If label exists, use it
                predicates[node.toPython()] = str(label)

            # This condition is never evaluated cause all the predicates have labels
            else:
                # If no label, extract the local part of the URI
                local_label = extract_label_from_uri(node, namespaces)
                predicates[node.toPython()] = local_label
    
    return predicates

predicates = build_pred_label_dict(graph, namespaces)

# Check the result
for uri, label in predicates.items():
    print(f"URI: {uri}, Label: {label}")
    break

URI: http://www.w3.org/2000/01/rdf-schema#label, Label: node label


Make an inverse dictionary to find URIs of the predicates given the labels

In [102]:
pred2uri = {pred: uri for uri, pred in predicates.items()}

### Matching function

Suppose we find an entity "Batman_1989" in the question we want to answer. However "Batman_1989" is registered in the knowledge graph as "Batman 1989". We need a function that takes the entity from the questions and finds the closest entity in the knowledge graph

In [59]:
def match_entity(entity, dictionary=nodes):
    
    tmp = 9999
    match_node = ""
    match_value = ""
    
    for key, value in dictionary.items():

        if editdistance.eval(value, entity) == 0:

            return key, value
        
        if editdistance.eval(value, entity) < tmp:
            tmp = editdistance.eval(value, entity)
            match_node = key
            match_value = value
    
    return match_node, match_value

We can also use the match_entity function to match a predicate to the closest predicate in the graph by specifing dictionary=predicates


For example:

In [66]:
match_node, match_value = match_entity('direcror', predicates)
print(f"URI: {match_node}, label: {match_value}")

URI: http://www.wikidata.org/prop/direct/P57, label: director


### Processing questions

Our first approach to answer factual question is very naive. The following function takes a question and tries to fit it to a series of questions patterns to extract a relation and an entity. For example the question "Who is the director of Batman" corresponds to pattern "who is the (?P<relation>.*) of (?P<entity>.*)". When we call method re.match on the question and the pattern it produces a match object (we call it match in the function) that contains a dictionary: {'relation': 'director', 'entity': Batman}. To access this dictionary we call .groupdict() on the match object (so match.groupdict() will be the dictionary). We retrieve relation and entity from the dictionary unsing get('relation') and get('entity') and specifing that if the dictionary doesn't have that key it should output "".

In [92]:
# List of patterns with named groups for 'relation' and 'entity'
question_patterns = [
    r"who is the (?P<relation>.*) of (?P<entity>.*)",   # e.g. who is the director of Batman
    r"what is the (?P<relation>.*) of (?P<entity>.*)",   # e.g. what is the movie rating of Batman
    r"when was (?P<entity>.*) (?P<relation>.*)",   # e.g. when was the godfather released
    r"who wrote the script of (?P<entity>.*)" # This one does not work with this approach
]

# Function to match the question to a pattern and extract relation and entity
def process_question(question, entity_dictionary, predicate_dictionary):
    
    for pattern in question_patterns:
        match = re.match(pattern, question, re.IGNORECASE)

        if match:
            # Extract relation and entity from named groups
            relation = match.groupdict().get('relation', "").lower()  # Set default as empty string
            entity = match.groupdict().get('entity', "")  # Set default as empty string
            
            # Match the entity to the closest in the knowledge graph
            matched_entity_uri, matched_entity_label = match_entity(entity, dictionary=entity_dictionary) if entity else (None, None)
            
            # Match the relation to the closest in the knowledge graph
            matched_predicate_uri, matched_predicate_label = match_entity(relation, dictionary=predicate_dictionary) if relation else (None, None)
            
            return matched_entity_uri, matched_entity_label, matched_predicate_uri, matched_predicate_label
    
    return None  # Return None if no pattern matches

# Example usage
question = "What is the release date of batman?" 
matched_entity_uri, matched_entity_label, matched_predicate_uri, matched_predicate_label = process_question(question, nodes, predicates)

print(f"Matched entity: {matched_entity_label} ({matched_entity_uri})")
print(f"Matched relation: {matched_predicate_label} ({matched_predicate_uri})")


Matched entity: Batman (http://www.wikidata.org/entity/Q4869384)
Matched relation: relative (http://www.wikidata.org/prop/direct/P1038)


One problem with this is that relation "release date" is not correctly catched because it is closer to relation "relative" than it is to relation "publication date" which actually exists as a predicate in the knowledge graph

IDEA TO FIX IT: modify the matching function for the predicates so that instead of relying on the edit distance it relies on embeddings similarity. Come back to this when you have started implementing embeddings

In [118]:
def query_graph(graph, entity_label, relation_predicate, namespaces):
    """
    Queries the graph to find the relation (e.g., director) of a given entity (e.g., movie).
    
    Args:
        graph (rdflib.Graph): The RDF graph to query.
        entity_label (str): The label of the entity (e.g., the movie title).
        relation_predicate (str): The predicate URI for the relation (e.g., director).
        namespaces (list): List of namespaces used in the graph.
    
    Returns:
        str: The result (e.g., director's name) or a message if no result is found.
    """
    # Define the query based on your format
    query_template = """
    SELECT ?result WHERE {{
        ?entity rdfs:label "{}"@en .  
        ?entity <{}> ?item . 
        ?item rdfs:label ?result . 
    }}  
    """.format(entity_label, relation_predicate)

    print("--- SPARQL query ---")
    print(query_template)

    # Execute the query
    qres = graph.query(query_template)

    # Process the results
    results = []
    for row in qres:
        results.append(row.result)
    
    # Check if we have results, if not return a friendly message
    if results:
        for result in results:
            print(f"Answer: {result}")
        return results[0]  # Return the first result (as there is a LIMIT 1)
    else:
        return "No results found for the given query."



In [119]:
# Example usage:
entity_label = "Apocalypse Now"  # Movie title (entity)
relation_predicate = pred2uri['screenwriter']   # Predicate for 'director'

namespaces = [WD, WDT, DDIS, RDFS, SCHEMA]  # Your namespaces list
result = query_graph(graph, entity_label, relation_predicate, namespaces)

print(f"The director of {entity_label} is: {result}")

--- SPARQL query ---

    SELECT ?result WHERE {
        ?entity rdfs:label "Apocalypse Now"@en .  
        ?entity <http://www.wikidata.org/prop/direct/P58> ?item . 
        ?item rdfs:label ?result . 
    }  
    
Answer: Francis Ford Coppola
Answer: John Milius
Answer: Michael Herr
The director of Apocalypse Now is: Francis Ford Coppola


In [108]:
# Example usage:
match_predicate = pred2uri['director']  
match_entity = ent2uri['Batman']

result = query_graph(graph, match_predicate, match_entity, namespaces)

print(result)  # This should print the answer returned by the SPARQL query

--- SPARQL query ---

    SELECT ?result WHERE {
        ?entity rdfs:label "http://www.wikidata.org/prop/direct/P57"@en .  
        ?entity <http://www.wikidata.org/entity/Q18914861> ?item . 
        ?item rdfs:label ?result . 
    }  
    LIMIT 1
    
No results found for the given query.


In [115]:
print(graph.value(subject=WD.Q18914861, predicate=WDT.P57))

http://www.wikidata.org/entity/Q295207


In [113]:
print(ent2uri['Batman'])

http://www.wikidata.org/entity/Q18914861


In [114]:
print(pred2uri['director'])

http://www.wikidata.org/prop/direct/P57
