In [1]:
# !pip install sparqlwrapper openai tqdm pandas networkx numpy pandas label-studio guidance

import networkx as nx
import numpy as np
from collections import Counter
from SPARQLWrapper import SPARQLWrapper, JSON

# !label-studio start

In [2]:
import os
import json
import pandas as pd
import pickle
import uuid
import matplotlib.pyplot as plt
import seaborn as sns
from dotenv import load_dotenv
from openai import OpenAI
from tqdm import tqdm, trange
from concurrent.futures import ThreadPoolExecutor, as_completed

# set the maximum number of retries
MAX_RETRIES = 10

# Load environment variables
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")

# Initialize OpenAI client
client = OpenAI(api_key=openai_api_key)

uuid = str(uuid.uuid4()).split("-")[-1]
uuid

'0f2c3482f46a'

Read the queries from CSV

In [3]:
df = pd.read_csv("filtered_questions_v3.csv", index_col=0)
df["dbpedia_entities"] = df["dbpedia_entities"].apply(lambda x: eval(x))
df["placeholders"] = df["placeholders"].apply(lambda x: eval(x))
df["dbpedia_entities_re"] = df["dbpedia_entities"].apply(
    lambda x: {k: v.split("/")[-1].split("#")[-1] for k, v in x.items()}
)
df

Unnamed: 0,question,type,placeholders,naturalness,difficulty,dbpedia_entities,dbpedia_entities_re
0,Please describe Marie Curie's contributions to...,descriptive,{'person': 'Marie Curie'},high,medium,{'person': 'http://dbpedia.org/resource/Marie_...,{'person': 'Marie_Curie'}
1,Explain the relationship between the United Na...,explanatory,"{'entity A': 'United Nations', 'entity B': 'in...",high,medium,{'entity A': 'http://dbpedia.org/resource/Unit...,"{'entity A': 'United_Nations', 'entity B': 'Pe..."
2,Based on current advancements in artificial in...,predictive,"{'entity': 'artificial intelligence', 'event':...",high,medium,{'entity': 'http://dbpedia.org/resource/Artifi...,"{'entity': 'Artificial_intelligence', 'event':..."
3,Compare and contrast the educational philosoph...,comparative,"{'entity A': 'John Dewey', 'entity B': 'Paulo ...",high,medium,{'entity A': 'http://dbpedia.org/resource/John...,"{'entity A': 'John_Dewey', 'entity B': 'Paulo_..."
4,How do you evaluate the impact of the Industri...,critical,"{'event': 'Industrial Revolution', 'field': 'm...",high,hard,{'event': 'http://dbpedia.org/resource/Industr...,"{'event': 'Industrial_Revolution', 'field': 'E..."
...,...,...,...,...,...,...,...
789,Compare and contrast the educational philosoph...,comparative,"{'entity A': 'John Dewey', 'entity B': 'Maria ...",high,hard,{'entity A': 'http://dbpedia.org/resource/John...,"{'entity A': 'John_Dewey', 'entity B': 'Maria_..."
790,How do you evaluate the impact of Rachel Carso...,critical,"{'person': 'Rachel Carson', 'event': 'her work...",high,medium,{'person': 'http://dbpedia.org/resource/Rachel...,"{'person': 'Rachel_Carson', 'event': 'Silent_S..."
791,Based on the current advancements in technolog...,predictive,"{'entity': 'artificial intelligence', 'time': ...",high,medium,{'entity': 'http://dbpedia.org/resource/Artifi...,"{'entity': 'Artificial_intelligence', 'time': ..."
792,Compare and contrast the similarities and diff...,comparative,"{'entity A': 'traditional education systems', ...",high,medium,{'entity A': 'http://dbpedia.org/resource/Educ...,"{'entity A': 'Education', 'entity B': 'Massive..."


SPARQL Template for single entity, two entities and three entities

In [None]:
# SPARQL query for single entity
single_entity_query = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX dbr: <http://dbpedia.org/resource/>
SELECT DISTINCT ?entity ?p ?firstHopEntity ?p2 ?secondHopEntity
WHERE {{
  {{
    dbr:{entity} ?p ?firstHopEntity.
    ?firstHopEntity ?p2 ?secondHopEntity.
    BIND(dbr:{entity} AS ?entity)
  }} UNION {{
    dbr:{entity} ?p ?firstHopEntity.
    ?secondHopEntity ?p2 ?firstHopEntity.
    BIND(dbr:{entity} AS ?entity)
  }} UNION {{
    ?firstHopEntity ?p dbr:{entity}.
    ?firstHopEntity ?p2 ?secondHopEntity.
    BIND(dbr:{entity} AS ?entity)
  }} UNION {{
    ?firstHopEntity ?p dbr:{entity}.
    ?secondHopEntity ?p2 ?firstHopEntity.
    BIND(dbr:{entity} AS ?entity)
  }}
}}
"""

# SPARQL query for two entities
two_entity_query = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX dbr: <http://dbpedia.org/resource/>
SELECT DISTINCT ?entity ?p ?firstHopEntity ?p2 ?secondHopEntity
WHERE {{
    {{
    dbr:{entity1} ?p ?firstHopEntity.
    ?firstHopEntity ?p2 ?secondHopEntity.
    BIND(dbr:{entity1} AS ?entity)
    }} UNION {{
    dbr:{entity1} ?p ?firstHopEntity.
    ?secondHopEntity ?p2 ?firstHopEntity.
    BIND(dbr:{entity1} AS ?entity)
    }} UNION {{
    ?firstHopEntity ?p dbr:{entity1}.
    ?firstHopEntity ?p2 ?secondHopEntity.
    BIND(dbr:{entity1} AS ?entity)
    }} UNION {{
    ?firstHopEntity ?p dbr:{entity1}.
    ?secondHopEntity ?p2 ?firstHopEntity.
    BIND(dbr:{entity1} AS ?entity)
    }} UNION {{
    dbr:{entity2} ?p ?firstHopEntity.
    ?firstHopEntity ?p2 ?secondHopEntity.
    BIND(dbr:{entity2} AS ?entity)
    }} UNION {{
    dbr:{entity2} ?p ?firstHopEntity.
    ?secondHopEntity ?p2 ?firstHopEntity.
    BIND(dbr:{entity2} AS ?entity)
    }} UNION {{
    ?firstHopEntity ?p dbr:{entity2}.
    ?firstHopEntity ?p2 ?secondHopEntity.
    BIND(dbr:{entity2} AS ?entity)
    }} UNION {{
    ?firstHopEntity ?p dbr:{entity2}.
    ?secondHopEntity ?p2 ?firstHopEntity.
    BIND(dbr:{entity2} AS ?entity)
    }}
}}
"""

# SPARQL query for three entities

three_entity_query = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX dbr: <http://dbpedia.org/resource/>
SELECT DISTINCT ?entity ?p ?firstHopEntity ?p2 ?secondHopEntity
WHERE {{
  {{
    dbr:{entity1} ?p ?firstHopEntity.
    ?firstHopEntity ?p2 ?secondHopEntity.
    BIND(dbr:{entity1} AS ?entity)
  }} UNION {{
    dbr:{entity1} ?p ?firstHopEntity.
    ?secondHopEntity ?p2 ?firstHopEntity.
    BIND(dbr:{entity1} AS ?entity)
  }} UNION {{
    ?firstHopEntity ?p dbr:{entity1}.
    ?firstHopEntity ?p2 ?secondHopEntity.
    BIND(dbr:{entity1} AS ?entity)
  }} UNION {{
    ?firstHopEntity ?p dbr:{entity1}.
    ?secondHopEntity ?p2 ?firstHopEntity.
    BIND(dbr:{entity1} AS ?entity)
  }} UNION {{
    dbr:{entity2} ?p ?firstHopEntity.
    ?firstHopEntity ?p2 ?secondHopEntity.
    BIND(dbr:{entity2} AS ?entity)
  }} UNION {{
    dbr:{entity2} ?p ?firstHopEntity.
    ?secondHopEntity ?p2 ?firstHopEntity.
    BIND(dbr:{entity2} AS ?entity)
  }} UNION {{
    ?firstHopEntity ?p dbr:{entity2}.
    ?firstHopEntity ?p2 ?secondHopEntity.
    BIND(dbr:{entity2} AS ?entity)
  }} UNION {{
    ?firstHopEntity ?p dbr:{entity2}.
    ?secondHopEntity ?p2 ?firstHopEntity.
    BIND(dbr:{entity2} AS ?entity)
  }} UNION {{
    dbr:{entity3} ?p ?firstHopEntity.
    ?firstHopEntity ?p2 ?secondHopEntity.
    BIND(dbr:{entity3} AS ?entity)
  }} UNION {{
    dbr:{entity3} ?p ?firstHopEntity.
    ?secondHopEntity ?p2 ?firstHopEntity.
    BIND(dbr:{entity3} AS ?entity)
  }} UNION {{
    ?firstHopEntity ?p dbr:{entity3}.
    ?firstHopEntity ?p2 ?secondHopEntity.
    BIND(dbr:{entity3} AS ?entity)
  }} UNION {{
    ?firstHopEntity ?p dbr:{entity3}.
    ?secondHopEntity ?p2 ?firstHopEntity.
    BIND(dbr:{entity3} AS ?entity)
  }}
}}
"""


def run_sparql(entities):
    sparql = SPARQLWrapper("http://dbpedia.org/sparql")

    if len(entities) == 1:
        query = single_entity_query
        query = query.format(entity=entities[0])
    elif len(entities) == 2:
        query = two_entity_query
        query = query.format(entity1=entities[0], entity2=entities[1])
    elif len(entities) == 3:
        query = three_entity_query
        query = query.format(
            entity1=entities[0], entity2=entities[1], entity3=entities[2]
        )

    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    return results

Test case

In [6]:
# test_case = run_sparql(single_entity_query, ["Albert_Einstein"])
# test_case_2 = run_sparql(two_entity_query, ["Albert_Einstein", "Germany"])
# test_case_3 = run_sparql(three_entity_query, ["Albert_Einstein", "Germany", "Berlin"])

save all the nodes/edges to NetworkX and then calculate the statistics of the graph

In [7]:
def extract_value(obj):
    if obj["type"] == "uri":
        return obj["value"].split("/")[-1].split("#")[-1]
    else:
        return obj["value"]


def build_up_graph(rdfs):
    """_summary_

    Args:
        rdfs (dict): results from SPARQL query

    Returns:
        _type_: networkx graph
    """
    # hold directed edges. self loops are allowed but multiple(parell) edges are not.
    G = nx.DiGraph()
    central_node = set()

    for result in rdfs["results"]["bindings"]:
        node = extract_value(result["entity"])
        central_node.add(node)

        first_hop_nei = extract_value(result["firstHopEntity"])
        second_hop_nei = extract_value(result["secondHopEntity"])
        r1 = extract_value(result["p"])
        r2 = extract_value(result["p2"])

        if node == first_hop_nei:
            # dbr:entity --> first_hop_nei --> second_hop_nei
            G.add_edge(node, first_hop_nei, relation=r1)
            G.add_edge(node, second_hop_nei, relation=r2)
        elif node == second_hop_nei:
            # dbr:entity -> first_hop_nei <- second_hop_nei
            G.add_edge(node, first_hop_nei, label=r1)
            G.add_edge(second_hop_nei, first_hop_nei, label=r2)
        elif first_hop_nei == second_hop_nei:
            # first_hop_nei -> dbr:entity -> second_hop_nei
            G.add_edge(first_hop_nei, node, label=r1)
            G.add_edge(node, second_hop_nei, label=r2)
        else:
            # first_hop_nei -> dbr:entity <- second_hop_nei
            G.add_edge(first_hop_nei, node, label=r1)
            G.add_edge(second_hop_nei, first_hop_nei, label=r2)

    return G, central_node


class PPR_Utils:
    # PPR to prune the graph
    def __init__(self):
        # define the parameters
        self.alpha = 0.85
        self.tol = 1e-6
        self.max_iter = 100
        self.threshold = 1e-5  # threshold for pruning, if the node's PPR is less than the threshold, it will be pruned

    def calculate_ppr(self, G, central_node):
        # calcultae Personalized PageRank
        personalization = {node: 0 for node in G.nodes()}
        # set the central node weight to 1
        for node in list(central_node):
            personalization[node] = 1

        ppr = nx.pagerank(
            G,
            personalization=personalization,
            alpha=self.alpha,
            tol=self.tol,
            max_iter=self.max_iter,
        )
        return ppr

    def prune_graph(self, G, central_node):
        pruned_G = nx.DiGraph()
        ppr = self.calculate_ppr(G, central_node)

        for node, score in ppr.items():
            if score >= self.threshold:
                pruned_G.add_node(node)

        for u, v, data in G.edges(data=True):
            if u in pruned_G and v in pruned_G:
                pruned_G.add_edge(u, v, **data)

        return pruned_G

Calculate the statistics of the builded graph

In [8]:
def calculate_statistics(G: nx.DiGraph):
    """_summary_

    Args:
       G (nx.DiGraph): _description_
    """

    # statistics
    print("Graph Statistics:")
    print(f"Number of nodes: {G.number_of_nodes()}")
    print(f"Number of edges: {G.number_of_edges()}")

    # Degree statistics
    in_degrees = [d for n, d in G.in_degree()]
    out_degrees = [d for n, d in G.out_degree()]
    total_degrees = [d for n, d in G.degree()]

    print(f"Average in-degree: {sum(in_degrees) / len(in_degrees):.2f}")
    print(f"Average out-degree: {sum(out_degrees) / len(out_degrees):.2f}")
    print(f"Average total degree: {sum(total_degrees) / len(total_degrees):.2f}")

    # Most common relations
    relations = [data['label'] for u, v, data in G.edges(data=True)]
    common_relations = Counter(relations).most_common(5)
    print("\nTop 5 most common relations:")
    for relation, count in common_relations:
        print(f"{relation}: {count}")

    # Number of connected components (for undirected version of the graph)
    undirected_G = G.to_undirected()
    num_components = nx.number_connected_components(undirected_G)
    print(f"\nNumber of connected components: {num_components}")

    # Largest connected component
    largest_cc = max(nx.connected_components(undirected_G), key=len)
    print(f"Size of the largest connected component: {len(largest_cc)}")

    # # Check if the graph is a DAG (Directed Acyclic Graph)
    # is_dag = nx.is_directed_acyclic_graph(G)
    # print(f"\nIs the graph a DAG? {is_dag}")
    # print(f"Is the graph strongly connected? {nx.is_strongly_connected(G)}")
    # print(f"Is the graph weakly connected? {nx.is_weakly_connected(G)}")

    # # Calculate the diameter of the largest connected component
    # largest_cc_subgraph = undirected_G.subgraph(largest_cc)
    # diameter = nx.diameter(largest_cc_subgraph)
    # print(f"Diameter of the largest connected component: {diameter}")

    # # Calculate the centrality of the top 5 nodes
    # print("\nCentrality Measures (for top 5 nodes):")
    # in_degree_centrality = nx.in_degree_centrality(G)
    # print("Top 5 nodes by In-Degree Centrality:")
    # for node, centrality in sorted(
    #     in_degree_centrality.items(), key=lambda x: x[1], reverse=True
    # )[:5]:
    #     print(f"{node}: {centrality:.4f}")

    # out_degree_centrality = nx.out_degree_centrality(G)
    # print("\nTop 5 nodes by Out-Degree Centrality:")
    # for node, centrality in sorted(
    #     out_degree_centrality.items(), key=lambda x: x[1], reverse=True
    # )[:5]:
    #     print(f"{node}: {centrality:.4f}")

    # # Betweenness Centrality (can be slow for large graphs)
    # betweenness_centrality = nx.betweenness_centrality(G)
    # print("\nTop 5 nodes by Betweenness Centrality:")
    # for node, centrality in sorted(
    #     betweenness_centrality.items(), key=lambda x: x[1], reverse=True
    # )[:5]:
    #     print(f"{node}: {centrality:.4f}")

    # Clustering Coefficient
    clustering_coefficient = nx.average_clustering(G)
    print(f"\nAverage Clustering Coefficient: {clustering_coefficient:.4f}")

    # Shortest Paths
    print("\nShortest Path Statistics:")
    shortest_paths = dict(nx.all_pairs_shortest_path_length(G))
    path_lengths = [
        length for paths in shortest_paths.values() for length in paths.values()
    ]
    print(f"Average Shortest Path Length: {np.mean(path_lengths):.2f}")
    print(f"Maximum Shortest Path Length (Diameter): {max(path_lengths)}")

    # Density
    density = nx.density(G)
    print(f"\nGraph Density: {density:.4f}")

Generate the subgraphs

In [15]:
try:
    os.mkdir("OKG")
    os.mkdir("OKG/subgraphs")
    os.mkdir("OKG/subgraphs/raw")
    os.mkdir("OKG/subgraphs/pruned_ppr")
except:
    pass

error_subgraph_indices = []
ppr = PPR_Utils()

entry_nodes = []
for entities in df["dbpedia_entities_re"]:
    entry_nodes.append(list(entities.values()))

with tqdm(
    total=len(entry_nodes), desc="Building subgraphs...", leave=True, ncols=100
) as pbar:
    for index, entry_node in enumerate(entry_nodes):
        try:
            rdfs = run_sparql(entry_node)
            G, central_node = build_up_graph(rdfs)
            ppr_G = ppr.prune_graph(G, central_node)

            # pbar.write(
            #     f"Graph {index}: Nodes = {G.number_of_nodes()}, Edges = {G.number_of_edges()}"
            # )
            # pbar.write(
            #     f"Pruned Graph {index}: Nodes = {ppr_G.number_of_nodes()}, Edges = {ppr_G.number_of_edges()}"
            # )

            pickle.dump(G, open(f"OKG/subgraphs/raw/{index}.pkl", "wb"))
            pickle.dump(ppr_G, open(f"OKG/subgraphs/pruned_ppr/{index}.pkl", "wb"))
        except Exception as e:
            pbar.write(f"Error: {e} occurred for subgraph {index}")
            error_subgraph_indices.append(index)
            continue

        # Update the progress bar
        pbar.update(1)

Building subgraphs...:   4%|█▌                                  | 34/794 [10:25<13:06:28, 62.09s/it]

Error: QueryBadFormed: A bad request has been sent to the endpoint: probably the SPARQL query is badly formed. 

Response:
b"Virtuoso 37000 Error SP030: SPARQL compiler, line 9: syntax error at 'company' before ')'\n\nSPARQL query:\n#output-format:application/sparql-results+json\n\nPREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>\nPREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>\nPREFIX dbr: <http://dbpedia.org/resource/>\nSELECT DISTINCT ?entity ?p ?firstHopEntity ?p2 ?secondHopEntity\nWHERE {\n  {\n    dbr:Amazon_(company) ?p ?firstHopEntity.\n    ?firstHopEntity ?p2 ?secondHopEntity.\n    BIND(dbr:Amazon_(company) AS ?entity)\n  } UNION {\n    dbr:Amazon_(company) ?p ?firstHopEntity.\n    ?secondHopEntity ?p2 ?firstHopEntity.\n    BIND(dbr:Amazon_(company) AS ?entity)\n  } UNION {\n    ?firstHopEntity ?p dbr:Amazon_(company).\n    ?firstHopEntity ?p2 ?secondHopEntity.\n    BIND(dbr:Amazon_(company) AS ?entity)\n  } UNION {\n    ?firstHopEntity ?p dbr:Amazon_(company).\n

Building subgraphs...:   5%|█▉                                   | 42/794 [12:02<2:45:06, 13.17s/it]

Error: QueryBadFormed: A bad request has been sent to the endpoint: probably the SPARQL query is badly formed. 

Response:
b"Virtuoso 37000 Error SP030: SPARQL compiler, line 9: syntax error at '.' before '?p'\n\nSPARQL query:\n#output-format:application/sparql-results+json\n\nPREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>\nPREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>\nPREFIX dbr: <http://dbpedia.org/resource/>\nSELECT DISTINCT ?entity ?p ?firstHopEntity ?p2 ?secondHopEntity\nWHERE {\n  {\n    dbr:Apple_Inc. ?p ?firstHopEntity.\n    ?firstHopEntity ?p2 ?secondHopEntity.\n    BIND(dbr:Apple_Inc. AS ?entity)\n  } UNION {\n    dbr:Apple_Inc. ?p ?firstHopEntity.\n    ?secondHopEntity ?p2 ?firstHopEntity.\n    BIND(dbr:Apple_Inc. AS ?entity)\n  } UNION {\n    ?firstHopEntity ?p dbr:Apple_Inc..\n    ?firstHopEntity ?p2 ?secondHopEntity.\n    BIND(dbr:Apple_Inc. AS ?entity)\n  } UNION {\n    ?firstHopEntity ?p dbr:Apple_Inc..\n    ?secondHopEntity ?p2 ?firstHopEntity.\n    

Building subgraphs...:   6%|██▍                                  | 51/794 [13:29<2:03:24,  9.97s/it]