In [1]:
pip install requests networkx matplotlib SPARQLWrapper

Collecting SPARQLWrapper
  Downloading SPARQLWrapper-2.0.0-py3-none-any.whl.metadata (2.0 kB)
Collecting rdflib>=6.1.1 (from SPARQLWrapper)
  Downloading rdflib-7.1.3-py3-none-any.whl.metadata (11 kB)
Downloading SPARQLWrapper-2.0.0-py3-none-any.whl (28 kB)
Downloading rdflib-7.1.3-py3-none-any.whl (564 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m564.9/564.9 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdflib, SPARQLWrapper
Successfully installed SPARQLWrapper-2.0.0 rdflib-7.1.3


In [4]:
import requests
import networkx as nx
import matplotlib.pyplot as plt
from SPARQLWrapper import SPARQLWrapper, JSON
import warnings
warnings.filterwarnings("ignore", message=".*Glyph*")
import time
import json
import logging
from rdflib import Graph, URIRef, Literal, Namespace, RDF
from rdflib import Namespace, Graph, RDF, RDFS, Literal
from rdflib.namespace import URIRef
import json
from SPARQLWrapper import SPARQLWrapper, JSON

## Data Process

Data correction: Modify the incorrect wiki item to the correct one, such as the entry "wikidata_description": "female given name" or "wikidata_description": "male given name"

In [5]:
# Set up Wikidata SPARQL endpoint
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"

# Define namespaces
MP = Namespace("http://masterproject.org/")
WIKIDATA = Namespace("http://www.wikidata.org/entity/")

# Create RDF graph
g = Graph()
g.bind("mp", MP)
g.bind("wikidata", WIKIDATA)

# Avoid duplicate queries
queried_nodes = set()

# List of relevant properties
RELEVANT_PROPERTIES = [
    "P31", "P279", "P361", "P1269", "P527", "P2670", "P2701", "P1163",
    "P1195", "P4330", "P366", "P1535", "P101", "P921", "P1552", "P13044",
    "P3575"]

def query_wikidata_properties(qid):
    """Query Wikidata properties for a given entity"""
    if qid in queried_nodes:
        return []
    queried_nodes.add(qid)

    query = f"""
    SELECT ?property ?propertyLabel ?value ?valueLabel WHERE {{
      wd:{qid} ?p ?value.
      ?property wikibase:directClaim ?p.
      FILTER (?property IN (wd:{", wd:".join(RELEVANT_PROPERTIES)}))
      OPTIONAL {{ ?value rdfs:label ?valueLabel. FILTER(LANG(?valueLabel) = "en") }}
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    """
    sparql = SPARQLWrapper(SPARQL_ENDPOINT)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    return results["results"]["bindings"]

def recursive_query(qid, label=None, depth=0, max_depth=3, dataset=None):
    """Recursively query and construct the knowledge graph"""
    if depth > max_depth:
        return

    # Query entity properties
    results = query_wikidata_properties(qid)

    for result in results:
        prop = result["property"]["value"].split('/')[-1]
        value = result.get("valueLabel", {}).get("value", result["value"]["value"])
        value_uri = result["value"]["value"]
        property_uri = f"https://www.wikidata.org/wiki/{prop}"

        if "wikidata.org/" in value_uri:
            value_qid = value_uri.split("/")[-1]

            # Process value as NewNode if not already queried
            if not any(value_qid == entry.get("wikidata_id") or value_qid in [dt.get("wikidata_id") for dt in entry["data_types"].values()] for entry in dataset):
                if value_qid not in queried_nodes:
                    g.add((WIKIDATA[value_qid], RDF.type, MP.NewNode))
                    g.add((WIKIDATA[value_qid], RDFS.label, Literal(value)))

            g.add((WIKIDATA[qid], URIRef(property_uri), WIKIDATA[value_qid]))
            g.add((URIRef(property_uri), RDFS.label, Literal(result["propertyLabel"]["value"])))

            if label:
                g.add((WIKIDATA[qid], RDFS.label, Literal(label)))
            if value:
                g.add((WIKIDATA[value_qid], RDFS.label, Literal(value)))

            recursive_query(value_qid, label=value, depth=depth + 1, max_depth=max_depth, dataset=dataset)

        if value_uri and "wikidata.org/entity/" in value_uri:
            value_qid = value_uri.split("/")[-1]
            if value_qid not in queried_nodes:
                g.add((WIKIDATA[value_qid], RDF.type, MP.NewNode))
                queried_nodes.add(value_qid)

            g.add((WIKIDATA[qid], URIRef(property_uri), WIKIDATA[value_qid]))
            g.add((URIRef(property_uri), RDFS.label, Literal(result["propertyLabel"]["value"])))

def process_modality_node(modality_qid, modality):
    """Process a modality node with necessary checks and relations"""
    if not any(entry["wikidata_id"] == modality_qid for entry in dataset):
        return

    if (WIKIDATA[modality_qid], RDF.type, MP.Modality) in g:
        return

    if (WIKIDATA[modality_qid], RDF.type, MP.NewNode) in g:
        g.remove((WIKIDATA[modality_qid], RDF.type, MP.NewNode))

    g.add((WIKIDATA[modality_qid], RDF.type, MP.Modality))
    g.add((WIKIDATA[modality_qid], RDFS.label, Literal(modality)))
    recursive_query(modality_qid, label=modality, dataset=dataset)

def process_datatype_node(data_type_qid, data_type, properties):
    """Process a datatype node with necessary checks and relations"""
    if not any(data_type_qid == dt.get("wikidata_id") for entry in dataset for dt in entry["data_types"].values()):
        return

    if (WIKIDATA[data_type_qid], RDF.type, MP.Modality) in g:
        return

    if (WIKIDATA[data_type_qid], RDF.type, MP.NewNode) in g:
        g.remove((WIKIDATA[data_type_qid], RDF.type, MP.NewNode))

    g.add((WIKIDATA[data_type_qid], RDF.type, MP.Datatype))
    g.add((WIKIDATA[data_type_qid], RDFS.label, Literal(data_type)))
    g.add((WIKIDATA[data_type_qid], MP.frequency, Literal(properties.get("frequency"))))
    g.add((WIKIDATA[data_type_qid], MP.QID, Literal(properties.get("wikidata_id"))))
    g.add((WIKIDATA[data_type_qid], MP.wikidata_label, Literal(properties.get("wikidata_label"))))
    g.add((WIKIDATA[data_type_qid], MP.wikidata_description, Literal(properties.get("wikidata_description"))))

    recursive_query(data_type_qid, label=data_type, dataset=dataset)

def build_knowledge_graph(dataset, max_depth=3):
    # print(f"Depth: {depth}")
    """Build the knowledge graph from the dataset"""
    for entry in dataset:
        modality = entry["modality"]
        data_types = entry["data_types"]

        # Process Modality
        modality_qid = entry.get("wikidata_id")
        if modality_qid:
            print(f"Processing modality: {modality} (QID: {modality_qid})")
            process_modality_node(modality_qid, modality)

            # Process DataTypes related to Modality
            for data_type, properties in data_types.items():
                data_type_qid = properties.get("wikidata_id")
                if data_type_qid:
                    # print(f"Processing datatype: {data_type} (QID: {data_type_qid})")
                    process_datatype_node(data_type_qid, data_type, properties)
                    g.add((WIKIDATA[modality_qid], MP["Modality_Datatype"], WIKIDATA[data_type_qid]))

    # Save the knowledge graph as Turtle and OWL format
    g.serialize(destination="knowledge_graph_with_labels.ttl", format="turtle")
    print("Knowledge graph saved to knowledge_graph_with_labels.ttl")

    g.serialize(destination="knowledge_graph_with_labels.owl", format="xml")
    print("Knowledge graph saved to knowledge_graph_with_labels.owl")

# Load the dataset and build the knowledge graph
with open('final_matched_modalities_data_types_true_label.json', 'r') as f:
    dataset = json.load(f)

build_knowledge_graph(dataset, max_depth=5)

Processing modality: image (QID: Q478798)
Processing modality: text (QID: Q234460)
Processing modality: 3d model (QID: Q3859833)
Processing modality: video (QID: Q625525)
Processing modality: audio (QID: Q3500685)
Processing modality: 3d point cloud (QID: Q1899648)
Processing modality: numerical (QID: Q63116)
Processing modality: metadata (QID: Q180160)
Processing modality: link (QID: Q63617815)
Processing modality: structured data (QID: Q26813700)
Processing modality: annotation (QID: Q857525)
Processing modality: depth (QID: Q930412)
Processing modality: skeleton (QID: Q186190)
Processing modality: inertial (QID: Q570607)
Processing modality: sensor (QID: Q37453697)
Processing modality: midi (QID: Q98777027)
Processing modality: 3d (QID: Q108276326)
Processing modality: 3d skeletal (QID: Q1813564)
Processing modality: tabular (QID: Q13402984)
Processing modality: graph (QID: Q98669280)
Processing modality: 3d scan (QID: Q94701573)
Processing modality: kinematic (QID: Q25215452)
Proce

## Graph Info

In [7]:
pip install rdflib networkx



In [8]:
import rdflib
import networkx as nx
from rdflib import URIRef

# Step 1: Load the knowledge graph
graph = rdflib.Graph()
graph.parse("knowledge_graph_with_labels.ttl", format="turtle")

# Step 2: Construct a directed graph using NetworkX
G = nx.DiGraph()  # Create a directed graph

# Iterate through the RDF graph and add triples to the NetworkX graph
for subj, pred, obj in graph:
    if isinstance(subj, URIRef) and isinstance(obj, URIRef):  # Ensure both subject and object are URIs, not literals
        G.add_edge(subj, obj, label=pred)

In [9]:
# Step 3: Analyze graph properties
# 1. Calculate the in-degrees and out-degrees of the nodes
in_degrees = dict(G.in_degree())   # Get the in-degrees
out_degrees = dict(G.out_degree())  # Get the out-degrees
print("In-degrees:", in_degrees)
print("Out-degrees:", out_degrees)

In-degrees: {rdflib.term.URIRef('http://www.wikidata.org/entity/Q309901'): 1, rdflib.term.URIRef('http://www.wikidata.org/entity/Q627436'): 20, rdflib.term.URIRef('http://www.wikidata.org/entity/Q16743958'): 2, rdflib.term.URIRef('http://masterproject.org/NewNode'): 10838, rdflib.term.URIRef('http://www.wikidata.org/entity/Q37169573'): 0, rdflib.term.URIRef('http://masterproject.org/Modality'): 230, rdflib.term.URIRef('http://www.wikidata.org/entity/Q59156893'): 2, rdflib.term.URIRef('http://www.wikidata.org/entity/Q206290'): 2, rdflib.term.URIRef('http://www.wikidata.org/entity/Q188504'): 3, rdflib.term.URIRef('http://www.wikidata.org/entity/Q105225381'): 1, rdflib.term.URIRef('http://www.wikidata.org/entity/Q1611269'): 1, rdflib.term.URIRef('http://www.wikidata.org/entity/Q22342369'): 1, rdflib.term.URIRef('http://www.wikidata.org/entity/Q106338776'): 1, rdflib.term.URIRef('http://www.wikidata.org/entity/Q501553'): 6, rdflib.term.URIRef('http://www.wikidata.org/entity/Q4683049'): 1, 

In [None]:
# Save in-degrees and out-degrees to a file
with open("node_degrees.txt", "w") as f:
    f.write("In-degrees:\n")
    for node, degree in in_degrees.items():
        f.write(f"{node}: {degree}\n")

    f.write("\nOut-degrees:\n")
    for node, degree in out_degrees.items():
        f.write(f"{node}: {degree}\n")

print("In-degrees and out-degrees saved to node_degrees.txt")

In [16]:
# 1. Sort nodes by in-degree and out-degree
sorted_in_degrees = sorted(in_degrees.items(), key=lambda x: x[1], reverse=True)
sorted_out_degrees = sorted(out_degrees.items(), key=lambda x: x[1], reverse=True)

# 2. Get top 10 nodes by in-degree
top_10_in_degree_nodes = sorted_in_degrees[:10]
top_10_out_degree_nodes = sorted_out_degrees[:10]

# 3. Save the results to a file
with open("top_10_nodes_degrees.txt", "w") as f:
    f.write("Top 10 nodes by in-degree:\n")
    for node, degree in top_10_in_degree_nodes:
        f.write(f"{node}: {degree}\n")

    f.write("\nTop 10 nodes by out-degree:\n")
    for node, degree in top_10_out_degree_nodes:
        f.write(f"{node}: {degree}\n")

print("Top 10 nodes by degree saved to top_10_nodes_degrees.txt")

Top 10 nodes by degree saved to top_10_nodes_degrees.txt


In [17]:
# 1. Sort nodes by in-degree and out-degree
sorted_in_degrees = sorted(in_degrees.items(), key=lambda x: x[1], reverse=True)
sorted_out_degrees = sorted(out_degrees.items(), key=lambda x: x[1], reverse=True)

# 2. Get top 10 nodes by in-degree and out-degree
top_10_in_degree_nodes = sorted_in_degrees[:10]
top_10_out_degree_nodes = sorted_out_degrees[:10]

# 3. Retrieve node type and label
def get_node_type_and_label(node):
    """Retrieve the type and label of the node."""
    node_type = None
    node_label = None

    # Check if node has a type and label
    for subj, pred, obj in graph:
        if subj == node:
            if pred == RDF.type:
                node_type = obj
            elif pred == RDFS.label:
                node_label = obj

    return node_type, node_label

# 4. Print the results to the console
print("Top 10 nodes by in-degree:")
for node, degree in top_10_in_degree_nodes:
    node_type, node_label = get_node_type_and_label(node)
    print(f"Node: {node}, In-degree: {degree}, Type: {node_type}, Label: {node_label}")

print("\nTop 10 nodes by out-degree:")
for node, degree in top_10_out_degree_nodes:
    node_type, node_label = get_node_type_and_label(node)
    print(f"Node: {node}, Out-degree: {degree}, Type: {node_type}, Label: {node_label}")

Top 10 nodes by in-degree:
Node: http://masterproject.org/NewNode, In-degree: 10838, Type: None, Label: None
Node: http://masterproject.org/Datatype, In-degree: 1019, Type: None, Label: None
Node: http://masterproject.org/Modality, In-degree: 230, Type: None, Label: None
Node: http://www.wikidata.org/entity/Q11862829, In-degree: 187, Type: http://masterproject.org/NewNode, Label: academic discipline
Node: http://www.wikidata.org/entity/Q1047113, In-degree: 124, Type: http://masterproject.org/NewNode, Label: field of study
Node: http://www.wikidata.org/entity/Q112826905, In-degree: 105, Type: http://masterproject.org/NewNode, Label: class of anatomical entity
Node: http://www.wikidata.org/entity/Q2267705, In-degree: 95, Type: http://masterproject.org/NewNode, Label: field of study
Node: http://www.wikidata.org/entity/Q1914636, In-degree: 90, Type: http://masterproject.org/Modality, Label: intent label
Node: http://www.wikidata.org/entity/Q5, In-degree: 90, Type: http://masterproject.org

In [10]:
# 2. Analyze connectivity of nodes (check for isolated nodes)
isolated_nodes = list(nx.isolates(G))  # Find all isolated nodes
print("Isolated nodes:", isolated_nodes)

Isolated nodes: []


In [11]:
# 3. Compute the diameter of the graph (only applicable to strongly connected graphs)
if nx.is_strongly_connected(G):
    diameter = nx.diameter(G)  # Compute the graph diameter
    print("Graph diameter:", diameter)
else:
    print("Graph is not strongly connected, skipping diameter calculation.")

Graph is not strongly connected, skipping diameter calculation.


In [12]:
# 4. Check for cycles (detect whether the graph contains directed cycles)
has_cycle = nx.is_directed_acyclic_graph(G)
print("Graph has cycle:", not has_cycle)

Graph has cycle: True


In [13]:
# 5. Centrality analysis (Betweenness centrality, Closeness centrality, etc.)
betweenness_centrality = nx.betweenness_centrality(G)  # Betweenness centrality
closeness_centrality = nx.closeness_centrality(G)      # Closeness centrality

print("Betweenness Centrality:", betweenness_centrality)
print("Closeness Centrality:", closeness_centrality)

Betweenness Centrality: {rdflib.term.URIRef('http://www.wikidata.org/entity/Q309901'): 1.7288754907916355e-05, rdflib.term.URIRef('http://www.wikidata.org/entity/Q627436'): 6.81187867663231e-06, rdflib.term.URIRef('http://www.wikidata.org/entity/Q16743958'): 2.311924010719855e-09, rdflib.term.URIRef('http://masterproject.org/NewNode'): 0.0, rdflib.term.URIRef('http://www.wikidata.org/entity/Q37169573'): 0.0, rdflib.term.URIRef('http://masterproject.org/Modality'): 0.0, rdflib.term.URIRef('http://www.wikidata.org/entity/Q59156893'): 0.00030796055178996336, rdflib.term.URIRef('http://www.wikidata.org/entity/Q206290'): 0.00043698255744100724, rdflib.term.URIRef('http://www.wikidata.org/entity/Q188504'): 0.0, rdflib.term.URIRef('http://www.wikidata.org/entity/Q105225381'): 2.311924010719855e-08, rdflib.term.URIRef('http://www.wikidata.org/entity/Q1611269'): 0.00025190724020803543, rdflib.term.URIRef('http://www.wikidata.org/entity/Q22342369'): 0.0, rdflib.term.URIRef('http://www.wikidata.o

In [14]:
# 6. Compute the shortest path lengths (starting from an example node)
example_node = list(G.nodes())[0]  # Assume we choose the first node
shortest_paths = nx.single_source_shortest_path_length(G, example_node)
print(f"Shortest paths from {example_node}: ", shortest_paths)

Shortest paths from http://www.wikidata.org/entity/Q309901:  {rdflib.term.URIRef('http://www.wikidata.org/entity/Q309901'): 0, rdflib.term.URIRef('http://www.wikidata.org/entity/Q627436'): 1, rdflib.term.URIRef('http://www.wikidata.org/entity/Q595523'): 1, rdflib.term.URIRef('http://www.wikidata.org/entity/Q116783096'): 1, rdflib.term.URIRef('http://masterproject.org/NewNode'): 1, rdflib.term.URIRef('http://www.wikidata.org/entity/Q494756'): 1, rdflib.term.URIRef('http://www.wikidata.org/entity/Q151885'): 1, rdflib.term.URIRef('http://www.wikidata.org/entity/Q268592'): 1, rdflib.term.URIRef('http://www.wikidata.org/entity/Q338990'): 1, rdflib.term.URIRef('http://www.wikidata.org/entity/Q19478619'): 2, rdflib.term.URIRef('http://www.wikidata.org/entity/Q1047113'): 2, rdflib.term.URIRef('http://www.wikidata.org/entity/Q451967'): 2, rdflib.term.URIRef('http://masterproject.org/Datatype'): 2, rdflib.term.URIRef('http://www.wikidata.org/entity/Q15916540'): 2, rdflib.term.URIRef('http://www.

In [15]:
# 7. Graph sparsity analysis
density = nx.density(G)  # Calculate the graph density
print("Graph density:", density)

Graph density: 0.0002597638158234533


## Add Dataset Info

In [6]:
from rdflib import Graph, URIRef, Literal, Namespace, RDF, RDFS
import csv

# Load the existing knowledge graph (TTL file)
g = Graph()
g.parse("knowledge_graph_with_labels.ttl", format="turtle")

# Define the custom namespace for Datasets
MP = Namespace("http://masterproject.org/")
# DATASET = Namespace("http://masterproject.org/dataset/")

# Define the namespaces for Modality and Datatype
WIKIDATA = Namespace("http://www.wikidata.org/entity/")

# Load the CSV file (datasets_modalities_datatypes.csv)
with open('datasets_modalities_datatypes.csv', 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    datasets = list(reader)

# Iterate through each dataset entry to create a new Dataset node
for idx, entry in enumerate(datasets):
    # Create a unique URI for the dataset, e.g., D1, D2, D100
    dataset_did = URIRef(f"D{idx + 1}")
    # print(WIKIDATA[dataset_did])

    # Create Dataset node with dataset_name as the label
    dataset_name = entry["dataset_name"]
    dataset_description = entry["dataset_description"]
    modality = entry["modality"]
    data_type = entry["data_type"]

    g.add((WIKIDATA[dataset_did], RDF.type, MP.Dataset))
    g.add((WIKIDATA[dataset_did], RDFS.label, Literal(dataset_name)))
    g.add((WIKIDATA[dataset_did], MP.dataset_description, Literal(dataset_description)))
    g.add((WIKIDATA[dataset_did], MP.modality, Literal(modality)))
    g.add((WIKIDATA[dataset_did], MP.data_type, Literal(data_type)))

    # Query the Modality and Datatype nodes by their labels
    modality_nodes = list(g.query(f"""
        SELECT ?modality WHERE {{
            ?modality rdf:type <http://masterproject.org/Modality>.
            ?modality rdfs:label "{modality}".
        }}
    """))

    datatype_nodes = list(g.query(f"""
        SELECT ?datatype WHERE {{
            ?datatype rdf:type <http://masterproject.org/Datatype>.
            ?datatype rdfs:label "{data_type}".
        }}
    """))

    # If Modality node is found, create an edge between Dataset and Modality
    for modality_node in modality_nodes:
        modality_uri = modality_node[0]
        g.add((WIKIDATA[dataset_did], MP.Dataset_Modality, modality_uri))

    # If Datatype node is found, create an edge between Dataset and Datatype
    for datatype_node in datatype_nodes:
        datatype_uri = datatype_node[0]
        g.add((WIKIDATA[dataset_did], MP.Dataset_Datatype, datatype_uri))

# Save the updated knowledge graph back to a TTL file
g.serialize(destination="final_knowledge_graph.ttl", format="turtle")
print("Knowledge graph updated and saved")

Knowledge graph updated and saved


## Graph Info

In [18]:
# Step 1: Load the knowledge graph
graph_d = rdflib.Graph()
graph_d.parse("final_knowledge_graph.ttl", format="turtle")

# Step 2: Construct a directed graph using NetworkX
G_d = nx.DiGraph()  # Create a directed graph

# Iterate through the RDF graph and add triples to the NetworkX graph
for subj, pred, obj in graph_d:
    if isinstance(subj, URIRef) and isinstance(obj, URIRef):  # Ensure both subject and object are URIs, not literals
        G_d.add_edge(subj, obj, label=pred)

In [20]:
# 1. Calculate the in-degrees and out-degrees of the nodes
in_degrees = dict(G_d.in_degree())   # Get the in-degrees
out_degrees = dict(G_d.out_degree())  # Get the out-degrees

# 1. Sort nodes by in-degree and out-degree
sorted_in_degrees = sorted(in_degrees.items(), key=lambda x: x[1], reverse=True)
sorted_out_degrees = sorted(out_degrees.items(), key=lambda x: x[1], reverse=True)

# 2. Get top 10 nodes by in-degree and out-degree
top_10_in_degree_nodes = sorted_in_degrees[:10]
top_10_out_degree_nodes = sorted_out_degrees[:10]

# 3. Retrieve node type and label
def get_node_type_and_label(node):
    """Retrieve the type and label of the node."""
    node_type = None
    node_label = None

    # Check if node has a type and label
    for subj, pred, obj in graph_d:
        if subj == node:
            if pred == RDF.type:
                node_type = obj
            elif pred == RDFS.label:
                node_label = obj

    return node_type, node_label

# 4. Print the results to the console
print("Top 10 nodes by in-degree:")
for node, degree in top_10_in_degree_nodes:
    node_type, node_label = get_node_type_and_label(node)
    print(f"Node: {node}, In-degree: {degree}, Type: {node_type}, Label: {node_label}")

print("\nTop 10 nodes by out-degree:")
for node, degree in top_10_out_degree_nodes:
    node_type, node_label = get_node_type_and_label(node)
    print(f"Node: {node}, Out-degree: {degree}, Type: {node_type}, Label: {node_label}")

Top 10 nodes by in-degree:
Node: http://masterproject.org/Dataset, In-degree: 27460, Type: None, Label: None
Node: http://www.wikidata.org/entity/Q234460, In-degree: 13628, Type: http://masterproject.org/Modality, Label: text
Node: http://masterproject.org/NewNode, In-degree: 10838, Type: None, Label: None
Node: http://www.wikidata.org/entity/Q478798, In-degree: 6763, Type: http://masterproject.org/Modality, Label: image
Node: http://www.wikidata.org/entity/Q625525, In-degree: 1923, Type: http://masterproject.org/Modality, Label: movie metadata
Node: http://www.wikidata.org/entity/Q3500685, In-degree: 1165, Type: http://masterproject.org/Modality, Label: audio
Node: http://masterproject.org/Datatype, In-degree: 1019, Type: None, Label: None
Node: http://www.wikidata.org/entity/Q120970430, In-degree: 650, Type: http://masterproject.org/Datatype, Label: dataset name
Node: http://www.wikidata.org/entity/Q63116, In-degree: 532, Type: http://masterproject.org/Modality, Label: numerical
Node

In [23]:
# 2. Analyze connectivity of nodes (check for isolated nodes)
isolated_nodes = list(nx.isolates(G_d))  # Find all isolated nodes
print("Isolated nodes:", isolated_nodes)

Isolated nodes: []


In [21]:
# 3. Compute the diameter of the graph (only applicable to strongly connected graphs)
if nx.is_strongly_connected(G_d):
    diameter = nx.diameter(G_d)  # Compute the graph diameter
    print("Graph diameter:", diameter)
else:
    print("Graph is not strongly connected, skipping diameter calculation.")

Graph is not strongly connected, skipping diameter calculation.


In [26]:
# 5. Centrality analysis (Betweenness centrality, Closeness centrality, etc.)
betweenness_centrality = nx.betweenness_centrality(G)  # Betweenness centrality
closeness_centrality = nx.closeness_centrality(G)      # Closeness centrality

print("Betweenness Centrality:", betweenness_centrality)
print("Closeness Centrality:", closeness_centrality)

Betweenness Centrality: {rdflib.term.URIRef('http://www.wikidata.org/entity/Q309901'): 1.7288754907916355e-05, rdflib.term.URIRef('http://www.wikidata.org/entity/Q627436'): 6.81187867663231e-06, rdflib.term.URIRef('http://www.wikidata.org/entity/Q16743958'): 2.311924010719855e-09, rdflib.term.URIRef('http://masterproject.org/NewNode'): 0.0, rdflib.term.URIRef('http://www.wikidata.org/entity/Q37169573'): 0.0, rdflib.term.URIRef('http://masterproject.org/Modality'): 0.0, rdflib.term.URIRef('http://www.wikidata.org/entity/Q59156893'): 0.00030796055178996336, rdflib.term.URIRef('http://www.wikidata.org/entity/Q206290'): 0.00043698255744100724, rdflib.term.URIRef('http://www.wikidata.org/entity/Q188504'): 0.0, rdflib.term.URIRef('http://www.wikidata.org/entity/Q105225381'): 2.311924010719855e-08, rdflib.term.URIRef('http://www.wikidata.org/entity/Q1611269'): 0.00025190724020803543, rdflib.term.URIRef('http://www.wikidata.org/entity/Q22342369'): 0.0, rdflib.term.URIRef('http://www.wikidata.o

In [28]:
# 1. Sort nodes by betweenness centrality and closeness centrality
sorted_betweenness = sorted(betweenness_centrality.items(), key=lambda x: x[1], reverse=True)
sorted_closeness = sorted(closeness_centrality.items(), key=lambda x: x[1], reverse=True)

# 2. Get top 10 nodes by betweenness centrality
top_10_betweenness_nodes = sorted_betweenness[:10]

# 3. Get top 10 nodes by closeness centrality
top_10_closeness_nodes = sorted_closeness[:10]

# 4. Print the top 10 nodes for each along with their labels and types
def get_label_and_type(node):
    """Get the label and type of a node from the RDF graph."""
    # Retrieve the label
    label = None
    for _, _, obj in graph.triples((node, RDFS.label, None)):
        label = obj.toPython()

    # Retrieve the type
    node_type = None
    for _, _, obj in graph.triples((node, RDF.type, None)):
        node_type = obj.toPython()

    return label, node_type

# Print the top 10 nodes by Betweenness Centrality with their labels and types
print("Top 10 nodes by Betweenness Centrality:")
for node, centrality in top_10_betweenness_nodes:
    label, node_type = get_label_and_type(URIRef(node))  # Get label and type from the graph
    print(f"{node} - Label: {label}, Type: {node_type}, Betweenness Centrality: {centrality}")

print("\nTop 10 nodes by Closeness Centrality:")
for node, centrality in top_10_closeness_nodes:
    label, node_type = get_label_and_type(URIRef(node))  # Get label and type from the graph
    print(f"{node} - Label: {label}, Type: {node_type}, Closeness Centrality: {centrality}")

Top 10 nodes by Betweenness Centrality:
http://www.wikidata.org/entity/Q478798 - Label: image, Type: http://masterproject.org/Modality, Betweenness Centrality: 0.07656686495312391
http://www.wikidata.org/entity/Q234460 - Label: text, Type: http://masterproject.org/Modality, Betweenness Centrality: 0.07627436981279838
http://www.wikidata.org/entity/Q860625 - Label: image file, Type: http://masterproject.org/NewNode, Betweenness Centrality: 0.05443242270794689
http://www.wikidata.org/entity/Q1572121 - Label: image file format, Type: http://masterproject.org/NewNode, Betweenness Centrality: 0.05438122467110026
http://www.wikidata.org/entity/Q1924634 - Label: Metafile, Type: http://masterproject.org/NewNode, Betweenness Centrality: 0.054299471336221046
http://www.wikidata.org/entity/Q1351368 - Label: archive file format, Type: http://masterproject.org/NewNode, Betweenness Centrality: 0.05423531928819891
http://www.wikidata.org/entity/Q7397 - Label: software, Type: http://masterproject.org/

In [22]:
# 7. Graph sparsity analysis
density = nx.density(G_d)  # Calculate the graph density
print("Graph density:", density)

Graph density: 6.712814153363479e-05


In [24]:
# 6. Compute the shortest path lengths (starting from an example node)
example_node = list(G_d.nodes())[0]  # Assume we choose the first node
shortest_paths = nx.single_source_shortest_path_length(G_d, example_node)
print(f"Shortest paths from {example_node}: ", shortest_paths)

Shortest paths from http://www.wikidata.org/entity/D9948:  {rdflib.term.URIRef('http://www.wikidata.org/entity/D9948'): 0, rdflib.term.URIRef('http://www.wikidata.org/entity/Q234460'): 1, rdflib.term.URIRef('http://masterproject.org/Dataset'): 1, rdflib.term.URIRef('http://www.wikidata.org/entity/Q173253'): 2, rdflib.term.URIRef('http://www.wikidata.org/entity/Q3375697'): 2, rdflib.term.URIRef('http://www.wikidata.org/entity/Q160151'): 2, rdflib.term.URIRef('http://www.wikidata.org/entity/Q189756'): 2, rdflib.term.URIRef('http://www.wikidata.org/entity/Q107263298'): 2, rdflib.term.URIRef('http://www.wikidata.org/entity/Q424083'): 2, rdflib.term.URIRef('http://www.wikidata.org/entity/Q165436'): 2, rdflib.term.URIRef('http://www.wikidata.org/entity/Q33742'): 2, rdflib.term.URIRef('http://www.wikidata.org/entity/Q653347'): 2, rdflib.term.URIRef('http://www.wikidata.org/entity/Q1766173'): 2, rdflib.term.URIRef('http://www.wikidata.org/entity/Q36224'): 2, rdflib.term.URIRef('http://www.wiki

## Data got from Chapter 5

In [None]:
display_csv_head('modality_wiki.csv')

           modality          wiki_word       qid
0          2d image              image   Q478798
1      2d structure          structure  Q6671777
2     3d annotation         annotation   Q857525
3  3d body skeleton           skeleton     Q7881
4     3d coordinate  coordinate system    Q11210


In [None]:
display_csv_head('datatype_wiki_merged.csv')

          datatype               wiki_word       qid
0        rgb image  Silicon Graphics Image  Q7514956
1       rgbd image  Silicon Graphics Image  Q7514956
2       srgb image  Silicon Graphics Image  Q7514956
3  rgb color image  Silicon Graphics Image  Q7514956
4      depth image                   depth   Q930412


In [None]:
display_csv_head('datasets_modalities_datatypes.csv')

  dataset_name                                dataset_description modality  \
0        MNIST  The **MNIST** database (**Modified National In...    image   
1       CelebA  CelebFaces Attributes dataset contains 202,599...    image   
2       CelebA  CelebFaces Attributes dataset contains 202,599...     text   
3     JFT-300M  **JFT-300M** is an internal Google dataset use...    image   
4     JFT-300M  **JFT-300M** is an internal Google dataset use...     text   

                 data_type  
0  handwritten digit image  
1               face image  
2         facial attribute  
3        web sourced image  
4              image label  
