In [None]:
pip install requests networkx matplotlib SPARQLWrapper

Collecting SPARQLWrapper
  Downloading SPARQLWrapper-2.0.0-py3-none-any.whl.metadata (2.0 kB)
Collecting rdflib>=6.1.1 (from SPARQLWrapper)
  Downloading rdflib-7.1.3-py3-none-any.whl.metadata (11 kB)
Downloading SPARQLWrapper-2.0.0-py3-none-any.whl (28 kB)
Downloading rdflib-7.1.3-py3-none-any.whl (564 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m564.9/564.9 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdflib, SPARQLWrapper
Successfully installed SPARQLWrapper-2.0.0 rdflib-7.1.3


In [None]:
import requests
import networkx as nx
import matplotlib.pyplot as plt
from SPARQLWrapper import SPARQLWrapper, JSON
import warnings
warnings.filterwarnings("ignore", message=".*Glyph*")
import time
import json
import logging
from rdflib import Graph, URIRef, Literal, Namespace, RDF

In [None]:
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"

# List of relevant properties (modality and data type relationships)
RELEVANT_PROPERTIES = [
    # Fundamental Semantics
    "P31",  # instance of
    "P279",  # subclass of
    "P361",  # part of
    "P1269",  # facet of

    # Structure and Technology
    "P527",  # has part(s)
    "P2670",  # has part(s) of the class
    "P2701",  # file format
    "P1163",  # media type
    "P1195",  # file extension
    "P4330",  # contains

    # Application and Domain
    "P366",  # has use
    "P1535",  # used by
    "P101",  # field of work
    "P921",  # main subject


    # Quality and Metadata
    "P1552",  # has characteristic
    "P13044",  # characteristic of
    "P3575",  # data size
]

In [None]:
# Wikidata SPARQL endpoint
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"

# From label get qid
def get_qid(label):
    """Get the QID of an entity based on its label."""
    time.sleep(1)
    logging.info(f"Fetching QID for label: {label}")
    query = f"""
    SELECT ?entity WHERE {{
      ?entity rdfs:label "{label}"@en.
      FILTER (STRSTARTS(STR(?entity), "http://www.wikidata.org/entity/Q"))
    }}
    LIMIT 1
    """
    sparql = SPARQLWrapper(SPARQL_ENDPOINT)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    if results["results"]["bindings"]:
        qid = results["results"]["bindings"][0]["entity"]["value"].split("/")[-1]
        logging.info(f"Found QID for {label}: {qid}")
        return qid
    else:
        logging.warning(f"No QID found for label: {label}")
    return None

In [None]:
from rdflib import Namespace, Graph, RDF, RDFS, Literal
import json
from SPARQLWrapper import SPARQLWrapper, JSON

# Set the Wikidata SPARQL endpoint
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"

# Define namespaces
EX = Namespace("http://example.org/")
WIKIDATA = Namespace("http://www.wikidata.org/entity/")

# Create an RDF graph
g = Graph()
g.bind("ex", EX)
g.bind("wikidata", WIKIDATA)

# Prevent duplicate queries
queried_nodes = set()

def query_wikidata_properties(qid):
    """Query Wikidata properties for a given entity"""
    if qid in queried_nodes:
        return []
    queried_nodes.add(qid)

    query = f"""
    SELECT ?property ?propertyLabel ?value ?valueLabel WHERE {{
      wd:{qid} ?p ?value.
      ?property wikibase:directClaim ?p.
      FILTER (?property IN (wd:{", wd:".join(RELEVANT_PROPERTIES)}))
      OPTIONAL {{ ?value rdfs:label ?valueLabel. FILTER(LANG(?valueLabel) = "en") }}
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    """
    sparql = SPARQLWrapper(SPARQL_ENDPOINT)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    return results["results"]["bindings"]

def recursive_query(qid, label=None, depth=0, max_depth=3):
    """Recursively query and construct the knowledge graph"""
    if depth > max_depth:
        return

    # Query entity properties
    results = query_wikidata_properties(qid)

    for result in results:
        prop = result["propertyLabel"]["value"].replace(" ", "_").replace("(", "_").replace(")", "_")
        value = result.get("valueLabel", {}).get("value", result["value"]["value"])
        value_uri = result["value"]["value"]

        # Check if the value is a Wikidata entity
        if "wikidata.org/entity/" in value_uri:
            value_qid = value_uri.split("/")[-1]

            # Create a new node as NewNode type and add it to the graph
            if value_qid not in queried_nodes:
                g.add((WIKIDATA[value_qid], RDF.type, EX.NewNode))  # Mark the new entity as NewNode type
                queried_nodes.add(value_qid)

            # Only add properties as edges connecting to the target node, not as node attributes
            g.add((WIKIDATA[qid], EX[prop], WIKIDATA[value_qid]))
            if label:
                g.add((WIKIDATA[qid], RDFS.label, Literal(label)))
            if value:
                g.add((WIKIDATA[value_qid], RDFS.label, Literal(value)))

            # Recursively query the new entity
            recursive_query(value_qid, label=value, depth=depth + 1, max_depth=max_depth)

        # Handle relationships between the current property and the new node
        if value_uri and "wikidata.org/entity/" in value_uri:
            value_qid = value_uri.split("/")[-1]
            if value_qid not in queried_nodes:
                g.add((WIKIDATA[value_qid], RDF.type, EX.NewNode))  # Mark the new entity as NewNode type
                queried_nodes.add(value_qid)

            # Add relationships from RELEVANT_PROPERTIES as edges between new nodes
            g.add((WIKIDATA[qid], EX[prop], WIKIDATA[value_qid]))  # Add relationship edges

def build_knowledge_graph(dataset, max_depth=3):
    """Build the knowledge graph from the dataset"""
    for entry in dataset:
        modality = entry["modality"]
        data_types = entry["data_types"]

        # Get the QID of the modality
        modality_qid = get_qid(modality)
        if modality_qid:
            print(f"Processing modality: {modality} (QID: {modality_qid})")

            # Add the modality node to the RDF graph and attach a label
            g.add((WIKIDATA[modality_qid], RDF.type, EX.Modality))
            g.add((WIKIDATA[modality_qid], RDFS.label, Literal(modality)))

            # Recursively query modality properties
            recursive_query(modality_qid, label=modality, depth=0, max_depth=max_depth)

            # Handle data_types
            for data_type, properties in data_types.items():
                data_type_qid = properties.get("wikidata_id")

                if data_type_qid:
                    # Add the data_type node to the RDF graph and attach a label
                    g.add((WIKIDATA[data_type_qid], RDF.type, EX.Datatype))
                    g.add((WIKIDATA[data_type_qid], RDFS.label, Literal(data_type)))
                    g.add((WIKIDATA[data_type_qid], EX.frequency, Literal(properties.get("frequency"))))
                    g.add((WIKIDATA[data_type_qid], EX.wikidata_label, Literal(properties.get("wikidata_label"))))
                    g.add((WIKIDATA[data_type_qid], EX.wikidata_description, Literal(properties.get("wikidata_description"))))

                    # Add a custom edge between modality and data_type
                    g.add((WIKIDATA[modality_qid], EX["Modality_Datatype"], WIKIDATA[data_type_qid]))

                    # Recursively query data_type properties
                    recursive_query(data_type_qid, label=data_type, depth=0, max_depth=max_depth)

    # Save the knowledge graph in Turtle format
    g.serialize(destination="knowledge_graph_with_labels_5.ttl", format="turtle")
    print("Knowledge graph saved to knowledge_graph_with_labels.ttl")

    g.serialize(destination="knowledge_graph_with_labels_5.owl", format="xml")
    print("Knowledge graph saved to knowledge_graph_with_labels.owl")

# Load the matched_modalities_data_types.json file
with open('matched_modalities_data_types.json', 'r') as f:
    dataset = json.load(f)

# Build the knowledge graph
build_knowledge_graph(dataset, max_depth=5)

Processing modality: image (QID: Q478798)
Processing modality: text (QID: Q234460)




Processing modality: video (QID: Q625525)
Processing modality: audio (QID: Q3500685)




Processing modality: metadata (QID: Q180160)
Processing modality: structured data (QID: Q26813700)
Processing modality: annotation (QID: Q857525)
Processing modality: depth (QID: Q181591)
Processing modality: skeleton (QID: Q7881)




Processing modality: sensor (QID: Q167676)




Processing modality: tabular (QID: Q13402984)
Processing modality: graph (QID: Q141488)




Processing modality: network (QID: Q109406)




Processing modality: table (QID: Q14748)
Processing modality: time series (QID: Q186588)
Processing modality: simulation (QID: Q45045)
Processing modality: lidar (QID: Q504027)
Processing modality: radar (QID: Q47528)
Processing modality: numeric (QID: Q3249849)
Processing modality: signal (QID: Q174984)
Processing modality: data (QID: Q42848)
Processing modality: point cloud (QID: Q1899648)
Processing modality: vector (QID: Q107994)
Processing modality: label (QID: Q202581)
Processing modality: chemical structure (QID: Q500256)




Processing modality: motion capture (QID: Q676252)




Processing modality: geolocation (QID: Q123349905)
Processing modality: temporal (QID: Q2472680)
Processing modality: sequence (QID: Q133250)
Processing modality: trajectory (QID: Q193139)
Processing modality: neuroimaging (QID: Q551875)
Processing modality: environment (QID: Q1572519)




Processing modality: pose (QID: Q1671968)
Processing modality: visualization (QID: Q451553)
Processing modality: geospatial (QID: Q122074678)




Processing modality: binary (QID: Q58483123)
Processing modality: interaction (QID: Q52948)
Processing modality: eye tracking (QID: Q970687)
Processing modality: social network (QID: Q2715623)




Processing modality: event (QID: Q1349920)
Processing modality: control (QID: Q338754)
Processing modality: synthetic data (QID: Q7662746)




Processing modality: code (QID: Q188889)




Processing modality: knowledge graph (QID: Q33002955)
Processing modality: matrix (QID: Q44337)
Processing modality: motion (QID: Q79782)




Processing modality: game (QID: Q11410)




Processing modality: web (QID: Q1427141)
Processing modality: log (QID: Q12029485)
Processing modality: model (QID: Q1941828)




Processing modality: feature vector (QID: Q1921842)




Processing modality: spatial (QID: Q122075505)
Processing modality: visual (QID: Q4014836)




Processing modality: software (QID: Q7397)
Processing modality: sketch (QID: Q5078274)




Processing modality: feature (QID: Q93586)




Processing modality: tactile (QID: Q124134995)




Processing modality: notebook (QID: Q43013)
Processing modality: emotion (QID: Q95969875)




Processing modality: biomarker (QID: Q864574)




Processing modality: other (QID: Q1433373)
Processing modality: database (QID: Q8513)
Processing modality: box (QID: Q188075)
Processing modality: animation (QID: Q11425)




Processing modality: odometry (QID: Q2014717)
Processing modality: blockchain (QID: Q20514253)




Processing modality: vr (QID: Q29716068)




Processing modality: data file (QID: Q5227290)
Processing modality: biosignal (QID: Q644240)
Knowledge graph saved to knowledge_graph_with_labels.ttl
Knowledge graph saved to knowledge_graph_with_labels.owl


## Counting Modality & Datatype

In [None]:
import json

# Load the matched_modalities_data_types.json file
with open('matched_modalities_data_types.json', 'r') as f:
    dataset = json.load(f)

# Initialize counters
modality_count = 0
unique_datatypes = set()

# Iterate through the dataset to count modalities and unique data types
for entry in dataset:
    modality_count += 1  # Count each modality
    # Add each data type to the set (to avoid duplicates)
    for data_type in entry['data_types']:
        unique_datatypes.add(data_type)

# Print the results
print(f"Number of modalities: {modality_count}")
print(f"Number of unique data types: {len(unique_datatypes)}")

Number of modalities: 123
Number of unique data types: 3874


In [None]:
from rdflib import Namespace, Graph, RDF, RDFS, Literal
import json
from SPARQLWrapper import SPARQLWrapper, JSON

# Set up Wikidata SPARQL endpoint
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"

# Define namespaces
MP = Namespace("http://masterproject.org/")
WIKIDATA = Namespace("http://www.wikidata.org/entity/")

# Create RDF graph
g = Graph()
g.bind("mp", MP)
g.bind("wikidata", WIKIDATA)

# Avoid duplicate queries
queried_nodes = set()

# List of relevant properties
RELEVANT_PROPERTIES = [
    "P31", "P279", "P361", "P1269", "P527", "P2670", "P2701", "P1163",
    "P1195", "P4330", "P366", "P1535", "P101", "P921", "P1552", "P13044",
    "P3575"
]

def query_wikidata_properties(qid):
    """Query Wikidata properties for a given entity"""
    if qid in queried_nodes:
        return []
    queried_nodes.add(qid)

    query = f"""
    SELECT ?property ?propertyLabel ?value ?valueLabel WHERE {{
      wd:{qid} ?p ?value.
      ?property wikibase:directClaim ?p.
      FILTER (?property IN (wd:{", wd:".join(RELEVANT_PROPERTIES)}))
      OPTIONAL {{ ?value rdfs:label ?valueLabel. FILTER(LANG(?valueLabel) = "en") }}
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    """
    sparql = SPARQLWrapper(SPARQL_ENDPOINT)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    return results["results"]["bindings"]

def recursive_query(qid, label=None, depth=0, max_depth=3):
    """Recursively query and construct the knowledge graph"""
    if depth > max_depth:
        return

    # Query entity properties
    results = query_wikidata_properties(qid)

    for result in results:
        prop = result["propertyLabel"]["value"].replace(" ", "_").replace("(", "_").replace(")", "_")
        value = result.get("valueLabel", {}).get("value", result["value"]["value"])
        value_uri = result["value"]["value"]

        # Check if the value is a Wikidata entity
        if "wikidata.org/entity/" in value_uri:
            value_qid = value_uri.split("/")[-1]

            # Create new node of type NewNode if it has not been queried
            if value_qid not in queried_nodes:
                g.add((WIKIDATA[value_qid], RDF.type, MP.NewNode))  # Mark the new entity as NewNode
                queried_nodes.add(value_qid)

            # Only create relationships (edges) between nodes without adding properties
            g.add((WIKIDATA[qid], MP[prop], WIKIDATA[value_qid]))
            if label:
                g.add((WIKIDATA[qid], RDFS.label, Literal(label)))
            if value:
                g.add((WIKIDATA[value_qid], RDFS.label, Literal(value)))

            # Recursively query the new node
            recursive_query(value_qid, label=value, depth=depth + 1, max_depth=max_depth)

        # Handle relationships between properties and new nodes
        if value_uri and "wikidata.org/entity/" in value_uri:
            value_qid = value_uri.split("/")[-1]
            if value_qid not in queried_nodes:
                g.add((WIKIDATA[value_qid], RDF.type, MP.NewNode))  # Mark the new entity as NewNode
                queried_nodes.add(value_qid)

            # Add edges based on RELEVANT_PROPERTIES between new nodes
            g.add((WIKIDATA[qid], MP[prop], WIKIDATA[value_qid]))  # Add relationship edge

def build_knowledge_graph(dataset, max_depth=3):
    """Build the knowledge graph from the dataset"""
    for entry in dataset:
        modality = entry["modality"]
        data_types = entry["data_types"]

        # Get QID for modality
        modality_qid = get_qid(modality)
        if modality_qid:
            print(f"Processing modality: {modality} (QID: {modality_qid})")

            # Add modality node to RDF graph and attach a label
            g.add((WIKIDATA[modality_qid], RDF.type, MP.Modality))
            g.add((WIKIDATA[modality_qid], RDFS.label, Literal(modality)))

            # Recursively query modality properties
            recursive_query(modality_qid, label=modality, depth=0, max_depth=max_depth)

            # Handle data_types
            for data_type, properties in data_types.items():
                data_type_qid = properties.get("wikidata_id")

                if data_type_qid:
                    # If the node exists as NewNode, change its type to Datatype
                    if (WIKIDATA[data_type_qid], RDF.type, MP.NewNode) in g:
                        g.remove((WIKIDATA[data_type_qid], RDF.type, MP.NewNode))

                    # Add data_type node to RDF graph with properties
                    g.add((WIKIDATA[data_type_qid], RDF.type, MP.Datatype))
                    g.add((WIKIDATA[data_type_qid], RDFS.label, Literal(data_type)))
                    g.add((WIKIDATA[data_type_qid], MP.frequency, Literal(properties.get("frequency"))))
                    g.add((WIKIDATA[data_type_qid], MP.QID, Literal(properties.get("wikidata_id"))))
                    g.add((WIKIDATA[data_type_qid], MP.wikidata_label, Literal(properties.get("wikidata_label"))))
                    g.add((WIKIDATA[data_type_qid], MP.wikidata_description, Literal(properties.get("wikidata_description"))))

                    # Create custom edge between modality and data_type
                    g.add((WIKIDATA[modality_qid], MP["Modality_Datatype"], WIKIDATA[data_type_qid]))

                    # Recursively query data_type properties
                    recursive_query(data_type_qid, label=data_type, depth=0, max_depth=max_depth)

    # Save the knowledge graph as Turtle and OWL format
    g.serialize(destination="knowledge_graph_with_labels.ttl", format="turtle")
    print("Knowledge graph saved to knowledge_graph_with_labels.ttl")

    g.serialize(destination="knowledge_graph_with_labels.owl", format="xml")
    print("Knowledge graph saved to knowledge_graph_with_labels.owl")

# Load the matched_modalities_data_types.json file
with open('matched_modalities_data_types.json', 'r') as f:
    dataset = json.load(f)

# Build the knowledge graph
build_knowledge_graph(dataset, max_depth=5)

Processing modality: image (QID: Q478798)
Processing modality: text (QID: Q234460)




Processing modality: video (QID: Q625525)
Processing modality: audio (QID: Q3500685)




Processing modality: metadata (QID: Q180160)
Processing modality: structured data (QID: Q26813700)
Processing modality: annotation (QID: Q857525)
Processing modality: depth (QID: Q181591)
Processing modality: skeleton (QID: Q7881)




Processing modality: sensor (QID: Q167676)




Processing modality: tabular (QID: Q13402984)
Processing modality: graph (QID: Q141488)




Processing modality: network (QID: Q109406)




Processing modality: table (QID: Q14748)
Processing modality: time series (QID: Q186588)
Processing modality: simulation (QID: Q45045)
Processing modality: lidar (QID: Q504027)
Processing modality: radar (QID: Q47528)
Processing modality: numeric (QID: Q3249849)
Processing modality: signal (QID: Q174984)
Processing modality: data (QID: Q42848)
Processing modality: point cloud (QID: Q1899648)
Processing modality: vector (QID: Q107994)
Processing modality: label (QID: Q202581)
Processing modality: chemical structure (QID: Q500256)




Processing modality: motion capture (QID: Q676252)




Processing modality: geolocation (QID: Q123349905)
Processing modality: temporal (QID: Q2472680)
Processing modality: sequence (QID: Q133250)
Processing modality: trajectory (QID: Q193139)
Processing modality: neuroimaging (QID: Q551875)
Processing modality: environment (QID: Q1572519)




Processing modality: pose (QID: Q1671968)
Processing modality: visualization (QID: Q451553)
Processing modality: geospatial (QID: Q122074678)




Processing modality: binary (QID: Q58483123)
Processing modality: interaction (QID: Q52948)
Processing modality: eye tracking (QID: Q970687)
Processing modality: social network (QID: Q2715623)




Processing modality: event (QID: Q1349920)
Processing modality: control (QID: Q338754)
Processing modality: synthetic data (QID: Q7662746)




Processing modality: code (QID: Q188889)




Processing modality: knowledge graph (QID: Q33002955)
Processing modality: matrix (QID: Q44337)
Processing modality: motion (QID: Q79782)




Processing modality: game (QID: Q11410)




Processing modality: web (QID: Q1427141)
Processing modality: log (QID: Q12029485)
Processing modality: model (QID: Q1941828)




Processing modality: feature vector (QID: Q1921842)




Processing modality: spatial (QID: Q122075505)
Processing modality: visual (QID: Q4014836)




Processing modality: software (QID: Q7397)
Processing modality: sketch (QID: Q5078274)




Processing modality: feature (QID: Q93586)




Processing modality: tactile (QID: Q124134995)




Processing modality: notebook (QID: Q43013)
Processing modality: emotion (QID: Q95969875)




Processing modality: biomarker (QID: Q864574)




Processing modality: other (QID: Q1433373)
Processing modality: database (QID: Q8513)
Processing modality: box (QID: Q188075)
Processing modality: animation (QID: Q11425)




Processing modality: odometry (QID: Q2014717)
Processing modality: blockchain (QID: Q20514253)




Processing modality: vr (QID: Q29716068)




Processing modality: data file (QID: Q5227290)
Processing modality: biosignal (QID: Q644240)
Knowledge graph saved to knowledge_graph_with_labels.ttl
Knowledge graph saved to knowledge_graph_with_labels.owl


In [None]:
from rdflib import Namespace, Graph, RDF, RDFS, Literal
import json
from SPARQLWrapper import SPARQLWrapper, JSON

# Set up Wikidata SPARQL endpoint
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"

# Define namespaces
MP = Namespace("http://masterproject.org/")
WIKIDATA = Namespace("http://www.wikidata.org/entity/")

# Create RDF graph
g = Graph()
g.bind("mp", MP)
g.bind("wikidata", WIKIDATA)

# Avoid duplicate queries
queried_nodes = set()

SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"

# From label get qid
def get_qid(label):
    """Get the QID of an entity based on its label."""
    time.sleep(1)
    logging.info(f"Fetching QID for label: {label}")
    query = f"""
    SELECT ?entity WHERE {{
      ?entity rdfs:label "{label}"@en.
      FILTER (STRSTARTS(STR(?entity), "http://www.wikidata.org/entity/Q"))
    }}
    LIMIT 1
    """
    sparql = SPARQLWrapper(SPARQL_ENDPOINT)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    if results["results"]["bindings"]:
        qid = results["results"]["bindings"][0]["entity"]["value"].split("/")[-1]
        logging.info(f"Found QID for {label}: {qid}")
        return qid
    else:
        logging.warning(f"No QID found for label: {label}")
    return None

# List of relevant properties (modality and data type relationships)
RELEVANT_PROPERTIES = [
    # Fundamental Semantics
    "P31",  # instance of
    "P279",  # subclass of
    "P361",  # part of
    "P1269",  # facet of

    # Structure and Technology
    "P527",  # has part(s)
    "P2670",  # has part(s) of the class
    "P2701",  # file format
    "P1163",  # media type
    "P1195",  # file extension
    "P4330",  # contains

    # Application and Domain
    "P366",  # has use
    "P1535",  # used by
    "P101",  # field of work
    "P921",  # main subject


    # Quality and Metadata
    "P1552",  # has characteristic
    "P13044",  # characteristic of
    "P3575",  # data size
]

def query_wikidata_properties(qid):
    """Query Wikidata properties for a given entity"""
    if qid in queried_nodes:
        return []
    queried_nodes.add(qid)

    query = f"""
    SELECT ?property ?propertyLabel ?value ?valueLabel WHERE {{
      wd:{qid} ?p ?value.
      ?property wikibase:directClaim ?p.
      FILTER (?property IN (wd:{", wd:".join(RELEVANT_PROPERTIES)}))
      OPTIONAL {{ ?value rdfs:label ?valueLabel. FILTER(LANG(?valueLabel) = "en") }}
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    """
    sparql = SPARQLWrapper(SPARQL_ENDPOINT)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    return results["results"]["bindings"]

def recursive_query(qid, label=None, depth=0, max_depth=3):
    """Recursively query and construct the knowledge graph"""
    if depth > max_depth:
        return

    # Query entity properties
    results = query_wikidata_properties(qid)

    for result in results:
        prop = result["propertyLabel"]["value"].replace(" ", "_").replace("(", "_").replace(")", "_")
        value = result.get("valueLabel", {}).get("value", result["value"]["value"])
        value_uri = result["value"]["value"]

        # Check if the value is a Wikidata entity
        if "wikidata.org/entity/" in value_uri:
            value_qid = value_uri.split("/")[-1]

            # Create new node of type NewNode if it has not been queried
            if value_qid not in queried_nodes:
                g.add((WIKIDATA[value_qid], RDF.type, MP.NewNode))  # Mark the new entity as NewNode
                queried_nodes.add(value_qid)

            # Only create relationships (edges) between nodes without adding properties
            g.add((WIKIDATA[qid], MP[prop], WIKIDATA[value_qid]))
            if label:
                g.add((WIKIDATA[qid], RDFS.label, Literal(label)))
            if value:
                g.add((WIKIDATA[value_qid], RDFS.label, Literal(value)))

            # Recursively query the new node
            recursive_query(value_qid, label=value, depth=depth + 1, max_depth=max_depth)

        # Handle relationships between properties and new nodes
        if value_uri and "wikidata.org/entity/" in value_uri:
            value_qid = value_uri.split("/")[-1]
            if value_qid not in queried_nodes:
                g.add((WIKIDATA[value_qid], RDF.type, MP.NewNode))  # Mark the new entity as NewNode
                queried_nodes.add(value_qid)

            # Add edges based on RELEVANT_PROPERTIES between new nodes
            g.add((WIKIDATA[qid], MP[prop], WIKIDATA[value_qid]))  # Add relationship edge

def build_knowledge_graph(dataset, max_depth=3):
    """Build the knowledge graph from the dataset"""
    for entry in dataset:
        modality = entry["modality"]
        data_types = entry["data_types"]

        # Get QID for modality
        modality_qid = get_qid(modality)
        if modality_qid:
            print(f"Processing modality: {modality} (QID: {modality_qid})")

            # Add modality node to RDF graph and attach a label
            g.add((WIKIDATA[modality_qid], RDF.type, MP.Modality))
            g.add((WIKIDATA[modality_qid], RDFS.label, Literal(modality)))

            # Recursively query modality properties
            recursive_query(modality_qid, label=modality, depth=0, max_depth=max_depth)

            # Handle data_types
            for data_type, properties in data_types.items():
                data_type_qid = properties.get("wikidata_id")

                if data_type_qid:
                    # If the node exists as NewNode, change its type to Datatype
                    if (WIKIDATA[data_type_qid], RDF.type, MP.NewNode) in g:
                        g.remove((WIKIDATA[data_type_qid], RDF.type, MP.NewNode))

                    # Add data_type node to RDF graph with properties
                    g.add((WIKIDATA[data_type_qid], RDF.type, MP.Datatype))
                    g.add((WIKIDATA[data_type_qid], RDFS.label, Literal(data_type)))
                    g.add((WIKIDATA[data_type_qid], MP.frequency, Literal(properties.get("frequency"))))
                    g.add((WIKIDATA[data_type_qid], MP.QID, Literal(properties.get("wikidata_id"))))
                    g.add((WIKIDATA[data_type_qid], MP.wikidata_label, Literal(properties.get("wikidata_label"))))
                    g.add((WIKIDATA[data_type_qid], MP.wikidata_description, Literal(properties.get("wikidata_description"))))

                    # Create custom edge only between modality and datatype (not between two datatypes)
                    if (WIKIDATA[modality_qid], RDF.type, MP.Modality) in g and (WIKIDATA[data_type_qid], RDF.type, MP.Datatype) in g:
                        g.add((WIKIDATA[modality_qid], MP["Modality_Datatype"], WIKIDATA[data_type_qid]))

                    # Recursively query data_type properties
                    recursive_query(data_type_qid, label=data_type, depth=0, max_depth=max_depth)

    # Save the knowledge graph as Turtle and OWL format
    g.serialize(destination="knowledge_graph_with_labels_5_2.ttl", format="turtle")
    print("Knowledge graph saved to knowledge_graph_with_labels.ttl")

    g.serialize(destination="knowledge_graph_with_labels_5_2.owl", format="xml")
    print("Knowledge graph saved to knowledge_graph_with_labels.owl")

# Load the matched_modalities_data_types.json file
with open('matched_modalities_data_types.json', 'r') as f:
    dataset = json.load(f)

# Build the knowledge graph
build_knowledge_graph(dataset, max_depth=5)

ModuleNotFoundError: No module named 'rdflib'

In [None]:
from rdflib import Namespace, Graph, RDF, RDFS, Literal
import json
from SPARQLWrapper import SPARQLWrapper, JSON
import logging
import time

# Set up logging
logging.basicConfig(level=logging.INFO)

# Set up Wikidata SPARQL endpoint
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"

# Define namespaces
WIKIDATA = Namespace("http://www.wikidata.org/entity/")
WD = Namespace("http://www.wikidata.org/prop/direct/")

# Create RDF graph
g = Graph()
g.bind("wikidata", WIKIDATA)
g.bind("wd", WD)

# Avoid duplicate queries
queried_nodes = set()

# List of relevant properties (modality and data type relationships)
RELEVANT_PROPERTIES = [
    "P31",  # instance of
    "P279",  # subclass of
    "P361",  # part of
    "P1269",  # facet of
    "P527",  # has part(s)
    "P2670",  # has part(s) of the class
    "P2701",  # file format
    "P1163",  # media type
    "P1195",  # file extension
    "P4330",  # contains
    "P366",  # has use
    "P1535",  # used by
    "P101",  # field of work
    "P921",  # main subject
    "P1552",  # has characteristic
    "P13044",  # characteristic of
    "P3575",  # data size
]

def query_wikidata_properties(qid):
    """Query Wikidata properties for a given entity"""
    if qid in queried_nodes:
        return []
    queried_nodes.add(qid)

    query = f"""
    SELECT ?property ?propertyLabel ?value ?valueLabel WHERE {{
      wd:{qid} ?p ?value.
      ?property wikibase:directClaim ?p.
      FILTER (?property IN (wd:{", wd:".join(RELEVANT_PROPERTIES)}))
      OPTIONAL {{ ?value rdfs:label ?valueLabel. FILTER(LANG(?valueLabel) = "en") }}
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    """
    sparql = SPARQLWrapper(SPARQL_ENDPOINT)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    return results["results"]["bindings"]

def create_or_update_node(qid, node_type, properties=None):
    """
    Create or update a node with the given QID and type.
    Priority: Modality > Datatype > NewNode.
    """
    if qid in queried_nodes:
        existing_type = g.value(WIKIDATA[qid], RDF.type)
        if existing_type == WIKIDATA.NewNode and node_type in [WIKIDATA.Datatype, WIKIDATA.Modality]:
            g.remove((WIKIDATA[qid], RDF.type, WIKIDATA.NewNode))
            g.add((WIKIDATA[qid], RDF.type, node_type))
        elif existing_type == WIKIDATA.Datatype and node_type == WIKIDATA.Modality:
            g.remove((WIKIDATA[qid], RDF.type, WIKIDATA.Datatype))
            g.add((WIKIDATA[qid], RDF.type, node_type))
    else:
        g.add((WIKIDATA[qid], RDF.type, node_type))
        queried_nodes.add(qid)

    if properties:
        for key, value in properties.items():
            g.add((WIKIDATA[qid], WIKIDATA[key], Literal(value)))

def recursive_query(qid, node_type, depth=0, max_depth=3):
    """Recursively query and construct the knowledge graph"""
    if depth > max_depth:
        return

    # Query entity properties
    results = query_wikidata_properties(qid)

    for result in results:
        prop = result["property"]["value"].split("/")[-1]  # Extract property ID (e.g., P31)
        value_uri = result["value"]["value"]
        value_label = result.get("valueLabel", {}).get("value", value_uri)

        # Check if the value is a Wikidata entity
        if "wikidata.org/entity/" in value_uri:
            value_qid = value_uri.split("/")[-1]

            # Create or update the node
            if value_qid not in queried_nodes:
                create_or_update_node(value_qid, WIKIDATA.NewNode, {"label": value_label})
            elif (WIKIDATA[value_qid], RDF.type, WIKIDATA.NewNode) in g:
                create_or_update_node(value_qid, node_type)

            # Add relationship edge
            g.add((WIKIDATA[qid], WD[prop], WIKIDATA[value_qid]))

            # Recursively query the new node
            recursive_query(value_qid, node_type, depth=depth + 1, max_depth=max_depth)

def build_knowledge_graph(dataset, max_depth=3):
    """Build the knowledge graph from the dataset"""
    for entry in dataset:
        modality = entry["modality"]
        modality_qid = entry.get("wikidata_id")

        if modality_qid:
            logging.info(f"Processing modality: {modality} (QID: {modality_qid})")

            # Create or update modality node
            create_or_update_node(modality_qid, WIKIDATA.Modality, {
                "label": modality,
                "QID": modality_qid
            })

            # Recursively query modality properties
            recursive_query(modality_qid, WIKIDATA.Modality, depth=0, max_depth=max_depth)

            # Handle data_types
            for data_type, properties in entry.get("data_types", {}).items():
                data_type_qid = properties.get("wikidata_id")

                if data_type_qid:
                    # Create or update datatype node
                    create_or_update_node(data_type_qid, WIKIDATA.Datatype, {
                        "label": data_type,
                        "frequency": properties.get("frequency"),
                        "QID": data_type_qid,
                        "wikidata_label": properties.get("wikidata_label"),
                        "wikidata_description": properties.get("wikidata_description")
                    })

                    # Add relationship between modality and datatype
                    g.add((WIKIDATA[modality_qid], WD["P366"], WIKIDATA[data_type_qid]))  # P366: has use

                    # Recursively query datatype properties
                    recursive_query(data_type_qid, WIKIDATA.Datatype, depth=0, max_depth=max_depth)

    # Save the knowledge graph as Turtle and OWL format
    g.serialize(destination="knowledge_graph.ttl", format="turtle")
    logging.info("Knowledge graph saved to knowledge_graph.ttl")

    g.serialize(destination="knowledge_graph.owl", format="xml")
    logging.info("Knowledge graph saved to knowledge_graph.owl")

# Load the matched_modalities_data_types.json file
with open('matched_modalities_data_types.json', 'r') as f:
    dataset = json.load(f)

# Build the knowledge graph
build_knowledge_graph(dataset, max_depth=5)

ModuleNotFoundError: No module named 'rdflib'

In [None]:
import json
from SPARQLWrapper import SPARQLWrapper, JSON

# Set up Wikidata SPARQL endpoint
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"

def query_wikidata_label(qid):
    time.sleep(1)
    """Query Wikidata for the label of an entity based on its QID"""
    query = f"""
    SELECT ?label WHERE {{
      wd:{qid} rdfs:label ?label.
      FILTER(LANG(?label) = "en")
    }}
    """
    sparql = SPARQLWrapper(SPARQL_ENDPOINT)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    # If label found, return it, otherwise return None
    if results["results"]["bindings"]:
        return results["results"]["bindings"][0]["label"]["value"]
    return None

def update_modality_labels(dataset):
    """Update the modality entries with the true label from Wikidata"""
    for entry in dataset:
        modality_qid = entry.get("wikidata_id")
        if modality_qid:
            # Query Wikidata for the label
            label = query_wikidata_label(modality_qid)
            if label:
                # Add the label to the modality entry as wikidata_true_label
                entry["wikidata_true_label"] = label
            else:
                print(f"No label found for QID: {modality_qid}")
        else:
            print(f"No wikidata_id found for modality: {entry.get('modality')}")

    return dataset

# Load the updated_matched_modalities_data_types.json file
with open('updated_matched_modalities_data_types.json', 'r') as f:
    dataset = json.load(f)

# Update modality labels
updated_dataset = update_modality_labels(dataset)

# Save the updated dataset to a new JSON file
with open('updated_modalities_with_true_labels.json', 'w') as f:
    json.dump(updated_dataset, f, indent=4)

print("Updated dataset saved to updated_modalities_with_true_labels.json")

Updated dataset saved to updated_modalities_with_true_labels.json


In [None]:
from rdflib import Namespace, Graph, RDF, RDFS, Literal
import json
from SPARQLWrapper import SPARQLWrapper, JSON

# Set up Wikidata SPARQL endpoint
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"

# From label get qid
def get_qid(label):
    """Get the QID of an entity based on its label."""
    time.sleep(1)
    logging.info(f"Fetching QID for label: {label}")
    query = f"""
    SELECT ?entity WHERE {{
      ?entity rdfs:label "{label}"@en.
      FILTER (STRSTARTS(STR(?entity), "http://www.wikidata.org/entity/Q"))
    }}
    LIMIT 1
    """
    sparql = SPARQLWrapper(SPARQL_ENDPOINT)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    if results["results"]["bindings"]:
        qid = results["results"]["bindings"][0]["entity"]["value"].split("/")[-1]
        logging.info(f"Found QID for {label}: {qid}")
        return qid
    else:
        logging.warning(f"No QID found for label: {label}")
    return None

# Define namespaces
MP = Namespace("http://masterproject.org/")
WIKIDATA = Namespace("http://www.wikidata.org/")

# Create RDF graph
g = Graph()
g.bind("mp", MP)
g.bind("wikidata", WIKIDATA)

# Avoid duplicate queries
queried_nodes = set()

# List of relevant properties
RELEVANT_PROPERTIES = [
    "P31", "P279", "P361", "P1269", "P527", "P2670", "P2701", "P1163",
    "P1195", "P4330", "P366", "P1535", "P101", "P921", "P1552", "P13044",
    "P3575"]

def query_wikidata_properties(qid):
    """Query Wikidata properties for a given entity"""
    if qid in queried_nodes:
        return []
    queried_nodes.add(qid)

    query = f"""
    SELECT ?property ?propertyLabel ?value ?valueLabel WHERE {{
      wd:{qid} ?p ?value.
      ?property wikibase:directClaim ?p.
      FILTER (?property IN (wd:{", wd:".join(RELEVANT_PROPERTIES)}))
      OPTIONAL {{ ?value rdfs:label ?valueLabel. FILTER(LANG(?valueLabel) = "en") }}
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    """
    sparql = SPARQLWrapper(SPARQL_ENDPOINT)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    return results["results"]["bindings"]

def recursive_query(qid, label=None, depth=0, max_depth=3):
    """Recursively query and construct the knowledge graph"""
    if depth > max_depth:
        return

    # Query entity properties
    results = query_wikidata_properties(qid)

    for result in results:
        prop = result["propertyLabel"]["value"].replace(" ", "_").replace("(", "_").replace(")", "_")
        value = result.get("valueLabel", {}).get("value", result["value"]["value"])
        value_uri = result["value"]["value"]

        # Create the full URL for the property (wikidata.org/wiki/Property:PXX)
        property_uri = f"https://www.wikidata.org/wiki/Property:{prop}"

        # Check if the value is a Wikidata entity
        if "wikidata.org/entity/" in value_uri:
            value_qid = value_uri.split("/")[-1]

            # Create new node of type NewNode if it has not been queried
            if value_qid not in queried_nodes:
                g.add((WIKIDATA[value_qid], RDF.type, MP.NewNode))  # Mark the new entity as NewNode
                queried_nodes.add(value_qid)

            # Only create relationships (edges) between nodes without adding properties
            g.add((WIKIDATA[qid], URIRef(property_uri), WIKIDATA[value_qid]))
            if label:
                g.add((WIKIDATA[qid], RDFS.label, Literal(label)))
            if value:
                g.add((WIKIDATA[value_qid], RDFS.label, Literal(value)))

            # Recursively query the new node
            recursive_query(value_qid, label=value, depth=depth + 1, max_depth=max_depth)

        # Handle relationships between properties and new nodes
        if value_uri and "wikidata.org/entity/" in value_uri:
            value_qid = value_uri.split("/")[-1]
            if value_qid not in queried_nodes:
                g.add((WIKIDATA[value_qid], RDF.type, MP.NewNode))  # Mark the new entity as NewNode
                queried_nodes.add(value_qid)

            # Add edges based on RELEVANT_PROPERTIES between new nodes
            g.add((WIKIDATA[qid], URIRef(property_uri), WIKIDATA[value_qid]))  # Add relationship edge

def build_knowledge_graph(dataset, max_depth=3):
    """Build the knowledge graph from the dataset"""
    for entry in dataset:
        modality = entry["modality"]
        data_types = entry["data_types"]

        # Get QID for modality
        modality_qid = get_qid(modality)
        if modality_qid:
            print(f"Processing modality: {modality} (QID: {modality_qid})")

            # Add modality node to RDF graph and attach a label
            g.add((WIKIDATA[modality_qid], RDF.type, MP.Modality))
            g.add((WIKIDATA[modality_qid], RDFS.label, Literal(modality)))

            # Recursively query modality properties
            recursive_query(modality_qid, label=modality, depth=0, max_depth=max_depth)

            # Handle data_types
            for data_type, properties in data_types.items():
                data_type_qid = properties.get("wikidata_id")

                if data_type_qid:
                    # If the node exists as NewNode, change its type to Datatype
                    if (WIKIDATA[data_type_qid], RDF.type, MP.NewNode) in g:
                        g.remove((WIKIDATA[data_type_qid], RDF.type, MP.NewNode))

                    # Add data_type node to RDF graph with properties
                    g.add((WIKIDATA[data_type_qid], RDF.type, MP.Datatype))
                    g.add((WIKIDATA[data_type_qid], RDFS.label, Literal(data_type)))
                    g.add((WIKIDATA[data_type_qid], MP.frequency, Literal(properties.get("frequency"))))
                    g.add((WIKIDATA[data_type_qid], MP.QID, Literal(properties.get("wikidata_id"))))
                    g.add((WIKIDATA[data_type_qid], MP.wikidata_label, Literal(properties.get("wikidata_label"))))
                    g.add((WIKIDATA[data_type_qid], MP.wikidata_description, Literal(properties.get("wikidata_description"))))

                    # Create custom edge between modality and data_type
                    g.add((WIKIDATA[modality_qid], MP["Modality_Datatype"], WIKIDATA[data_type_qid]))

                    # Recursively query data_type properties
                    recursive_query(data_type_qid, label=data_type, depth=0, max_depth=max_depth)

    # Save the knowledge graph as Turtle and OWL format
    g.serialize(destination="knowledge_graph_with_labels.ttl", format="turtle")
    print("Knowledge graph saved to knowledge_graph_with_labels.ttl")

    g.serialize(destination="knowledge_graph_with_labels.owl", format="xml")
    print("Knowledge graph saved to knowledge_graph_with_labels.owl")

# Load the matched_modalities_data_types.json file
with open('updated_matched_modalities_data_types.json', 'r') as f:
    dataset = json.load(f)

# Build the knowledge graph
build_knowledge_graph(dataset, max_depth=3)

In [None]:
from rdflib import Namespace, Graph, RDF, RDFS, Literal
import json
from SPARQLWrapper import SPARQLWrapper, JSON

# Set up Wikidata SPARQL endpoint
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"

# From label get qid
def get_qid(label):
    """Get the QID of an entity based on its label."""
    time.sleep(1)
    logging.info(f"Fetching QID for label: {label}")
    query = f"""
    SELECT ?entity WHERE {{
      ?entity rdfs:label "{label}"@en.
      FILTER (STRSTARTS(STR(?entity), "http://www.wikidata.org/entity/Q"))
    }}
    LIMIT 1
    """
    sparql = SPARQLWrapper(SPARQL_ENDPOINT)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    if results["results"]["bindings"]:
        qid = results["results"]["bindings"][0]["entity"]["value"].split("/")[-1]
        logging.info(f"Found QID for {label}: {qid}")
        return qid
    else:
        logging.warning(f"No QID found for label: {label}")
    return None

# Define namespaces
MP = Namespace("http://masterproject.org/")
WIKIDATA = Namespace("http://www.wikidata.org/entity/")

# Create RDF graph
g = Graph()
g.bind("mp", MP)
g.bind("wikidata", WIKIDATA)

# Avoid duplicate queries
queried_nodes = set()

# List of relevant properties
RELEVANT_PROPERTIES = [
    "P31", "P279", "P361", "P1269", "P527", "P2670", "P2701", "P1163",
    "P1195", "P4330", "P366", "P1535", "P101", "P921", "P1552", "P13044",
    "P3575"]

def query_wikidata_properties(qid):
    # time.sleep(1)
    """Query Wikidata properties for a given entity"""
    if qid in queried_nodes:
        return []
    queried_nodes.add(qid)

    query = f"""
    SELECT ?property ?propertyLabel ?value ?valueLabel WHERE {{
      wd:{qid} ?p ?value.
      ?property wikibase:directClaim ?p.
      FILTER (?property IN (wd:{", wd:".join(RELEVANT_PROPERTIES)}))
      OPTIONAL {{ ?value rdfs:label ?valueLabel. FILTER(LANG(?valueLabel) = "en") }}
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    """
    sparql = SPARQLWrapper(SPARQL_ENDPOINT)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    # time.sleep(1)
    results = sparql.query().convert()
    return results["results"]["bindings"]

def recursive_query(qid, label=None, depth=0, max_depth=3):
    """Recursively query and construct the knowledge graph"""
    if depth > max_depth:
        return

    # Query entity properties
    results = query_wikidata_properties(qid)


    for result in results:
        # print(result["property"]["value"].split('/')[-1])
        prop = result["property"]["value"].split('/')[-1]
        # prop = result["propertyLabel"]["value"].replace(" ", "_").replace("(", "_").replace(")", "_")
        value = result.get("valueLabel", {}).get("value", result["value"]["value"])
        value_uri = result["value"]["value"]

        # Create the full URL for the property (wikidata.org/wiki/Property:PXX)
        property_uri = f"https://www.wikidata.org/wiki/{prop}"

        # Check if the value is a Wikidata entity
        if "wikidata.org/" in value_uri:
            value_qid = value_uri.split("/")[-1]

            # Create new node of type NewNode if it has not been queried
            if value_qid not in queried_nodes:
                g.add((WIKIDATA[value_qid], RDF.type, MP.NewNode))  # Mark the new entity as NewNode
                queried_nodes.add(value_qid)

            # Only create relationships (edges) between nodes without adding properties
            g.add((WIKIDATA[qid], URIRef(property_uri), WIKIDATA[value_qid]))

            # Add rdfs:label to edge (relationship)
            g.add((WIKIDATA[qid], URIRef(property_uri), WIKIDATA[value_qid]))
            g.add((WIKIDATA[qid], RDFS.label, Literal(result["propertyLabel"]["value"])))  # Label for edge

            if label:
                g.add((WIKIDATA[qid], RDFS.label, Literal(label)))
            if value:
                g.add((WIKIDATA[value_qid], RDFS.label, Literal(value)))

            # Recursively query the new node
            recursive_query(value_qid, label=value, depth=depth + 1, max_depth=max_depth)

        # Handle relationships between properties and new nodes
        if value_uri and "wikidata.org/entity/" in value_uri:
            value_qid = value_uri.split("/")[-1]
            if value_qid not in queried_nodes:
                g.add((WIKIDATA[value_qid], RDF.type, MP.NewNode))  # Mark the new entity as NewNode
                queried_nodes.add(value_qid)

            # Add edges based on RELEVANT_PROPERTIES between new nodes
            g.add((WIKIDATA[qid], URIRef(property_uri), WIKIDATA[value_qid]))  # Add relationship edge

def build_knowledge_graph(dataset, max_depth=3):
    """Build the knowledge graph from the dataset"""
    for entry in dataset:
        modality = entry["modality"]
        data_types = entry["data_types"]

        # Get QID for modality
        modality_qid = entry.get("wikidata_id")  # Directly use the wikidata_id from the dataset
        if modality_qid:
            print(f"Processing modality: {modality} (QID: {modality_qid})")

            # Add modality node to RDF graph and attach a label
            g.add((WIKIDATA[modality_qid], RDF.type, MP.Modality))
            g.add((WIKIDATA[modality_qid], RDFS.label, Literal(modality)))

            # Recursively query modality properties
            recursive_query(modality_qid, label=modality, depth=0, max_depth=max_depth)

            # Handle data_types
            for data_type, properties in data_types.items():
                data_type_qid = properties.get("wikidata_id")

                if data_type_qid:
                    # If the node exists as NewNode, change its type to Datatype
                    if (WIKIDATA[data_type_qid], RDF.type, MP.NewNode) in g:
                        g.remove((WIKIDATA[data_type_qid], RDF.type, MP.NewNode))

                    # Add data_type node to RDF graph with properties
                    g.add((WIKIDATA[data_type_qid], RDF.type, MP.Datatype))
                    g.add((WIKIDATA[data_type_qid], RDFS.label, Literal(data_type)))
                    g.add((WIKIDATA[data_type_qid], MP.frequency, Literal(properties.get("frequency"))))
                    g.add((WIKIDATA[data_type_qid], MP.QID, Literal(properties.get("wikidata_id"))))
                    g.add((WIKIDATA[data_type_qid], MP.wikidata_label, Literal(properties.get("wikidata_label"))))
                    g.add((WIKIDATA[data_type_qid], MP.wikidata_description, Literal(properties.get("wikidata_description"))))

                    # Create custom edge between modality and data_type
                    g.add((WIKIDATA[modality_qid], MP["Modality_Datatype"], WIKIDATA[data_type_qid]))

                    # Recursively query data_type properties
                    recursive_query(data_type_qid, label=data_type, depth=0, max_depth=max_depth)

    # Save the knowledge graph as Turtle and OWL format
    g.serialize(destination="knowledge_graph_with_labels.ttl", format="turtle")
    print("Knowledge graph saved to knowledge_graph_with_labels.ttl")

    g.serialize(destination="knowledge_graph_with_labels.owl", format="xml")
    print("Knowledge graph saved to knowledge_graph_with_labels.owl")

# Load the matched_modalities_data_types.json file
with open('updated_modalities_with_true_labels.json', 'r') as f:
    dataset = json.load(f)

# Build the knowledge graph
build_knowledge_graph(dataset, max_depth=2)

Processing modality: image (QID: Q478798)
Processing modality: text (QID: Q234460)
Processing modality: 3d model (QID: Q3859833)
Processing modality: video (QID: Q625525)
Processing modality: audio (QID: Q3500685)
Processing modality: 3d point cloud (QID: Q1899648)
Processing modality: numerical (QID: Q63116)
Processing modality: metadata (QID: Q180160)
Processing modality: link (QID: Q63617815)
Processing modality: structured data (QID: Q26813700)
Processing modality: annotation (QID: Q857525)
Processing modality: depth (QID: Q930412)
Processing modality: skeleton (QID: Q186190)
Processing modality: inertial (QID: Q570607)
Processing modality: sensor (QID: Q37453697)
Processing modality: midi (QID: Q98777027)
Processing modality: 3d (QID: Q108276326)
Processing modality: 3d skeletal (QID: Q1813564)
Processing modality: tabular (QID: Q13402984)
Processing modality: graph (QID: Q98669280)
Processing modality: 3d scan (QID: Q94701573)
Processing modality: kinematic (QID: Q25215452)
Proce

## Modify Node Type

In [None]:
from rdflib import Namespace, Graph, RDF, RDFS, Literal
from rdflib.namespace import URIRef
import json
from SPARQLWrapper import SPARQLWrapper, JSON

# Set up Wikidata SPARQL endpoint
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"

# Define namespaces
MP = Namespace("http://masterproject.org/")
WIKIDATA = Namespace("http://www.wikidata.org/entity/")

# Create RDF graph
g = Graph()
g.bind("mp", MP)
g.bind("wikidata", WIKIDATA)

# Avoid duplicate queries
queried_nodes = set()

# List of relevant properties
RELEVANT_PROPERTIES = [
    "P31", "P279", "P361", "P1269", "P527", "P2670", "P2701", "P1163",
    "P1195", "P4330", "P366", "P1535", "P101", "P921", "P1552", "P13044",
    "P3575"]

def query_wikidata_properties(qid):
    """Query Wikidata properties for a given entity"""
    if qid in queried_nodes:
        return []
    queried_nodes.add(qid)

    query = f"""
    SELECT ?property ?propertyLabel ?value ?valueLabel WHERE {{
      wd:{qid} ?p ?value.
      ?property wikibase:directClaim ?p.
      FILTER (?property IN (wd:{", wd:".join(RELEVANT_PROPERTIES)}))
      OPTIONAL {{ ?value rdfs:label ?valueLabel. FILTER(LANG(?valueLabel) = "en") }}
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    """
    sparql = SPARQLWrapper(SPARQL_ENDPOINT)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    return results["results"]["bindings"]

def recursive_query(qid, label=None, depth=0, max_depth=3, dataset=None):
    """Recursively query and construct the knowledge graph"""
    if depth > max_depth:
        return

    # Query entity properties
    results = query_wikidata_properties(qid)

    for result in results:
        prop = result["property"]["value"].split('/')[-1]
        value = result.get("valueLabel", {}).get("value", result["value"]["value"])
        value_uri = result["value"]["value"]
        property_uri = f"https://www.wikidata.org/wiki/{prop}"

        if "wikidata.org/" in value_uri:
            value_qid = value_uri.split("/")[-1]

            # Process value as NewNode if not already queried
            if not any(value_qid == entry.get("wikidata_id") or value_qid in [dt.get("wikidata_id") for dt in entry["data_types"].values()] for entry in dataset):
                if value_qid not in queried_nodes:
                    g.add((WIKIDATA[value_qid], RDF.type, MP.NewNode))
                    g.add((WIKIDATA[value_qid], RDFS.label, Literal(value)))

            g.add((WIKIDATA[qid], URIRef(property_uri), WIKIDATA[value_qid]))
            g.add((URIRef(property_uri), RDFS.label, Literal(result["propertyLabel"]["value"])))

            if label:
                g.add((WIKIDATA[qid], RDFS.label, Literal(label)))
            if value:
                g.add((WIKIDATA[value_qid], RDFS.label, Literal(value)))

            recursive_query(value_qid, label=value, depth=depth + 1, max_depth=max_depth, dataset=dataset)

        if value_uri and "wikidata.org/entity/" in value_uri:
            value_qid = value_uri.split("/")[-1]
            if value_qid not in queried_nodes:
                g.add((WIKIDATA[value_qid], RDF.type, MP.NewNode))
                queried_nodes.add(value_qid)

            g.add((WIKIDATA[qid], URIRef(property_uri), WIKIDATA[value_qid]))
            g.add((URIRef(property_uri), RDFS.label, Literal(result["propertyLabel"]["value"])))

def process_modality_node(modality_qid, modality):
    """Process a modality node with necessary checks and relations"""
    if not any(entry["wikidata_id"] == modality_qid for entry in dataset):
        return

    if (WIKIDATA[modality_qid], RDF.type, MP.Modality) in g:
        return

    if (WIKIDATA[modality_qid], RDF.type, MP.NewNode) in g:
        g.remove((WIKIDATA[modality_qid], RDF.type, MP.NewNode))

    g.add((WIKIDATA[modality_qid], RDF.type, MP.Modality))
    g.add((WIKIDATA[modality_qid], RDFS.label, Literal(modality)))
    recursive_query(modality_qid, label=modality, dataset=dataset)

def process_datatype_node(data_type_qid, data_type, properties):
    """Process a datatype node with necessary checks and relations"""
    if not any(data_type_qid == dt.get("wikidata_id") for entry in dataset for dt in entry["data_types"].values()):
        return

    if (WIKIDATA[data_type_qid], RDF.type, MP.Modality) in g:
        return

    if (WIKIDATA[data_type_qid], RDF.type, MP.NewNode) in g:
        g.remove((WIKIDATA[data_type_qid], RDF.type, MP.NewNode))

    g.add((WIKIDATA[data_type_qid], RDF.type, MP.Datatype))
    g.add((WIKIDATA[data_type_qid], RDFS.label, Literal(data_type)))
    g.add((WIKIDATA[data_type_qid], MP.frequency, Literal(properties.get("frequency"))))
    g.add((WIKIDATA[data_type_qid], MP.QID, Literal(properties.get("wikidata_id"))))
    g.add((WIKIDATA[data_type_qid], MP.wikidata_label, Literal(properties.get("wikidata_label"))))
    g.add((WIKIDATA[data_type_qid], MP.wikidata_description, Literal(properties.get("wikidata_description"))))

    recursive_query(data_type_qid, label=data_type, dataset=dataset)

def build_knowledge_graph(dataset, max_depth=3):
    # print(f"Depth: {depth}")
    """Build the knowledge graph from the dataset"""
    for entry in dataset:
        modality = entry["modality"]
        data_types = entry["data_types"]

        # Process Modality
        modality_qid = entry.get("wikidata_id")
        if modality_qid:
            print(f"Processing modality: {modality} (QID: {modality_qid})")
            process_modality_node(modality_qid, modality)

            # Process DataTypes related to Modality
            for data_type, properties in data_types.items():
                data_type_qid = properties.get("wikidata_id")
                if data_type_qid:
                    # print(f"Processing datatype: {data_type} (QID: {data_type_qid})")
                    process_datatype_node(data_type_qid, data_type, properties)
                    g.add((WIKIDATA[modality_qid], MP["Modality_Datatype"], WIKIDATA[data_type_qid]))

    # Save the knowledge graph as Turtle and OWL format
    g.serialize(destination="knowledge_graph_with_labels.ttl", format="turtle")
    print("Knowledge graph saved to knowledge_graph_with_labels.ttl")

    g.serialize(destination="knowledge_graph_with_labels.owl", format="xml")
    print("Knowledge graph saved to knowledge_graph_with_labels.owl")

# Load the dataset and build the knowledge graph
with open('updated_modalities_with_true_labels.json', 'r') as f:
    dataset = json.load(f)

build_knowledge_graph(dataset, max_depth=2)

Processing modality: image (QID: Q478798)
Processing modality: text (QID: Q234460)
Processing modality: 3d model (QID: Q3859833)
Processing modality: video (QID: Q625525)
Processing modality: audio (QID: Q3500685)
Processing modality: 3d point cloud (QID: Q1899648)
Processing modality: numerical (QID: Q63116)
Processing modality: metadata (QID: Q180160)
Processing modality: link (QID: Q63617815)
Processing modality: structured data (QID: Q26813700)
Processing modality: annotation (QID: Q857525)
Processing modality: depth (QID: Q930412)
Processing modality: skeleton (QID: Q186190)
Processing modality: inertial (QID: Q570607)
Processing modality: sensor (QID: Q37453697)
Processing modality: midi (QID: Q98777027)
Processing modality: 3d (QID: Q108276326)
Processing modality: 3d skeletal (QID: Q1813564)
Processing modality: tabular (QID: Q13402984)
Processing modality: graph (QID: Q98669280)
Processing modality: 3d scan (QID: Q94701573)
Processing modality: kinematic (QID: Q25215452)
Proce

## database information supplementary

In [None]:
from rdflib import Graph, URIRef, Literal, Namespace, RDF, RDFS
import csv

# Load the existing knowledge graph (TTL file)
g = Graph()
g.parse("knowledge_graph_with_labels.ttl", format="turtle")

# Define the custom namespace for Datasets
MP = Namespace("http://masterproject.org/")
# DATASET = Namespace("http://masterproject.org/dataset/")

# Define the namespaces for Modality and Datatype
WIKIDATA = Namespace("http://www.wikidata.org/entity/")

# Load the CSV file (datasets_modalities_datatypes.csv)
with open('datasets_modalities_datatypes.csv', 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    datasets = list(reader)

# Iterate through each dataset entry to create a new Dataset node
for idx, entry in enumerate(datasets):
    # Create a unique URI for the dataset, e.g., D1, D2, D100
    dataset_did = URIRef(f"D{idx + 1}")
    # print(WIKIDATA[dataset_did])

    # Create Dataset node with dataset_name as the label
    dataset_name = entry["dataset_name"]
    dataset_description = entry["dataset_description"]
    modality = entry["modality"]
    data_type = entry["data_type"]

    g.add((WIKIDATA[dataset_did], RDF.type, MP.Dataset))
    g.add((WIKIDATA[dataset_did], RDFS.label, Literal(dataset_name)))
    g.add((WIKIDATA[dataset_did], MP.dataset_description, Literal(dataset_description)))
    g.add((WIKIDATA[dataset_did], MP.modality, Literal(modality)))
    g.add((WIKIDATA[dataset_did], MP.data_type, Literal(data_type)))

    # Query the Modality and Datatype nodes by their labels
    modality_nodes = list(g.query(f"""
        SELECT ?modality WHERE {{
            ?modality rdf:type <http://masterproject.org/Modality>.
            ?modality rdfs:label "{modality}".
        }}
    """))

    datatype_nodes = list(g.query(f"""
        SELECT ?datatype WHERE {{
            ?datatype rdf:type <http://masterproject.org/Datatype>.
            ?datatype rdfs:label "{data_type}".
        }}
    """))

    # If Modality node is found, create an edge between Dataset and Modality
    for modality_node in modality_nodes:
        modality_uri = modality_node[0]
        g.add((WIKIDATA[dataset_did], MP.Dataset_Modality, modality_uri))

    # If Datatype node is found, create an edge between Dataset and Datatype
    for datatype_node in datatype_nodes:
        datatype_uri = datatype_node[0]
        g.add((WIKIDATA[dataset_did], MP.Dataset_Datatype, datatype_uri))

# Save the updated knowledge graph back to a TTL file
g.serialize(destination="updated_knowledge_graph_q.ttl", format="turtle")
print("Knowledge graph updated and saved as updated_knowledge_graph.ttl")

Knowledge graph updated and saved as updated_knowledge_graph.ttl
