In [None]:
pip install requests networkx matplotlib SPARQLWrapper

Collecting SPARQLWrapper
  Downloading SPARQLWrapper-2.0.0-py3-none-any.whl.metadata (2.0 kB)
Collecting rdflib>=6.1.1 (from SPARQLWrapper)
  Downloading rdflib-7.1.3-py3-none-any.whl.metadata (11 kB)
Downloading SPARQLWrapper-2.0.0-py3-none-any.whl (28 kB)
Downloading rdflib-7.1.3-py3-none-any.whl (564 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m564.9/564.9 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdflib, SPARQLWrapper
Successfully installed SPARQLWrapper-2.0.0 rdflib-7.1.3


In [None]:
import requests
import networkx as nx
import matplotlib.pyplot as plt
from SPARQLWrapper import SPARQLWrapper, JSON
import warnings
warnings.filterwarnings("ignore", message=".*Glyph*")
import time
import json
import logging

## Verify Duplications in Modality and Datatype

In [None]:
# load the file -> "merged_modalities_data_types.json"
with open('merged_modalities_data_types.json', 'r') as file:
    data = json.load(file)

# Initialize a set to store all the "data_types" values
data_types_set = set()

# Check for duplicates
for entry in data:
    modality = entry['modality'].lower()
    data_types = [dt.lower() for dt in entry['data_types']]

    # Check if the modality is in the data_types
    if modality in data_types:
        print(f"Duplicate found: {modality} is present in data_types {data_types}")

    # Add all data_types to the set for further checking
    for dt in data_types:
        data_types_set.add(dt)

Duplicate found: text is present in data_types ['graph based annotation', 'beats annotation', 'wikipedia topic', 'wikipedia page', 'molecular description', 'magazine text', 'sign label', 'pixel wise segmentation mask', 'doi identifier', 'attack category label', 'output file', 'free text argument', 'sleep diary', 'anonymized response', 'road attribute', 'internet censorship test request', 'text to speech script', 'resolution', 'educational course material', 'experiment design', 'meeting note', 'watermark text', 'natural language caption', 'sample efficient task learning dialogue', 'condition', 'hierarchical label', 'assembly instruction', 'syntactic annotation', 'website url', 'positional relationship sentence', 'problem setup', 'case summary', 'finite clause embedding verb', 'sms message', 'safety reply', 'bt flows label', 'consumer review', 'invite link network node', 'literature survey', 'social media caption', 'action adverb pair', 'photo metadata', 'academic title', 'legal article'

### Remove Duplication Words in Datatypes

In [None]:
# load the file -> "merged_modalities_data_types.json"
with open('merged_modalities_data_types.json', 'r') as file:
    data = json.load(file)

# Iterate through the data and remove duplicate entries from 'data_types'
for entry in data:
    modality = entry['modality'].lower()
    data_types = [dt.lower() for dt in entry['data_types']]

    # Remove the modality from the data_types if it exists
    if modality in data_types:
        print(f"Removing duplicate: {modality} from data_types {data_types}")
        entry['data_types'] = [dt for dt in data_types if dt != modality]

# Save the updated JSON data back to the file
with open('merged_modalities_data_types_cleaned.json', 'w') as file:
    json.dump(data, file, indent=4)

Removing duplicate: text from data_types ['graph based annotation', 'beats annotation', 'wikipedia topic', 'wikipedia page', 'molecular description', 'magazine text', 'sign label', 'pixel wise segmentation mask', 'doi identifier', 'attack category label', 'output file', 'free text argument', 'sleep diary', 'anonymized response', 'road attribute', 'internet censorship test request', 'text to speech script', 'resolution', 'educational course material', 'experiment design', 'meeting note', 'watermark text', 'natural language caption', 'sample efficient task learning dialogue', 'condition', 'hierarchical label', 'assembly instruction', 'syntactic annotation', 'website url', 'positional relationship sentence', 'problem setup', 'case summary', 'finite clause embedding verb', 'sms message', 'safety reply', 'bt flows label', 'consumer review', 'invite link network node', 'literature survey', 'social media caption', 'action adverb pair', 'photo metadata', 'academic title', 'legal article', 'con

## Verify the QIDs for All Identified Modalities and Datatypes

In [None]:
# Wikidata SPARQL endpoint
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"

# From label get qid
def get_qid(label):
    """Get the QID of an entity based on its label."""
    time.sleep(1)
    logging.info(f"Fetching QID for label: {label}")
    query = f"""
    SELECT ?entity WHERE {{
      ?entity rdfs:label "{label}"@en.
      FILTER (STRSTARTS(STR(?entity), "http://www.wikidata.org/entity/Q"))
    }}
    LIMIT 1
    """
    sparql = SPARQLWrapper(SPARQL_ENDPOINT)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    if results["results"]["bindings"]:
        qid = results["results"]["bindings"][0]["entity"]["value"].split("/")[-1]
        logging.info(f"Found QID for {label}: {qid}")
        return qid
    else:
        logging.warning(f"No QID found for label: {label}")
    return None

In [None]:
# load the file -> "wikidata_matched_full_NoIso.json"
def check_labels_in_wikidata(json_file):
    """Reads the json file and checks if each label has a QID."""
    with open(json_file, "r", encoding="utf-8") as file:
        data = json.load(file)

    # Access the 'isolated_type' key in the data
    isolated_types = data.get("clustered_types", [])

    # Loop through the data and get QID for each 'wikidata_label'
    for item in isolated_types:
        label = item.get("wikidata_label")

        if label:
            qid = get_qid(label)
            if not qid:
                logging.warning(f"Label '{label}' does not have a corresponding QID.")
        else:
            logging.warning("No 'wikidata_label' found in item.")

json_file = "wikidata_matched_full_NoIso.json"
check_labels_in_wikidata(json_file)



In [None]:
# From qid get label
def get_label(qid):
    """Get the label of an entity based on its QID."""
    time.sleep(1)  # Respect rate limits
    logging.info(f"Fetching label for QID: {qid}")

    query = f"""
    SELECT ?label WHERE {{
      wd:{qid} rdfs:label ?label.
      FILTER(LANG(?label) = "en")
    }}
    LIMIT 1
    """

    sparql = SPARQLWrapper(SPARQL_ENDPOINT)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    if results["results"]["bindings"]:
        label = results["results"]["bindings"][0]["label"]["value"]
        logging.info(f"Found label for {qid}: {label}")
        return label
    else:
        logging.warning(f"No label found for QID: {qid}")

    return None

In [None]:
get_label("Q725252")

'satellite imagery'

## Merge Modality and Datatypes into One File

In [None]:
def extract_datatypes(input_file, output_file):
    """
    Extract all data types from the input JSON file and save them to a list.

    :param input_file: Path to the input JSON file (e.g., "data.json")
    :param output_file: Path to the output file (e.g., "datatypes.json")
    """
    # Load the input JSON file
    with open(input_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    # Extract all datatypes from the "isolated_type" list
    datatypes = [entry["datatype"] for entry in data["clustered_types"]]

    # Save the extracted datatypes to the output file
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(datatypes, f, indent=4)

    print(f"Extracted datatypes saved to {output_file}")

# Example usage
input_file = "wikidata_matched_full_NoIso_QIDok.json"  # Path to the input JSON file
output_file = "datatypes_wikidata_matched_full.json"  # Path to the output JSON file
extract_datatypes(input_file, output_file)

Extracted datatypes saved to datatypes_wikidata_matched_full.json


In [None]:
def filter_and_save_modalities(input_modalities_file, input_datatypes_file, output_file):
    """
    Filter the data types in the input modalities file based on the available data types
    in the input datatypes file, and save the filtered result to a new output file.

    :param input_modalities_file: Path to the merged modalities file (e.g., "merged_modalities_data_types.json")
    :param input_datatypes_file: Path to the datatypes file (e.g., "datatypes.json")
    :param output_file: Path to the output file (e.g., "filtered_modalities.json")
    """
    # Load the merged modalities data (contains modalities and data types)
    with open(input_modalities_file, "r", encoding="utf-8") as f:
        merged_modalities = json.load(f)

    # Load the datatypes list
    with open(input_datatypes_file, "r", encoding="utf-8") as f:
        valid_datatypes = set(json.load(f))  # Using a set for faster lookup

    # Filter the data types in merged modalities based on valid datatypes
    filtered_modalities = []
    for modality in merged_modalities:
        modality_title = modality["modality"]
        valid_data_types = [dt for dt in modality["data_types"] if dt in valid_datatypes]

        if valid_data_types:  # Only add the modality if there are valid data types
            filtered_modalities.append({
                "modality": modality_title,
                "data_types": valid_data_types
            })

    # Save the filtered modalities to the output file
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(filtered_modalities, f, indent=4)

    print(f"Filtered modalities saved to {output_file}")

# Example usage
input_modalities_file = "merged_modalities_data_types_cleaned.json"  # Path to the merged modalities JSON file
input_datatypes_file = "datatypes_wikidata_matched_full.json"  # Path to the datatypes JSON file
output_file = "filtered_modalities_datatypes_full.json"  # Path to the output filtered file

filter_and_save_modalities(input_modalities_file, input_datatypes_file, output_file)

Filtered modalities saved to filtered_modalities_datatypes_full.json




> [  
    {  
        "modality": "image",  
        "data_types": [  
            "hair segmentation mask",  
            "visual slam groundtruth location",  
            "hair image",   
            ……



In [None]:
import json

def filter_and_save_modalities(input_modalities_file, input_datatypes_file, output_file):
    """
    Filter the data types in the input modalities file based on the available data types
    in the input datatypes file, and save the filtered result to a new output file.
    Keeps all modalities but filters data types.

    :param input_modalities_file: Path to the merged modalities file (e.g., "merged_modalities_data_types.json")
    :param input_datatypes_file: Path to the datatypes file (e.g., "datatypes.json")
    :param output_file: Path to the output file (e.g., "filtered_modalities.json")
    """
    # Load the merged modalities data (contains modalities and data types)
    with open(input_modalities_file, "r", encoding="utf-8") as f:
        merged_modalities = json.load(f)

    # Load the datatypes list
    with open(input_datatypes_file, "r", encoding="utf-8") as f:
        valid_datatypes = set(json.load(f))  # Using a set for faster lookup

    # Filter the data types in merged modalities based on valid datatypes
    filtered_modalities = []
    for modality in merged_modalities:
        modality_title = modality["modality"]
        valid_data_types = [dt for dt in modality["data_types"] if dt in valid_datatypes]

        # Keep the modality even if no valid data_types remain
        filtered_modalities.append({
            "modality": modality_title,
            "data_types": valid_data_types  # Keep only matching data types
        })

    # Save the filtered modalities to the output file
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(filtered_modalities, f, indent=4)

    print(f"Filtered modalities saved to {output_file}")

# Example usage
input_modalities_file = "merged_modalities_data_types_cleaned.json"  # Path to the merged modalities JSON file
input_datatypes_file = "datatypes_wikidata_matched_full.json"  # Path to the datatypes JSON file
output_file = "filtered_modalities_datatypes_full.json"  # Path to the output filtered file

filter_and_save_modalities(input_modalities_file, input_datatypes_file, output_file)

Filtered modalities saved to filtered_modalities_datatypes_full.json


## A Merged Version with All Property Info

In [None]:
import json

# Load two files
with open('wikidata_matched_full_NoIso_QIDok.json', 'r', encoding='utf-8') as f:
    wikidata_data = json.load(f)

with open('merged_modalities_data_types_cleaned.json', 'r', encoding='utf-8') as f:
    modalities_data = json.load(f)

clustered_types = wikidata_data.get('clustered_types', [])
wikidata_dict = {item['datatype']: item for item in clustered_types}

output = []

for modality_item in modalities_data:
    modality = modality_item['modality']
    data_types = modality_item['data_types']

    modality_output = {
        'modality': modality,
        'data_types': {}
    }

    for data_type in data_types:
        if data_type in wikidata_dict:
            matched_data = wikidata_dict[data_type]
            modality_output['data_types'][data_type] = {
                'frequency': matched_data['frequency'],
                'wikidata_id': matched_data['wikidata_id'],
                'wikidata_label': matched_data['wikidata_label'],
                'wikidata_description': matched_data['wikidata_description']
            }

    if modality_output['data_types']:
        output.append(modality_output)

with open('matched_modalities_data_types.json', 'w', encoding='utf-8') as f:
    json.dump(output, f, indent=4, ensure_ascii=False)

[  
    {  
        "modality": "image",  
        "data_types": {  
            "hair segmentation mask": {  
                "frequency": 1,  
                "wikidata_id": "Q28472",  
                "wikidata_label": "hair",  
                "wikidata_description": "protein filament that grows from follicles found in the dermis, or skin"  
            },  
            "visual slam groundtruth location": {  
                "frequency": 1,  
                "wikidata_id": "Q4110915",  
                "wikidata_label": "Visual odometry",  
                "wikidata_description": ""  
            },  
            ......

In [None]:
# Load two files
with open('wikidata_matched_full_NoIso_QIDok.json', 'r', encoding='utf-8') as f:
    wikidata_data = json.load(f)

with open('merged_modalities_data_types_cleaned.json', 'r', encoding='utf-8') as f:
    modalities_data = json.load(f)

clustered_types = wikidata_data.get('clustered_types', [])
wikidata_dict = {item['datatype']: item for item in clustered_types}

output = []

for modality_item in modalities_data:
    modality = modality_item['modality']
    data_types = modality_item['data_types']

    # Prepare modality output (always include modality)
    modality_output = {
        'modality': modality,
        'data_types': {}
    }

    # Iterate through the data types and find matches
    for data_type in data_types:
        if data_type in wikidata_dict:
            matched_data = wikidata_dict[data_type]
            modality_output['data_types'][data_type] = {
                'frequency': matched_data['frequency'],
                'wikidata_id': matched_data['wikidata_id'],
                'wikidata_label': matched_data['wikidata_label'],
                'wikidata_description': matched_data['wikidata_description']
            }

    # Always append modality_output even if no matching data types
    output.append(modality_output)

# Save the result to a new JSON file
with open('matched_modalities_data_types.json', 'w', encoding='utf-8') as f:
    json.dump(output, f, indent=4, ensure_ascii=False)

print(f"Output saved to matched_modalities_data_types.json")

Output saved to matched_modalities_data_types.json


In [None]:
# Load the matched_modalities_data_types.json file
with open('matched_modalities_data_types.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Initialize counters for modalities and datatypes
total_modality_count = 0
non_empty_modality_count = 0
datatype_count = 0
unique_datatypes = set()  # Set to keep track of unique datatypes

# Loop through each modality in the data
for modality_item in data:
    total_modality_count += 1  # Increment the total modality count
    data_types = modality_item.get('data_types', {})

    # Only count modalities that have non-empty data_types
    if data_types:
        non_empty_modality_count += 1  # Increment modality count for each modality with non-empty data_types

        # Increment datatype count by the number of data types in the current modality
        datatype_count += len(data_types)

        # Add data_types to the unique datatypes set (this automatically removes duplicates)
        unique_datatypes.update(data_types.keys())

# Print the results
print(f"Number of Modalities (total): {total_modality_count}")
print(f"Number of Modalities (with non-empty data_types): {non_empty_modality_count}")
print(f"Number of Datatypes: {datatype_count}")
print(f"Number of Unique Datatypes: {len(unique_datatypes)}")

Number of Modalities (total): 261
Number of Modalities (with non-empty data_types): 123
Number of Datatypes: 4276
Number of Unique Datatypes: 3874


## Merge QID for Modalities

In [None]:
import csv
import json

# Step 1: Load modality_wiki.csv into a dictionary
modality_wiki_dict = {}

# Open modality_wiki.csv and read its contents
with open('modality_wiki.csv', 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        modality_wiki_dict[row['modality']] = {
            'wikidata_label': row['wiki_word'],
            'wikidata_id': row['qid']
        }

# Step 2: Load the matched_modalities_data_types.json file
with open('matched_modalities_data_types.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Step 3: Add the wikidata_label and wikidata_id to each modality in the matched_modalities_data_types.json
for modality_item in data:
    modality_name = modality_item['modality']

    # Check if modality_name is in modality_wiki_dict
    if modality_name in modality_wiki_dict:
        modality_item['wikidata_label'] = modality_wiki_dict[modality_name]['wikidata_label']
        modality_item['wikidata_id'] = modality_wiki_dict[modality_name]['wikidata_id']

# Step 4: Save the updated data to a new JSON file
with open('updated_matched_modalities_data_types.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, indent=4, ensure_ascii=False)

In [None]:
# Load the matched_modalities_data_types.json file
with open('updated_matched_modalities_data_types.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Initialize counters for modalities and datatypes
total_modality_count = 0
non_empty_modality_count = 0
datatype_count = 0
unique_datatypes = set()  # Set to keep track of unique datatypes

# Loop through each modality in the data
for modality_item in data:
    total_modality_count += 1  # Increment the total modality count
    data_types = modality_item.get('data_types', {})

    # Only count modalities that have non-empty data_types
    if data_types:
        non_empty_modality_count += 1  # Increment modality count for each modality with non-empty data_types

        # Increment datatype count by the number of data types in the current modality
        datatype_count += len(data_types)

        # Add data_types to the unique datatypes set (this automatically removes duplicates)
        unique_datatypes.update(data_types.keys())

# Print the results
print(f"Number of Modalities (total): {total_modality_count}")
print(f"Number of Modalities (with non-empty data_types): {non_empty_modality_count}")
print(f"Number of Datatypes: {datatype_count}")
print(f"Number of Unique Datatypes: {len(unique_datatypes)}")

Number of Modalities (total): 261
Number of Modalities (with non-empty data_types): 123
Number of Datatypes: 4276
Number of Unique Datatypes: 3874


## Recursive Query Wikidata

**Prevent repeated queries of nodes**

Output: Triples (Label Modality, Datatype, Others)

In [None]:
def get_qid(label):
    """Get the QID of an entity based on its label."""
    time.sleep(1)
    logging.info(f"Fetching QID for label: {label}")
    query = f"""
    SELECT ?entity WHERE {{
      ?entity rdfs:label "{label}"@en.
      FILTER (STRSTARTS(STR(?entity), "http://www.wikidata.org/entity/Q"))
    }}
    LIMIT 1
    """
    sparql = SPARQLWrapper(SPARQL_ENDPOINT)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    if results["results"]["bindings"]:
        qid = results["results"]["bindings"][0]["entity"]["value"].split("/")[-1]
        logging.info(f"Found QID for {label}: {qid}")
        return qid
    else:
        logging.warning(f"No QID found for label: {label}")
    return None

## Construct the Knowledge Graph

In [None]:
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"

# List of relevant properties (modality and data type relationships)
RELEVANT_PROPERTIES = [
    # Fundamental Semantics
    "P31",  # instance of
    "P279",  # subclass of
    "P361",  # part of
    "P1269",  # facet of

    # Structure and Technology
    "P527",  # has part(s)
    "P2670",  # has part(s) of the class
    "P2701",  # file format
    "P1163",  # media type
    "P1195",  # file extension
    "P4330",  # contains

    # Application and Domain
    "P366",  # has use
    "P1535",  # used by
    "P101",  # field of work
    "P921",  # main subject


    # Quality and Metadata
    "P1552",  # has characteristic
    "P13044",  # characteristic of
    "P3575",  # data size
]

## OWL

In [None]:
import json
import time
import logging
from SPARQLWrapper import SPARQLWrapper, JSON
from rdflib import Graph, URIRef, Literal, Namespace, RDF

# Set the Wikidata SPARQL endpoint
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"


# Define namespaces
EX = Namespace("http://example.org/")
WIKIDATA = Namespace("http://www.wikidata.org/entity/")

# Create an RDF graph
g = Graph()
g.bind("ex", EX)
g.bind("wikidata", WIKIDATA)

# Prevent duplicate queries
queried_nodes = set()

def get_qid(label):
    """Get the QID of an entity based on its label."""
    time.sleep(1)
    logging.info(f"Fetching QID for label: {label}")
    query = f"""
    SELECT ?entity WHERE {{
      ?entity rdfs:label "{label}"@en.
      FILTER (STRSTARTS(STR(?entity), "http://www.wikidata.org/entity/Q"))
    }}
    LIMIT 1
    """
    sparql = SPARQLWrapper(SPARQL_ENDPOINT)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    if results["results"]["bindings"]:
        qid = results["results"]["bindings"][0]["entity"]["value"].split("/")[-1]
        logging.info(f"Found QID for {label}: {qid}")
        return qid
    else:
        logging.warning(f"No QID found for label: {label}")
    return None

def query_wikidata_properties(qid):
    """Query Wikidata properties for a given entity"""
    if qid in queried_nodes:
        return []
    queried_nodes.add(qid)

    # print(f"Querying properties for QID: {qid}")
    query = f"""
    SELECT ?property ?propertyLabel ?value ?valueLabel WHERE {{
      wd:{qid} ?p ?value.
      ?property wikibase:directClaim ?p.
      FILTER (?property IN (wd:{", wd:".join(RELEVANT_PROPERTIES)}))
      OPTIONAL {{ ?value rdfs:label ?valueLabel. FILTER(LANG(?valueLabel) = "en") }}
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    """
    sparql = SPARQLWrapper(SPARQL_ENDPOINT)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    return results["results"]["bindings"]

def recursive_query(qid, depth=0, max_depth=3):
    """Recursively query and construct the knowledge graph"""
    if depth > max_depth:
        return

    # Query entity properties
    results = query_wikidata_properties(qid)

    for result in results:
        prop = result["propertyLabel"]["value"].replace(" ", "_").replace("(", "_").replace(")", "_")
        value = result.get("valueLabel", {}).get("value", result["value"]["value"])
        value_uri = result["value"]["value"]

        # Check if the value is a Wikidata entity
        if "wikidata.org/entity/" in value_uri:
            value_qid = value_uri.split("/")[-1]

            # Add nodes to the RDF graph
            g.add((WIKIDATA[qid], RDF.type, EX.Entity))
            g.add((WIKIDATA[qid], EX[prop], WIKIDATA[value_qid]))

            # Recursively query
            recursive_query(value_qid, depth + 1, max_depth)

def build_knowledge_graph(dataset, max_depth=3):
    """Build a knowledge graph from a data set"""
    for entry in dataset:
        modality = entry["modality"]
        data_types = entry["data_types"]

        # Get the QID of modality
        modality_qid = get_qid(modality)
        if modality_qid:
            print(f"Processing modality: {modality} (QID: {modality_qid})")

            # Add modality node to RDF graph
            g.add((WIKIDATA[modality_qid], RDF.type, EX.Modality))
            g.add((WIKIDATA[modality_qid], EX.label, Literal(modality)))

            # Recursively query the properties of modality
            recursive_query(modality_qid, depth=0, max_depth=max_depth)

            # Handle data_types
            for data_type, properties in data_types.items():
                data_type_qid = properties.get("wikidata_id")

                if data_type_qid:
                    # print(f"Processing data type: {data_type} (QID: {data_type_qid})")

                    # Add a data_type node and append its attributes to the RDF graph
                    g.add((WIKIDATA[data_type_qid], RDF.type, EX.Datatype))
                    g.add((WIKIDATA[data_type_qid], EX.label, Literal(data_type)))
                    g.add((WIKIDATA[data_type_qid], EX.frequency, Literal(properties.get("frequency"))))
                    g.add((WIKIDATA[data_type_qid], EX.wikidata_label, Literal(properties.get("wikidata_label"))))
                    g.add((WIKIDATA[data_type_qid], EX.wikidata_description, Literal(properties.get("wikidata_description"))))

                    # Add custom edges of modality and data_type
                    g.add((WIKIDATA[modality_qid], EX["Modality_Datatype"], WIKIDATA[data_type_qid]))

                    # Recursively query the attributes of data_type
                    recursive_query(data_type_qid, depth=0, max_depth=max_depth)

    # Save the knowledge graph as an OWL file
    g.serialize(destination="knowledge_graph_3.owl", format="xml")
    print("Knowledge graph saved to knowledge_graph.owl")

with open('matched_modalities_data_types.json', 'r') as f:
    dataset = json.load(f)

# Build a knowledge graph
build_knowledge_graph(dataset, max_depth=3)

Processing modality: image (QID: Q478798)
Processing modality: text (QID: Q234460)




Processing modality: video (QID: Q625525)
Processing modality: audio (QID: Q3500685)




Processing modality: metadata (QID: Q180160)
Processing modality: structured data (QID: Q26813700)
Processing modality: annotation (QID: Q857525)
Processing modality: depth (QID: Q181591)
Processing modality: skeleton (QID: Q7881)




Processing modality: sensor (QID: Q167676)




Processing modality: tabular (QID: Q13402984)
Processing modality: graph (QID: Q141488)




Processing modality: network (QID: Q109406)




Processing modality: table (QID: Q14748)
Processing modality: time series (QID: Q186588)
Processing modality: simulation (QID: Q45045)
Processing modality: lidar (QID: Q504027)
Processing modality: radar (QID: Q47528)
Processing modality: numeric (QID: Q3249849)
Processing modality: signal (QID: Q174984)
Processing modality: data (QID: Q42848)
Processing modality: point cloud (QID: Q1899648)
Processing modality: vector (QID: Q107994)
Processing modality: label (QID: Q202581)
Processing modality: chemical structure (QID: Q500256)




Processing modality: motion capture (QID: Q676252)




Processing modality: geolocation (QID: Q123349905)
Processing modality: temporal (QID: Q2472680)
Processing modality: sequence (QID: Q133250)
Processing modality: trajectory (QID: Q193139)
Processing modality: neuroimaging (QID: Q551875)
Processing modality: environment (QID: Q1572519)




Processing modality: pose (QID: Q1671968)
Processing modality: visualization (QID: Q451553)
Processing modality: geospatial (QID: Q122074678)




Processing modality: binary (QID: Q58483123)
Processing modality: interaction (QID: Q52948)
Processing modality: eye tracking (QID: Q970687)
Processing modality: social network (QID: Q2715623)




Processing modality: event (QID: Q1349920)
Processing modality: control (QID: Q338754)
Processing modality: synthetic data (QID: Q7662746)




Processing modality: code (QID: Q188889)




Processing modality: knowledge graph (QID: Q33002955)
Processing modality: matrix (QID: Q44337)
Processing modality: motion (QID: Q79782)




Processing modality: game (QID: Q11410)




Processing modality: web (QID: Q1427141)
Processing modality: log (QID: Q12029485)
Processing modality: model (QID: Q1941828)




Processing modality: feature vector (QID: Q1921842)




Processing modality: spatial (QID: Q122075505)
Processing modality: visual (QID: Q4014836)




Processing modality: software (QID: Q7397)
Processing modality: sketch (QID: Q5078274)




Processing modality: feature (QID: Q93586)




Processing modality: tactile (QID: Q124134995)




Processing modality: notebook (QID: Q43013)
Processing modality: emotion (QID: Q95969875)




Processing modality: biomarker (QID: Q864574)




Processing modality: other (QID: Q1433373)
Processing modality: database (QID: Q8513)
Processing modality: box (QID: Q188075)
Processing modality: animation (QID: Q11425)




Processing modality: odometry (QID: Q2014717)
Processing modality: blockchain (QID: Q20514253)




Processing modality: vr (QID: Q29716068)




Processing modality: data file (QID: Q5227290)
Processing modality: biosignal (QID: Q644240)
Knowledge graph saved to knowledge_graph.owl


In [None]:
# Verify whether Modality in the JSON file can find the QID
def validate_modalities_qid(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)

    found_modalities = []
    not_found_modalities = []

    for entry in data:
        modality = entry.get("modality")
        if modality:
            # print(f"Checking QID for Modality: {modality}")
            qid = get_qid(modality)
            if qid:
                found_modalities.append({"modality": modality, "qid": qid})
                # print(f"Found QID for {modality}: {qid}")
            else:
                not_found_modalities.append(modality)
                # print(f"No QID found for {modality}")

    # print("\nModalities with QID:")
    # for item in found_modalities:
    #     print(f"- {item['modality']}: {item['qid']}")

    # print("\nModalities without QID:")
    # for modality in not_found_modalities:
    #     print(f"- {modality}")

validate_modalities_qid('matched_modalities_data_types.json')



In [None]:
import json
import time
import logging
from SPARQLWrapper import SPARQLWrapper, JSON
from rdflib import Graph, URIRef, Literal, Namespace, RDF

# Set the Wikidata SPARQL endpoint
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"


# Define namespaces
EX = Namespace("http://example.org/")
WIKIDATA = Namespace("http://www.wikidata.org/entity/")

# Create an RDF graph
g = Graph()
g.bind("ex", EX)
g.bind("wikidata", WIKIDATA)

# Prevent duplicate queries
queried_nodes = set()

def get_qid(label):
    """Get the QID of an entity based on its label."""
    time.sleep(1)
    logging.info(f"Fetching QID for label: {label}")
    query = f"""
    SELECT ?entity WHERE {{
      ?entity rdfs:label "{label}"@en.
      FILTER (STRSTARTS(STR(?entity), "http://www.wikidata.org/entity/Q"))
    }}
    LIMIT 1
    """
    sparql = SPARQLWrapper(SPARQL_ENDPOINT)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    if results["results"]["bindings"]:
        qid = results["results"]["bindings"][0]["entity"]["value"].split("/")[-1]
        logging.info(f"Found QID for {label}: {qid}")
        return qid
    else:
        logging.warning(f"No QID found for label: {label}")
    return None

def query_wikidata_properties(qid):
    """Query Wikidata properties for a given entity"""
    if qid in queried_nodes:
        return []
    queried_nodes.add(qid)

    # print(f"Querying properties for QID: {qid}")
    query = f"""
    SELECT ?property ?propertyLabel ?value ?valueLabel WHERE {{
      wd:{qid} ?p ?value.
      ?property wikibase:directClaim ?p.
      FILTER (?property IN (wd:{", wd:".join(RELEVANT_PROPERTIES)}))
      OPTIONAL {{ ?value rdfs:label ?valueLabel. FILTER(LANG(?valueLabel) = "en") }}
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    """
    sparql = SPARQLWrapper(SPARQL_ENDPOINT)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    return results["results"]["bindings"]

def recursive_query(qid, depth=0, max_depth=3):
    """Recursively query and construct the knowledge graph"""
    if depth > max_depth:
        return

    # Query entity properties
    results = query_wikidata_properties(qid)

    for result in results:
        prop = result["propertyLabel"]["value"].replace(" ", "_").replace("(", "_").replace(")", "_")
        value = result.get("valueLabel", {}).get("value", result["value"]["value"])
        value_uri = result["value"]["value"]

        # Check if the value is a Wikidata entity
        if "wikidata.org/entity/" in value_uri:
            value_qid = value_uri.split("/")[-1]

            # Add nodes to RDF graph
            g.add((WIKIDATA[qid], RDF.type, EX.Entity))
            g.add((WIKIDATA[qid], EX[prop], WIKIDATA[value_qid]))

            # recursive query
            recursive_query(value_qid, depth + 1, max_depth)

def build_knowledge_graph(dataset, max_depth=3):
    """Build a knowledge graph from a data set"""
    for entry in dataset:
        modality = entry["modality"]
        data_types = entry["data_types"]

        # Get the QID of modality
        modality_qid = get_qid(modality)
        if modality_qid:
            print(f"Processing modality: {modality} (QID: {modality_qid})")

            # Add modality node to RDF graph
            g.add((WIKIDATA[modality_qid], RDF.type, EX.Modality))
            g.add((WIKIDATA[modality_qid], EX.label, Literal(modality)))

            # Recursively query the properties of modality
            recursive_query(modality_qid, depth=0, max_depth=max_depth)

            # Handle data_types
            for data_type, properties in data_types.items():
                data_type_qid = properties.get("wikidata_id")

                if data_type_qid:
                    # print(f"Processing data type: {data_type} (QID: {data_type_qid})")

                    # Add a data_type node and append its attributes to the RDF graph
                    g.add((WIKIDATA[data_type_qid], RDF.type, EX.Datatype))
                    g.add((WIKIDATA[data_type_qid], EX.label, Literal(data_type)))
                    g.add((WIKIDATA[data_type_qid], EX.frequency, Literal(properties.get("frequency"))))
                    g.add((WIKIDATA[data_type_qid], EX.wikidata_label, Literal(properties.get("wikidata_label"))))
                    g.add((WIKIDATA[data_type_qid], EX.wikidata_description, Literal(properties.get("wikidata_description"))))

                    # Add custom edges of modality and data_type
                    g.add((WIKIDATA[modality_qid], EX["Modality_Datatype"], WIKIDATA[data_type_qid]))

                    # Recursively query the attributes of data_type
                    recursive_query(data_type_qid, depth=0, max_depth=max_depth)

    # Save the knowledge graph as an OWL file
    g.serialize(destination="knowledge_graph_2.owl", format="xml")
    print("Knowledge graph saved to knowledge_graph.owl")


# Load file
with open('matched_modalities_data_types.json', 'r') as f:
    dataset = json.load(f)

# Build a knowledge graph
build_knowledge_graph(dataset, max_depth=2)

Processing modality: image (QID: Q478798)
Processing modality: text (QID: Q234460)




Processing modality: video (QID: Q625525)
Processing modality: audio (QID: Q3500685)




Processing modality: metadata (QID: Q180160)
Processing modality: structured data (QID: Q26813700)
Processing modality: annotation (QID: Q857525)
Processing modality: depth (QID: Q181591)
Processing modality: skeleton (QID: Q7881)




Processing modality: sensor (QID: Q167676)




Processing modality: tabular (QID: Q13402984)
Processing modality: graph (QID: Q141488)




Processing modality: network (QID: Q109406)




Processing modality: table (QID: Q14748)
Processing modality: time series (QID: Q186588)
Processing modality: simulation (QID: Q45045)
Processing modality: lidar (QID: Q504027)
Processing modality: radar (QID: Q47528)
Processing modality: numeric (QID: Q3249849)
Processing modality: signal (QID: Q174984)
Processing modality: data (QID: Q42848)
Processing modality: point cloud (QID: Q1899648)
Processing modality: vector (QID: Q107994)
Processing modality: label (QID: Q202581)
Processing modality: chemical structure (QID: Q500256)




Processing modality: motion capture (QID: Q676252)




Processing modality: geolocation (QID: Q123349905)
Processing modality: temporal (QID: Q2472680)
Processing modality: sequence (QID: Q133250)
Processing modality: trajectory (QID: Q193139)
Processing modality: neuroimaging (QID: Q551875)
Processing modality: environment (QID: Q1572519)




Processing modality: pose (QID: Q1671968)
Processing modality: visualization (QID: Q451553)
Processing modality: geospatial (QID: Q122074678)




Processing modality: binary (QID: Q58483123)
Processing modality: interaction (QID: Q52948)
Processing modality: eye tracking (QID: Q970687)
Processing modality: social network (QID: Q2715623)




Processing modality: event (QID: Q1349920)
Processing modality: control (QID: Q338754)
Processing modality: synthetic data (QID: Q7662746)




Processing modality: code (QID: Q188889)




Processing modality: knowledge graph (QID: Q33002955)
Processing modality: matrix (QID: Q44337)
Processing modality: motion (QID: Q79782)




Processing modality: game (QID: Q11410)




Processing modality: web (QID: Q1427141)
Processing modality: log (QID: Q12029485)
Processing modality: model (QID: Q1941828)




Processing modality: feature vector (QID: Q1921842)




Processing modality: spatial (QID: Q122075505)
Processing modality: visual (QID: Q4014836)




Processing modality: software (QID: Q7397)
Processing modality: sketch (QID: Q5078274)




Processing modality: feature (QID: Q93586)




Processing modality: tactile (QID: Q124134995)




Processing modality: notebook (QID: Q43013)
Processing modality: emotion (QID: Q95969875)




Processing modality: biomarker (QID: Q864574)




Processing modality: other (QID: Q1433373)
Processing modality: database (QID: Q8513)
Processing modality: box (QID: Q188075)
Processing modality: animation (QID: Q11425)




Processing modality: odometry (QID: Q2014717)
Processing modality: blockchain (QID: Q20514253)




Processing modality: vr (QID: Q29716068)




Processing modality: data file (QID: Q5227290)
Processing modality: biosignal (QID: Q644240)
Knowledge graph saved to knowledge_graph.owl


In [None]:
import json
import time
import logging
from SPARQLWrapper import SPARQLWrapper, JSON
from rdflib import Graph, URIRef, Literal, Namespace, RDF

# Set the Wikidata SPARQL endpoint
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"


# Define namespaces
EX = Namespace("http://example.org/")
WIKIDATA = Namespace("http://www.wikidata.org/entity/")

# Create an RDF graph
g = Graph()
g.bind("ex", EX)
g.bind("wikidata", WIKIDATA)

# Prevent duplicate queries
queried_nodes = set()

def get_qid(label):
    """Get the QID of an entity based on its label."""
    time.sleep(1)
    logging.info(f"Fetching QID for label: {label}")
    query = f"""
    SELECT ?entity WHERE {{
      ?entity rdfs:label "{label}"@en.
      FILTER (STRSTARTS(STR(?entity), "http://www.wikidata.org/entity/Q"))
    }}
    LIMIT 1
    """
    sparql = SPARQLWrapper(SPARQL_ENDPOINT)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    if results["results"]["bindings"]:
        qid = results["results"]["bindings"][0]["entity"]["value"].split("/")[-1]
        logging.info(f"Found QID for {label}: {qid}")
        return qid
    else:
        logging.warning(f"No QID found for label: {label}")
    return None

def query_wikidata_properties(qid):
    """Query Wikidata properties for a given entity"""
    if qid in queried_nodes:
        return []
    queried_nodes.add(qid)

    # print(f"Querying properties for QID: {qid}")
    query = f"""
    SELECT ?property ?propertyLabel ?value ?valueLabel WHERE {{
      wd:{qid} ?p ?value.
      ?property wikibase:directClaim ?p.
      FILTER (?property IN (wd:{", wd:".join(RELEVANT_PROPERTIES)}))
      OPTIONAL {{ ?value rdfs:label ?valueLabel. FILTER(LANG(?valueLabel) = "en") }}
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    """
    sparql = SPARQLWrapper(SPARQL_ENDPOINT)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    return results["results"]["bindings"]

def recursive_query(qid, depth=0, max_depth=3):
    """Recursively query and construct the knowledge graph"""
    if depth > max_depth:
        return

    # Query entity properties
    results = query_wikidata_properties(qid)

    for result in results:
        prop = result["propertyLabel"]["value"].replace(" ", "_").replace("(", "_").replace(")", "_")
        value = result.get("valueLabel", {}).get("value", result["value"]["value"])
        value_uri = result["value"]["value"]

        # Check if the value is a Wikidata entity
        if "wikidata.org/entity/" in value_uri:
            value_qid = value_uri.split("/")[-1]

            # Add nodes to RDF graph
            g.add((WIKIDATA[qid], RDF.type, EX.Entity))
            g.add((WIKIDATA[qid], EX[prop], WIKIDATA[value_qid]))

            # recursive query
            recursive_query(value_qid, depth + 1, max_depth)

def build_knowledge_graph(dataset, max_depth=3):
    """Build a knowledge graph from a data set"""
    for entry in dataset:
        modality = entry["modality"]
        data_types = entry["data_types"]

        # Get the QID of modality
        modality_qid = get_qid(modality)
        if modality_qid:
            print(f"Processing modality: {modality} (QID: {modality_qid})")

            # Add modality node to RDF graph
            g.add((WIKIDATA[modality_qid], RDF.type, EX.Modality))
            g.add((WIKIDATA[modality_qid], EX.label, Literal(modality)))

            # Recursively query the properties of modality
            recursive_query(modality_qid, depth=0, max_depth=max_depth)

            # Handle data_types
            for data_type, properties in data_types.items():
                data_type_qid = properties.get("wikidata_id")

                if data_type_qid:
                    # print(f"Processing data type: {data_type} (QID: {data_type_qid})")

                    # Add a data_type node and append its attributes to the RDF graph
                    g.add((WIKIDATA[data_type_qid], RDF.type, EX.Datatype))
                    g.add((WIKIDATA[data_type_qid], EX.label, Literal(data_type)))
                    g.add((WIKIDATA[data_type_qid], EX.frequency, Literal(properties.get("frequency"))))
                    g.add((WIKIDATA[data_type_qid], EX.wikidata_label, Literal(properties.get("wikidata_label"))))
                    g.add((WIKIDATA[data_type_qid], EX.wikidata_description, Literal(properties.get("wikidata_description"))))

                    # Add custom edges of modality and data_type
                    g.add((WIKIDATA[modality_qid], EX["Modality_Datatype"], WIKIDATA[data_type_qid]))

                    # Recursively query the attributes of data_type
                    recursive_query(data_type_qid, depth=0, max_depth=max_depth)

    # Save the knowledge graph as an OWL file
    g.serialize(destination="knowledge_graph_1.owl", format="xml")
    print("Knowledge graph saved to knowledge_graph.owl")


# Load file
with open('matched_modalities_data_types.json', 'r') as f:
    dataset = json.load(f)

# Build a knowledge graph
build_knowledge_graph(dataset, max_depth=1)

Processing modality: image (QID: Q478798)
Processing modality: text (QID: Q234460)




Processing modality: video (QID: Q625525)
Processing modality: audio (QID: Q3500685)




Processing modality: metadata (QID: Q180160)
Processing modality: structured data (QID: Q26813700)
Processing modality: annotation (QID: Q857525)
Processing modality: depth (QID: Q181591)
Processing modality: skeleton (QID: Q7881)




Processing modality: sensor (QID: Q167676)




Processing modality: tabular (QID: Q13402984)
Processing modality: graph (QID: Q141488)




Processing modality: network (QID: Q109406)




Processing modality: table (QID: Q14748)
Processing modality: time series (QID: Q186588)
Processing modality: simulation (QID: Q45045)
Processing modality: lidar (QID: Q504027)
Processing modality: radar (QID: Q47528)
Processing modality: numeric (QID: Q3249849)
Processing modality: signal (QID: Q174984)
Processing modality: data (QID: Q42848)
Processing modality: point cloud (QID: Q1899648)
Processing modality: vector (QID: Q107994)
Processing modality: label (QID: Q202581)
Processing modality: chemical structure (QID: Q500256)




Processing modality: motion capture (QID: Q676252)




Processing modality: geolocation (QID: Q123349905)
Processing modality: temporal (QID: Q2472680)
Processing modality: sequence (QID: Q133250)
Processing modality: trajectory (QID: Q193139)
Processing modality: neuroimaging (QID: Q551875)
Processing modality: environment (QID: Q1572519)




Processing modality: pose (QID: Q1671968)
Processing modality: visualization (QID: Q451553)
Processing modality: geospatial (QID: Q122074678)




Processing modality: binary (QID: Q58483123)
Processing modality: interaction (QID: Q52948)
Processing modality: eye tracking (QID: Q970687)
Processing modality: social network (QID: Q2715623)




Processing modality: event (QID: Q1349920)
Processing modality: control (QID: Q338754)
Processing modality: synthetic data (QID: Q7662746)




Processing modality: code (QID: Q188889)




Processing modality: knowledge graph (QID: Q33002955)
Processing modality: matrix (QID: Q44337)
Processing modality: motion (QID: Q79782)




Processing modality: game (QID: Q11410)




Processing modality: web (QID: Q1427141)
Processing modality: log (QID: Q12029485)
Processing modality: model (QID: Q1941828)




Processing modality: feature vector (QID: Q1921842)




Processing modality: spatial (QID: Q122075505)
Processing modality: visual (QID: Q4014836)




Processing modality: software (QID: Q7397)
Processing modality: sketch (QID: Q5078274)




Processing modality: feature (QID: Q93586)




Processing modality: tactile (QID: Q124134995)




Processing modality: notebook (QID: Q43013)
Processing modality: emotion (QID: Q95969875)




Processing modality: biomarker (QID: Q864574)




Processing modality: other (QID: Q1433373)
Processing modality: database (QID: Q8513)
Processing modality: box (QID: Q188075)
Processing modality: animation (QID: Q11425)




Processing modality: odometry (QID: Q2014717)
Processing modality: blockchain (QID: Q20514253)




Processing modality: vr (QID: Q29716068)




Processing modality: data file (QID: Q5227290)
Processing modality: biosignal (QID: Q644240)
Knowledge graph saved to knowledge_graph.owl


In [None]:
def recursive_query(qid, depth=0, max_depth=3):
    """Recursively query and construct the knowledge graph"""
    if depth > max_depth:
        return

    # Query entity properties
    results = query_wikidata_properties(qid)

    for result in results:
        prop = result["propertyLabel"]["value"].replace(" ", "_").replace("(", "_").replace(")", "_")
        value = result.get("valueLabel", {}).get("value", result["value"]["value"])
        value_uri = result["value"]["value"]

        # Check if the value is a Wikidata entity
        if "wikidata.org/entity/" in value_uri:
            value_qid = value_uri.split("/")[-1]

            # recursive query
            recursive_query(value_qid, depth + 1, max_depth)

def build_knowledge_graph(dataset, max_depth=3):
    """Build a knowledge graph from a data set"""
    for entry in dataset:
        modality = entry["modality"]
        data_types = entry["data_types"]

        # Get the QID of modality
        modality_qid = get_qid(modality)
        if modality_qid:
            print(f"Processing modality: {modality} (QID: {modality_qid})")

            # Add modality node to RDF graph，EX.Modality
            g.add((WIKIDATA[modality_qid], RDF.type, EX.Modality))
            g.add((WIKIDATA[modality_qid], EX.label, Literal(modality)))

            # Recursively query the properties of modality
            recursive_query(modality_qid, depth=0, max_depth=max_depth)

            # Handle data_types
            for data_type, properties in data_types.items():
                data_type_qid = properties.get("wikidata_id")

                if data_type_qid:
                    # Add a data_type node and specify the type as EX.Datatype
                    g.add((WIKIDATA[data_type_qid], RDF.type, EX.Datatype))
                    g.add((WIKIDATA[data_type_qid], EX.label, Literal(data_type)))
                    g.add((WIKIDATA[data_type_qid], EX.frequency, Literal(properties.get("frequency"))))
                    g.add((WIKIDATA[data_type_qid], EX.wikidata_label, Literal(properties.get("wikidata_label"))))
                    g.add((WIKIDATA[data_type_qid], EX.wikidata_description, Literal(properties.get("wikidata_description"))))

                    # Add custom edges of modality and data_type
                    g.add((WIKIDATA[modality_qid], EX["Modality_Datatype"], WIKIDATA[data_type_qid]))

                    # Recursively query the attributes of data_type
                    recursive_query(data_type_qid, depth=0, max_depth=max_depth)

    # Save the knowledge graph as an OWL file
    g.serialize(destination="knowledge_graph_3.owl", format="xml")
    print("Knowledge graph saved to knowledge_graph.owl")

In [None]:
from rdflib import Namespace, Graph, RDF, RDFS, Literal
import json

# Set the Wikidata SPARQL endpoint
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"

# Define namespaces
EX = Namespace("http://example.org/")
WIKIDATA = Namespace("http://www.wikidata.org/entity/")

# Create an RDF graph
g = Graph()
g.bind("ex", EX)
g.bind("wikidata", WIKIDATA)

# Prevent duplicate queries
queried_nodes = set()

def query_wikidata_properties(qid):
    """Query Wikidata properties for a given entity"""
    if qid in queried_nodes:
        return []
    queried_nodes.add(qid)

    query = f"""
    SELECT ?property ?propertyLabel ?value ?valueLabel WHERE {{
      wd:{qid} ?p ?value.
      ?property wikibase:directClaim ?p.
      FILTER (?property IN (wd:{", wd:".join(RELEVANT_PROPERTIES)}))
      OPTIONAL {{ ?value rdfs:label ?valueLabel. FILTER(LANG(?valueLabel) = "en") }}
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    """
    sparql = SPARQLWrapper(SPARQL_ENDPOINT)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    return results["results"]["bindings"]

def recursive_query(qid, label=None, depth=0, max_depth=3):
    """Recursively query and construct the knowledge graph"""
    if depth > max_depth:
        return

    # Query entity properties
    results = query_wikidata_properties(qid)

    for result in results:
        prop = result["propertyLabel"]["value"].replace(" ", "_").replace("(", "_").replace(")", "_")
        value = result.get("valueLabel", {}).get("value", result["value"]["value"])
        value_uri = result["value"]["value"]

        # Check if the value is a Wikidata entity
        if "wikidata.org/entity/" in value_uri:
            value_qid = value_uri.split("/")[-1]

            # Add attribute edges and make sure the nodes have labels
            g.add((WIKIDATA[qid], EX[prop], WIKIDATA[value_qid]))
            if label:
                g.add((WIKIDATA[qid], RDFS.label, Literal(label)))
            if value:
                g.add((WIKIDATA[value_qid], RDFS.label, Literal(value)))

            # recursive query
            # recursive_query(value_qid, label=value, depth + 1, max_depth)
            recursive_query(value_qid, label=value, depth=depth + 1, max_depth=max_depth)

def build_knowledge_graph(dataset, max_depth=3):
    """Build a knowledge graph from a data set"""
    for entry in dataset:
        modality = entry["modality"]
        data_types = entry["data_types"]

        # Get the QID of modality
        modality_qid = get_qid(modality)
        if modality_qid:
            print(f"Processing modality: {modality} (QID: {modality_qid})")

            # Add modality node to RDF graph，and append label
            g.add((WIKIDATA[modality_qid], RDF.type, EX.Modality))
            g.add((WIKIDATA[modality_qid], RDFS.label, Literal(modality)))

            # Recursively query the properties of modality
            recursive_query(modality_qid, label=modality, depth=0, max_depth=max_depth)

            # Handle data_types
            for data_type, properties in data_types.items():
                data_type_qid = properties.get("wikidata_id")

                if data_type_qid:
                    # Add data_type node to the RDF graph and attach label
                    g.add((WIKIDATA[data_type_qid], RDF.type, EX.Datatype))
                    g.add((WIKIDATA[data_type_qid], RDFS.label, Literal(data_type)))
                    g.add((WIKIDATA[data_type_qid], EX.frequency, Literal(properties.get("frequency"))))
                    g.add((WIKIDATA[data_type_qid], EX.wikidata_label, Literal(properties.get("wikidata_label"))))
                    g.add((WIKIDATA[data_type_qid], EX.wikidata_description, Literal(properties.get("wikidata_description"))))

                    # Add custom edges of modality and data_type
                    g.add((WIKIDATA[modality_qid], EX["Modality_Datatype"], WIKIDATA[data_type_qid]))

                    # Recursively query the attributes of data_type
                    recursive_query(data_type_qid, label=data_type, depth=0, max_depth=max_depth)

    # Save the knowledge graph as an OWL file
    g.serialize(destination="knowledge_graph_with_labels_1.owl", format="xml")
    print("Knowledge graph saved to knowledge_graph_with_labels.owl")


# Load file
with open('matched_modalities_data_types.json', 'r') as f:
    dataset = json.load(f)

# Build a knowledge graph
build_knowledge_graph(dataset, max_depth=1)

Processing modality: image (QID: Q478798)
Processing modality: text (QID: Q234460)




Processing modality: video (QID: Q625525)
Processing modality: audio (QID: Q3500685)




Processing modality: metadata (QID: Q180160)
Processing modality: structured data (QID: Q26813700)
Processing modality: annotation (QID: Q857525)
Processing modality: depth (QID: Q181591)
Processing modality: skeleton (QID: Q7881)




Processing modality: sensor (QID: Q167676)




Processing modality: tabular (QID: Q13402984)
Processing modality: graph (QID: Q141488)




Processing modality: network (QID: Q109406)




Processing modality: table (QID: Q14748)
Processing modality: time series (QID: Q186588)
Processing modality: simulation (QID: Q45045)
Processing modality: lidar (QID: Q504027)
Processing modality: radar (QID: Q47528)
Processing modality: numeric (QID: Q3249849)
Processing modality: signal (QID: Q174984)
Processing modality: data (QID: Q42848)
Processing modality: point cloud (QID: Q1899648)
Processing modality: vector (QID: Q107994)
Processing modality: label (QID: Q202581)
Processing modality: chemical structure (QID: Q500256)




Processing modality: motion capture (QID: Q676252)




Processing modality: geolocation (QID: Q123349905)
Processing modality: temporal (QID: Q2472680)
Processing modality: sequence (QID: Q133250)
Processing modality: trajectory (QID: Q193139)
Processing modality: neuroimaging (QID: Q551875)
Processing modality: environment (QID: Q1572519)




Processing modality: pose (QID: Q1671968)
Processing modality: visualization (QID: Q451553)
Processing modality: geospatial (QID: Q122074678)




Processing modality: binary (QID: Q58483123)
Processing modality: interaction (QID: Q52948)
Processing modality: eye tracking (QID: Q970687)
Processing modality: social network (QID: Q2715623)




Processing modality: event (QID: Q1349920)
Processing modality: control (QID: Q338754)
Processing modality: synthetic data (QID: Q7662746)




Processing modality: code (QID: Q188889)




Processing modality: knowledge graph (QID: Q33002955)
Processing modality: matrix (QID: Q44337)
Processing modality: motion (QID: Q79782)




Processing modality: game (QID: Q11410)




Processing modality: web (QID: Q1427141)
Processing modality: log (QID: Q12029485)
Processing modality: model (QID: Q1941828)




Processing modality: feature vector (QID: Q1921842)




Processing modality: spatial (QID: Q122075505)
Processing modality: visual (QID: Q4014836)




Processing modality: software (QID: Q7397)
Processing modality: sketch (QID: Q5078274)




Processing modality: feature (QID: Q93586)




Processing modality: tactile (QID: Q124134995)




Processing modality: notebook (QID: Q43013)
Processing modality: emotion (QID: Q95969875)




Processing modality: biomarker (QID: Q864574)




Processing modality: other (QID: Q1433373)
Processing modality: database (QID: Q8513)
Processing modality: box (QID: Q188075)
Processing modality: animation (QID: Q11425)




Processing modality: odometry (QID: Q2014717)
Processing modality: blockchain (QID: Q20514253)




Processing modality: vr (QID: Q29716068)




Processing modality: data file (QID: Q5227290)
Processing modality: biosignal (QID: Q644240)
Knowledge graph saved to knowledge_graph_with_labels.owl


## Turtle

In [None]:
from rdflib import Namespace, Graph, RDF, RDFS, Literal
import json

# Set the Wikidata SPARQL endpoint
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"

# Define namespaces
EX = Namespace("http://example.org/")
WIKIDATA = Namespace("http://www.wikidata.org/entity/")

# Create an RDF graph
g = Graph()
g.bind("ex", EX)
g.bind("wikidata", WIKIDATA)

# Prevent duplicate queries
queried_nodes = set()

def query_wikidata_properties(qid):
    """Query Wikidata properties for a given entity"""
    if qid in queried_nodes:
        return []
    queried_nodes.add(qid)

    query = f"""
    SELECT ?property ?propertyLabel ?value ?valueLabel WHERE {{
      wd:{qid} ?p ?value.
      ?property wikibase:directClaim ?p.
      FILTER (?property IN (wd:{", wd:".join(RELEVANT_PROPERTIES)}))
      OPTIONAL {{ ?value rdfs:label ?valueLabel. FILTER(LANG(?valueLabel) = "en") }}
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    """
    sparql = SPARQLWrapper(SPARQL_ENDPOINT)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    return results["results"]["bindings"]

def recursive_query(qid, label=None, depth=0, max_depth=3):
    """Recursively query and construct the knowledge graph"""
    if depth > max_depth:
        return

    # Query entity properties
    results = query_wikidata_properties(qid)

    for result in results:
        prop = result["propertyLabel"]["value"].replace(" ", "_").replace("(", "_").replace(")", "_")
        value = result.get("valueLabel", {}).get("value", result["value"]["value"])
        value_uri = result["value"]["value"]

        # Check if the value is a Wikidata entity
        if "wikidata.org/entity/" in value_uri:
            value_qid = value_uri.split("/")[-1]

            # Add attribute edges and make sure the nodes have labels
            g.add((WIKIDATA[qid], EX[prop], WIKIDATA[value_qid]))
            if label:
                g.add((WIKIDATA[qid], RDFS.label, Literal(label)))
            if value:
                g.add((WIKIDATA[value_qid], RDFS.label, Literal(value)))

            # recursive query
            # recursive_query(value_qid, label=value, depth + 1, max_depth)
            recursive_query(value_qid, label=value, depth=depth + 1, max_depth=max_depth)

def build_knowledge_graph(dataset, max_depth=3):
    """Build a knowledge graph from a data set"""
    for entry in dataset:
        modality = entry["modality"]
        data_types = entry["data_types"]

        # Get the QID of modality
        modality_qid = get_qid(modality)
        if modality_qid:
            print(f"Processing modality: {modality} (QID: {modality_qid})")

            # Add modality node to RDF graph，and append label
            g.add((WIKIDATA[modality_qid], RDF.type, EX.Modality))
            g.add((WIKIDATA[modality_qid], RDFS.label, Literal(modality)))

            # Recursively query the properties of modality
            recursive_query(modality_qid, label=modality, depth=0, max_depth=max_depth)

            # Handle data_types
            for data_type, properties in data_types.items():
                data_type_qid = properties.get("wikidata_id")

                if data_type_qid:
                    # Add data_type node to the RDF graph and attach label
                    g.add((WIKIDATA[data_type_qid], RDF.type, EX.Datatype))
                    g.add((WIKIDATA[data_type_qid], RDFS.label, Literal(data_type)))
                    g.add((WIKIDATA[data_type_qid], EX.frequency, Literal(properties.get("frequency"))))
                    g.add((WIKIDATA[data_type_qid], EX.wikidata_label, Literal(properties.get("wikidata_label"))))
                    g.add((WIKIDATA[data_type_qid], EX.wikidata_description, Literal(properties.get("wikidata_description"))))

                    # Add custom edges of modality and data_type
                    g.add((WIKIDATA[modality_qid], EX["Modality_Datatype"], WIKIDATA[data_type_qid]))

                    # Recursively query the attributes of data_type
                    recursive_query(data_type_qid, label=data_type, depth=0, max_depth=max_depth)

    # Save the knowledge graph as an OWL file
    g.serialize(destination="knowledge_graph_with_labels_1.ttl", format="turtle")
    print("Knowledge graph saved to knowledge_graph_with_labels.owl")

    g.serialize(destination="knowledge_graph_with_labels_1.owl", format="xml")
    print("Knowledge graph saved to knowledge_graph_with_labels.owl")


# Load file
with open('matched_modalities_data_types.json', 'r') as f:
    dataset = json.load(f)

# Build a knowledge graph
build_knowledge_graph(dataset, max_depth=1)

Processing modality: image (QID: Q478798)
Processing modality: text (QID: Q234460)




Processing modality: video (QID: Q625525)
Processing modality: audio (QID: Q3500685)




Processing modality: metadata (QID: Q180160)
Processing modality: structured data (QID: Q26813700)
Processing modality: annotation (QID: Q857525)
Processing modality: depth (QID: Q181591)
Processing modality: skeleton (QID: Q7881)




Processing modality: sensor (QID: Q167676)




Processing modality: tabular (QID: Q13402984)
Processing modality: graph (QID: Q141488)




Processing modality: network (QID: Q109406)




Processing modality: table (QID: Q14748)
Processing modality: time series (QID: Q186588)
Processing modality: simulation (QID: Q45045)
Processing modality: lidar (QID: Q504027)
Processing modality: radar (QID: Q47528)
Processing modality: numeric (QID: Q3249849)
Processing modality: signal (QID: Q174984)
Processing modality: data (QID: Q42848)
Processing modality: point cloud (QID: Q1899648)
Processing modality: vector (QID: Q107994)
Processing modality: label (QID: Q202581)
Processing modality: chemical structure (QID: Q500256)




Processing modality: motion capture (QID: Q676252)




Processing modality: geolocation (QID: Q123349905)
Processing modality: temporal (QID: Q2472680)
Processing modality: sequence (QID: Q133250)
Processing modality: trajectory (QID: Q193139)
Processing modality: neuroimaging (QID: Q551875)
Processing modality: environment (QID: Q1572519)




Processing modality: pose (QID: Q1671968)
Processing modality: visualization (QID: Q451553)
Processing modality: geospatial (QID: Q122074678)




Processing modality: binary (QID: Q58483123)
Processing modality: interaction (QID: Q52948)
Processing modality: eye tracking (QID: Q970687)
Processing modality: social network (QID: Q2715623)




Processing modality: event (QID: Q1349920)
Processing modality: control (QID: Q338754)
Processing modality: synthetic data (QID: Q7662746)




Processing modality: code (QID: Q188889)




Processing modality: knowledge graph (QID: Q33002955)
Processing modality: matrix (QID: Q44337)
Processing modality: motion (QID: Q79782)




Processing modality: game (QID: Q11410)




Processing modality: web (QID: Q1427141)
Processing modality: log (QID: Q12029485)
Processing modality: model (QID: Q1941828)




Processing modality: feature vector (QID: Q1921842)




Processing modality: spatial (QID: Q122075505)
Processing modality: visual (QID: Q4014836)




Processing modality: software (QID: Q7397)
Processing modality: sketch (QID: Q5078274)




Processing modality: feature (QID: Q93586)




Processing modality: tactile (QID: Q124134995)




Processing modality: notebook (QID: Q43013)
Processing modality: emotion (QID: Q95969875)




Processing modality: biomarker (QID: Q864574)




Processing modality: other (QID: Q1433373)
Processing modality: database (QID: Q8513)
Processing modality: box (QID: Q188075)
Processing modality: animation (QID: Q11425)




Processing modality: odometry (QID: Q2014717)
Processing modality: blockchain (QID: Q20514253)




Processing modality: vr (QID: Q29716068)




Processing modality: data file (QID: Q5227290)
Processing modality: biosignal (QID: Q644240)
Knowledge graph saved to knowledge_graph_with_labels.owl


In [None]:
from rdflib import Namespace, Graph, RDF, RDFS, Literal
import json

# Set the Wikidata SPARQL endpoint
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"

# Define namespaces
EX = Namespace("http://example.org/")
WIKIDATA = Namespace("http://www.wikidata.org/entity/")

# Create an RDF graph
g = Graph()
g.bind("ex", EX)
g.bind("wikidata", WIKIDATA)

# Prevent duplicate queries
queried_nodes = set()

def query_wikidata_properties(qid):
    """Query Wikidata properties for a given entity"""
    if qid in queried_nodes:
        return []
    queried_nodes.add(qid)

    query = f"""
    SELECT ?property ?propertyLabel ?value ?valueLabel WHERE {{
      wd:{qid} ?p ?value.
      ?property wikibase:directClaim ?p.
      FILTER (?property IN (wd:{", wd:".join(RELEVANT_PROPERTIES)}))
      OPTIONAL {{ ?value rdfs:label ?valueLabel. FILTER(LANG(?valueLabel) = "en") }}
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    """
    sparql = SPARQLWrapper(SPARQL_ENDPOINT)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    return results["results"]["bindings"]

def recursive_query(qid, label=None, depth=0, max_depth=3):
    """Recursively query and construct the knowledge graph"""
    if depth > max_depth:
        return

    # Query entity properties
    results = query_wikidata_properties(qid)

    for result in results:
        prop = result["propertyLabel"]["value"].replace(" ", "_").replace("(", "_").replace(")", "_")
        value = result.get("valueLabel", {}).get("value", result["value"]["value"])
        value_uri = result["value"]["value"]

        # Check if the value is a Wikidata entity
        if "wikidata.org/entity/" in value_uri:
            value_qid = value_uri.split("/")[-1]

            # Add attribute edges and make sure the nodes have labels
            g.add((WIKIDATA[qid], EX[prop], WIKIDATA[value_qid]))
            if label:
                g.add((WIKIDATA[qid], RDFS.label, Literal(label)))
            if value:
                g.add((WIKIDATA[value_qid], RDFS.label, Literal(value)))

            # recursive query
            # recursive_query(value_qid, label=value, depth + 1, max_depth)
            recursive_query(value_qid, label=value, depth=depth + 1, max_depth=max_depth)

def build_knowledge_graph(dataset, max_depth=3):
    """Build a knowledge graph from a data set"""
    for entry in dataset:
        modality = entry["modality"]
        data_types = entry["data_types"]

        # Get the QID of modality
        modality_qid = get_qid(modality)
        if modality_qid:
            print(f"Processing modality: {modality} (QID: {modality_qid})")

            # Add modality node to RDF graph，and append label
            g.add((WIKIDATA[modality_qid], RDF.type, EX.Modality))
            g.add((WIKIDATA[modality_qid], RDFS.label, Literal(modality)))

            # Recursively query the properties of modality
            recursive_query(modality_qid, label=modality, depth=0, max_depth=max_depth)

            # Handle data_types
            for data_type, properties in data_types.items():
                data_type_qid = properties.get("wikidata_id")

                if data_type_qid:
                    # Add data_type node to the RDF graph and attach label
                    g.add((WIKIDATA[data_type_qid], RDF.type, EX.Datatype))
                    g.add((WIKIDATA[data_type_qid], RDFS.label, Literal(data_type)))
                    g.add((WIKIDATA[data_type_qid], EX.frequency, Literal(properties.get("frequency"))))
                    g.add((WIKIDATA[data_type_qid], EX.wikidata_label, Literal(properties.get("wikidata_label"))))
                    g.add((WIKIDATA[data_type_qid], EX.wikidata_description, Literal(properties.get("wikidata_description"))))

                    # Add custom edges of modality and data_type
                    g.add((WIKIDATA[modality_qid], EX["Modality_Datatype"], WIKIDATA[data_type_qid]))

                    # Recursively query the attributes of data_type
                    recursive_query(data_type_qid, label=data_type, depth=0, max_depth=max_depth)

    # Save the knowledge graph as an OWL file
    g.serialize(destination="knowledge_graph_with_labels_2.ttl", format="turtle")
    print("Knowledge graph saved to knowledge_graph_with_labels.owl")

    g.serialize(destination="knowledge_graph_with_labels_2.owl", format="xml")
    print("Knowledge graph saved to knowledge_graph_with_labels.owl")


# Load file
with open('matched_modalities_data_types.json', 'r') as f:
    dataset = json.load(f)

# Build a knowledge graph
build_knowledge_graph(dataset, max_depth=2)

In [None]:
from rdflib import Namespace, Graph, RDF, RDFS, Literal
import json

# Set the Wikidata SPARQL endpoint
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"

# Define namespaces
EX = Namespace("http://example.org/")
WIKIDATA = Namespace("http://www.wikidata.org/entity/")

# Create an RDF graph
g = Graph()
g.bind("ex", EX)
g.bind("wikidata", WIKIDATA)

# Prevent duplicate queries
queried_nodes = set()

def query_wikidata_properties(qid):
    """Query Wikidata properties for a given entity"""
    if qid in queried_nodes:
        return []
    queried_nodes.add(qid)

    query = f"""
    SELECT ?property ?propertyLabel ?value ?valueLabel WHERE {{
      wd:{qid} ?p ?value.
      ?property wikibase:directClaim ?p.
      FILTER (?property IN (wd:{", wd:".join(RELEVANT_PROPERTIES)}))
      OPTIONAL {{ ?value rdfs:label ?valueLabel. FILTER(LANG(?valueLabel) = "en") }}
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    """
    sparql = SPARQLWrapper(SPARQL_ENDPOINT)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    return results["results"]["bindings"]

def recursive_query(qid, label=None, depth=0, max_depth=3):
    """Recursively query and construct the knowledge graph"""
    if depth > max_depth:
        return

    # Query entity properties
    results = query_wikidata_properties(qid)

    for result in results:
        prop = result["propertyLabel"]["value"].replace(" ", "_").replace("(", "_").replace(")", "_")
        value = result.get("valueLabel", {}).get("value", result["value"]["value"])
        value_uri = result["value"]["value"]

        # Check if the value is a Wikidata entity
        if "wikidata.org/entity/" in value_uri:
            value_qid = value_uri.split("/")[-1]

            # Add attribute edges and make sure the nodes have labels
            g.add((WIKIDATA[qid], EX[prop], WIKIDATA[value_qid]))
            if label:
                g.add((WIKIDATA[qid], RDFS.label, Literal(label)))
            if value:
                g.add((WIKIDATA[value_qid], RDFS.label, Literal(value)))

            # recursive query
            # recursive_query(value_qid, label=value, depth + 1, max_depth)
            recursive_query(value_qid, label=value, depth=depth + 1, max_depth=max_depth)

def build_knowledge_graph(dataset, max_depth=3):
    """Build a knowledge graph from a data set"""
    for entry in dataset:
        modality = entry["modality"]
        data_types = entry["data_types"]

        # Get the QID of modality
        modality_qid = get_qid(modality)
        if modality_qid:
            print(f"Processing modality: {modality} (QID: {modality_qid})")

            # Add modality node to RDF graph，and append label
            g.add((WIKIDATA[modality_qid], RDF.type, EX.Modality))
            g.add((WIKIDATA[modality_qid], RDFS.label, Literal(modality)))

            # Recursively query the properties of modality
            recursive_query(modality_qid, label=modality, depth=0, max_depth=max_depth)

            # Handle data_types
            for data_type, properties in data_types.items():
                data_type_qid = properties.get("wikidata_id")

                if data_type_qid:
                    # Add data_type node to the RDF graph and attach label
                    g.add((WIKIDATA[data_type_qid], RDF.type, EX.Datatype))
                    g.add((WIKIDATA[data_type_qid], RDFS.label, Literal(data_type)))
                    g.add((WIKIDATA[data_type_qid], EX.frequency, Literal(properties.get("frequency"))))
                    g.add((WIKIDATA[data_type_qid], EX.wikidata_label, Literal(properties.get("wikidata_label"))))
                    g.add((WIKIDATA[data_type_qid], EX.wikidata_description, Literal(properties.get("wikidata_description"))))

                    # Add custom edges of modality and data_type
                    g.add((WIKIDATA[modality_qid], EX["Modality_Datatype"], WIKIDATA[data_type_qid]))

                    # Recursively query the attributes of data_type
                    recursive_query(data_type_qid, label=data_type, depth=0, max_depth=max_depth)

    # Save the knowledge graph as an OWL file
    g.serialize(destination="knowledge_graph_with_labels_3.ttl", format="turtle")
    print("Knowledge graph saved to knowledge_graph_with_labels.owl")

    g.serialize(destination="knowledge_graph_with_labels_3.owl", format="xml")
    print("Knowledge graph saved to knowledge_graph_with_labels.owl")


# Load file
with open('matched_modalities_data_types.json', 'r') as f:
    dataset = json.load(f)

# Build a knowledge graph
build_knowledge_graph(dataset, max_depth=3)

## with P

In [None]:
from rdflib import Namespace, Graph, RDF, RDFS, Literal
import json
from SPARQLWrapper import SPARQLWrapper, JSON
import json
import time
import logging
from SPARQLWrapper import SPARQLWrapper, JSON
from rdflib import Graph, URIRef, Literal, Namespace, RDF

# Set the Wikidata SPARQL endpoint
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"

# Define namespaces
EX = Namespace("http://example.org/")
WIKIDATA = Namespace("http://www.wikidata.org/entity/")

# Create an RDF graph
g = Graph()
g.bind("ex", EX)
g.bind("wikidata", WIKIDATA)

# Prevent duplicate queries
queried_nodes = set()

def query_wikidata_properties(qid):
    """Query Wikidata properties for a given entity"""
    if qid in queried_nodes:
        return []
    queried_nodes.add(qid)

    query = f"""
    SELECT ?property ?propertyLabel ?value ?valueLabel WHERE {{
      wd:{qid} ?p ?value.
      ?property wikibase:directClaim ?p.
      FILTER (?property IN (wd:{", wd:".join(RELEVANT_PROPERTIES)}))
      OPTIONAL {{ ?value rdfs:label ?valueLabel. FILTER(LANG(?valueLabel) = "en") }}
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    """
    sparql = SPARQLWrapper(SPARQL_ENDPOINT)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    return results["results"]["bindings"]

def recursive_query(qid, label=None, depth=0, max_depth=3):
    """Recursively query and construct the knowledge graph"""
    if depth > max_depth:
        return

    # Query entity properties
    results = query_wikidata_properties(qid)

    for result in results:
        prop = result["propertyLabel"]["value"].replace(" ", "_").replace("(", "_").replace(")", "_")
        value = result.get("valueLabel", {}).get("value", result["value"]["value"])
        value_uri = result["value"]["value"]

        # Check if the value is a Wikidata entity
        if "wikidata.org/entity/" in value_uri:
            value_qid = value_uri.split("/")[-1]

            # New nodes are of type NewNode and added to the graph
            if value_qid not in queried_nodes:
                g.add((WIKIDATA[value_qid], RDF.type, EX.NewNode))  # Mark new entities as NewNode type
                queried_nodes.add(value_qid)

            # Only connect attributes to the target node as edges, not as attributes of the node
            g.add((WIKIDATA[qid], EX[prop], WIKIDATA[value_qid]))
            if label:
                g.add((WIKIDATA[qid], RDFS.label, Literal(label)))
            if value:
                g.add((WIKIDATA[value_qid], RDFS.label, Literal(value)))

            # recursive query
            recursive_query(value_qid, label=value, depth=depth + 1, max_depth=max_depth)

        # Process the relationship edge between the current property and the new node
        if value_uri and "wikidata.org/entity/" in value_uri:
            value_qid = value_uri.split("/")[-1]
            if value_qid not in queried_nodes:
                g.add((WIKIDATA[value_qid], RDF.type, EX.NewNode))  # Mark new entities as NewNode type
                queried_nodes.add(value_qid)

            # Add relationships in RELEVANT_PROPERTIES as edges between new nodes
            g.add((WIKIDATA[qid], EX[prop], WIKIDATA[value_qid]))  # Add relationship edge

def build_knowledge_graph(dataset, max_depth=3):
    """Build a knowledge graph from a data set"""
    for entry in dataset:
        modality = entry["modality"]
        data_types = entry["data_types"]

        # Get the QID of modality
        modality_qid = get_qid(modality)
        if modality_qid:
            print(f"Processing modality: {modality} (QID: {modality_qid})")

            # Add modality node to RDF graph，and append label
            g.add((WIKIDATA[modality_qid], RDF.type, EX.Modality))
            g.add((WIKIDATA[modality_qid], RDFS.label, Literal(modality)))

            # Recursively query the properties of modality
            recursive_query(modality_qid, label=modality, depth=0, max_depth=max_depth)

            # Handle data_types
            for data_type, properties in data_types.items():
                data_type_qid = properties.get("wikidata_id")

                if data_type_qid:
                    # Add data_type node to the RDF graph and attach label
                    g.add((WIKIDATA[data_type_qid], RDF.type, EX.Datatype))
                    g.add((WIKIDATA[data_type_qid], RDFS.label, Literal(data_type)))
                    g.add((WIKIDATA[data_type_qid], EX.frequency, Literal(properties.get("frequency"))))
                    g.add((WIKIDATA[data_type_qid], EX.wikidata_label, Literal(properties.get("wikidata_label"))))
                    g.add((WIKIDATA[data_type_qid], EX.wikidata_description, Literal(properties.get("wikidata_description"))))

                    # Add custom edges of modality and data_type
                    g.add((WIKIDATA[modality_qid], EX["Modality_Datatype"], WIKIDATA[data_type_qid]))

                    # Recursively query the attributes of data_type
                    recursive_query(data_type_qid, label=data_type, depth=0, max_depth=max_depth)

    # ttl
    g.serialize(destination="knowledge_graph_with_labels_3.ttl", format="turtle")
    print("Knowledge graph saved to knowledge_graph_with_labels.ttl")

    g.serialize(destination="knowledge_graph_with_labels_3.owl", format="xml")
    print("Knowledge graph saved to knowledge_graph_with_labels.owl")

# Load file
with open('matched_modalities_data_types.json', 'r') as f:
    dataset = json.load(f)

# Build a knowledge graph
build_knowledge_graph(dataset, max_depth=3)

Processing modality: image (QID: Q478798)


HTTPError: HTTP Error 403: Forbidden

In [None]:
from rdflib import Namespace, Graph, RDF, RDFS, Literal
import json
from SPARQLWrapper import SPARQLWrapper, JSON

# Set the Wikidata SPARQL endpoint
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"

# Define namespaces
EX = Namespace("http://example.org/")
WIKIDATA = Namespace("http://www.wikidata.org/entity/")

# Create an RDF graph
g = Graph()
g.bind("ex", EX)
g.bind("wikidata", WIKIDATA)

# Prevent duplicate queries
queried_nodes = set()

def query_wikidata_properties(qid):
    """Query Wikidata properties for a given entity"""
    if qid in queried_nodes:
        return []
    queried_nodes.add(qid)

    query = f"""
    SELECT ?property ?propertyLabel ?value ?valueLabel WHERE {{
      wd:{qid} ?p ?value.
      ?property wikibase:directClaim ?p.
      FILTER (?property IN (wd:{", wd:".join(RELEVANT_PROPERTIES)}))
      OPTIONAL {{ ?value rdfs:label ?valueLabel. FILTER(LANG(?valueLabel) = "en") }}
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    """
    sparql = SPARQLWrapper(SPARQL_ENDPOINT)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    return results["results"]["bindings"]

def recursive_query(qid, label=None, depth=0, max_depth=3):
    """Recursively query and construct the knowledge graph"""
    if depth > max_depth:
        return

    # Query entity properties
    results = query_wikidata_properties(qid)

    for result in results:
        prop = result["propertyLabel"]["value"].replace(" ", "_").replace("(", "_").replace(")", "_")
        value = result.get("valueLabel", {}).get("value", result["value"]["value"])
        value_uri = result["value"]["value"]

        # Check if the value is a Wikidata entity
        if "wikidata.org/entity/" in value_uri:
            value_qid = value_uri.split("/")[-1]

            # New nodes are of type NewNode and added to the graph
            if value_qid not in queried_nodes:
                g.add((WIKIDATA[value_qid], RDF.type, EX.NewNode))  # Mark new entities as NewNode type
                queried_nodes.add(value_qid)

            # Only connect attributes to the target node as edges, not as attributes of the node
            g.add((WIKIDATA[qid], EX[prop], WIKIDATA[value_qid]))
            if label:
                g.add((WIKIDATA[qid], RDFS.label, Literal(label)))
            if value:
                g.add((WIKIDATA[value_qid], RDFS.label, Literal(value)))

            # recursive query
            recursive_query(value_qid, label=value, depth=depth + 1, max_depth=max_depth)

        # Process the relationship edge between the current property and the new node
        if value_uri and "wikidata.org/entity/" in value_uri:
            value_qid = value_uri.split("/")[-1]
            if value_qid not in queried_nodes:
                g.add((WIKIDATA[value_qid], RDF.type, EX.NewNode))  # Mark new entities as NewNode type
                queried_nodes.add(value_qid)

            # Add relationships in RELEVANT_PROPERTIES as edges between new nodes
            g.add((WIKIDATA[qid], EX[prop], WIKIDATA[value_qid]))  # Add relationship edge

def build_knowledge_graph(dataset, max_depth=3):
    """Build a knowledge graph from a data set"""
    for entry in dataset:
        modality = entry["modality"]
        data_types = entry["data_types"]

        # Get the QID of modality
        modality_qid = get_qid(modality)
        if modality_qid:
            print(f"Processing modality: {modality} (QID: {modality_qid})")

            # Add modality node to RDF graph，and append label
            g.add((WIKIDATA[modality_qid], RDF.type, EX.Modality))
            g.add((WIKIDATA[modality_qid], RDFS.label, Literal(modality)))

            # Recursively query the properties of modality
            recursive_query(modality_qid, label=modality, depth=0, max_depth=max_depth)

            # Handle data_types
            for data_type, properties in data_types.items():
                data_type_qid = properties.get("wikidata_id")

                if data_type_qid:
                    # Add data_type node to the RDF graph and attach label
                    g.add((WIKIDATA[data_type_qid], RDF.type, EX.Datatype))
                    g.add((WIKIDATA[data_type_qid], RDFS.label, Literal(data_type)))
                    g.add((WIKIDATA[data_type_qid], EX.frequency, Literal(properties.get("frequency"))))
                    g.add((WIKIDATA[data_type_qid], EX.wikidata_label, Literal(properties.get("wikidata_label"))))
                    g.add((WIKIDATA[data_type_qid], EX.wikidata_description, Literal(properties.get("wikidata_description"))))

                    # Add custom edges of modality and data_type
                    g.add((WIKIDATA[modality_qid], EX["Modality_Datatype"], WIKIDATA[data_type_qid]))

                    # Recursively query the attributes of data_type
                    recursive_query(data_type_qid, label=data_type, depth=0, max_depth=max_depth)

    # ttl
    g.serialize(destination="knowledge_graph_with_labels_2.ttl", format="turtle")
    print("Knowledge graph saved to knowledge_graph_with_labels.ttl")

    g.serialize(destination="knowledge_graph_with_labels_2.owl", format="xml")
    print("Knowledge graph saved to knowledge_graph_with_labels.owl")

# Load file
with open('matched_modalities_data_types.json', 'r') as f:
    dataset = json.load(f)

# Build a knowledge graph
build_knowledge_graph(dataset, max_depth=2)

Processing modality: image (QID: Q478798)
Processing modality: text (QID: Q234460)




Processing modality: video (QID: Q625525)
Processing modality: audio (QID: Q3500685)




Processing modality: metadata (QID: Q180160)
Processing modality: structured data (QID: Q26813700)
Processing modality: annotation (QID: Q857525)
Processing modality: depth (QID: Q181591)
Processing modality: skeleton (QID: Q7881)




Processing modality: sensor (QID: Q167676)




Processing modality: tabular (QID: Q13402984)
Processing modality: graph (QID: Q141488)




Processing modality: network (QID: Q109406)




Processing modality: table (QID: Q14748)
Processing modality: time series (QID: Q186588)
Processing modality: simulation (QID: Q45045)
Processing modality: lidar (QID: Q504027)
Processing modality: radar (QID: Q47528)
Processing modality: numeric (QID: Q3249849)
Processing modality: signal (QID: Q174984)
Processing modality: data (QID: Q42848)
Processing modality: point cloud (QID: Q1899648)
Processing modality: vector (QID: Q107994)
Processing modality: label (QID: Q202581)
Processing modality: chemical structure (QID: Q500256)




Processing modality: motion capture (QID: Q676252)




Processing modality: geolocation (QID: Q123349905)
Processing modality: temporal (QID: Q2472680)
Processing modality: sequence (QID: Q133250)
Processing modality: trajectory (QID: Q193139)
Processing modality: neuroimaging (QID: Q551875)
Processing modality: environment (QID: Q1572519)




Processing modality: pose (QID: Q1671968)
Processing modality: visualization (QID: Q451553)
Processing modality: geospatial (QID: Q122074678)




Processing modality: binary (QID: Q58483123)
Processing modality: interaction (QID: Q52948)
Processing modality: eye tracking (QID: Q970687)
Processing modality: social network (QID: Q2715623)




Processing modality: event (QID: Q1349920)
Processing modality: control (QID: Q338754)
Processing modality: synthetic data (QID: Q7662746)




Processing modality: code (QID: Q188889)




Processing modality: knowledge graph (QID: Q33002955)
Processing modality: matrix (QID: Q44337)
Processing modality: motion (QID: Q79782)




Processing modality: game (QID: Q11410)




Processing modality: web (QID: Q1427141)
Processing modality: log (QID: Q12029485)
Processing modality: model (QID: Q1941828)




Processing modality: feature vector (QID: Q1921842)




Processing modality: spatial (QID: Q122075505)
Processing modality: visual (QID: Q4014836)




Processing modality: software (QID: Q7397)
Processing modality: sketch (QID: Q5078274)




Processing modality: feature (QID: Q93586)




Processing modality: tactile (QID: Q124134995)




Processing modality: notebook (QID: Q43013)
Processing modality: emotion (QID: Q95969875)




Processing modality: biomarker (QID: Q864574)




Processing modality: other (QID: Q1433373)
Processing modality: database (QID: Q8513)
Processing modality: box (QID: Q188075)
Processing modality: animation (QID: Q11425)




Processing modality: odometry (QID: Q2014717)
Processing modality: blockchain (QID: Q20514253)




Processing modality: vr (QID: Q29716068)




Processing modality: data file (QID: Q5227290)
Processing modality: biosignal (QID: Q644240)
Knowledge graph saved to knowledge_graph_with_labels.ttl
Knowledge graph saved to knowledge_graph_with_labels.owl


In [None]:
from rdflib import Namespace, Graph, RDF, RDFS, Literal
import json
from SPARQLWrapper import SPARQLWrapper, JSON

# Set the Wikidata SPARQL endpoint
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"

# Define namespaces
EX = Namespace("http://example.org/")
WIKIDATA = Namespace("http://www.wikidata.org/entity/")

# Create an RDF graph
g = Graph()
g.bind("ex", EX)
g.bind("wikidata", WIKIDATA)

# Prevent duplicate queries
queried_nodes = set()

def query_wikidata_properties(qid):
    """Query Wikidata properties for a given entity"""
    if qid in queried_nodes:
        return []
    queried_nodes.add(qid)

    query = f"""
    SELECT ?property ?propertyLabel ?value ?valueLabel WHERE {{
      wd:{qid} ?p ?value.
      ?property wikibase:directClaim ?p.
      FILTER (?property IN (wd:{", wd:".join(RELEVANT_PROPERTIES)}))
      OPTIONAL {{ ?value rdfs:label ?valueLabel. FILTER(LANG(?valueLabel) = "en") }}
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    """
    sparql = SPARQLWrapper(SPARQL_ENDPOINT)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    return results["results"]["bindings"]

def recursive_query(qid, label=None, depth=0, max_depth=3):
    """Recursively query and construct the knowledge graph"""
    if depth > max_depth:
        return

    # Query entity properties
    results = query_wikidata_properties(qid)

    for result in results:
        prop = result["propertyLabel"]["value"].replace(" ", "_").replace("(", "_").replace(")", "_")
        value = result.get("valueLabel", {}).get("value", result["value"]["value"])
        value_uri = result["value"]["value"]

        # Check if the value is a Wikidata entity
        if "wikidata.org/entity/" in value_uri:
            value_qid = value_uri.split("/")[-1]

            # New nodes are of type NewNode and added to the graph
            if value_qid not in queried_nodes:
                g.add((WIKIDATA[value_qid], RDF.type, EX.NewNode))  # Mark new entities as NewNode type
                queried_nodes.add(value_qid)

            # Only connect attributes to the target node as edges, not as attributes of the node
            g.add((WIKIDATA[qid], EX[prop], WIKIDATA[value_qid]))
            if label:
                g.add((WIKIDATA[qid], RDFS.label, Literal(label)))
            if value:
                g.add((WIKIDATA[value_qid], RDFS.label, Literal(value)))

            # recursive query
            recursive_query(value_qid, label=value, depth=depth + 1, max_depth=max_depth)

        # Process the relationship edge between the current property and the new node
        if value_uri and "wikidata.org/entity/" in value_uri:
            value_qid = value_uri.split("/")[-1]
            if value_qid not in queried_nodes:
                g.add((WIKIDATA[value_qid], RDF.type, EX.NewNode))  # Mark new entities as NewNode type
                queried_nodes.add(value_qid)

            # Add relationships in RELEVANT_PROPERTIES as edges between new nodes
            g.add((WIKIDATA[qid], EX[prop], WIKIDATA[value_qid]))  # Add relationship edge

def build_knowledge_graph(dataset, max_depth=3):
    """Build a knowledge graph from a data set"""
    for entry in dataset:
        modality = entry["modality"]
        data_types = entry["data_types"]

        # Get the QID of modality
        modality_qid = get_qid(modality)
        if modality_qid:
            print(f"Processing modality: {modality} (QID: {modality_qid})")

            # Add modality node to RDF graph，and append label
            g.add((WIKIDATA[modality_qid], RDF.type, EX.Modality))
            g.add((WIKIDATA[modality_qid], RDFS.label, Literal(modality)))

            # Recursively query the properties of modality
            recursive_query(modality_qid, label=modality, depth=0, max_depth=max_depth)

            # Handle data_types
            for data_type, properties in data_types.items():
                data_type_qid = properties.get("wikidata_id")

                if data_type_qid:
                    # Add data_type node to the RDF graph and attach label
                    g.add((WIKIDATA[data_type_qid], RDF.type, EX.Datatype))
                    g.add((WIKIDATA[data_type_qid], RDFS.label, Literal(data_type)))
                    g.add((WIKIDATA[data_type_qid], EX.frequency, Literal(properties.get("frequency"))))
                    g.add((WIKIDATA[data_type_qid], EX.wikidata_label, Literal(properties.get("wikidata_label"))))
                    g.add((WIKIDATA[data_type_qid], EX.wikidata_description, Literal(properties.get("wikidata_description"))))

                    # Add custom edges of modality and data_type
                    g.add((WIKIDATA[modality_qid], EX["Modality_Datatype"], WIKIDATA[data_type_qid]))

                    # Recursively query the attributes of data_type
                    recursive_query(data_type_qid, label=data_type, depth=0, max_depth=max_depth)

    # ttl
    g.serialize(destination="knowledge_graph_with_labels_1.ttl", format="turtle")
    print("Knowledge graph saved to knowledge_graph_with_labels.ttl")

    g.serialize(destination="knowledge_graph_with_labels_1.owl", format="xml")
    print("Knowledge graph saved to knowledge_graph_with_labels.owl")

# Load file
with open('matched_modalities_data_types.json', 'r') as f:
    dataset = json.load(f)

# Build a knowledge graph
build_knowledge_graph(dataset, max_depth=1)

Processing modality: image (QID: Q478798)
Processing modality: text (QID: Q234460)




Processing modality: video (QID: Q625525)
Processing modality: audio (QID: Q3500685)




Processing modality: metadata (QID: Q180160)
Processing modality: structured data (QID: Q26813700)
Processing modality: annotation (QID: Q857525)
Processing modality: depth (QID: Q181591)
Processing modality: skeleton (QID: Q7881)




Processing modality: sensor (QID: Q167676)




Processing modality: tabular (QID: Q13402984)
Processing modality: graph (QID: Q141488)




Processing modality: network (QID: Q109406)




Processing modality: table (QID: Q14748)
Processing modality: time series (QID: Q186588)
Processing modality: simulation (QID: Q45045)
Processing modality: lidar (QID: Q504027)
Processing modality: radar (QID: Q47528)
Processing modality: numeric (QID: Q3249849)
Processing modality: signal (QID: Q174984)
Processing modality: data (QID: Q42848)
Processing modality: point cloud (QID: Q1899648)
Processing modality: vector (QID: Q107994)
Processing modality: label (QID: Q202581)
Processing modality: chemical structure (QID: Q500256)




Processing modality: motion capture (QID: Q676252)




Processing modality: geolocation (QID: Q123349905)
Processing modality: temporal (QID: Q2472680)
Processing modality: sequence (QID: Q133250)
Processing modality: trajectory (QID: Q193139)
Processing modality: neuroimaging (QID: Q551875)
Processing modality: environment (QID: Q1572519)




Processing modality: pose (QID: Q1671968)
Processing modality: visualization (QID: Q451553)
Processing modality: geospatial (QID: Q122074678)




Processing modality: binary (QID: Q58483123)
Processing modality: interaction (QID: Q52948)
Processing modality: eye tracking (QID: Q970687)
Processing modality: social network (QID: Q2715623)




Processing modality: event (QID: Q1349920)
Processing modality: control (QID: Q338754)
Processing modality: synthetic data (QID: Q7662746)




Processing modality: code (QID: Q188889)




Processing modality: knowledge graph (QID: Q33002955)
Processing modality: matrix (QID: Q44337)
Processing modality: motion (QID: Q79782)




Processing modality: game (QID: Q11410)




Processing modality: web (QID: Q1427141)
Processing modality: log (QID: Q12029485)
Processing modality: model (QID: Q1941828)




Processing modality: feature vector (QID: Q1921842)




Processing modality: spatial (QID: Q122075505)
Processing modality: visual (QID: Q4014836)




Processing modality: software (QID: Q7397)
Processing modality: sketch (QID: Q5078274)




Processing modality: feature (QID: Q93586)




Processing modality: tactile (QID: Q124134995)




Processing modality: notebook (QID: Q43013)
Processing modality: emotion (QID: Q95969875)




Processing modality: biomarker (QID: Q864574)




Processing modality: other (QID: Q1433373)
Processing modality: database (QID: Q8513)
Processing modality: box (QID: Q188075)
Processing modality: animation (QID: Q11425)




Processing modality: odometry (QID: Q2014717)
Processing modality: blockchain (QID: Q20514253)




Processing modality: vr (QID: Q29716068)




Processing modality: data file (QID: Q5227290)
Processing modality: biosignal (QID: Q644240)
Knowledge graph saved to knowledge_graph_with_labels.ttl
Knowledge graph saved to knowledge_graph_with_labels.owl


## Full Dataset Query

In [None]:
from rdflib import Namespace, Graph, RDF, RDFS, Literal
import json
from SPARQLWrapper import SPARQLWrapper, JSON
import time
import logging

# Set up Wikidata SPARQL endpoint
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"

# List of relevant properties (modality and data type relationships)
RELEVANT_PROPERTIES = [
    "P31",  # instance of
    "P279",  # subclass of
    "P361",  # part of
    "P1269",  # facet of

    # Structure and Technology
    "P527",  # has part(s)
    "P2670",  # has part(s) of the class
    "P2701",  # file format
    "P1163",  # media type
    "P1195",  # file extension
    "P4330",  # contains

    # Application and Domain
    "P366",  # has use
    "P1535",  # used by
    "P101",  # field of work
    "P921",  # main subject

    # Quality and Metadata
    "P1552",  # has characteristic
    "P13044",  # characteristic of
    "P3575",  # data size
]

# Define namespaces
WIKIDATA = Namespace("http://www.wikidata.org/entity/")
P = Namespace("https://www.wikidata.org/wiki/Property/")

# Create RDF graph
g = Graph()
g.bind("wikidata", WIKIDATA)
g.bind("p", P)

# Avoid duplicate queries
queried_nodes = {}

def query_wikidata_properties(qid):
    """Query Wikidata properties for a given entity"""
    if qid in queried_nodes:
        return []
    queried_nodes[qid] = "Queried"  # Mark this node as queried

    query = f"""
    SELECT ?property ?propertyLabel ?value ?valueLabel WHERE {{
      wd:{qid} ?p ?value.
      ?property wikibase:directClaim ?p.
      FILTER (?property IN (wd:{", wd:".join(RELEVANT_PROPERTIES)}))
      OPTIONAL {{ ?value rdfs:label ?valueLabel. FILTER(LANG(?valueLabel) = "en") }}
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    """
    sparql = SPARQLWrapper(SPARQL_ENDPOINT)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    return results["results"]["bindings"]

def create_node(qid, node_type, label=None, properties=None):
    """Create nodes in the RDF graph with QID, type, label, and additional properties"""
    if qid not in queried_nodes:
        g.add((WIKIDATA[qid], RDF.type, node_type))
        queried_nodes[qid] = node_type
        if label:
            g.add((WIKIDATA[qid], RDFS.label, Literal(label)))
        if properties:
            for prop, value in properties.items():
                g.add((WIKIDATA[qid], P[prop], Literal(value)))

def recursive_query(qid, node_type, label=None, depth=0, max_depth=3):
    """Recursively query and construct the knowledge graph"""
    if depth > max_depth:
        return

    # Query entity properties
    results = query_wikidata_properties(qid)

    for result in results:
        prop = result["propertyLabel"]["value"]
        value = result.get("valueLabel", {}).get("value", result["value"]["value"])
        value_uri = result["value"]["value"]

        # Check if the value is a Wikidata entity
        if "wikidata.org/entity/" in value_uri:
            value_qid = value_uri.split("/")[-1]

            # Handle Modality
            if node_type == P.Modality:
                if value_qid not in queried_nodes or queried_nodes[value_qid] == "NewNode":
                    create_node(value_qid, P.NewNode, value)

                g.add((WIKIDATA[qid], P[prop], WIKIDATA[value_qid]))
            # Handle Datatype
            elif node_type == P.Datatype:
                if value_qid not in queried_nodes or queried_nodes[value_qid] == "NewNode":
                    create_node(value_qid, P.NewNode, value)
                g.add((WIKIDATA[qid], P[prop], WIKIDATA[value_qid]))

            # Recursively query the new node
            recursive_query(value_qid, node_type, value, depth=depth + 1, max_depth=max_depth)

def build_knowledge_graph(dataset, max_depth=3):
    """Build the knowledge graph from the dataset"""
    for entry in dataset:
        modality_qid = entry["modality"]
        data_types = entry["data_types"]

        # Process Modality
        create_node(modality_qid, P.Modality, entry["modality"], {"QID": modality_qid, "wikidata_label": entry.get("wikidata_label", "")})

        # Recursively query modality properties
        recursive_query(modality_qid, P.Modality, entry["modality"], depth=0, max_depth=max_depth)

        # Process DataTypes
        for data_type, properties in data_types.items():
            data_type_qid = properties["wikidata_id"]

            # Handle existing Datatype node as NewNode
            if data_type_qid in queried_nodes and queried_nodes[data_type_qid] == "NewNode":
                create_node(data_type_qid, P.Datatype, data_type, {"frequency": properties.get("frequency", ""),
                                                                  "QID": data_type_qid,
                                                                  "wikidata_label": properties["wikidata_label"],
                                                                  "wikidata_description": properties.get("wikidata_description", "")})

            # Process the relationship between Modality and Datatype
            g.add((WIKIDATA[modality_qid], P["P279"], WIKIDATA[data_type_qid]))  # Example relation (P279 is subclass of)

            # Recursively query Datatype properties
            recursive_query(data_type_qid, P.Datatype, data_type, depth=0, max_depth=max_depth)

    # Save the knowledge graph as Turtle and OWL format
    g.serialize(destination="knowledge_graph_with_labels_5_2.ttl", format="turtle")
    print("Knowledge graph saved to knowledge_graph_with_labels.ttl")

    g.serialize(destination="knowledge_graph_with_labels_5_2.owl", format="xml")
    print("Knowledge graph saved to knowledge_graph_with_labels.owl")

# Load the updated matched_modalities_data_types.json file
with open('updated_matched_modalities_data_types.json', 'r') as f:
    dataset = json.load(f)

# Build the knowledge graph
build_knowledge_graph(dataset, max_depth=3)



KeyboardInterrupt: 

In [None]:
from rdflib import Namespace, Graph, RDF, RDFS, Literal
import json
from SPARQLWrapper import SPARQLWrapper, JSON
import logging
import time

# Set up logging
logging.basicConfig(level=logging.INFO)

# Set up Wikidata SPARQL endpoint
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"

# Define namespaces
WIKIDATA = Namespace("http://www.wikidata.org/entity/")
WD = Namespace("http://www.wikidata.org/prop/direct/")

# Create RDF graph
g = Graph()
g.bind("wikidata", WIKIDATA)
g.bind("wd", WD)

# Avoid duplicate queries
queried_nodes = set()

# List of relevant properties (modality and data type relationships)
RELEVANT_PROPERTIES = [
    "P31",  # instance of
    "P279",  # subclass of
    "P361",  # part of
    "P1269",  # facet of
    "P527",  # has part(s)
    "P2670",  # has part(s) of the class
    "P2701",  # file format
    "P1163",  # media type
    "P1195",  # file extension
    "P4330",  # contains
    "P366",  # has use
    "P1535",  # used by
    "P101",  # field of work
    "P921",  # main subject
    "P1552",  # has characteristic
    "P13044",  # characteristic of
    "P3575",  # data size
]

def query_wikidata_properties(qid):
    """Query Wikidata properties for a given entity"""
    if qid in queried_nodes:
        return []
    queried_nodes.add(qid)

    query = f"""
    SELECT ?property ?propertyLabel ?value ?valueLabel WHERE {{
      wd:{qid} ?p ?value.
      ?property wikibase:directClaim ?p.
      FILTER (?property IN (wd:{", wd:".join(RELEVANT_PROPERTIES)}))
      OPTIONAL {{ ?value rdfs:label ?valueLabel. FILTER(LANG(?valueLabel) = "en") }}
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    """
    sparql = SPARQLWrapper(SPARQL_ENDPOINT)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    return results["results"]["bindings"]

def create_or_update_node(qid, node_type, properties=None):
    """
    Create or update a node with the given QID and type.
    Priority: Modality > Datatype > NewNode.
    """
    if qid in queried_nodes:
        existing_type = g.value(WIKIDATA[qid], RDF.type)
        if existing_type == WIKIDATA.NewNode and node_type in [WIKIDATA.Datatype, WIKIDATA.Modality]:
            g.remove((WIKIDATA[qid], RDF.type, WIKIDATA.NewNode))
            g.add((WIKIDATA[qid], RDF.type, node_type))
        elif existing_type == WIKIDATA.Datatype and node_type == WIKIDATA.Modality:
            g.remove((WIKIDATA[qid], RDF.type, WIKIDATA.Datatype))
            g.add((WIKIDATA[qid], RDF.type, node_type))
    else:
        g.add((WIKIDATA[qid], RDF.type, node_type))
        queried_nodes.add(qid)

    if properties:
        for key, value in properties.items():
            g.add((WIKIDATA[qid], WIKIDATA[key], Literal(value)))

def recursive_query(qid, node_type, depth=0, max_depth=1):
    """Recursively query and construct the knowledge graph"""
    if depth > max_depth:
        return

    # Query entity properties for ALL RELEVANT_PROPERTIES
    results = query_wikidata_properties(qid)

    for result in results:
        prop = result["property"]["value"].split("/")[-1]  # Extract property ID (e.g., P31)
        value_uri = result["value"]["value"]
        value_label = result.get("valueLabel", {}).get("value", value_uri)

        # Add attribute label
        prop_label = result["propertyLabel"]["value"]
        g.add((WD[prop], RDFS.label, Literal(prop_label)))  # Add attribute label

        # Handle entity relationships
        if "wikidata.org/entity/" in value_uri:
            value_qid = value_uri.split("/")[-1]

            # Make sure to add edges
            g.add((WIKIDATA[qid], WD[prop], WIKIDATA[value_qid]))

            # Type priority logic
            if value_qid not in queried_nodes:
                create_or_update_node(value_qid, WIKIDATA.NewNode, {"label": value_label})
            else:
                existing_type = g.value(WIKIDATA[value_qid], RDF.type)
                # Update type if current node is NewNode and target type has higher priority
                if existing_type == WIKIDATA.NewNode and node_type in [WIKIDATA.Datatype, WIKIDATA.Modality]:
                    create_or_update_node(value_qid, node_type)

            # Recursive exploration
            recursive_query(value_qid, node_type, depth=depth+1, max_depth=max_depth)

def build_knowledge_graph(dataset, max_depth=1):
    """Build the knowledge graph from the dataset"""
    for entry in dataset:
        modality = entry["modality"]
        modality_qid = entry.get("wikidata_id")

        if modality_qid:
            logging.info(f"Processing modality: {modality} (QID: {modality_qid})")

            # Create modality node
            create_or_update_node(modality_qid, WIKIDATA.Modality, {
                "label": modality,
                "QID": modality_qid
            })

            # recursive query
            recursive_query(modality_qid, WIKIDATA.Modality, depth=0, max_depth=max_depth)

            # Process data_types
            for data_type, properties in entry.get("data_types", {}).items():
                data_type_qid = properties.get("wikidata_id")

                if data_type_qid:
                    # Create datatype node
                    create_or_update_node(data_type_qid, WIKIDATA.Datatype, {
                        "label": data_type,
                        "frequency": properties.get("frequency"),
                        "QID": data_type_qid,
                        "wikidata_label": properties.get("wikidata_label"),
                        "wikidata_description": properties.get("wikidata_description")
                    })

                    recursive_query(data_type_qid, WIKIDATA.Datatype, depth=0, max_depth=max_depth)

    # Save the knowledge graph as Turtle and OWL format
    g.serialize(destination="knowledge_graph.ttl", format="turtle")
    logging.info("Knowledge graph saved to knowledge_graph.ttl")

    g.serialize(destination="knowledge_graph.owl", format="xml")
    logging.info("Knowledge graph saved to knowledge_graph.owl")

# Load the matched_modalities_data_types.json file
with open('updated_matched_modalities_data_types.json', 'r') as f:
    dataset = json.load(f)

# Build the knowledge graph
build_knowledge_graph(dataset, max_depth=3)

In [None]:
pip install sparqlwrapper



In [None]:
# pip install sparqlwrapper
# https://rdflib.github.io/sparqlwrapper/

import sys
from SPARQLWrapper import SPARQLWrapper, JSON

endpoint_url = "https://query.wikidata.org/sparql"

query = """#Cats
SELECT ?item ?itemLabel
WHERE
{
  ?item wdt:P31 wd:Q146. # Must be a cat
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],mul,en". } # Helps get the label in your language, if not, then default for all languages, then en language
}"""


def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    # TODO adjust user agent; see https://w.wiki/CX6
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()


results = get_results(endpoint_url, query)

# for result in results["results"]["bindings"]:
#     print(result)

print(results)

{'head': {'vars': ['item', 'itemLabel']}, 'results': {'bindings': [{'item': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q378619'}, 'itemLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'CC'}}, {'item': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q498787'}, 'itemLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'Muezza'}}, {'item': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q677525'}, 'itemLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'Orangey'}}, {'item': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q893453'}, 'itemLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'Unsinkable Sam'}}, {'item': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q1050083'}, 'itemLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'Catmando'}}, {'item': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q1185550'}, 'itemLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'Oscar'}}, {'item': {'type': 'uri', 'value

## Property as Label

In [None]:
from rdflib import Namespace, Graph, RDF, RDFS, Literal
import json
from SPARQLWrapper import SPARQLWrapper, JSON

# Set up Wikidata SPARQL endpoint
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"

# From label get qid
def get_qid(label):
    """Get the QID of an entity based on its label."""
    time.sleep(1)
    logging.info(f"Fetching QID for label: {label}")
    query = f"""
    SELECT ?entity WHERE {{
      ?entity rdfs:label "{label}"@en.
      FILTER (STRSTARTS(STR(?entity), "http://www.wikidata.org/entity/Q"))
    }}
    LIMIT 1
    """
    sparql = SPARQLWrapper(SPARQL_ENDPOINT)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    if results["results"]["bindings"]:
        qid = results["results"]["bindings"][0]["entity"]["value"].split("/")[-1]
        logging.info(f"Found QID for {label}: {qid}")
        return qid
    else:
        logging.warning(f"No QID found for label: {label}")
    return None

# Define namespaces
MP = Namespace("http://masterproject.org/")
WIKIDATA = Namespace("http://www.wikidata.org/entity/")

# Create RDF graph
g = Graph()
g.bind("mp", MP)
g.bind("wikidata", WIKIDATA)

# Avoid duplicate queries
queried_nodes = set()

# List of relevant properties
RELEVANT_PROPERTIES = [
    "P31", "P279", "P361", "P1269", "P527", "P2670", "P2701", "P1163",
    "P1195", "P4330", "P366", "P1535", "P101", "P921", "P1552", "P13044",
    "P3575"]

def query_wikidata_properties(qid):
    # time.sleep(1)
    """Query Wikidata properties for a given entity"""
    if qid in queried_nodes:
        return []
    queried_nodes.add(qid)

    query = f"""
    SELECT ?property ?propertyLabel ?value ?valueLabel WHERE {{
      wd:{qid} ?p ?value.
      ?property wikibase:directClaim ?p.
      FILTER (?property IN (wd:{", wd:".join(RELEVANT_PROPERTIES)}))
      OPTIONAL {{ ?value rdfs:label ?valueLabel. FILTER(LANG(?valueLabel) = "en") }}
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    """
    sparql = SPARQLWrapper(SPARQL_ENDPOINT)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    time.sleep(1)
    results = sparql.query().convert()
    return results["results"]["bindings"]

def recursive_query(qid, label=None, depth=0, max_depth=3):
    """Recursively query and construct the knowledge graph"""
    if depth > max_depth:
        return

    # Query entity properties
    results = query_wikidata_properties(qid)


    for result in results:
        # print(result["property"]["value"].split('/')[-1])
        prop = result["property"]["value"].split('/')[-1]
        # prop = result["propertyLabel"]["value"].replace(" ", "_").replace("(", "_").replace(")", "_")
        value = result.get("valueLabel", {}).get("value", result["value"]["value"])
        value_uri = result["value"]["value"]

        # Create the full URL for the property (wikidata.org/wiki/Property:PXX)
        property_uri = f"https://www.wikidata.org/wiki/Property:{prop}"

        # Check if the value is a Wikidata entity
        if "wikidata.org/" in value_uri:
            value_qid = value_uri.split("/")[-1]

            # Create new node of type NewNode if it has not been queried
            if value_qid not in queried_nodes:
                g.add((WIKIDATA[value_qid], RDF.type, MP.NewNode))  # Mark the new entity as NewNode
                queried_nodes.add(value_qid)

            # Only create relationships (edges) between nodes without adding properties
            g.add((WIKIDATA[qid], URIRef(property_uri), WIKIDATA[value_qid]))
            if label:
                g.add((WIKIDATA[qid], RDFS.label, Literal(label)))
            if value:
                g.add((WIKIDATA[value_qid], RDFS.label, Literal(value)))

            # Recursively query the new node
            recursive_query(value_qid, label=value, depth=depth + 1, max_depth=max_depth)

        # Handle relationships between properties and new nodes
        if value_uri and "wikidata.org/entity/" in value_uri:
            value_qid = value_uri.split("/")[-1]
            if value_qid not in queried_nodes:
                g.add((WIKIDATA[value_qid], RDF.type, MP.NewNode))  # Mark the new entity as NewNode
                queried_nodes.add(value_qid)

            # Add edges based on RELEVANT_PROPERTIES between new nodes
            g.add((WIKIDATA[qid], URIRef(property_uri), WIKIDATA[value_qid]))  # Add relationship edge

def build_knowledge_graph(dataset, max_depth=3):
    """Build the knowledge graph from the dataset"""
    for entry in dataset:
        modality = entry["modality"]
        data_types = entry["data_types"]

        # Get QID for modality
        modality_qid = entry.get("wikidata_id")  # Directly use the wikidata_id from the dataset
        if modality_qid:
            print(f"Processing modality: {modality} (QID: {modality_qid})")

            # Add modality node to RDF graph and attach a label
            g.add((WIKIDATA[modality_qid], RDF.type, MP.Modality))
            g.add((WIKIDATA[modality_qid], RDFS.label, Literal(modality)))

            # Recursively query modality properties
            recursive_query(modality_qid, label=modality, depth=0, max_depth=max_depth)

            # Handle data_types
            for data_type, properties in data_types.items():
                data_type_qid = properties.get("wikidata_id")

                if data_type_qid:
                    # If the node exists as NewNode, change its type to Datatype
                    if (WIKIDATA[data_type_qid], RDF.type, MP.NewNode) in g:
                        g.remove((WIKIDATA[data_type_qid], RDF.type, MP.NewNode))

                    # Add data_type node to RDF graph with properties
                    g.add((WIKIDATA[data_type_qid], RDF.type, MP.Datatype))
                    g.add((WIKIDATA[data_type_qid], RDFS.label, Literal(data_type)))
                    g.add((WIKIDATA[data_type_qid], MP.frequency, Literal(properties.get("frequency"))))
                    g.add((WIKIDATA[data_type_qid], MP.QID, Literal(properties.get("wikidata_id"))))
                    g.add((WIKIDATA[data_type_qid], MP.wikidata_label, Literal(properties.get("wikidata_label"))))
                    g.add((WIKIDATA[data_type_qid], MP.wikidata_description, Literal(properties.get("wikidata_description"))))

                    # Create custom edge between modality and data_type
                    g.add((WIKIDATA[modality_qid], MP["Modality_Datatype"], WIKIDATA[data_type_qid]))

                    # Recursively query data_type properties
                    recursive_query(data_type_qid, label=data_type, depth=0, max_depth=max_depth)

    # Save the knowledge graph as Turtle and OWL format
    g.serialize(destination="knowledge_graph_with_labels.ttl", format="turtle")
    print("Knowledge graph saved to knowledge_graph_with_labels.ttl")

    g.serialize(destination="knowledge_graph_with_labels.owl", format="xml")
    print("Knowledge graph saved to knowledge_graph_with_labels.owl")

# Load the matched_modalities_data_types.json file
with open('updated_matched_modalities_data_types.json', 'r') as f:
    dataset = json.load(f)

# Build the knowledge graph
build_knowledge_graph(dataset, max_depth=2)

Processing modality: image (QID: Q478798)


HTTPError: HTTP Error 429: Too Many Requests