In [1]:
import pandas as pd

from rdflib import Graph, URIRef, Literal, Namespace

from rdflib.namespace import XSD, RDF, RDFS, OWL, SKOS, PROV, DC


In [2]:
# Read the Excel file
xls = pd.ExcelFile('TIDO_ORSD.xlsx')

# Read the sheets into DataFrames
df_class = pd.read_excel(xls, 'Class definitions')
df_property = pd.read_excel(xls, 'Relation definitions')

# Set the second row as the column headers
df_class.columns = df_class.iloc[1]
df_class = df_class[2:].reset_index(drop=True)
df_class = df_class.rename_axis(None, axis=1)

df_property.columns = df_property.iloc[1]
df_property = df_property[2:].reset_index(drop=True)
df_property = df_property.rename_axis(None, axis=1)

# Display a DataFrame
df_property

Unnamed: 0,Relation,Label(s),Definition,Domain,Range,"Status (Proposed, Accepted, Rejected, Deprecated)",Superseded by,Comments,Definition derived from,"Priority \n(High, Medium, Low)",Related to identifed concept
0,tido:hasContext,has context,A relationship indicating how one piece of inf...,tido:PieceOfInformation,tido:PieceOfInformation,,,,,,TIDO-C08
1,tido:disputes,disputes,A relationship indicating how a piece of infor...,tido:PieceOfInformation,tido:Hypothesis,,,,https://www.merriam-webster.com/dictionary/dis...,,TIDO-C28
2,tido:supports,supports,A relationship indicating how a piece of infor...,tido:PieceOfInformation,tido:Hypothesis,,,,https://www.merriam-webster.com/dictionary/sup...,,TIDO-C30
3,tido:providesInsightsInto,provides insight into,The super-property of tido:supports and tido:d...,tido:PieceOfInformation,tido:Hypothesis,,,,,,TIDO-C31
4,tido:assumes,assumes,A relationship indicating how an option would ...,tido:Option,tido:Hypothesis,,,,https://www.merriam-webster.com/dictionary/assume,,
5,tido:answers,answers,A piece of information can be used to answer a...,tido:PieceOfInformation,tido:RQ,,,,,,
6,tido:questions,questions,A research question can be raised by a piece o...,tido:RQ,tido:PieceOfInformation,,,,,,
7,tido:informs,informs,A relationship that indicates which options wo...,tido:Option,tido:RQ,,,,,,
8,tido:hasConsideration,has consideration,An option can have multiple considerations tha...,tido:Option,tido:Consideration,,,,,,
9,tido:wasSelectedBy,was selected by,An option can be selected during a resolution ...,tido:Option,tido:Resolution,,,,,,


In [3]:

# Create an RDF graph
g = Graph()

# Define the rdfs namespace
# DC = Namespace("http://purl.org/dc/terms/")
# RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
TIDO = Namespace("https://w3id.org/tido#")
# SKOS = Namespace("http://www.w3.org/2004/02/skos/core#")

# Bind the TIDO namespace to the graph
g.bind("tido", TIDO)

# Load the existing TIDO.ttl file into the graph
try:
    g.parse("TIDO.ttl", format="turtle")
except FileNotFoundError:
    print("TIDO.ttl not found. A new file will be created.")

# Iterate through the Class DataFrame and create triples
for _, row in df_class.iterrows():
    class_uri = row['Class URI']
    definition = row['Definition']
    reference = row['Definition derived from']
    labels = row['Label(s)']

    if pd.notna(class_uri):  # Ensure values are not NaN
        subject = TIDO[class_uri.split(':')[1]]
        
        if pd.notna(definition):
            triple = (subject, RDFS.comment, Literal(definition))
            if triple not in g:  # Check for duplicate
                g.add(triple)
        
        if pd.notna(reference):
            triple = (subject, RDFS.isDefinedBy, Literal(reference))
            if triple not in g:  # Check for duplicate
                g.add(triple)
        
        if pd.notna(labels):
            for l in labels.split(','):
                if (subject, RDFS.label, Literal(l.strip())) not in g:  # Check for duplicate
                    g.add((subject, SKOS.altLabel, Literal(l.strip())))


# Iterate through the Property DataFrame and create triples
for _, row in df_property.iterrows():
    property_uri = row['Relation']
    definition = row['Definition']
    reference = row['Definition derived from']
    labels = row['Label(s)']

    if pd.notna(property_uri):  # Ensure values are not NaN
        subject = TIDO[property_uri.split(':')[1]]
        
        if pd.notna(definition):
            triple = (subject, RDFS.comment, Literal(definition))
            if triple not in g:  # Check for duplicate
                g.add(triple)
        
        if pd.notna(reference):
            triple = (subject, RDFS.seeAlso, Literal(reference))
            if triple not in g:  # Check for duplicate
                g.add(triple)
        
        if pd.notna(labels):
            for l in labels.split(','):
                if (subject, RDFS.label, Literal(l.strip())) not in g:  # Check for duplicate
                    g.add((subject, SKOS.altLabel, Literal(l.strip())))

# Serialize the graph to an RDF format (e.g., Turtle)
rdf_output = g.serialize(format='turtle') #.decode('utf-8')

# Save the RDF output to a file
with open("TIDO_enriched.ttl", "w") as f:
    f.write(rdf_output)

# Print the RDF output
print(rdf_output)

@prefix : <https://w3id.org/tido#> .
@prefix dcterms: <http://purl.org/dc/terms/> .
@prefix mod: <https://w3id.org/mod#> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix prov: <http://www.w3.org/ns/prov#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .
@prefix urref: <http://eturwg.c4i.gmu.edu/files/ontologies/URREF_v5_dev.owl#> .
@prefix vann: <http://purl.org/vocab/vann/> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

dcterms:created a owl:AnnotationProperty .

dcterms:description a owl:AnnotationProperty .

dcterms:license a owl:AnnotationProperty .

dcterms:title a owl:AnnotationProperty .

vann:preferredNamespacePrefix a owl:AnnotationProperty .

vann:preferredNamespaceUri a owl:AnnotationProperty .

owl:versionIRI a owl:AnnotationProperty .

owl:versionInfo a owl:AnnotationProperty .

<https://w3id.org/tido> a owl:Ontology ;
    dcterms:created "2025-05-07"^^xsd:date ;
    dcterms:description "The TI

In [2]:
df_files = [
    'transcriptions/dataframes/De_Dienst_ep_1.csv',
    'transcriptions/dataframes/De_Dienst_ep_2.csv',
    # 'transcriptions/dataframes/De_Dienst_ep_3.csv',
    # 'transcriptions/dataframes/De_Dienst_ep_4.csv',
    # 'transcriptions/dataframes/De_Dienst_ep_5.csv',
    # 'transcriptions/dataframes/De_Dienst_ep_6.csv',
]

column_to_rdf_mapping = {
    'start': PROV.startedAtTime,
    'end': PROV.endedAtTime,
    'speaker': PROV.wasAssociatedWith,
    'transcription': (RDF.value, "@nl"),
    'translation': (RDF.value, "@en"),
}

# Read and concatenate all files into a single DataFrame
all_dataframes = [pd.read_csv(file, sep=';') for file in df_files]
combined_di_dataframe = pd.concat(all_dataframes, ignore_index=True)

# Drop the 'Unnamed: 0' column
if 'Unnamed: 0' in combined_di_dataframe.columns:
    combined_di_dataframe = combined_di_dataframe.drop(columns=['Unnamed: 0'])

# Display the combined DataFrame
print(combined_di_dataframe)

                     seg_id                start                  end  \
0      De_Dienst_ep_1_seg_0  2025-01-01 00:00:00  2025-01-01 00:00:21   
1      De_Dienst_ep_1_seg_1  2025-01-01 00:00:30  2025-01-01 00:01:11   
2      De_Dienst_ep_1_seg_2  2025-01-01 00:01:11  2025-01-01 00:01:25   
3      De_Dienst_ep_1_seg_3  2025-01-01 00:01:27  2025-01-01 00:01:43   
4      De_Dienst_ep_1_seg_4  2025-01-01 00:01:43  2025-01-01 00:01:44   
..                      ...                  ...                  ...   
223  De_Dienst_ep_2_seg_117  2025-01-02 00:23:50  2025-01-02 00:24:03   
224  De_Dienst_ep_2_seg_118  2025-01-02 00:24:04  2025-01-02 00:24:09   
225  De_Dienst_ep_2_seg_119  2025-01-02 00:24:13  2025-01-02 00:24:33   
226  De_Dienst_ep_2_seg_120  2025-01-02 00:24:37  2025-01-02 00:25:07   
227  De_Dienst_ep_2_seg_121  2025-01-02 00:25:07  2025-01-02 00:25:20   

      speaker                                      transcription  \
0        Bart   De eerste keer dat ik een telefoon tap,

In [7]:
from rdflib import Graph, Namespace

# Define namespaces
# RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
# RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
TIDO = Namespace("https://w3id.org/tido#")
TIDO_case = Namespace("https://w3id.org/tido/case#")
PROV = Namespace("http://www.w3.org/ns/prov#")
# SKOS = Namespace("http://www.w3.org/2004/02/skos/core#")

def add_triple(graph, triple):
    """
    Add a triple to the graph if it doesn't already exist.
    """
    if triple not in graph:
        graph.add(triple)

# Create an RDF graph
g_TIDO = Graph()
g_TIDO.bind("tido", TIDO)

# Load the TIDO_enriched.ttl file into the graph
try:
    g_TIDO.parse("TIDO_enriched.ttl", format="turtle")
    print("TIDO_enriched.ttl loaded successfully.")
except FileNotFoundError:
    print("TIDO_enriched.ttl not found.")


DIs = [
    'di01',
    'di02',
    # 'di03',
    # 'di04',
    # 'di05',
    # 'di06',
    # 'di07',
    # 'di08',
    # 'di09',
    # 'di10'
    ]

for di in DIs:
    di_ns = Namespace(f"https://w3id.org/tido/{di}#")
    # Create an RDF graph
    g_DI = Graph()
    g_DI.bind(di, di_ns)
    g_DI.bind("tido", TIDO)
    g_DI.bind("tido_case", TIDO_case)

    # Load the original DI01.ttl file into the graph
    try:
        g_DI.parse(f"./DIs/original/tido_{di}.ttl", format="turtle")
        print(f"./DIs/original/tido_{di}.ttl loaded successfully.")
    except FileNotFoundError:
        print(f"./DIs/original/tido_{di}.ttl not found.")

    # g_DI += g_TIDO

    # Implement basic rdfs:subclassOf inference
    for s1,p,o in g_TIDO.triples((None, RDFS.subClassOf, TIDO.Activity)):
        for s2, _, _ in g_DI.triples((None, RDF.type, s1)):
            triple = (s2, RDF.type, o)
            add_triple(g_DI, triple)

    for s,_,_ in g_DI.triples((None, RDF.type, TIDO.Activity)):
        for s1, _, o1 in g_DI.triples((s, RDFS.label, None)):

            # For every s1, retrieve the corresponding row from combined_di_dataframe using o1 as seg_id
            seg_id = str(o1)
            seg_row = combined_di_dataframe[combined_di_dataframe['seg_id'] == seg_id]
            
            if not seg_row.empty:
                # add a triple to g_DI using the prov:startedAtTime property and the start from seg_row
                start_time = seg_row.iloc[0]['start']
                triple = (s, PROV.startedAtTime, Literal(start_time, datatype=XSD.dateTime))
                add_triple(g_DI, triple) 
            
                # add a triple to g_DI using the prov:endedAtTime property and the end from seg_row
                end_time = seg_row.iloc[0]['end']
                triple = (s, PROV.endedAtTime, Literal(end_time, datatype=XSD.dateTime))
                add_triple(g_DI, triple) 

                # add a triple to g_DI using the prov:wasAssociatedWith property and the agent_uri
                # check if an agent with the same name exists in g_DI
                # if not, create a new agent
                agent_names = seg_row.iloc[0]['speaker']

                for agent_name in agent_names.split(','):
                    # remove leading and trailing spaces
                    agent_name = agent_name.strip()
                    # replace spaces with underscores
                    agent_name = agent_name.replace(" ", "_")
                    # remove special characters
                    agent_name = ''.join(e for e in agent_name if e.isalnum() or e == '_')
                    # replace multiple underscores with a single underscore
                    agent_name = '_'.join(agent_name.split('_'))
                    # remove leading and trailing underscores
                    agent_name = agent_name.strip('_')
                    # create a uri for the agent in the DI01 namespace
                    agent_uri = TIDO_case[agent_name.lower()]
                    # Check if the agent already exists in g_DI as an agent
                    if (agent_uri, RDF.type, TIDO.Agent) not in g_DI:
                        triple = (agent_uri, RDF.type, TIDO.Agent)
                        add_triple(g_DI, triple)
                    
                    # make sure only first letter is capitalized of the agent name
                    agent_name = agent_name.capitalize()
                    triple = (agent_uri, RDFS.label, Literal(agent_name))
                    if triple not in g_DI:
                        add_triple(g_DI, triple)
                
                    # add a triple to g_DI using the prov:wasAssociatedWith property and the agent_uri
                    triple = (s, PROV.wasAssociatedWith, agent_uri)
                    if triple not in g_DI:
                        add_triple(g_DI, triple)

                # Add the transcription and translation to g_DI
                # add a triple to g_DI using the rdf:value property and the transcription from seg_row
                transcription = seg_row.iloc[0]['transcription']
                if pd.notna(transcription):
                    triple = (s, RDF.value, Literal(transcription, lang="nl"))
                    add_triple(g_DI, triple)
                # add a triple to g_DI using the rdf:value property and the translation from seg_row
                translation = seg_row.iloc[0]['translation']
                if pd.notna(translation):
                    triple = (s, RDF.value, Literal(translation, lang="en"))
                    add_triple(g_DI, triple)
                
                triple = (s, TIDO.contributesTo, TIDO_case["case_1"])
                if triple not in g_DI:
                    add_triple(g_DI, triple)
            else:
                raise ValueError(f"No matching row found for seg_id: {seg_id}")

    # Serialize the graph to an RDF format (e.g., Turtle)
    rdf_output = g_DI.serialize(format='turtle') #.decode('utf-8')

    # Save the RDF output to a file
    with open(f"DIs/extended/tido_{di}.ttl", "w") as f:
        f.write(rdf_output)

    # Print the RDF output
    # print(rdf_output)

            # Check if the 
            # triple already exists in g_DI
            # if triple not in g_DI:


TIDO_enriched.ttl loaded successfully.
./DIs/original/tido_di01.ttl loaded successfully.
./DIs/original/tido_di02.ttl loaded successfully.
