# RDF Creation

In this document I will proceed in the pipeline by generating a file containing RDF triplets. I start from the previous file, "IbsenStage_with_uris.json". In this step i use `rdflib`. As explained in the main documentation, each theatre event becomes a `schema:TheaterEvent`, linked to a play (`schema:Play`) and a venue (`schema:EventVenue`).
Where available, the script links works and venues to Wikidata URIs via `schema:sameAs`. It also adds normalized performance dates (`xsd:date`), event names, and internal IDs. If the venue’s URI isn’t available, the city’s Wikidata URI is used as a fallback (`schema:location`).

In [None]:
import json, re, math, datetime
from pathlib import Path
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, DCTERMS, XSD

# Set the working directory and search upwards until required folder is found
cwd = Path.cwd()
root = cwd
while not (root / "Ibsenstage_staged").exists() and root.parent != root:
    root = root.parent

# Define input and output paths
input_path = root / "Ibsenstage_staged" / "IbsenStage_with_uris.json"
if not input_path.exists():
    raise FileNotFoundError(f"Cannot find file: {input_path}")
output_dir = root / "Ibsenstage_curated"
if not output_dir.exists():
    raise FileNotFoundError(f"Cannot find directory: {output_dir}")
output_path = output_dir / "ibsenstage_triplets.ttl"

# Define namespaces for RDF
SCHEMA = Namespace("http://schema.org/")
WD = Namespace("https://www.wikidata.org/entity/")
IBSEN = Namespace("https://ibsenstage.hf.uio.no/pages/")

# Initialize RDF graph and bind prefixes
g = Graph()
g.bind("schema", SCHEMA)
g.bind("dcterms", DCTERMS)
g.bind("wd", WD)
g.bind("xsd", XSD)

# Normalize dates into valid xsd:date format, or fallback to string literals
def normalize_date(date_str):
    if not date_str:
        return None, None
    s = str(date_str).strip()
    m = re.match(r"^(\d{4})-(\d{2})-(\d{2})$", s)
    if m:
        y, mo, d = map(int, m.groups())
        try:
            datetime.date(y, mo, d)  # Validates the date
            return s, XSD.date
        except ValueError:
            return s, None
    if re.match(r"^\d{4}-\d{2}$", s):
        return s + "-01", XSD.date  # Fill missing day
    if re.match(r"^\d{4}$", s):
        return s + "-01-01", XSD.date  # Fill missing month/day
    return s, None  # Fallback to literal string

# Load data from JSON
data = json.loads(input_path.read_text(encoding="utf-8"))
print(f"Processing {len(data)} records…")

# Loop through each record
for i, rec in enumerate(data):
    if i % 1000 == 0:
        print(f"Processed {i}")
    eid = str(rec.get("eventid"))  # Unique event ID
    wid = rec.get("workid")        # Work (play) ID
    vid = str(rec.get("venueid"))  # Venue ID

    # Normalize work ID
    if isinstance(wid, (int, float)) and not math.isnan(wid):
        wid = str(int(wid))
    elif isinstance(wid, float) and math.isnan(wid):
        wid = None
    else:
        wid = str(wid).strip() if wid else None

    # Get Wikidata URI for work
    work_ref = None
    raw_work = rec.get("workURI")
    if raw_work:
        qid = raw_work if raw_work.startswith("Q") else raw_work.rsplit("/", 1)[-1]
        work_ref = WD[qid]

    # Get Wikidata URI for venue
    venue_ref = None
    raw_venue = rec.get("venueURI")
    if raw_venue:
        qid_v = raw_venue if raw_venue.startswith("Q") else raw_venue.rsplit("/", 1)[-1]
        venue_ref = WD[qid_v]

    # Define internal URIs
    event_res = IBSEN[f"event/{eid}"]
    work_res = IBSEN[f"work/{wid}"] if wid else None
    venue_res = IBSEN[f"venue/{vid}"]

    # Add event as a TheaterEvent
    g.add((event_res, RDF.type, SCHEMA.TheaterEvent))

    # Add event name
    name = rec.get("eventname_normalized") or rec.get("eventname")
    if name:
        g.add((event_res, SCHEMA.name, Literal(name)))

    # Add first performance date
    first_date = rec.get("first_date")
    if first_date:
        lex, dt = normalize_date(first_date)
        if lex:
            lit = Literal(lex, datatype=dt) if dt else Literal(lex)
            g.add((event_res, SCHEMA.firstPerformance, lit))

    # Link event to work via schema:workPerformed
    if work_ref:
        g.add((event_res, SCHEMA.workPerformed, work_ref))

    # Add event city as location (uses Wikidata QID)
    raw_city = rec.get("cityURI")
    if raw_city:
        qid_c = raw_city if raw_city.startswith("Q") else raw_city.rsplit("/", 1)[-1]
        g.add((event_res, SCHEMA.location, WD[qid_c]))

    # Add internal identifier for the event
    g.add((event_res, DCTERMS.identifier, Literal(eid)))

    # Add work (play) as schema:Play
    if work_res:
        g.add((work_res, RDF.type, SCHEMA.Play))
        title = rec.get("worktitle")
        if title:
            g.add((work_res, SCHEMA.name, Literal(title)))
        g.add((work_res, DCTERMS.identifier, Literal(wid)))
        if work_ref:
            g.add((work_res, SCHEMA.sameAs, work_ref))  # Link to Wikidata

    # Add venue as schema:EventVenue
    if venue_ref:
        g.add((venue_res, SCHEMA.sameAs, venue_ref))  # Link local venue to Wikidata
    if venue_res:
        g.add((venue_res, RDF.type, SCHEMA.EventVenue))
        vname = rec.get("venuename", "")
        if vname:
            g.add((venue_res, SCHEMA.name, Literal(vname)))
        g.add((venue_res, DCTERMS.identifier, Literal(vid)))

# Serialize graph to Turtle format with manual prefix for wd:
ttl_output = g.serialize(format="turtle")
ttl_output = "@prefix wd: <https://www.wikidata.org/entity/> .\n" + ttl_output

# Save to file
output_path.write_text(ttl_output, encoding="utf-8")
print(f"Saved {len(g)} triples to {output_path.resolve()}")


Processing 4924 records…
Processed 0
Processed 1000
Processed 2000
Processed 3000
Processed 4000
✅ Saved 30442 triples to C:\Users\Cristiano (CC)\Desktop\Cristiano-June25\OsloMet\Masterstudium i bibliotek- og informasjonsvitenskap - deltid\MBIB4140 - Metadata og interoperabilitet\2ndre sjansen\Ibsenstage_curated\ibsenstage_triplets.ttl


# Serialization in RDF/XML

The next step is serializing the triplets obtained also in RDF/XML. Serialization changes the RDF graph into a file format that semantic web tools can use to store, share, or read data. While the Turtle file obtained in the previous cell is concise and easy for humans to read and debug, RDF/XML is the original W3C-standard serialization for RDF and remains widely supported by legacy systems, ontology tools, and triplestores. It is more formal, and it integrates easily with XML-based workflows, allows validation using standard XML tools, and ensures maximum interoperability in institutional contexts.

In [None]:
# Define output path
rdfxml_path = output_dir / "ibsenstage_triplets.rdf"

# Serialize to RDF/XML (widely supported by semantic web tools and triple stores)
rdfxml_output = g.serialize(format="xml")
rdfxml_path.write_text(rdfxml_output, encoding="utf-8")

print(f"Saved {len(g)} triples to:")
print(f" - {rdfxml_path.resolve()}")

✅ Saved 30442 triples to:
   - C:\Users\Cristiano (CC)\Desktop\Cristiano-June25\OsloMet\Masterstudium i bibliotek- og informasjonsvitenskap - deltid\MBIB4140 - Metadata og interoperabilitet\2ndre sjansen\Ibsenstage_curated\ibsenstage_triplets.rdf


# Validating

The final step is verifying that the RDF file is well-formed. To do so, I will first check if there are any parsing or syntaxt issues locally, then I will make the RDF accessible via an URL

In [None]:
#Check if the RDF is well-formed
from rdflib import Graph

g = Graph()
try:
    g.parse("Ibsenstage_curated/ibsenstage_triplets.rdf", format="xml")
    print("RDF/XML is valid.")
except Exception as e:
    print("RDF/XML is invalid:")
    print(e)


✅ RDF/XML is valid.


The whole project has been uploaded in the repository `MBIB4140_2` on GitHub, and it is publicly available at the link: [https://github.com/Uhyret/MBIB4140_2] 