## Prerequisites pt.2

In [3]:
import json, re, math, datetime
from pathlib import Path
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, DCTERMS, XSD

# RDF Creation

In this document I will proceed in the pipeline by generating a file containing RDF triplets. I start from the previous file, "IbsenStage_with_uris.json". In this step i use `rdflib`. As explained in the main documentation, each theatre event becomes a `schema:TheaterEvent`, linked to a play (`schema:Play`) and a venue (`schema:EventVenue`).
Where available, the script links works and venues to Wikidata URIs via `schema:sameAs`. It also adds normalized performance dates (`xsd:date`), event names, and internal IDs. If the venue’s URI isn’t available, the city’s Wikidata URI is used as a fallback (`schema:location`).

In [None]:
# I locate the "Ibsenstage_staged" folder containing the input data
cwd = Path.cwd()
root = cwd
while not (root / "Ibsenstage_staged").exists() and root.parent != root:
    root = root.parent

# I set up file paths for reading JSON input and writing RDF output
input_path = root / "Ibsenstage_staged" / "IbsenStage_with_uris.json"
if not input_path.exists():
    raise FileNotFoundError(f"Cannot find file: {input_path}")
output_dir = root / "Ibsenstage_curated"
if not output_dir.exists():
    raise FileNotFoundError(f"Cannot find directory: {output_dir}")
output_path = output_dir / "ibsenstage_triplets.ttl"

# here I create RDF namespaces
SCHEMA = Namespace("http://schema.org/")          
WD = Namespace("https://www.wikidata.org/entity/")
IBSEN = Namespace("https://ibsenstage.hf.uio.no/pages/") # The custom namespace for Ibsen data

# create an empty RDF graph and register our namespaces with it
g = Graph()
g.bind("schema", SCHEMA)
g.bind("dcterms", DCTERMS)
g.bind("wd", WD)
g.bind("xsd", XSD)

# function to clean up messy date formats and make them valid for RDF
def normalize_date(date_str) -> tuple[str | None, type | None]:
    if not date_str:
        return None, None
    s = str(date_str).strip()
    
    # check if it's already a complete date (YYYY-MM-DD)
    m = re.match(r"^(\d{4})-(\d{2})-(\d{2})$", s)
    if m:
        y, mo, d = map(int, m.groups())
        try:
            datetime.date(y, mo, d)  # Verify it's a real date
            return s, XSD.date
        except ValueError:
            return s, None
    
    # then I fill in missing parts: "1879-03" becomes "1879-03-01"
    if re.match(r"^\d{4}-\d{2}$", s):
        return s + "-01", XSD.date
    
    # here "1879" becomes "1879-01-01"
    if re.match(r"^\d{4}$", s):
        return s + "-01-01", XSD.date

    # if it doesn't match any known format, return as it is    
    return s, None

# Load the theater data from our JSON file
data = json.loads(input_path.read_text(encoding="utf-8"))
print(f"Loaded {len(data)} theater performance records")

Loaded 4924 theater performance records


In [7]:
# Function to process a single theater record and add RDF triples
def process_record(rec, graph):
    # extract the basic IDs that identify performances, plays, and venues
    eid = str(rec.get("eventid"))
    wid = rec.get("workid")
    vid = str(rec.get("venueid"))

    # convert work ID to string, or None if missing/invalid
    wid = str(int(wid)) if wid and not math.isnan(wid) else None

    # look for Wikidata references that link my data to external knowledge
    # these URIs connect local theater data to global knowledge bases
    work_ref = None
    raw_work = rec.get("workURI")  # Wikidata URI for the play
    if raw_work:
        # extract just the QID part (i.e. "Q1234") from the full URI
        qid = raw_work if raw_work.startswith("Q") else raw_work.rsplit("/", 1)[-1]
        work_ref = WD[qid]

    venue_ref = None
    raw_venue = rec.get("venueURI")  # Wikidata URI for the venue
    if raw_venue:
        qid_v = raw_venue if raw_venue.startswith("Q") else raw_venue.rsplit("/", 1)[-1]
        venue_ref = WD[qid_v]

    # create internal URIs for my data using the custom namespace
    event_res = IBSEN[f"event/{eid}"]
    work_res = IBSEN[f"work/{wid}"] if wid else None
    venue_res = IBSEN[f"venue/{vid}"]

    # declare events as a theater performance
    graph.add((event_res, RDF.type, SCHEMA.TheaterEvent))
    
    # add the performance name/title
    name = rec.get("eventname_normalized") or rec.get("eventname")
    if name:
        graph.add((event_res, SCHEMA.name, Literal(name)))

    # add the date of first performance, using cleaned-up date format
    first_date = rec.get("first_date")
    if first_date:
        lex, dt = normalize_date(first_date)
        if lex:
            # create a properly typed date literal for RDF
            lit = Literal(lex, datatype=dt) if dt else Literal(lex)
            graph.add((event_res, SCHEMA.firstPerformance, lit))

    # link the performances to the Wikidata entry for the play
    if work_ref:
        graph.add((event_res, SCHEMA.workPerformed, work_ref))

    # add the city where this performance took place
    raw_city = rec.get("cityURI")
    if raw_city:
        qid_c = raw_city if raw_city.startswith("Q") else raw_city.rsplit("/", 1)[-1]
        graph.add((event_res, SCHEMA.location, WD[qid_c]))

    # add DCterms internal identifier for tracking
    graph.add((event_res, DCTERMS.identifier, Literal(eid)))

    # create a separate entry for plays and declare it as a schema:Play
    if work_res:
        graph.add((work_res, RDF.type, SCHEMA.Play))
        title = rec.get("worktitle")
        if title:
            graph.add((work_res, SCHEMA.name, Literal(title)))  # add play title
        graph.add((work_res, DCTERMS.identifier, Literal(wid))) # add ID
        if work_ref:
            # link local play entries to Wikidata entries
            graph.add((work_res, SCHEMA.sameAs, work_ref))

    # create entries for the venue
    if venue_ref:
        # link local venue to its Wikidata entry
        graph.add((venue_res, SCHEMA.sameAs, venue_ref))
    if venue_res:
        graph.add((venue_res, RDF.type, SCHEMA.EventVenue))  # declare it as a venue
        vname = rec.get("venuename", "")
        if vname:
            graph.add((venue_res, SCHEMA.name, Literal(vname)))  # add venue name
        graph.add((venue_res, DCTERMS.identifier, Literal(vid))) # add ID

# process a small batch to verify if RDF structure works correctly
sample_size = 50
print(f"Processing first {sample_size} records to verify structure...")

# loop through each theater performance record in sample
for i, rec in enumerate(data[:sample_size]):
    process_record(rec, g)

print(f"Generated {len(g)} RDF triples from {sample_size} records")

# show a sample of what my RDF data looks like
print("\nSample RDF triples:")
for i, (s, p, o) in enumerate(g):
    if i >= 10:
        break
    print(f"  {s} {p} {o}")
    
print(f"\n... and {len(g) - 10} more triples")

Processing first 50 records to verify structure...
Generated 469 RDF triples from 50 records

Sample RDF triples:
  https://ibsenstage.hf.uio.no/pages/venue/15014 http://www.w3.org/1999/02/22-rdf-syntax-ns#type http://schema.org/EventVenue
  https://ibsenstage.hf.uio.no/pages/event/85934 http://purl.org/dc/terms/identifier 85934
  https://ibsenstage.hf.uio.no/pages/work/8528 http://purl.org/dc/terms/identifier 8528
  https://ibsenstage.hf.uio.no/pages/event/85772 http://schema.org/name Een poppenhuis
  https://ibsenstage.hf.uio.no/pages/event/85768 http://schema.org/workPerformed https://www.wikidata.org/entity/Q1434818
  https://ibsenstage.hf.uio.no/pages/event/85612 http://schema.org/name Rosmersholm
  https://ibsenstage.hf.uio.no/pages/event/85765 http://schema.org/name Gengangere
  https://ibsenstage.hf.uio.no/pages/venue/12427 http://purl.org/dc/terms/identifier 12427
  https://ibsenstage.hf.uio.no/pages/work/8544 http://schema.org/sameAs https://www.wikidata.org/entity/Q1217608
 

In [8]:
# here I process the remaining records
print(f"Processing remaining {len(data) - 50} records...")

for i, rec in enumerate(data[50:], start=50):
    if i % 1000 == 0:
        print(f"Processed {i}/{len(data)} records ({len(g)} triples so far)")
    
    # reuse the processing function
    process_record(rec, g)

print(f"Completed processing all {len(data)} records")

# then convert RDF graph to Turtle format and save
ttl_output = g.serialize(format="turtle")
ttl_output = "@prefix wd: <https://www.wikidata.org/entity/> .\n" + ttl_output

output_path.write_text(ttl_output, encoding="utf-8")
print(f"Saved {len(g)} RDF triples to {output_path.resolve()}")

# check how many theater events, plays, and venues I have in the final graph
events = sum(1 for s, p, o in g if p == RDF.type and o == SCHEMA.TheaterEvent)
works = sum(1 for s, p, o in g if p == RDF.type and o == SCHEMA.Play)
venues = sum(1 for s, p, o in g if p == RDF.type and o == SCHEMA.EventVenue)

print(f"\nFinal RDF graph includes:")
print(f"  {events} theater events")
print(f"  {works} different plays")  
print(f"  {venues} theater venues")

Processing remaining 4874 records...
Processed 1000/4924 records (7407 triples so far)
Processed 2000/4924 records (13468 triples so far)
Processed 3000/4924 records (20276 triples so far)
Processed 4000/4924 records (26273 triples so far)
Completed processing all 4924 records
Saved 31739 RDF triples to C:\Users\Cristiano (CC)\Desktop\Cristiano-June25\OsloMet\Masterstudium i bibliotek- og informasjonsvitenskap - deltid\MBIB4140 - Metadata og interoperabilitet\2ndre sjansen\MBIB4140_2\Ibsenstage_curated\ibsenstage_triplets.ttl

Final RDF graph includes:
  4924 theater events
  30 different plays
  1400 theater venues


# Serialization in RDF/XML

The next step is serializing the triplets obtained also in RDF/XML. Serialization changes the RDF graph into a file format that semantic web tools can use to store, share, or read data. While the Turtle file obtained in the previous cell is concise and easy for humans to read and debug, RDF/XML is the original W3C-standard serialization for RDF and remains widely supported by legacy systems, ontology tools, and triplestores. It is more formal, and it integrates easily with XML-based workflows, allows validation using standard XML tools, and ensures maximum interoperability in institutional contexts.

In [None]:
# define output path
rdfxml_path = output_dir / "ibsenstage_triplets.rdf"

# serialize to RDF/XML
rdfxml_output = g.serialize(format="xml")
rdfxml_path.write_text(rdfxml_output, encoding="utf-8")

print(f"Saved {len(g)} triples to:")
print(f" - {rdfxml_path.resolve()}")

✅ Saved 30442 triples to:
   - C:\Users\Cristiano (CC)\Desktop\Cristiano-June25\OsloMet\Masterstudium i bibliotek- og informasjonsvitenskap - deltid\MBIB4140 - Metadata og interoperabilitet\2ndre sjansen\Ibsenstage_curated\ibsenstage_triplets.rdf


# Validating

The final step is verifying that the RDF file is well-formed. To do so, I will first check if there are any parsing or syntaxt issues locally, then I will make the RDF accessible via an URL

In [None]:
#finally I check if the RDF is well-formed
g = Graph()
try:
    g.parse("Ibsenstage_curated/ibsenstage_triplets.rdf", format="xml")
    print("RDF/XML is valid.")
except Exception as e:
    print("RDF/XML is invalid:")
    print(e)


✅ RDF/XML is valid.


The whole project has been uploaded in the repository `MBIB4140_2` on GitHub, and it is publicly available at the link: [https://github.com/Uhyret/MBIB4140_2] 