# Dataset Nanopublication Generator (Corrected)

Creates dataset description nanopublications following FAIR principles.

**Template:** [Dataset Description Template](https://w3id.org/np/RAuVB37yyAuAlgusrUAoG84JI4_EfrEqIkpEZYDpSz3d8)

In [None]:
CONFIG_FILE = "../config/vbae208/vbae208_dataset.json"
OUTPUT_DIR = "../output/dataset"

In [None]:
import json
from pathlib import Path
from datetime import datetime, timezone
from rdflib import Dataset, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD, FOAF

NP = Namespace("http://www.nanopub.org/nschema#")
DCT = Namespace("http://purl.org/dc/terms/")
NT = Namespace("https://w3id.org/np/o/ntemplate/")
NPX = Namespace("http://purl.org/nanopub/x/")
PROV = Namespace("http://www.w3.org/ns/prov#")
ORCID = Namespace("https://orcid.org/")
DCAT = Namespace("http://www.w3.org/ns/dcat#")

DATASET_TEMPLATE = URIRef("https://w3id.org/np/RAuVB37yyAuAlgusrUAoG84JI4_EfrEqIkpEZYDpSz3d8")
PROV_TEMPLATE = URIRef("https://w3id.org/np/RA7lSq6MuK_TIC6JMSHvLtee3lpLoZDOqLJCLXevnrPoU")
PUBINFO_TEMPLATE_1 = URIRef("https://w3id.org/np/RA0J4vUn_dekg-U1kK3AOEt02p9mT2WO03uGxLDec1jLw")
PUBINFO_TEMPLATE_2 = URIRef("https://w3id.org/np/RAukAcWHRDlkqxk7H2XNSegc1WnHI569INvNr-xdptDGI")

print("✓ Setup complete")

In [None]:
with open(CONFIG_FILE, 'r', encoding='utf-8') as f:
    config = json.load(f)

metadata = config.get('metadata', {})
print(f"✓ Loaded {len(config['nanopublications'])} dataset nanopubs")

In [None]:
def create_dataset_nanopub(np_config, metadata):
    TEMP_NP = Namespace("http://purl.org/nanopub/temp/np/")
    
    this_np = URIRef("http://purl.org/nanopub/temp/np/")
    head_graph = URIRef("http://purl.org/nanopub/temp/np/Head")
    assertion_graph = URIRef("http://purl.org/nanopub/temp/np/assertion")
    provenance_graph = URIRef("http://purl.org/nanopub/temp/np/provenance")
    pubinfo_graph = URIRef("http://purl.org/nanopub/temp/np/pubinfo")
    
    author_uri = ORCID[metadata['creator_orcid']]
    
    # Dataset subject
    if np_config.get('dataset_uri'):
        dataset_uri = URIRef(np_config['dataset_uri'])
    else:
        dataset_uri = TEMP_NP['dataset']
    
    ds = Dataset()
    ds.bind("this", "http://purl.org/nanopub/temp/np/")
    ds.bind("sub", TEMP_NP)
    ds.bind("np", NP)
    ds.bind("dct", DCT)
    ds.bind("nt", NT)
    ds.bind("npx", NPX)
    ds.bind("xsd", XSD)
    ds.bind("rdfs", RDFS)
    ds.bind("orcid", ORCID)
    ds.bind("prov", PROV)
    ds.bind("foaf", FOAF)
    ds.bind("dcat", DCAT)
    
    # HEAD
    head = ds.graph(head_graph)
    head.add((this_np, RDF.type, NP.Nanopublication))
    head.add((this_np, NP.hasAssertion, assertion_graph))
    head.add((this_np, NP.hasProvenance, provenance_graph))
    head.add((this_np, NP.hasPublicationInfo, pubinfo_graph))
    
    # ASSERTION
    assertion = ds.graph(assertion_graph)
    assertion.add((dataset_uri, RDF.type, DCAT.Dataset))
    assertion.add((dataset_uri, RDFS.label, Literal(np_config['label'])))
    
    if np_config.get('description'):
        assertion.add((dataset_uri, RDFS.comment, Literal(np_config['description'])))
    
    if np_config.get('access_url'):
        assertion.add((dataset_uri, DCAT.accessURL, URIRef(np_config['access_url'])))
    
    if np_config.get('license_uri'):
        assertion.add((dataset_uri, DCT.license, URIRef(np_config['license_uri'])))
    
    is_part_of = metadata.get('is_part_of', {})
    if is_part_of.get('uri'):
        assertion.add((dataset_uri, DCT.isPartOf, URIRef(is_part_of['uri'])))
    
    # PROVENANCE
    provenance = ds.graph(provenance_graph)
    provenance.add((assertion_graph, PROV.wasAttributedTo, author_uri))
    
    # PUBINFO
    pubinfo = ds.graph(pubinfo_graph)
    pubinfo.add((author_uri, FOAF.name, Literal(metadata['creator_name'])))
    
    now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.000Z")
    pubinfo.add((this_np, DCT.created, Literal(now, datatype=XSD.dateTime)))
    pubinfo.add((this_np, DCT.creator, author_uri))
    pubinfo.add((this_np, DCT.license, URIRef("https://creativecommons.org/licenses/by/4.0/")))
    pubinfo.add((this_np, NPX.wasCreatedAt, URIRef("https://sciencelive4all.org/")))
    pubinfo.add((this_np, NPX.hasNanopubType, DCAT.Dataset))
    
    if np_config.get('dataset_uri'):
        pubinfo.add((this_np, NPX.introduces, dataset_uri))
    
    label = f"Dataset: {np_config['label']}"
    pubinfo.add((this_np, RDFS.label, Literal(label)))
    
    pubinfo.add((this_np, NT.wasCreatedFromTemplate, DATASET_TEMPLATE))
    pubinfo.add((this_np, NT.wasCreatedFromProvenanceTemplate, PROV_TEMPLATE))
    pubinfo.add((this_np, NT.wasCreatedFromPubinfoTemplate, PUBINFO_TEMPLATE_1))
    pubinfo.add((this_np, NT.wasCreatedFromPubinfoTemplate, PUBINFO_TEMPLATE_2))
    
    return ds, label

print("✓ Function defined")

In [None]:
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
generated_files = []

for np_config in config['nanopublications']:
    ds, label = create_dataset_nanopub(np_config, metadata)
    output_file = Path(OUTPUT_DIR) / f"{np_config['id']}.trig"
    ds.serialize(destination=str(output_file), format='trig')
    generated_files.append(output_file)
    print(f"✓ Generated: {output_file}")

print(f"\nTotal: {len(generated_files)} nanopublications")

In [None]:
if generated_files:
    print(f"Preview of {generated_files[0]}:\n")
    print("=" * 80)
    with open(generated_files[0], 'r') as f:
        print(f.read())