# Dataset Nanopublication Generator

This notebook generates nanopublications for datasets from a JSON configuration file.

## Dataset Nanopublications
Dataset nanopublications describe research datasets following FAIR principles, including:
- Dataset identification and description
- Access URLs and formats
- Licensing and attribution
- Domain and language metadata

In [None]:
import json
import sys
from pathlib import Path

# Add parent directory to path for imports
sys.path.insert(0, str(Path.cwd().parent))

from nanopub_utils import (
    NanopubGenerator, load_config, save_nanopub,
    make_uri, make_literal, validate_required_fields,
    generate_nanopub_uri, PREFIXES
)

In [None]:
class DatasetNanopubGenerator(NanopubGenerator):
    """Generator for dataset nanopublications."""
    
    def __init__(self, config: dict, nanopub_config: dict):
        # Merge metadata with individual nanopub config
        merged_config = {
            **config.get('metadata', {}),
            **nanopub_config,
            'template_uri': config.get('template_uri'),
            'label': nanopub_config.get('nanopub_label', nanopub_config.get('label', 'Dataset'))
        }
        super().__init__(merged_config)
        self.add_prefix('fdof')
        self.add_prefix('fair')
        self.add_prefix('dcat')
        self.add_prefix('schema')
    
    def generate_assertion(self) -> str:
        """Generate the dataset assertion graph."""
        # Get dataset URI or generate local one
        dataset_uri = self.config.get('dataset_uri')
        if not dataset_uri:
            dataset_uri = f"{self.nanopub_uri}#dataset"
        
        label = self.config.get('label', 'Unnamed Dataset')
        description = self.config.get('description', '')
        access_url = self.config.get('access_url')
        format_type = self.config.get('format')
        license_uri = self.config.get('license_uri')
        domain = self.config.get('domain')
        language = self.config.get('language')
        creators = self.config.get('creators', [])
        version = self.config.get('version')
        related_pub = self.config.get('related_publication')
        
        lines = [f"{self.sub_prefix}:assertion {{"]
        
        # Declare the dataset as FAIR Digital Object
        lines.append(f"  <{dataset_uri}> a fdof:FAIRDigitalObject ;")
        lines.append(f"    a <https://w3id.org/fair/ff/terms/Dataset> ;")
        lines.append(f"    rdfs:label {make_literal(label)} .")
        
        # Add description
        if description:
            lines.append(f"  <{dataset_uri}> rdfs:comment {make_literal(description)} .")
        
        # Add access URL
        if access_url:
            lines.append(f"  <{dataset_uri}> dcat:accessURL <{access_url}> .")
        
        # Add format
        if format_type:
            lines.append(f"  <{dataset_uri}> fdof:hasEncodingFormat {make_literal(format_type)} .")
        
        # Add license
        if license_uri:
            lines.append(f"  <{dataset_uri}> dct:license <{license_uri}> .")
        
        # Add domain/subject
        if domain:
            lines.append(f"  <{dataset_uri}> dct:subject {make_literal(domain)} .")
        
        # Add language
        if language:
            lines.append(f"  <{dataset_uri}> dct:language {make_literal(language)} .")
        
        # Add creators
        for creator in creators:
            if creator.get('orcid'):
                orcid = creator['orcid']
                lines.append(f"  <{dataset_uri}> dct:creator orcid:{orcid} .")
                if creator.get('name'):
                    lines.append(f"  orcid:{orcid} foaf:name {make_literal(creator['name'])} .")
        
        # Add version
        if version:
            lines.append(f"  <{dataset_uri}> dct:hasVersion {make_literal(version)} .")
        
        # Add related publication
        if related_pub:
            pub_uri = related_pub if related_pub.startswith('http') else f"https://doi.org/{related_pub}"
            lines.append(f"  <{dataset_uri}> dct:isReferencedBy <{pub_uri}> .")
        
        # Add metadata reference to this nanopub
        lines.append(f"  <{dataset_uri}> fdof:hasMetadata this: .")
        
        lines.append("}")
        return "\n".join(lines)

In [None]:
# Configuration
CONFIG_FILE = "../config/vbae208_dataset.json"  # Change this to use different config
OUTPUT_DIR = "../output/dataset"

# Create output directory
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

In [None]:
# Load configuration
config = load_config(CONFIG_FILE)

print(f"Source paper: {config['metadata']['source_paper']['title']}")
print(f"DOI: {config['metadata']['source_paper']['doi']}")
print(f"Number of dataset nanopublications to generate: {len(config['nanopublications'])}")
print()

for i, np_config in enumerate(config['nanopublications'], 1):
    print(f"{i}. {np_config['label']}")
    print(f"   Access URL: {np_config.get('access_url', 'N/A')}")

In [None]:
# Generate nanopublications
generated_files = []

for np_config in config['nanopublications']:
    # Create generator
    generator = DatasetNanopubGenerator(config, np_config)
    
    # Generate nanopub content
    nanopub_content = generator.generate()
    
    # Save to file
    output_file = f"{OUTPUT_DIR}/{np_config['id']}.trig"
    save_nanopub(nanopub_content, output_file)
    generated_files.append(output_file)
    
    print(f"Generated: {output_file}")

print(f"\nTotal generated: {len(generated_files)} nanopublications")

In [None]:
# Preview first generated nanopublication
if generated_files:
    print(f"Preview of {generated_files[0]}:\n")
    print("=" * 80)
    with open(generated_files[0], 'r') as f:
        print(f.read())

## Next Steps

1. Review the generated `.trig` files in the output directory
2. Sign and publish using Nanodash or nanopub-java
3. To use with a different paper, create a new JSON config file and update `CONFIG_FILE`