# Dataset Nanopublication Generator

Creates dataset description nanopublications from a JSON configuration file.

**Template:** [Dataset Description Template](https://w3id.org/np/RAuVB37yyAuAlgusrUAoG84JI4_EfrEqIkpEZYDpSz3d8)

## Dataset Nanopublications
Document datasets following FAIR principles:
- Dataset identification and description
- Access URLs and formats
- Licensing and attribution

In [1]:
import json
import sys
from pathlib import Path

# Add parent directory to path for imports
sys.path.insert(0, str(Path.cwd().parent))

from nanopub_utils import (
    NanopubGenerator, load_config, save_nanopub,
    make_literal, PREFIXES
)

In [2]:
# Dataset-specific constants
DATASET_TEMPLATE = "https://w3id.org/np/RAuVB37yyAuAlgusrUAoG84JI4_EfrEqIkpEZYDpSz3d8"
DCAT_DATASET = "http://www.w3.org/ns/dcat#Dataset"
RDFS_LABEL = "http://www.w3.org/2000/01/rdf-schema#label"
DCT_DESCRIPTION = "http://www.w3.org/2000/01/rdf-schema#comment"
DCT_LICENSE = "http://purl.org/dc/terms/license"
DCT_IS_PART_OF = "http://purl.org/dc/terms/isPartOf"
DCAT_ACCESS_URL = "http://www.w3.org/ns/dcat#accessURL"

class DatasetNanopubGenerator(NanopubGenerator):
    """Generator for dataset nanopublications."""
    
    def __init__(self, config: dict, nanopub_config: dict):
        # Merge metadata with individual nanopub config
        merged_config = {
            **config.get('metadata', {}),
            **nanopub_config,
            'template_uri': DATASET_TEMPLATE
        }
        super().__init__(merged_config)
        
        # Add nanopub type for Dataset
        self.add_nanopub_type(DCAT_DATASET)
        
        # isPartOf goes in assertion for Dataset (links the dataset entity)
        self.is_part_of_in_assertion = True
    
    def generate_assertion(self) -> str:
        """Generate the dataset assertion graph."""
        label = self.config['label']
        description = self.config.get('description', '')
        dataset_uri = self.config.get('dataset_uri')
        access_url = self.config.get('access_url')
        license_uri = self.config.get('license_uri')
        is_part_of = self.config.get('is_part_of')
        
        # Set nanopub label
        self.config['label'] = f'Dataset: {label}'
        
        # Determine the dataset subject
        if dataset_uri:
            subject = f'<{dataset_uri}>'
            self.set_introduces(dataset_uri)
        else:
            subject = f'{self.sub_prefix}:dataset'
        
        lines = [f'{self.sub_prefix}:assertion {{']
        
        # Collect predicates
        predicates = [f'    a <{DCAT_DATASET}>']
        predicates.append(f'    <{RDFS_LABEL}> {make_literal(label)}')
        
        if description:
            predicates.append(f'    <{DCT_DESCRIPTION}> {make_literal(description)}')
        
        if access_url:
            predicates.append(f'    <{DCAT_ACCESS_URL}> <{access_url}>')
        
        if license_uri:
            predicates.append(f'    <{DCT_LICENSE}> <{license_uri}>')
        
        # isPartOf (link dataset to systematic review)
        if is_part_of and is_part_of.get('uri'):
            predicates.append(f'    <{DCT_IS_PART_OF}> <{is_part_of["uri"]}>')
        
        # Format with semicolons
        lines.append(f'  {subject}')
        for i, pred in enumerate(predicates):
            if i < len(predicates) - 1:
                lines.append(f'{pred};')
            else:
                lines.append(f'{pred} .')
        
        lines.append('}')
        return '\n'.join(lines)

In [3]:
# Configuration
CONFIG_FILE = "../config/vbae208/vbae208_dataset.json"  # Change this to use different config
CONFIG_FILE = "../config/clenet2025/clenet2025_dataset.json"  # Change this to use different config
OUTPUT_DIR = "../output/dataset"

# Create output directory
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

In [4]:
# Load configuration
config = load_config(CONFIG_FILE)

print(f"Source paper: {config['metadata']['source_paper']['title']}")
print(f"Number of dataset nanopublications to generate: {len(config['nanopublications'])}")
if config['metadata'].get('is_part_of'):
    print(f"Part of: {config['metadata']['is_part_of']['label']}")
print()

for i, np_config in enumerate(config['nanopublications'], 1):
    print(f"{i}. {np_config['label']}")

Source paper: QOMIC: quantum optimization for motif identification
Number of dataset nanopublications to generate: 2
Part of: Quantum Computing Applications for Biodiversity Research - Systematic Review

1. TRRUST - Human Transcriptional Regulatory Network Database
2. QOMIC Synthetic Benchmark Networks


In [5]:
# Generate nanopublications
generated_files = []

for np_config in config['nanopublications']:
    # Create generator
    generator = DatasetNanopubGenerator(config, np_config)
    
    # Generate nanopub content
    nanopub_content = generator.generate()
    
    # Save to file
    output_file = f"{OUTPUT_DIR}/{np_config['id']}.trig"
    save_nanopub(nanopub_content, output_file)
    generated_files.append(output_file)
    
    print(f"Generated: {output_file}")

print(f"\nTotal generated: {len(generated_files)} nanopublications")

Generated: ../output/dataset/dataset_trrust.trig
Generated: ../output/dataset/dataset_qomic_synthetic.trig

Total generated: 2 nanopublications


In [6]:
# Preview first generated nanopublication
if generated_files:
    print(f"Preview of {generated_files[0]}:\n")
    print("=" * 80)
    with open(generated_files[0], 'r') as f:
        print(f.read())

Preview of ../output/dataset/dataset_trrust.trig:

@prefix this: <https://w3id.org/np/RA22bd15979d73bbb0e1612e5f89df9acaaa35e2d8cbb> .
@prefix sub: <https://w3id.org/np/RA22bd15979d73bbb0e1612e5f89df9acaaa35e2d8cbb/> .
@prefix dct: <http://purl.org/dc/terms/> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix np: <http://www.nanopub.org/nschema#> .
@prefix npx: <http://purl.org/nanopub/x/> .
@prefix nt: <https://w3id.org/np/o/ntemplate/> .
@prefix orcid: <https://orcid.org/> .
@prefix prov: <http://www.w3.org/ns/prov#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

sub:Head {
  this: a np:Nanopublication ;
    np:hasAssertion sub:assertion ;
    np:hasProvenance sub:provenance ;
    np:hasPublicationInfo sub:pubinfo .
}

sub:assertion {
  <https://www.grnpedia.org/trrust/>
    a <http://www.w3.org/ns/dcat#Dataset>;
    <http://www.w3.org/2000/01/rdf-schema#lab

## JSON Config Structure

```json
{
  "metadata": {
    "source_paper": { "title": "...", "doi": "..." },
    "creator_orcid": "0000-0000-0000-0000",
    "creator_name": "Your Name",
    "is_part_of": {
      "uri": "https://w3id.org/np/YOUR-REVIEW-URI",
      "label": "Your Systematic Review Title"
    }
  },
  "nanopublications": [
    {
      "id": "dataset_mydata",
      "label": "My Dataset Name",
      "description": "Description of the dataset",
      "dataset_uri": "https://doi.org/10.xxxx/dataset",
      "access_url": "https://example.com/download"
    }
  ]
}
```

The `is_part_of` creates a `dct:isPartOf` triple in the **assertion** linking the dataset entity to your systematic review.