In [4]:
#!/usr/bin/env python3
import os
import re
import unicodedata
import json
from pathlib import Path
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD, SKOS, OWL

# -----------------------------------------------------------------------------
# 1. Setup paths and namespaces
# -----------------------------------------------------------------------------
path = str(Path(os.path.abspath(os.getcwd())).absolute())
json_file = os.path.join(path, "train_platinum.json")
#json_file = os.path.join(path, "train_gold.json")
#vertex_file = os.path.join(path, "train_vertexsets.json")
tokenized_file = os.path.join(path, "tokenized_sentences_with_entities.json")
save_path = os.path.join(path, "rdf")
os.makedirs(save_path, exist_ok=True)

# Namespaces
GUTBRAIN = Namespace("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/")
GUTPROP = Namespace("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/")

# Class URIs
PAPER_CLASS       = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Paper")
MENTION_CLASS     = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Mention")
PAPER_ABSTRACT    = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/PaperAbstract")
PAPER_TITLE       = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/PaperTitle")
PAPER_COLLECTION  = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/PaperCollection")
PROJECT           = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Project")
SAMPLE            = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Sample")
SENTENCE          = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Sentence")

# -----------------------------------------------------------------------------
# 2. Load the JSON paper data
# -----------------------------------------------------------------------------
with open(json_file, "r", encoding="utf-8") as f:
    data = json.load(f)
#print("Loaded JSON paper IDs:", list(data.keys()))

# -----------------------------------------------------------------------------
# 3. Mapping dictionaries (keys must be in Title case)
# -----------------------------------------------------------------------------
label_mapping = {
    "Anatomical Location":   URIRef("https://w3id.org/brainteaser/ontology/schema/AnatomicalSite"),
    "Animal":                URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Animal"),
    "Biomedical Technique":  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/BiomedicalTechnique"),
    "Bacteria":              URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Species"),
    "Chemical":              URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Chemical"),
    "Dietary Supplement":    URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/DietarySupplement"),
    "DDF":                   URIRef("https://w3id.org/brainteaser/ontology/schema/DiseaseDisorderOrFinding"),
    "Drug":                  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Drug"),
    "Food":                  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Food"),
    "Gene":                  URIRef("https://w3id.org/brainteaser/ontology/schema/Gene"),
    "Human":                 URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Human"),
    "Microbiome":            URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Microbiome"),
    "Statistical Technique": URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/StatisticalTechnique")
}
concept_scheme_mapping = {
    "Anatomical Location":   URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/AnatomicSite"),
    "Animal":                URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Animal"),
    "Human":                 URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Human"),
    "Drug":                  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Drug"),
    "Gene":                  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Gene"),
    "Dietary Supplement":    URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/dietarySupplement"),
    "DDF":                   URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/diseaseDisorderOrFindingTaxonomy"),
    "Metabolite":            URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Metabolite"),
    "Species":               URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Species"),
    "Food":                  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Food"),
    "Chemical":              URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Chemical"),
    "Biomedical Technique":  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/BiomedicalTechnique"),
    "Microbiome":            URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Microbiome"),
    "Statistical Technique": URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/StatisticalTechnique")
}
alias_mapping = {
    "Bacteria": "Species"
}

# -----------------------------------------------------------------------------
# 4. Initialize the RDF graph and bind namespaces
# -----------------------------------------------------------------------------
g = Graph()
g.bind("gutbrain", GUTBRAIN)
g.bind("rdfs", RDFS)
g.bind("xsd", XSD)
g.bind("skos", SKOS)
g.bind("owl", OWL)
g.bind("gutprop", GUTPROP)

g.add((SKOS.inScheme, RDF.type, OWL.ObjectProperty))

g.add((GUTPROP.partOf, RDF.type, OWL.ObjectProperty))
g.add((GUTPROP.partOf, RDFS.label, Literal("partOf", datatype=XSD.string)))
g.add((GUTPROP.hasTitle, RDF.type, OWL.ObjectProperty))
g.add((GUTPROP.hasTitle, RDFS.label, Literal("hasTitle", datatype=XSD.string)))
g.add((GUTPROP.hasAbstract, RDF.type, OWL.ObjectProperty))
g.add((GUTPROP.hasAbstract, RDFS.label, Literal("hasAbstract", datatype=XSD.string)))

g.add((GUTPROP.containedIn, RDF.type, OWL.ObjectProperty))
g.add((GUTPROP.containedIn, RDFS.label, Literal("containedIn", datatype=XSD.string)))
g.add((GUTBRAIN.contains, RDF.type, OWL.ObjectProperty))
g.add((GUTBRAIN.contains, RDFS.label, Literal("contains", datatype=XSD.string)))
g.add((GUTPROP.composedOf, RDF.type, OWL.ObjectProperty))
g.add((GUTPROP.composedOf, RDFS.label, Literal("composedOf", datatype=XSD.string)))

# Define new property "locatedIn" for linking mentions with sentence nodes
g.add((GUTPROP.locatedIn, RDF.type, OWL.ObjectProperty))
g.add((GUTPROP.locatedIn, RDFS.label, Literal("locatedIn", datatype=XSD.string)))

g.add((GUTPROP.paperId, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.paperId, RDFS.label, Literal("paperId", datatype=XSD.string)))
g.add((GUTPROP.paperAnnotator, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.paperAnnotator, RDFS.label, Literal("paperAnnotator", datatype=XSD.string)))
g.add((GUTPROP.paperYear, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.paperYear, RDFS.label, Literal("paperYear", datatype=XSD.string)))
g.add((GUTPROP.paperJournal, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.paperJournal, RDFS.label, Literal("paperJournal", datatype=XSD.string)))
g.add((GUTPROP.paperAuthor, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.paperAuthor, RDFS.label, Literal("paperAuthor", datatype=XSD.string)))
g.add((GUTPROP.numberOfRunsFound, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.numberOfRunsFound, RDFS.label, Literal("numberOfRunsFound", datatype=XSD.string)))
g.add((GUTPROP.NCBITaxonID, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.NCBITaxonID, RDFS.label, Literal("NCBITaxonID", datatype=XSD.string)))
g.add((GUTPROP.sdRelativeAbundance, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.sdRelativeAbundance, RDFS.label, Literal("sdRelativeAbundance", datatype=XSD.string)))
g.add((GUTPROP.medianRelativeAbundance, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.medianRelativeAbundance, RDFS.label, Literal("medianRelativeAbundance", datatype=XSD.string)))
g.add((GUTPROP.meanRelativeAbundance, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.meanRelativeAbundance, RDFS.label, Literal("meanRelativeAbundance", datatype=XSD.string)))
g.add((GUTPROP.scientificName, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.scientificName, RDFS.label, Literal("scientificName", datatype=XSD.string)))

for scheme_uri in set(concept_scheme_mapping.values()):
    keys = [k for k, v in concept_scheme_mapping.items() if v == scheme_uri]
    label_text = ", ".join(k.title() for k in keys) + " Concept Scheme"
    g.add((scheme_uri, RDF.type, SKOS.ConceptScheme))
    g.add((scheme_uri, RDFS.label, Literal(label_text, datatype=XSD.string)))

is_train_platinum = "train_platinum" in os.path.basename(json_file)
#is_train_gold = "train_gold" in os.path.basename(json_file)

#if is_train_gold:
 #   gold_collection_uri = URIRef(GUTBRAIN["goldCollection"])
 #   label_text = "goldCollection"
  #  g.add((gold_collection_uri, RDF.type, PAPER_COLLECTION))
   # g.add((gold_collection_uri, RDFS.label, Literal(label_text, datatype=XSD.string)))
    
if is_train_platinum:
    platinum_collection_uri = URIRef(GUTBRAIN["platinumCollection"])
    label_text = "platinumCollection"
    g.add((platinum_collection_uri, RDF.type, PAPER_COLLECTION))
    g.add((platinum_collection_uri, RDFS.label, Literal(label_text, datatype=XSD.string)))

# -----------------------------------------------------------------------------
# Utility functions
# -----------------------------------------------------------------------------
def create_uri_fragment(text):
    cleaned = re.sub(r'<[^>]*>', '', text)
    cleaned = normalize_text(cleaned)
    # Allow word characters, digits, underscores, hyphens, and Greek characters (U+0370 to U+03FF)
    cleaned = re.sub(r'[^\w\u0370-\u03FF-]', '_', cleaned)
    return cleaned

def to_camel_case(s):
    s = re.sub(r'[^\w\s]', '', s)
    parts = re.split(r'\s+', s.strip())
    if not parts:
        return ""
    return parts[0].lower() + ''.join(word.title() for word in parts[1:])

def normalize_text(text):
    return unicodedata.normalize('NFC', text)

# -----------------------------------------------------------------------------
# 5. Process each paper (each key in the JSON represents a paper)
# -----------------------------------------------------------------------------
for paper_id, paper_data in data.items():
    paper_uri = URIRef(GUTBRAIN[f"paper_{paper_id}"])
    g.add((paper_uri, RDF.type, PAPER_CLASS))
    
    if is_train_platinum:
        g.add((paper_uri, GUTPROP.partOf, platinum_collection_uri))
        g.add((platinum_collection_uri, GUTBRAIN.contains, paper_uri))

    #if is_train_gold:
    #    g.add((paper_uri, GUTPROP.partOf, gold_collection_uri))
    #    g.add((gold_collection_uri, GUTBRAIN.contains, paper_uri))
    
    # Each paper gets its own mention node
    #paper_mention = URIRef(GUTBRAIN[f"mention_{paper_id}"])
    #g.add((paper_mention, RDF.type, MENTION_CLASS))
    #g.add((paper_uri, GUTPROP.hasMention, paper_mention))
    
    metadata = paper_data.get("metadata", {})
    full_title = metadata.get("title", None)
    full_abstract = metadata.get("abstract", None)
    try:
        paper_id_val = int(paper_id)
    except ValueError:
        paper_id_val = paper_id
    paper_annotator = metadata.get("annotator", None)
    paper_year = metadata.get("year", None)
    paper_journal = metadata.get("journal", None)
    paper_author = metadata.get("author", None)
    
    g.add((paper_uri, GUTPROP.paperId, Literal(paper_id_val, datatype=XSD.integer)))
    if paper_annotator is not None:
        g.add((paper_uri, GUTPROP.paperAnnotator, Literal(paper_annotator, datatype=XSD.string)))
    if paper_year is not None:
        g.add((paper_uri, GUTPROP.paperYear, Literal(paper_year, datatype=XSD.gYear)))
    if paper_journal is not None:
        g.add((paper_uri, GUTPROP.paperJournal, Literal(paper_journal, datatype=XSD.string)))
    if paper_author is not None:
        g.add((paper_uri, GUTPROP.paperAuthor, Literal(paper_author, datatype=XSD.string)))
    
    title_texts = []
    abstract_texts = []
    
    entities = paper_data.get("entities", [])
    for i, entity in enumerate(entities):
        raw_label = entity.get("label", "").strip()
        label_title = raw_label.title() if raw_label.lower() != "ddf" else "DDF"
        
        text_span = entity.get("text_span", "").strip()
        cleaned_text_span = re.sub(r'<[^>]*>', '', text_span).strip()
        
        instance_fragment = create_uri_fragment(cleaned_text_span)
        
        location_lower = entity.get("location", "").strip().lower()
        if location_lower == "title":
            title_texts.append(cleaned_text_span)
        elif location_lower == "abstract":
            abstract_texts.append(cleaned_text_span)
        
        if label_title in label_mapping:
            entity_node = URIRef(GUTBRAIN[instance_fragment])
            class_uri = label_mapping[label_title]
            g.add((entity_node, RDF.type, class_uri))
            g.add((entity_node, RDF.type, SKOS.Concept))
            g.add((entity_node, RDFS.label, Literal(cleaned_text_span, datatype=XSD.string)))
            
            scheme_key = alias_mapping.get(label_title, label_title)
            if scheme_key in concept_scheme_mapping:
                scheme_uri = concept_scheme_mapping[scheme_key]
                g.add((entity_node, SKOS.inScheme, scheme_uri))
        else:
            entity_node = URIRef(GUTBRAIN[f"entity_{paper_id}_{i}"])
            g.add((entity_node, RDF.type, MENTION_CLASS))
            g.add((entity_node, RDF.type, SKOS.Concept))
            g.add((entity_node, RDFS.label, Literal(label_title, datatype=XSD.string)))
            g.add((entity_node, GUTPROP.annotator, Literal(metadata.get("annotator", "unknown"), datatype=XSD.string)))
            g.add((entity_node, GUTPROP.location, Literal(entity.get("location", ""), datatype=XSD.string)))
            g.add((entity_node, GUTPROP.text_span, Literal(cleaned_text_span, datatype=XSD.string)))
        
        #g.add((entity_node, GUTPROP.containedIn, paper_mention))
    
    if full_title is None and title_texts:
        full_title = " ".join(title_texts)
    if full_abstract is None and abstract_texts:
        full_abstract = " ".join(abstract_texts)
    
    if full_title:
        title_uri = URIRef(GUTBRAIN[f"title_{paper_id}"])
        g.add((title_uri, RDF.type, PAPER_TITLE))
        g.add((title_uri, RDFS.comment, Literal(full_title, datatype=XSD.string)))
        g.add((paper_uri, GUTPROP.hasTitle, title_uri))
    
    if full_abstract:
        abstract_uri = URIRef(GUTBRAIN[f"abstract_{paper_id}"])
        g.add((abstract_uri, RDF.type, PAPER_ABSTRACT))
        g.add((abstract_uri, RDFS.comment, Literal(full_abstract, datatype=XSD.string)))
        g.add((paper_uri, GUTPROP.hasAbstract, abstract_uri))
    
    # Process relations
    relations = paper_data.get("relations", [])
    for relation in relations:
        subj_text = relation.get("subject_text_span", "").strip()
        cleaned_subj_text = re.sub(r'<[^>]*>', '', subj_text).strip()
        subj_fragment = create_uri_fragment(cleaned_subj_text)
        subj_uri = URIRef(GUTBRAIN[subj_fragment])
        if not list(g.triples((subj_uri, None, None))):
            print(f"Warning: Subject not recognized: {subj_uri}. Info: '{cleaned_subj_text}'")
        
        obj_text = relation.get("object_text_span", "").strip()
        cleaned_obj_text = re.sub(r'<[^>]*>', '', obj_text).strip()
        obj_fragment = create_uri_fragment(cleaned_obj_text)
        obj_uri = URIRef(GUTBRAIN[obj_fragment])
        if not list(g.triples((obj_uri, None, None))):
            print(f"Warning: Object not recognized: {obj_uri}. Info: '{cleaned_obj_text}'")
        
        pred_text = relation.get("predicate", "").strip()
        pred_text_clean = to_camel_case(pred_text)
        pred_uri = URIRef(GUTPROP[pred_text_clean])
        #print(f"Predicate: '{pred_text}' -> '{pred_text_clean}'")
        g.add((pred_uri, RDF.type, OWL.ObjectProperty))
        g.add((pred_uri, RDFS.label, Literal(pred_text_clean, datatype=XSD.string)))
        g.add((subj_uri, pred_uri, obj_uri))


mention_counter = 1
tokenized_mentions = {}

with open(tokenized_file, "r", encoding="utf-8") as f_sent:
    tokenized_data = json.load(f_sent)

for entry in tokenized_data:
    pmid = entry.get("pmid", "").strip()
    sent_id = entry.get("sent_id")
    sentence_text = entry.get("sentence", "").strip()
    entity_list = entry.get("entities", [])

    sentence_uri_str = f"sentence_{pmid}_{sent_id}"
    sentence_uri = URIRef(GUTBRAIN[sentence_uri_str])

    g.add((sentence_uri, RDF.type, SENTENCE))
    g.add((sentence_uri, RDFS.comment, Literal(sentence_text, datatype=XSD.string)))

    if sent_id == 0:
        title_uri = URIRef(GUTBRAIN[f"title_{pmid}"])
        g.add((sentence_uri, GUTPROP.partOf, title_uri))
        g.add((title_uri, GUTPROP.composedOf, sentence_uri))
    else:
        abstract_uri = URIRef(GUTBRAIN[f"abstract_{pmid}"])
        g.add((sentence_uri, GUTPROP.partOf, abstract_uri))
        g.add((abstract_uri, GUTPROP.composedOf, sentence_uri))

    #mention_uri = URIRef(GUTBRAIN[f"mention_{mention_counter}"])
    #mention_counter += 1
    #g.add((mention_uri, RDF.type, MENTION_CLASS))
    #g.add((mention_uri, GUTPROP.locatedIn, sentence_uri))
    for ent_text in entity_list:
        if not ent_text.strip():
            continue
        canonical = create_uri_fragment(ent_text)
        if canonical in tokenized_mentions:
            mention_node = tokenized_mentions[canonical]
        else:
            mention_node = URIRef(GUTBRAIN[f"mention_{mention_counter}"])
            mention_counter += 1
            g.add((mention_node, RDF.type, MENTION_CLASS))
            #g.add((mention_node, RDFS.label, Literal(f"mention_{mention_counter}", datatype=XSD.string)))
            tokenized_mentions[canonical] = mention_node
            #print(mention_node)
        g.add((mention_node, GUTPROP.locatedIn, sentence_uri))
        entity_uri = URIRef(GUTBRAIN[canonical])
        g.add((entity_uri, GUTPROP.containedIn, mention_node))

# -----------------------------------------------------------------------------
# 7. Serialize and print the graph in Turtle format
# -----------------------------------------------------------------------------
output_file = os.path.join(save_path, "gutbrain_entities.ttl")
ttl_output = g.serialize(format="turtle")
with open(output_file, "w", encoding="utf-8") as f_out:
    f_out.write(ttl_output)

print(f"The RDF graph has been saved in {output_file}")


The RDF graph has been saved in C:\Users\samue\OneDrive\Desktop\ThesisPiron\rdf\gutbrain_entities.ttl
