In [66]:
#!/usr/bin/env python3
import os
import csv
import re
from pathlib import Path
import pandas as pd
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD, SKOS, OWL

# -----------------------------------------------------------------------------
# 1. Setup dei percorsi e dei namespace
# -----------------------------------------------------------------------------
path = str(Path(os.path.abspath(os.getcwd())).absolute())
csv_file = os.path.join(path, "train_platinum_entities.csv")  # Assicurati che il file sia in questo percorso
save_path = os.path.join(path, "rdf")
os.makedirs(save_path, exist_ok=True)

# Namespace per gli individuali creati dal CSV
GUTBRAIN = Namespace("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/")
GUTPROP = Namespace("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/")

PAPER_CLASS   = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Paper")
MENTION_CLASS = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Mention")

# -----------------------------------------------------------------------------
# 2. Lettura del CSV con rilevamento automatico del delimitatore
# -----------------------------------------------------------------------------
with open(csv_file, "r", encoding="utf-8") as f:
    sample = f.read(1024)
    f.seek(0)
    try:
        dialect = csv.Sniffer().sniff(sample)
        delimiter = dialect.delimiter
        print(f"Rilevato delimitatore: '{delimiter}'")
    except csv.Error:
        delimiter = ","  # fallback
        print("Impossibile rilevare il delimitatore, uso la virgola ','.")

df = pd.read_csv(csv_file, delimiter=delimiter)
df.columns = [col.strip() for col in df.columns]
print("Colonne lette dal CSV:", df.columns.tolist())

# -----------------------------------------------------------------------------
# 3. Dizionari di mapping (le chiavi sono in formato Title)
# -----------------------------------------------------------------------------
# Mapping per le label che associano la label (formattata come Title) all'URI della classe
label_mapping = {
    "Anatomical Location":   URIRef("https://w3id.org/brainteaser/ontology/schema/AnatomicalSite"),
    "Animal":                URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Animal"),
    "Biomedical Technique":  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/BiomedicalTechnique"),
    "Bacteria":              URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Species"),
    "Chemical":              URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Chemical"),
    "Dietary Supplement":    URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/DietarySupplement"),
    "Ddf":                   URIRef("https://w3id.org/brainteaser/ontology/schema/DiseaseDisorderOrFinding"),
    "Drug":                  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Drug"),
    "Food":                  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Food"),
    "Gene":                  URIRef("https://w3id.org/brainteaser/ontology/schema/Gene"),
    "Human":                 URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Human"),
    "Microbiome":            URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Microbiome"),
    "Statistical Technique": URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/StatisticalTechnique")
}

# Mapping per i concept scheme: le chiavi devono essere in formato Title.
concept_scheme_mapping = {
    "Anatomical Location":   URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/AnatomicSite"),
    "Animal":                URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Animal"),
    "Human":                 URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Human"),
    "Drug":                  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Drug"),
    "Gene":                  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Gene"),
    "Dietary Supplement":    URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/dietarySupplement"),
    "Ddf":                   URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/diseaseDisorderOrFindingTaxonomy"),
    "Metabolite":            URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Metabolite"),
    "Species":               URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Species"),
    "Food":                  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Food"),
    "Chemical":              URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Chemical"),
    "Biomedical Technique":  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/BiomedicalTechnique"),
    "Microbiome":            URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Microbiome"),
    "Statistical Technique": URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/StatisticalTechnique")
}

# Dizionario alias: per mappare eventuali termini alternativi alla chiave di concept scheme
alias_mapping = {
    "Bacteria": "Species"
}

# -----------------------------------------------------------------------------
# 4. Inizializza il grafo RDF e imposta il binding dei namespace
# -----------------------------------------------------------------------------
g = Graph()
g.bind("gutbrain", GUTBRAIN)
g.bind("rdfs", RDFS)
g.bind("xsd", XSD)
g.bind("skos", SKOS)
g.bind("owl", OWL)
g.bind("gutprop", GUTPROP)

# -----------------------------------------------------------------------------
# 4.a: Dichiarazione di skos:inScheme come ObjectProperty (OWL)
# -----------------------------------------------------------------------------
g.add((SKOS.inScheme, RDF.type, OWL.ObjectProperty))

# -----------------------------------------------------------------------------
# 4.b: Crea gli individuali per i Concept Scheme
# -----------------------------------------------------------------------------
for scheme_uri in set(concept_scheme_mapping.values()):
    # Trova tutte le chiavi che mappano a questo URI
    keys = [k for k, v in concept_scheme_mapping.items() if v == scheme_uri]
    # Costruisci l'etichetta: usa .title() per capitalizzare
    label_text = ", ".join(k.title() for k in keys) + " Concept Scheme"
    g.add((scheme_uri, RDF.type, SKOS.ConceptScheme))
    g.add((scheme_uri, RDFS.label, Literal(label_text, datatype=XSD.string)))

def create_uri_fragment(text):
    # Rimuovi spazi e virgole
    return re.sub(r'[ ,]', '', text)
    
# -----------------------------------------------------------------------------
# 5. Itera su ogni riga del CSV per creare le istanze e associarle al Concept Scheme
# -----------------------------------------------------------------------------
for idx, row in df.iterrows():
    pmid      = str(row["pmid"]).strip()
    annotator = str(row["annotator"]).strip()
    start_idx = str(row["start_idx"]).strip()
    end_idx   = str(row["end_idx"]).strip()
    location  = str(row["location"]).strip()
    text_span = str(row["text_span"]).strip()
    raw_label = str(row["label"]).strip()

    # Converti la label in formato Title per uniformarla (es. "bacteria" -> "Bacteria")
    label_title = raw_label.title()

    # Rimuove eventuali tag HTML dal campo text_span
    cleaned_text_span = re.sub(r'<[^>]*>', '', text_span).strip()

    # Crea o riferisci l'istanza del paper associato
    paper_uri = URIRef(GUTBRAIN[f"paper_{pmid}"])
    g.add((paper_uri, RDF.type, PAPER_CLASS))
    
    if label_title in label_mapping:
        class_uri = label_mapping[label_title]
        instance_name = cleaned_text_span.replace(" ", "_")
        instance_uri = URIRef(GUTBRAIN[instance_name])
        
        g.add((instance_uri, RDF.type, class_uri))
        g.add((instance_uri, RDF.type, SKOS.Concept))
        g.add((instance_uri, RDFS.label, Literal(cleaned_text_span, datatype=XSD.string)))
        
        # Determina la chiave da usare per il Concept Scheme: usa l'alias se esiste
        scheme_key = label_title
        if label_title in alias_mapping:
            scheme_key = alias_mapping[label_title]
        
        if scheme_key in concept_scheme_mapping:
            scheme_uri = concept_scheme_mapping[scheme_key]
            g.add((instance_uri, SKOS.inScheme, scheme_uri))
        
        #g.add((paper_uri, GUTPROP.containedIn, instance_uri))
    else:
        # Gestione come una Mention per le annotazioni non mappate
        mention_uri = URIRef(GUTBRAIN[f"mention_{pmid}"])
        g.add((mention_uri, RDF.type, MENTION_CLASS))
        g.add((mention_uri, RDF.type, SKOS.Concept))
        g.add((mention_uri, GUTBRAIN.annotator, Literal(annotator, datatype=XSD.string)))
        g.add((mention_uri, GUTBRAIN.location, Literal(location, datatype=XSD.string)))
        g.add((mention_uri, GUTBRAIN.text_span, Literal(cleaned_text_span, datatype=XSD.string)))
        g.add((mention_uri, GUTBRAIN.label, Literal(label_title, datatype=XSD.string)))
        g.add((mention_uri, GUTBRAIN.start_idx, Literal(int(start_idx), datatype=XSD.integer)))
        g.add((mention_uri, GUTBRAIN.end_idx, Literal(int(end_idx), datatype=XSD.integer)))
        g.add((paper_uri, GUTBRAIN.hasMention, mention_uri))

# -----------------------------------------------------------------------------
# 6. Serializza il grafo RDF in formato Turtle e salva su file
# -----------------------------------------------------------------------------
output_file = os.path.join(save_path, "gutbrain_entities.ttl")
g.serialize(destination=output_file, format="turtle")
print(f"Il grafo RDF è stato salvato in {output_file}")

Rilevato delimitatore: '|'
Colonne lette dal CSV: ['pmid', 'annotator', 'start_idx', 'end_idx', 'location', 'text_span', 'label']
Il grafo RDF è stato salvato in C:\Users\samue\OneDrive\Desktop\ThesisPiron\rdf\gutbrain_entities.ttl


In [8]:
#CSV
#!/usr/bin/env python3
import os
import csv
import re
from pathlib import Path
import pandas as pd
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD, SKOS, OWL

# -----------------------------------------------------------------------------
# 1. Setup dei percorsi e dei namespace
# -----------------------------------------------------------------------------
path = str(Path(os.path.abspath(os.getcwd())).absolute())
csv_file = os.path.join(path, "train_platinum_entities.csv")  # File delle entità
relations_csv = os.path.join(path, "train_platinum_relations.csv")  # File delle relazioni
save_path = os.path.join(path, "rdf")
os.makedirs(save_path, exist_ok=True)

# Namespace per gli individuali creati dal CSV
GUTBRAIN = Namespace("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/")
# Namespace per le proprietà e per i tipi (schema)
GUTPROP = Namespace("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/")

PAPER_CLASS   = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Paper")
MENTION_CLASS = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Mention")

# -----------------------------------------------------------------------------
# 2. Lettura del CSV delle entità con rilevamento automatico del delimitatore
# -----------------------------------------------------------------------------
with open(csv_file, "r", encoding="utf-8") as f:
    sample = f.read(1024)
    f.seek(0)
    try:
        dialect = csv.Sniffer().sniff(sample)
        delimiter = dialect.delimiter
        print(f"Rilevato delimitatore per entità: '{delimiter}'")
    except csv.Error:
        delimiter = ","  
        print("Impossibile rilevare il delimitatore per entità, uso la virgola ','.")

df = pd.read_csv(csv_file, delimiter=delimiter)
df.columns = [col.strip() for col in df.columns]
print("Colonne lette dal CSV delle entità:", df.columns.tolist())

# -----------------------------------------------------------------------------
# 3. Dizionari di mapping (le chiavi sono in formato Title)
# -----------------------------------------------------------------------------
label_mapping = {
    "Anatomical Location":   URIRef("https://w3id.org/brainteaser/ontology/schema/AnatomicalSite"),
    "Animal":                URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Animal"),
    "Biomedical Technique":  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/BiomedicalTechnique"),
    "Bacteria":              URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Species"),
    "Chemical":              URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Chemical"),
    "Dietary Supplement":    URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/DietarySupplement"),
    "DDF":                   URIRef("https://w3id.org/brainteaser/ontology/schema/DiseaseDisorderOrFinding"),
    "Drug":                  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Drug"),
    "Food":                  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Food"),
    "Gene":                  URIRef("https://w3id.org/brainteaser/ontology/schema/Gene"),
    "Human":                 URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Human"),
    "Microbiome":            URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Microbiome"),
    "Statistical Technique": URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/StatisticalTechnique")
}

concept_scheme_mapping = {
    "Anatomical Location":   URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/AnatomicSite"),
    "Animal":                URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Animal"),
    "Human":                 URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Human"),
    "Drug":                  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Drug"),
    "Gene":                  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Gene"),
    "Dietary Supplement":    URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/dietarySupplement"),
    "DDF":                   URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/diseaseDisorderOrFindingTaxonomy"),
    "Metabolite":            URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Metabolite"),
    "Species":               URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Species"),
    "Food":                  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Food"),
    "Chemical":              URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Chemical"),
    "Biomedical Technique":  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/BiomedicalTechnique"),
    "Microbiome":            URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Microbiome"),
    "Statistical Technique": URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/StatisticalTechnique")
}

alias_mapping = {
    "Bacteria": "Species"
}

# -----------------------------------------------------------------------------
# 4. Inizializza il grafo RDF e imposta il binding dei namespace
# -----------------------------------------------------------------------------
g = Graph()
g.bind("gutbrain", GUTBRAIN)
g.bind("rdfs", RDFS)
g.bind("xsd", XSD)
g.bind("skos", SKOS)
g.bind("owl", OWL)
g.bind("gutprop", GUTPROP)

# -----------------------------------------------------------------------------
# 4.a: Dichiarazione di skos:inScheme come ObjectProperty (OWL)
# -----------------------------------------------------------------------------
g.add((SKOS.inScheme, RDF.type, OWL.ObjectProperty))

# -----------------------------------------------------------------------------
# 4.b: Crea gli individuali per i Concept Scheme
# -----------------------------------------------------------------------------
for scheme_uri in set(concept_scheme_mapping.values()):
    keys = [k for k, v in concept_scheme_mapping.items() if v == scheme_uri]
    label_text = ", ".join(k.title() for k in keys) + " Concept Scheme"
    g.add((scheme_uri, RDF.type, SKOS.ConceptScheme))
    g.add((scheme_uri, RDFS.label, Literal(label_text, datatype=XSD.string)))

# Funzione per creare un frammento URI senza spazi, virgole e parentesi
def create_uri_fragment(text):
    # Rimuove i tag HTML, se presenti
    cleaned = re.sub(r'<[^>]*>', '', text)
    # Sostituisce / rimuove caratteri non validi (ad es. backslash, parentesi, virgole, spazi)
    # Es.: sostituisci tutti i caratteri NON a-zA-Z0-9_- con underscore
    cleaned = re.sub(r'[^a-zA-Z0-9_-]', '_', cleaned)
    return cleaned


# Definizione della funzione per convertire una stringa in camelCase
def to_camel_case(s):
    parts = re.split(r'\s+', s.strip())
    if not parts:
        return ""
    return parts[0].lower() + ''.join(word.title() for word in parts[1:])

# -----------------------------------------------------------------------------
# 5. Itera sul CSV delle entità per creare le istanze e associarle al Concept Scheme
# -----------------------------------------------------------------------------
for idx, row in df.iterrows():
    pmid      = str(row["pmid"]).strip()
    annotator = str(row["annotator"]).strip()
    start_idx = str(row["start_idx"]).strip()
    end_idx   = str(row["end_idx"]).strip()
    location  = str(row["location"]).strip()
    text_span = str(row["text_span"]).strip()
    raw_label = str(row["label"]).strip()

    if raw_label.lower() == "ddf":
        label_title = "DDF"
    else:
        label_title = raw_label.title()
    
    cleaned_text_span = re.sub(r'<[^>]*>', '', text_span).strip()
    cleaned_text_span = cleaned_text_span.replace('\\', '\\\\') 

    paper_uri = URIRef(GUTBRAIN[f"paper_{pmid}"])
    g.add((paper_uri, RDF.type, PAPER_CLASS))
    
    if label_title in label_mapping:
        class_uri = label_mapping[label_title]
        instance_fragment = create_uri_fragment(cleaned_text_span)
        instance_uri = URIRef(GUTBRAIN[instance_fragment])
        
        g.add((instance_uri, RDF.type, class_uri))
        g.add((instance_uri, RDF.type, SKOS.Concept))
        g.add((instance_uri, RDFS.label, Literal(cleaned_text_span, datatype=XSD.string)))
        
        scheme_key = label_title
        if label_title in alias_mapping:
            scheme_key = alias_mapping[label_title]
        
        if scheme_key in concept_scheme_mapping:
            scheme_uri = concept_scheme_mapping[scheme_key]
            g.add((instance_uri, SKOS.inScheme, scheme_uri))
        
        #g.add((paper_uri, GUTPROP.containedIn, instance_uri))
    else:
        mention_uri = URIRef(GUTBRAIN[f"mention_{pmid}"])
        g.add((mention_uri, RDF.type, MENTION_CLASS))
        g.add((mention_uri, RDF.type, SKOS.Concept))
        g.add((mention_uri, GUTPROP.annotator, Literal(annotator, datatype=XSD.string)))
        g.add((mention_uri, GUTPROP.location, Literal(location, datatype=XSD.string)))
        g.add((mention_uri, GUTPROP.text_span, Literal(cleaned_text_span, datatype=XSD.string)))
        g.add((mention_uri, GUTPROP.label, Literal(label_title, datatype=XSD.string)))
        g.add((mention_uri, GUTPROP.start_idx, Literal(int(start_idx), datatype=XSD.integer)))
        g.add((mention_uri, GUTPROP.end_idx, Literal(int(end_idx), datatype=XSD.integer)))
        g.add((paper_uri, GUTPROP.hasMention, mention_uri))

# -----------------------------------------------------------------------------
# 6. Lettura del CSV delle relazioni e integrazione nel grafo RDF
# -----------------------------------------------------------------------------
df_rel = pd.read_csv(relations_csv, delimiter="|")
df_rel.columns = [col.strip() for col in df_rel.columns]
print("Colonne lette dal CSV delle relazioni:", df_rel.columns.tolist())

df_rel.rename(columns={'object_label,,': 'object_label'}, inplace=True)
print("Colonne lette dal CSV delle relazioni:", df_rel.columns.tolist())

for idx, row in df_rel.iterrows():
    # Per il soggetto:
    subj_text = str(row["subject_text_span"]).strip()
    cleaned_subj_text = re.sub(r'<[^>]*>', '', subj_text).strip()
    subj_fragment = create_uri_fragment(cleaned_subj_text)
    subj_uri = URIRef(GUTBRAIN[subj_fragment])
    if not list(g.triples((subj_uri, None, None))):
         print(f"Warning: Subject not recognized: {subj_uri}. Info: '{cleaned_subj_text}'")
    
    # Per l'oggetto:
    obj_text = str(row["object_text_span"]).strip()
    cleaned_obj_text = re.sub(r'<[^>]*>', '', obj_text).strip()
    obj_fragment = create_uri_fragment(cleaned_obj_text)
    obj_uri = URIRef(GUTBRAIN[obj_fragment])
    if not list(g.triples((obj_uri, None, None))):
         print(f"Warning: Object not recognized: {obj_uri}. Info: '{cleaned_obj_text}'")
    
    # Per il predicato:
    pred_text = str(row["predicate"]).strip()
    pred_text_clean = to_camel_case(pred_text)
    pred_uri = URIRef(GUTPROP[pred_text_clean])
    print(f"Predicate: '{pred_text}' -> '{pred_text_clean}'")
    g.add((pred_uri, RDF.type, OWL.ObjectProperty))
    g.add((pred_uri, RDFS.label, Literal(pred_text_clean, datatype=XSD.string)))
    
    g.add((subj_uri, pred_uri, obj_uri))

output_file = os.path.join(save_path, "gutbrain_entities.ttl")
g.serialize(destination=output_file, format="turtle")
print(f"Il grafo RDF è stato salvato in {output_file}")

Rilevato delimitatore per entità: '|'
Colonne lette dal CSV delle entità: ['pmid', 'annotator', 'start_idx', 'end_idx', 'location', 'text_span', 'label']
Colonne lette dal CSV delle relazioni: ['pmid', 'annotator', 'subject_start_idx', 'subject_end_idx', 'subject_location', 'subject_text_span', 'subject_label', 'predicate', 'object_start_idx', 'object_end_idx', 'object_location', 'object_text_span', 'object_label,,']
Colonne lette dal CSV delle relazioni: ['pmid', 'annotator', 'subject_start_idx', 'subject_end_idx', 'subject_location', 'subject_text_span', 'subject_label', 'predicate', 'object_start_idx', 'object_end_idx', 'object_location', 'object_text_span', 'object_label']
Predicate: 'target' -> 'target'
Predicate: 'target' -> 'target'
Predicate: 'target' -> 'target'
Predicate: 'impact' -> 'impact'
Predicate: 'change effect' -> 'changeEffect'
Predicate: 'impact' -> 'impact'
Predicate: 'impact' -> 'impact'
Predicate: 'change effect' -> 'changeEffect'
Predicate: 'impact' -> 'impact'


In [2]:
#!/usr/bin/env python3
import os
import re
import unicodedata
import json
from pathlib import Path
import pandas as pd
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD, SKOS, OWL

# -----------------------------------------------------------------------------
# 1. Setup paths and namespaces
# -----------------------------------------------------------------------------
path = str(Path(os.path.abspath(os.getcwd())).absolute())
json_file = os.path.join(path, "train_platinum.json")   # JSON file containing both entities and relations
save_path = os.path.join(path, "rdf")
os.makedirs(save_path, exist_ok=True)

# Namespace for individuals created from the JSON data
GUTBRAIN = Namespace("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/")
# Namespace for schema classes and object properties
GUTPROP = Namespace("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/")

PAPER_CLASS       = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Paper")
MENTION_CLASS     = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Mention")
PAPER_ABSTRACT    = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/PaperAbstract")
PAPER_TITLE       = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/PaperTitle")
PAPER_COLLECTION  = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/PaperCollection")
PROJECT           = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Project")
SAMPLE            = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Sample")
SENTENCE          = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Sentence")

# -----------------------------------------------------------------------------
# 2. Load the JSON file
# -----------------------------------------------------------------------------
with open(json_file, "r", encoding="utf-8") as f:
    data = json.load(f)
print("Loaded JSON paper IDs:", list(data.keys()))

# -----------------------------------------------------------------------------
# 3. Mapping dictionaries (keys must be in Title case)
# -----------------------------------------------------------------------------
label_mapping = {
    "Anatomical Location":   URIRef("https://w3id.org/brainteaser/ontology/schema/AnatomicalSite"),
    "Animal":                URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Animal"),
    "Biomedical Technique":  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/BiomedicalTechnique"),
    "Bacteria":              URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Species"),
    "Chemical":              URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Chemical"),
    "Dietary Supplement":    URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/DietarySupplement"),
    "DDF":                   URIRef("https://w3id.org/brainteaser/ontology/schema/DiseaseDisorderOrFinding"),
    "Drug":                  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Drug"),
    "Food":                  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Food"),
    "Gene":                  URIRef("https://w3id.org/brainteaser/ontology/schema/Gene"),
    "Human":                 URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Human"),
    "Microbiome":            URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Microbiome"),
    "Statistical Technique": URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/StatisticalTechnique")
}
concept_scheme_mapping = {
    "Anatomical Location":   URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/AnatomicSite"),
    "Animal":                URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Animal"),
    "Human":                 URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Human"),
    "Drug":                  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Drug"),
    "Gene":                  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Gene"),
    "Dietary Supplement":    URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/dietarySupplement"),
    "DDF":                   URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/diseaseDisorderOrFindingTaxonomy"),
    "Metabolite":            URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Metabolite"),
    "Species":               URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Species"),
    "Food":                  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Food"),
    "Chemical":              URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Chemical"),
    "Biomedical Technique":  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/BiomedicalTechnique"),
    "Microbiome":            URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Microbiome"),
    "Statistical Technique": URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/StatisticalTechnique")
}
alias_mapping = {
    "Bacteria": "Species"
}

# -----------------------------------------------------------------------------
# 4. Initialize the RDF graph and bind namespaces
# -----------------------------------------------------------------------------
g = Graph()
g.bind("gutbrain", GUTBRAIN)
g.bind("rdfs", RDFS)
g.bind("xsd", XSD)
g.bind("skos", SKOS)
g.bind("owl", OWL)
g.bind("gutprop", GUTPROP)

# Declare skos:inScheme as an ObjectProperty
g.add((SKOS.inScheme, RDF.type, OWL.ObjectProperty))

# Declare new object properties for collection, title, and abstract linking
g.add((GUTPROP.partOf, RDF.type, OWL.ObjectProperty))
g.add((GUTPROP.partOf, RDFS.label, Literal("partOf", datatype=XSD.string)))
g.add((GUTPROP.hasTitle, RDF.type, OWL.ObjectProperty))
g.add((GUTPROP.hasTitle, RDFS.label, Literal("hasTitle", datatype=XSD.string)))
g.add((GUTPROP.hasAbstract, RDF.type, OWL.ObjectProperty))
g.add((GUTPROP.hasAbstract, RDFS.label, Literal("hasAbstract", datatype=XSD.string)))

# Create individuals for concept schemes
for scheme_uri in set(concept_scheme_mapping.values()):
    keys = [k for k, v in concept_scheme_mapping.items() if v == scheme_uri]
    label_text = ", ".join(k.title() for k in keys) + " Concept Scheme"
    g.add((scheme_uri, RDF.type, SKOS.ConceptScheme))
    g.add((scheme_uri, RDFS.label, Literal(label_text, datatype=XSD.string)))

# If the dataset is train_platinum, create an instance of the Platinum Collection
is_train_platinum = "train_platinum" in os.path.basename(json_file)
if is_train_platinum:
    platinum_collection_uri = URIRef(GUTBRAIN["platinumCollection"])
    g.add((platinum_collection_uri, RDF.type, PAPER_COLLECTION))

# -----------------------------------------------------------------------------
# Utility functions
# -----------------------------------------------------------------------------
def create_uri_fragment(text):
    # Remove HTML tags (if any)
    cleaned = re.sub(r'<[^>]*>', '', text)
    # Normalize the text
    cleaned = normalize_text(cleaned)
    # Allow word characters (which includes letters and digits), underscores, hyphens,
    # and any character in the Greek Unicode block (U+0370 to U+03FF)
    cleaned = re.sub(r'[^\w\u0370-\u03FF-]', '_', cleaned)
    return cleaned

def to_camel_case(s):
    s = re.sub(r'[^\w\s]', '', s)
    parts = re.split(r'\s+', s.strip())
    if not parts:
        return ""
    return parts[0].lower() + ''.join(word.title() for word in parts[1:])

def normalize_text(text):
    return unicodedata.normalize('NFC', text)

# -----------------------------------------------------------------------------
# 5. Process each paper (each key in JSON represents a paper)
# -----------------------------------------------------------------------------
for paper_id, paper_data in data.items():
    # Create a paper individual
    paper_uri = URIRef(GUTBRAIN[f"paper_{paper_id}"])
    g.add((paper_uri, RDF.type, PAPER_CLASS))
    
    # If dataset is train_platinum, link the paper to the Platinum Collection instance using partOf
    if is_train_platinum:
        g.add((paper_uri, GUTPROP.partOf, platinum_collection_uri))
    
    # Check for title and abstract locations in the paper's entities.
    # If any entity indicates a location of "title" or "abstract", create the corresponding instance.
    entities = paper_data.get("entities", [])
    title_found = any(entity.get("location", "").strip().lower() == "title" for entity in entities)
    abstract_found = any(entity.get("location", "").strip().lower() == "abstract" for entity in entities)
    if title_found:
        title_uri = URIRef(GUTBRAIN[f"title_{paper_id}"])
        g.add((title_uri, RDF.type, PAPER_TITLE))
        g.add((paper_uri, GUTPROP.hasTitle, title_uri))
    if abstract_found:
        abstract_uri = URIRef(GUTBRAIN[f"abstract_{paper_id}"])
        g.add((abstract_uri, RDF.type, PAPER_ABSTRACT))
        g.add((paper_uri, GUTPROP.hasAbstract, abstract_uri))
    
    # Process entities from this paper
    for entity in entities:
        raw_label = entity.get("label", "").strip()
        if raw_label.lower() == "ddf":
            label_title = "DDF"
        else:
            label_title = raw_label.title()
        text_span = entity.get("text_span", "").strip()
        cleaned_text_span = re.sub(r'<[^>]*>', '', text_span).strip()
        instance_fragment = create_uri_fragment(cleaned_text_span)
        instance_uri = URIRef(GUTBRAIN[instance_fragment])
        
        if label_title in label_mapping:
            class_uri = label_mapping[label_title]
            g.add((instance_uri, RDF.type, class_uri))
            g.add((instance_uri, RDF.type, SKOS.Concept))
            g.add((instance_uri, RDFS.label, Literal(cleaned_text_span, datatype=XSD.string)))
            
            scheme_key = label_title
            if label_title in alias_mapping:
                scheme_key = alias_mapping[label_title]
            if scheme_key in concept_scheme_mapping:
                scheme_uri = concept_scheme_mapping[scheme_key]
                g.add((instance_uri, SKOS.inScheme, scheme_uri))
            # Link paper to entity
            #g.add((paper_uri, GUTPROP.containedIn, instance_uri))
        else:
            # Create a mention individual for unrecognized labels
            mention_uri = URIRef(GUTBRAIN[f"mention_{paper_id}"])
            g.add((mention_uri, RDF.type, MENTION_CLASS))
            g.add((mention_uri, RDF.type, SKOS.Concept))
            annotator = paper_data.get("metadata", {}).get("annotator", "unknown")
            g.add((mention_uri, GUTPROP.annotator, Literal(annotator, datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.location, Literal(entity.get("location", ""), datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.text_span, Literal(cleaned_text_span, datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.label, Literal(label_title, datatype=XSD.string)))
            g.add((paper_uri, GUTPROP.hasMention, mention_uri))
    
    # Process relations from this paper
    relations = paper_data.get("relations", [])
    for relation in relations:
        subj_text = relation.get("subject_text_span", "").strip()
        cleaned_subj_text = re.sub(r'<[^>]*>', '', subj_text).strip()
        subj_fragment = create_uri_fragment(cleaned_subj_text)
        subj_uri = URIRef(GUTBRAIN[subj_fragment])
        if not list(g.triples((subj_uri, None, None))):
            print(f"Warning: Subject not recognized: {subj_uri}. Info: '{cleaned_subj_text}'")
        
        obj_text = relation.get("object_text_span", "").strip()
        cleaned_obj_text = re.sub(r'<[^>]*>', '', obj_text).strip()
        obj_fragment = create_uri_fragment(cleaned_obj_text)
        obj_uri = URIRef(GUTBRAIN[obj_fragment])
        if not list(g.triples((obj_uri, None, None))):
            print(f"Warning: Object not recognized: {obj_uri}. Info: '{cleaned_obj_text}'")
        
        pred_text = relation.get("predicate", "").strip()
        pred_text_clean = to_camel_case(pred_text)
        pred_uri = URIRef(GUTPROP[pred_text_clean])
        print(f"Predicate: '{pred_text}' -> '{pred_text_clean}'")
        g.add((pred_uri, RDF.type, OWL.ObjectProperty))
        g.add((pred_uri, RDFS.label, Literal(pred_text_clean, datatype=XSD.string)))
        
        g.add((subj_uri, pred_uri, obj_uri))

# -----------------------------------------------------------------------------
# 6.b (Optional) Process any ternary relations if needed...
# -----------------------------------------------------------------------------
# (You could add loops for binary_tag_based_relations, ternary_tag_based_relations,
#  and ternary_mention_based_relations if required, following a similar approach.)

# -----------------------------------------------------------------------------
# 7. Serialize and print the graph in Turtle format
# -----------------------------------------------------------------------------
output_file = os.path.join(save_path, "gutbrain_entities.ttl")
ttl_output = g.serialize(format="turtle")
with open(output_file, "w", encoding="utf-8") as f_out:
    f_out.write(ttl_output)
print(f"The RDF graph has been saved in {output_file}")


Loaded JSON paper IDs: ['38068763', '35965349', '34870091', '28158162', '34172092', '37845499', '37371676', '37574818', '37571393', '37841274', '37485660', '31955786', '34098340', '38350463', '29352709', '33511258', '33422110', '34985325', '36550591', '30459574', '38026003', '33194817', '29022384', '29857583', '34758889', '37881577', '36984505', '32979562', '34961418', '25034760', '33067915', '33271210', '36794003', '38132705', '36900437', '34603341', '34422393', '35914559', '38422755', '37228957', '30717162', '31248424', '37469436', '31179435', '37995075', '35326429', '31083360', '38010793', '31685046', '34444820', '34092293', '37927130', '35432226', '36757367', '36493975', '37213508', '33046051', '38204948', '31952911', '29023380', '28572752', '36346385', '32459708', '33177907', '38089822', '31646148', '23981537', '37657622', '36760344', '33722869', '34776854', '28976454', '31053995', '38576868', '37511699', '37464164', '37368331', '37396336', '36517709', '37978477', '33713734', '348