In [66]:
#!/usr/bin/env python3
import os
import csv
import re
from pathlib import Path
import pandas as pd
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD, SKOS, OWL

# -----------------------------------------------------------------------------
# 1. Setup dei percorsi e dei namespace
# -----------------------------------------------------------------------------
path = str(Path(os.path.abspath(os.getcwd())).absolute())
csv_file = os.path.join(path, "train_platinum_entities.csv")  # Assicurati che il file sia in questo percorso
save_path = os.path.join(path, "rdf")
os.makedirs(save_path, exist_ok=True)

# Namespace per gli individuali creati dal CSV
GUTBRAIN = Namespace("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/")
GUTPROP = Namespace("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/")

PAPER_CLASS   = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Paper")
MENTION_CLASS = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Mention")

# -----------------------------------------------------------------------------
# 2. Lettura del CSV con rilevamento automatico del delimitatore
# -----------------------------------------------------------------------------
with open(csv_file, "r", encoding="utf-8") as f:
    sample = f.read(1024)
    f.seek(0)
    try:
        dialect = csv.Sniffer().sniff(sample)
        delimiter = dialect.delimiter
        print(f"Rilevato delimitatore: '{delimiter}'")
    except csv.Error:
        delimiter = ","  # fallback
        print("Impossibile rilevare il delimitatore, uso la virgola ','.")

df = pd.read_csv(csv_file, delimiter=delimiter)
df.columns = [col.strip() for col in df.columns]
print("Colonne lette dal CSV:", df.columns.tolist())

# -----------------------------------------------------------------------------
# 3. Dizionari di mapping (le chiavi sono in formato Title)
# -----------------------------------------------------------------------------
# Mapping per le label che associano la label (formattata come Title) all'URI della classe
label_mapping = {
    "Anatomical Location":   URIRef("https://w3id.org/brainteaser/ontology/schema/AnatomicalSite"),
    "Animal":                URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Animal"),
    "Biomedical Technique":  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/BiomedicalTechnique"),
    "Bacteria":              URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Species"),
    "Chemical":              URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Chemical"),
    "Dietary Supplement":    URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/DietarySupplement"),
    "Ddf":                   URIRef("https://w3id.org/brainteaser/ontology/schema/DiseaseDisorderOrFinding"),
    "Drug":                  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Drug"),
    "Food":                  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Food"),
    "Gene":                  URIRef("https://w3id.org/brainteaser/ontology/schema/Gene"),
    "Human":                 URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Human"),
    "Microbiome":            URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Microbiome"),
    "Statistical Technique": URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/StatisticalTechnique")
}

# Mapping per i concept scheme: le chiavi devono essere in formato Title.
concept_scheme_mapping = {
    "Anatomical Location":   URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/AnatomicSite"),
    "Animal":                URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Animal"),
    "Human":                 URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Human"),
    "Drug":                  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Drug"),
    "Gene":                  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Gene"),
    "Dietary Supplement":    URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/dietarySupplement"),
    "Ddf":                   URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/diseaseDisorderOrFindingTaxonomy"),
    "Metabolite":            URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Metabolite"),
    "Species":               URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Species"),
    "Food":                  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Food"),
    "Chemical":              URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Chemical"),
    "Biomedical Technique":  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/BiomedicalTechnique"),
    "Microbiome":            URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Microbiome"),
    "Statistical Technique": URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/StatisticalTechnique")
}

# Dizionario alias: per mappare eventuali termini alternativi alla chiave di concept scheme
alias_mapping = {
    "Bacteria": "Species"
}

# -----------------------------------------------------------------------------
# 4. Inizializza il grafo RDF e imposta il binding dei namespace
# -----------------------------------------------------------------------------
g = Graph()
g.bind("gutbrain", GUTBRAIN)
g.bind("rdfs", RDFS)
g.bind("xsd", XSD)
g.bind("skos", SKOS)
g.bind("owl", OWL)
g.bind("gutprop", GUTPROP)

# -----------------------------------------------------------------------------
# 4.a: Dichiarazione di skos:inScheme come ObjectProperty (OWL)
# -----------------------------------------------------------------------------
g.add((SKOS.inScheme, RDF.type, OWL.ObjectProperty))

# -----------------------------------------------------------------------------
# 4.b: Crea gli individuali per i Concept Scheme
# -----------------------------------------------------------------------------
for scheme_uri in set(concept_scheme_mapping.values()):
    # Trova tutte le chiavi che mappano a questo URI
    keys = [k for k, v in concept_scheme_mapping.items() if v == scheme_uri]
    # Costruisci l'etichetta: usa .title() per capitalizzare
    label_text = ", ".join(k.title() for k in keys) + " Concept Scheme"
    g.add((scheme_uri, RDF.type, SKOS.ConceptScheme))
    g.add((scheme_uri, RDFS.label, Literal(label_text, datatype=XSD.string)))

def create_uri_fragment(text):
    # Rimuovi spazi e virgole
    return re.sub(r'[ ,]', '', text)
    
# -----------------------------------------------------------------------------
# 5. Itera su ogni riga del CSV per creare le istanze e associarle al Concept Scheme
# -----------------------------------------------------------------------------
for idx, row in df.iterrows():
    pmid      = str(row["pmid"]).strip()
    annotator = str(row["annotator"]).strip()
    start_idx = str(row["start_idx"]).strip()
    end_idx   = str(row["end_idx"]).strip()
    location  = str(row["location"]).strip()
    text_span = str(row["text_span"]).strip()
    raw_label = str(row["label"]).strip()

    # Converti la label in formato Title per uniformarla (es. "bacteria" -> "Bacteria")
    label_title = raw_label.title()

    # Rimuove eventuali tag HTML dal campo text_span
    cleaned_text_span = re.sub(r'<[^>]*>', '', text_span).strip()

    # Crea o riferisci l'istanza del paper associato
    paper_uri = URIRef(GUTBRAIN[f"paper_{pmid}"])
    g.add((paper_uri, RDF.type, PAPER_CLASS))
    
    if label_title in label_mapping:
        class_uri = label_mapping[label_title]
        instance_name = cleaned_text_span.replace(" ", "_")
        instance_uri = URIRef(GUTBRAIN[instance_name])
        
        g.add((instance_uri, RDF.type, class_uri))
        g.add((instance_uri, RDF.type, SKOS.Concept))
        g.add((instance_uri, RDFS.label, Literal(cleaned_text_span, datatype=XSD.string)))
        
        # Determina la chiave da usare per il Concept Scheme: usa l'alias se esiste
        scheme_key = label_title
        if label_title in alias_mapping:
            scheme_key = alias_mapping[label_title]
        
        if scheme_key in concept_scheme_mapping:
            scheme_uri = concept_scheme_mapping[scheme_key]
            g.add((instance_uri, SKOS.inScheme, scheme_uri))
        
        #g.add((paper_uri, GUTPROP.containedIn, instance_uri))
    else:
        # Gestione come una Mention per le annotazioni non mappate
        mention_uri = URIRef(GUTBRAIN[f"mention_{pmid}"])
        g.add((mention_uri, RDF.type, MENTION_CLASS))
        g.add((mention_uri, RDF.type, SKOS.Concept))
        g.add((mention_uri, GUTBRAIN.annotator, Literal(annotator, datatype=XSD.string)))
        g.add((mention_uri, GUTBRAIN.location, Literal(location, datatype=XSD.string)))
        g.add((mention_uri, GUTBRAIN.text_span, Literal(cleaned_text_span, datatype=XSD.string)))
        g.add((mention_uri, GUTBRAIN.label, Literal(label_title, datatype=XSD.string)))
        g.add((mention_uri, GUTBRAIN.start_idx, Literal(int(start_idx), datatype=XSD.integer)))
        g.add((mention_uri, GUTBRAIN.end_idx, Literal(int(end_idx), datatype=XSD.integer)))
        g.add((paper_uri, GUTBRAIN.hasMention, mention_uri))

# -----------------------------------------------------------------------------
# 6. Serializza il grafo RDF in formato Turtle e salva su file
# -----------------------------------------------------------------------------
output_file = os.path.join(save_path, "gutbrain_entities.ttl")
g.serialize(destination=output_file, format="turtle")
print(f"Il grafo RDF è stato salvato in {output_file}")

Rilevato delimitatore: '|'
Colonne lette dal CSV: ['pmid', 'annotator', 'start_idx', 'end_idx', 'location', 'text_span', 'label']
Il grafo RDF è stato salvato in C:\Users\samue\OneDrive\Desktop\ThesisPiron\rdf\gutbrain_entities.ttl


In [102]:
#!/usr/bin/env python3
import os
import csv
import re
from pathlib import Path
import pandas as pd
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD, SKOS, OWL

# -----------------------------------------------------------------------------
# 1. Setup dei percorsi e dei namespace
# -----------------------------------------------------------------------------
path = str(Path(os.path.abspath(os.getcwd())).absolute())
csv_file = os.path.join(path, "train_platinum_entities.csv")  # File delle entità
relations_csv = os.path.join(path, "train_platinum_relations.csv")  # File delle relazioni
save_path = os.path.join(path, "rdf")
os.makedirs(save_path, exist_ok=True)

# Namespace per gli individuali creati dal CSV
GUTBRAIN = Namespace("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/")
# Namespace per le proprietà e per i tipi (schema)
GUTPROP = Namespace("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/")

PAPER_CLASS   = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Paper")
MENTION_CLASS = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Mention")

# -----------------------------------------------------------------------------
# 2. Lettura del CSV delle entità con rilevamento automatico del delimitatore
# -----------------------------------------------------------------------------
with open(csv_file, "r", encoding="utf-8") as f:
    sample = f.read(1024)
    f.seek(0)
    try:
        dialect = csv.Sniffer().sniff(sample)
        delimiter = dialect.delimiter
        print(f"Rilevato delimitatore per entità: '{delimiter}'")
    except csv.Error:
        delimiter = ","  
        print("Impossibile rilevare il delimitatore per entità, uso la virgola ','.")

df = pd.read_csv(csv_file, delimiter=delimiter)
df.columns = [col.strip() for col in df.columns]
print("Colonne lette dal CSV delle entità:", df.columns.tolist())

# -----------------------------------------------------------------------------
# 3. Dizionari di mapping (le chiavi sono in formato Title)
# -----------------------------------------------------------------------------
label_mapping = {
    "Anatomical Location":   URIRef("https://w3id.org/brainteaser/ontology/schema/AnatomicalSite"),
    "Animal":                URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Animal"),
    "Biomedical Technique":  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/BiomedicalTechnique"),
    "Bacteria":              URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Species"),
    "Chemical":              URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Chemical"),
    "Dietary Supplement":    URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/DietarySupplement"),
    "DDF":                   URIRef("https://w3id.org/brainteaser/ontology/schema/DiseaseDisorderOrFinding"),
    "Drug":                  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Drug"),
    "Food":                  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Food"),
    "Gene":                  URIRef("https://w3id.org/brainteaser/ontology/schema/Gene"),
    "Human":                 URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Human"),
    "Microbiome":            URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Microbiome"),
    "Statistical Technique": URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/StatisticalTechnique")
}

concept_scheme_mapping = {
    "Anatomical Location":   URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/AnatomicSite"),
    "Animal":                URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Animal"),
    "Human":                 URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Human"),
    "Drug":                  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Drug"),
    "Gene":                  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Gene"),
    "Dietary Supplement":    URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/dietarySupplement"),
    "DDF":                   URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/diseaseDisorderOrFindingTaxonomy"),
    "Metabolite":            URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Metabolite"),
    "Species":               URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Species"),
    "Food":                  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Food"),
    "Chemical":              URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Chemical"),
    "Biomedical Technique":  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/BiomedicalTechnique"),
    "Microbiome":            URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Microbiome"),
    "Statistical Technique": URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/StatisticalTechnique")
}

alias_mapping = {
    "Bacteria": "Species"
}

# -----------------------------------------------------------------------------
# 4. Inizializza il grafo RDF e imposta il binding dei namespace
# -----------------------------------------------------------------------------
g = Graph()
g.bind("gutbrain", GUTBRAIN)
g.bind("rdfs", RDFS)
g.bind("xsd", XSD)
g.bind("skos", SKOS)
g.bind("owl", OWL)
g.bind("gutprop", GUTPROP)

# -----------------------------------------------------------------------------
# 4.a: Dichiarazione di skos:inScheme come ObjectProperty (OWL)
# -----------------------------------------------------------------------------
g.add((SKOS.inScheme, RDF.type, OWL.ObjectProperty))

# -----------------------------------------------------------------------------
# 4.b: Crea gli individuali per i Concept Scheme
# -----------------------------------------------------------------------------
for scheme_uri in set(concept_scheme_mapping.values()):
    keys = [k for k, v in concept_scheme_mapping.items() if v == scheme_uri]
    label_text = ", ".join(k.title() for k in keys) + " Concept Scheme"
    g.add((scheme_uri, RDF.type, SKOS.ConceptScheme))
    g.add((scheme_uri, RDFS.label, Literal(label_text, datatype=XSD.string)))

# Funzione per creare un frammento URI senza spazi, virgole e parentesi
def create_uri_fragment(text):
    # Rimuove i tag HTML, se presenti
    cleaned = re.sub(r'<[^>]*>', '', text)
    # Sostituisce / rimuove caratteri non validi (ad es. backslash, parentesi, virgole, spazi)
    # Es.: sostituisci tutti i caratteri NON a-zA-Z0-9_- con underscore
    cleaned = re.sub(r'[^a-zA-Z0-9_-]', '_', cleaned)
    return cleaned


# Definizione della funzione per convertire una stringa in camelCase
def to_camel_case(s):
    parts = re.split(r'\s+', s.strip())
    if not parts:
        return ""
    return parts[0].lower() + ''.join(word.title() for word in parts[1:])

# -----------------------------------------------------------------------------
# 5. Itera sul CSV delle entità per creare le istanze e associarle al Concept Scheme
# -----------------------------------------------------------------------------
for idx, row in df.iterrows():
    pmid      = str(row["pmid"]).strip()
    annotator = str(row["annotator"]).strip()
    start_idx = str(row["start_idx"]).strip()
    end_idx   = str(row["end_idx"]).strip()
    location  = str(row["location"]).strip()
    text_span = str(row["text_span"]).strip()
    raw_label = str(row["label"]).strip()

    if raw_label.lower() == "ddf":
        label_title = "DDF"
    else:
        label_title = raw_label.title()
    
    cleaned_text_span = re.sub(r'<[^>]*>', '', text_span).strip()
    cleaned_text_span = cleaned_text_span.replace('\\', '\\\\') 

    paper_uri = URIRef(GUTBRAIN[f"paper_{pmid}"])
    g.add((paper_uri, RDF.type, PAPER_CLASS))
    
    if label_title in label_mapping:
        class_uri = label_mapping[label_title]
        instance_fragment = create_uri_fragment(cleaned_text_span)
        instance_uri = URIRef(GUTBRAIN[instance_fragment])
        
        g.add((instance_uri, RDF.type, class_uri))
        g.add((instance_uri, RDF.type, SKOS.Concept))
        g.add((instance_uri, RDFS.label, Literal(cleaned_text_span, datatype=XSD.string)))
        
        scheme_key = label_title
        if label_title in alias_mapping:
            scheme_key = alias_mapping[label_title]
        
        if scheme_key in concept_scheme_mapping:
            scheme_uri = concept_scheme_mapping[scheme_key]
            g.add((instance_uri, SKOS.inScheme, scheme_uri))
        
        #g.add((paper_uri, GUTPROP.containedIn, instance_uri))
    else:
        mention_uri = URIRef(GUTBRAIN[f"mention_{pmid}"])
        g.add((mention_uri, RDF.type, MENTION_CLASS))
        g.add((mention_uri, RDF.type, SKOS.Concept))
        g.add((mention_uri, GUTPROP.annotator, Literal(annotator, datatype=XSD.string)))
        g.add((mention_uri, GUTPROP.location, Literal(location, datatype=XSD.string)))
        g.add((mention_uri, GUTPROP.text_span, Literal(cleaned_text_span, datatype=XSD.string)))
        g.add((mention_uri, GUTPROP.label, Literal(label_title, datatype=XSD.string)))
        g.add((mention_uri, GUTPROP.start_idx, Literal(int(start_idx), datatype=XSD.integer)))
        g.add((mention_uri, GUTPROP.end_idx, Literal(int(end_idx), datatype=XSD.integer)))
        g.add((paper_uri, GUTPROP.hasMention, mention_uri))

# -----------------------------------------------------------------------------
# 6. Lettura del CSV delle relazioni e integrazione nel grafo RDF
# -----------------------------------------------------------------------------
df_rel = pd.read_csv(relations_csv, delimiter="|")
df_rel.columns = [col.strip() for col in df_rel.columns]
print("Colonne lette dal CSV delle relazioni:", df_rel.columns.tolist())

df_rel.rename(columns={'object_label,,': 'object_label'}, inplace=True)
print("Colonne lette dal CSV delle relazioni:", df_rel.columns.tolist())

for idx, row in df_rel.iterrows():
    # Per il soggetto:
    subj_text = str(row["subject_text_span"]).strip()
    cleaned_subj_text = re.sub(r'<[^>]*>', '', subj_text).strip()
    subj_fragment = create_uri_fragment(cleaned_subj_text)
    subj_uri = URIRef(GUTBRAIN[subj_fragment])
    
    # Per l'oggetto:
    obj_text = str(row["object_text_span"]).strip()
    cleaned_obj_text = re.sub(r'<[^>]*>', '', obj_text).strip()
    obj_fragment = create_uri_fragment(cleaned_obj_text)
    print(obj_fragment)
    obj_uri = URIRef(GUTBRAIN[obj_fragment])
    
    # Per il predicato:
    pred_text = str(row["predicate"]).strip()
    pred_text_clean = to_camel_case(pred_text)
    pred_uri = URIRef(GUTPROP[pred_text_clean])
    
    g.add((subj_uri, pred_uri, obj_uri))

output_file = os.path.join(save_path, "gutbrain_entities.ttl")
g.serialize(destination=output_file, format="turtle")
print(f"Il grafo RDF è stato salvato in {output_file}")

Rilevato delimitatore per entità: '|'
Colonne lette dal CSV delle entità: ['pmid', 'annotator', 'start_idx', 'end_idx', 'location', 'text_span', 'label']
Colonne lette dal CSV delle relazioni: ['pmid', 'annotator', 'subject_start_idx', 'subject_end_idx', 'subject_location', 'subject_text_span', 'subject_label', 'predicate', 'object_start_idx', 'object_end_idx', 'object_location', 'object_text_span', 'object_label,,']
Colonne lette dal CSV delle relazioni: ['pmid', 'annotator', 'subject_start_idx', 'subject_end_idx', 'subject_location', 'subject_text_span', 'subject_label', 'predicate', 'object_start_idx', 'object_end_idx', 'object_location', 'object_text_span', 'object_label']
patients
people
bariatric_patients
gut_microbiome
inflammation
poultry_gut_microbiome
gut_microbiome
inflammation
gut_microbiome
inflammation
poultry_gut_microbiome
Birds
bird
patients
patients
patients
psychosis
schizophrenia
patients
patients
patients
healthy_controls
patients
patients
Human_Migraine_Headache
g