In [1]:
import xml.etree.ElementTree as ET
from difflib import SequenceMatcher, get_close_matches

def parse_mesh_descriptors(xml_path):
    """Parse the MeSH XML and return a list of descriptors."""
    tree = ET.parse(xml_path)
    root = tree.getroot()
    descriptors = []
    for dr in root.findall('DescriptorRecord'):
        ui_el   = dr.find('DescriptorUI')
        name_el = dr.find('DescriptorName/String')
        if ui_el is None or name_el is None:
            continue
        ui   = ui_el.text
        name = name_el.text
        tree_nums = [tn.text for tn in dr.findall('TreeNumberList/TreeNumber') if tn.text]
        descriptors.append({'ui':ui, 'name':name, 'tree_numbers':tree_nums})
    return descriptors

MESH_XML = 'desc2025.xml' #from the folder
descriptors = parse_mesh_descriptors(MESH_XML)
print(f"Parsed {len(descriptors)} descriptors")
#print(descriptors)

Parsed 30956 descriptors


In [3]:
def get_bacteria_taxonomy(descriptors):
    """
    Find the descriptor with name 'Bacteria', 
    then collect every descriptor under that tree-number prefix.
    """
    prefix = None #bacteria is B03
    for d in descriptors:
        if d['name'].lower() == 'bacteria' and d['tree_numbers']:
            prefix = d['tree_numbers'][0]
            break
    if not prefix:
        raise RuntimeError("Could not find 'Bacteria' in descriptors")
    tax = {}
    for d in descriptors:
        for tn in d['tree_numbers']:
            if tn == prefix or tn.startswith(prefix + '.'):
                tax[tn] = {'ui':d['ui'], 'name':d['name']}
                break
    return tax

bacteria_tax = get_bacteria_taxonomy(descriptors)
#print(f"{len(bacteria_tax)} bacterial MeSH nodes")

In [4]:
def build_name_index(taxonomy):
    """
    Build a dict: lower-case name -> list of (tree#, ui, canonical name)
    """
    idx = {}
    for tree_num, info in taxonomy.items():
        key = info['name'].lower()
        idx.setdefault(key, []).append((tree_num, info['ui'], info['name']))
    return idx

name_index = build_name_index(bacteria_tax)
print(f"Indexed {len(name_index)} bacterial names")

Indexed 859 bacterial names


In [5]:
from difflib import SequenceMatcher

def find_mesh_match(input_name, name_index, n=5, cutoff=0.6):
    """
    Return matches for input_name among the MeSH bacterial names.
    Exact matches come first with score=1.0.
    Then fuzzy matches (score computed via SequenceMatcher.ratio).
    """
    key = input_name.lower()
    results = []
    if key in name_index:
        for tree_num, ui, name in name_index[key]:
            results.append((name, ui, tree_num, 1.0))
        return results

    all_names = list(name_index.keys())
    #print(all_names)
    
    close = get_close_matches(key, all_names, n=n, cutoff=cutoff)
    for cname in close:
        score = SequenceMatcher(None, key, cname).ratio()
        for tree_num, ui, name in name_index[cname]:
            results.append((name, ui, tree_num, score))
    results.sort(key=lambda x: x[3], reverse=True)
    return results


In [7]:
import re
import json

def strip_html_tags(text):
    return re.sub(r'<[^>]+>', '', text)
path = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\data\train_platinum\train_platinum.json"
with open(path, "r", encoding="utf-8") as f:
    data = json.load(f)

bacteria_terms = set()
for record in data.values():
    for ent in record.get("entities", []):
        if ent.get("label", "").lower() == "bacteria":
            raw = ent["text_span"]
            clean = strip_html_tags(raw).strip()
            if clean:
                bacteria_terms.add(clean)

bacteria_terms.add(clean)

for term in sorted(bacteria_terms):
    print(f"Query: {term}")
    matches = find_mesh_match(term, name_index)
    if not matches:
        print("no matches")
    else:
        for name, ui, tree, score in matches[:5]:
            print(f"  • {name:30s} UI={ui:8s} Tree={tree:12s}  score={score:.2f}")
    print()

Query: A. muciniphila
  • Acidiphilium                   UI=D041801  Tree=B03.440.400.425.100.110  score=0.62

Query: Acidaminococcus intestini
  • Acidaminococcus                UI=D045850  Tree=B03.353.250   score=0.75
  • Micrococcus luteus             UI=D016982  Tree=B03.510.024.850.500.500  score=0.60

Query: Actinobacteria
  • Actinobacteria                 UI=D039903  Tree=B03.510.024   score=1.00

Query: Actinobacteriota
  • Actinobacteria                 UI=D039903  Tree=B03.510.024   score=0.93
  • Acidobacteria                  UI=D061271  Tree=B03.026       score=0.83
  • Cyanobacteria                  UI=D000458  Tree=B03.280       score=0.76
  • Acinetobacter                  UI=D000150  Tree=B03.440.400.425.537.050  score=0.76
  • Acetobacter                    UI=D000091  Tree=B03.440.400.425.100.100  score=0.74

Query: Acute/chronic insomnia-related signature bacteria
no matches

Query: Agathobaculum
  • Thiobacillus                   UI=D013855  Tree=B03.440.400.450.

In [1]:
import xml.etree.ElementTree as ET

def parse_mesh_descriptors(xml_path):
    """
    Parse the MeSH XML and return a list of dicts:
      { 'ui': DescriptorUI,
        'name': DescriptorName,
        'tree_numbers': [treeNum1, treeNum2, ...]
      }
    """
    tree = ET.parse(xml_path)
    root = tree.getroot()
    descriptors = []
    for dr in root.findall('DescriptorRecord'):
        ui_el = dr.find('DescriptorUI')
        name_el = dr.find('DescriptorName/String')
        if ui_el is None or name_el is None:
            continue
        ui = ui_el.text
        name = name_el.text
        tree_numbers = [tn.text for tn in dr.findall('TreeNumberList/TreeNumber') if tn.text]
        descriptors.append({
            'ui': ui,
            'name': name,
            'tree_numbers': tree_numbers
        })
    return descriptors

def get_bacteria_taxonomy(xml_path):
    """
    From the full MeSH descriptors file, extract all descriptors
    under the 'Bacteria' branch (tree number B03).
    Returns a dict: { tree_number: { 'ui': ..., 'name': ... }, ... }
    """
    descriptors = parse_mesh_descriptors(xml_path)

    bacteria_prefix = None
    for d in descriptors:
        if d['name'] == 'Bacteria':
            bacteria_prefix = d['tree_numbers'][0]
            break

    if not bacteria_prefix:
        raise RuntimeError("Couldn't find a descriptor named 'Bacteria' in the file.")

    taxonomy = {}
    for d in descriptors:
        for tn in d['tree_numbers']:
            if tn == bacteria_prefix or tn.startswith(bacteria_prefix + '.'):
                taxonomy[tn] = {
                    'ui': d['ui'],
                    'name': d['name']
                }
                break

    return taxonomy

if __name__ == '__main__':
    xml_file = 'desc2025.xml'
    bacteria_tax = get_bacteria_taxonomy(xml_file)

    for tree_num in sorted(bacteria_tax):
        info = bacteria_tax[tree_num]
        print(f"{tree_num:10s}  {info['name']:30s}  ({info['ui']})")

B03         Bacteria                        (D001419)
B03.026     Acidobacteria                   (D061271)
B03.054     Agricultural Inoculants         (D059827)
B03.110     Atypical Bacterial Forms        (D001295)
B03.110.422  L Forms                         (D007740)
B03.110.761  Spheroplasts                    (D013104)
B03.120     Bacteria, Aerobic               (D001420)
B03.130     Bacteria, Anaerobic             (D001421)
B03.135     Bacteria, Thermoduric           (D000072280)
B03.250     Chlorobi                        (D019414)
B03.250.140  Chlorobium                      (D041883)
B03.275     Chloroflexi                     (D041862)
B03.275.150  Chloroflexus                    (D041861)
B03.275.575  Dehalococcoides                 (D000082942)
B03.280     Cyanobacteria                   (D000458)
B03.280.100  Anabaena                        (D017033)
B03.280.100.150  Anabaena cylindrica             (D046868)
B03.280.100.900  Anabaena variabilis             (D046870)
B03.28

In [2]:
import xml.etree.ElementTree as ET
from difflib import SequenceMatcher, get_close_matches

def parse_mesh_descriptors(xml_path):
    """Parse the MeSH XML and return a list of descriptors."""
    tree = ET.parse(xml_path)
    root = tree.getroot()
    descriptors = []
    for dr in root.findall('DescriptorRecord'):
        ui_el   = dr.find('DescriptorUI')
        name_el = dr.find('DescriptorName/String')
        if ui_el is None or name_el is None:
            continue
        ui   = ui_el.text
        name = name_el.text
        tree_nums = [tn.text for tn in dr.findall('TreeNumberList/TreeNumber') if tn.text]
        descriptors.append({'ui':ui, 'name':name, 'tree_numbers':tree_nums})
    return descriptors

MESH_XML = 'desc2025.xml' #from the folder
descriptors = parse_mesh_descriptors(MESH_XML)
print(f"Parsed {len(descriptors)} descriptors")
#print(descriptors)

Parsed 30956 descriptors


In [3]:
def get_bacteria_taxonomy(descriptors):
    """
    Find the descriptor with name 'Bacteria', 
    then collect every descriptor under that tree-number prefix.
    """
    prefix = None #bacteria is B03
    for d in descriptors:
        if d['name'].lower() == 'bacteria' and d['tree_numbers']:
            prefix = d['tree_numbers'][0]
            break
    if not prefix:
        raise RuntimeError("Could not find 'Bacteria' in descriptors")
    tax = {}
    for d in descriptors:
        for tn in d['tree_numbers']:
            if tn == prefix or tn.startswith(prefix + '.'):
                tax[tn] = {'ui':d['ui'], 'name':d['name']}
                break
    return tax

bacteria_tax = get_bacteria_taxonomy(descriptors)
#print(f"{len(bacteria_tax)} bacterial MeSH nodes")

In [4]:
def build_name_index(taxonomy):
    """
    Build a dict: lower-case name -> list of (tree#, ui, canonical name)
    """
    idx = {}
    for tree_num, info in taxonomy.items():
        key = info['name'].lower()
        idx.setdefault(key, []).append((tree_num, info['ui'], info['name']))
    return idx

name_index = build_name_index(bacteria_tax)
print(f"Indexed {len(name_index)} bacterial names")

Indexed 859 bacterial names


In [5]:
from difflib import SequenceMatcher

def find_mesh_match(input_name, name_index, n=5, cutoff=0.6):
    """
    Return matches for input_name among the MeSH bacterial names.
    Exact matches come first with score=1.0.
    Then fuzzy matches (score computed via SequenceMatcher.ratio).
    """
    key = input_name.lower()
    results = []
    if key in name_index:
        for tree_num, ui, name in name_index[key]:
            results.append((name, ui, tree_num, 1.0))
        return results

    all_names = list(name_index.keys())
    #print(all_names)
    
    close = get_close_matches(key, all_names, n=n, cutoff=cutoff)
    for cname in close:
        score = SequenceMatcher(None, key, cname).ratio()
        for tree_num, ui, name in name_index[cname]:
            results.append((name, ui, tree_num, score))
    results.sort(key=lambda x: x[3], reverse=True)
    return results


In [6]:
import re, json, numpy as np
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def load_taxonomy_tree(path):
    row_re = re.compile(r"^\s*(.*?)\s+\[([^\]]+)\]\s*$")
    rows = []
    with open(path, encoding="utf-8") as fh:
        for ln in fh:
            m = row_re.match(ln)
            if m:
                label, tid = m.groups()
                depth      = len(ln) - len(ln.lstrip())
                rows.append((label, tid, depth))
    return rows

TAX_FILE = "bacteria_tree1.txt"
rows     = load_taxonomy_tree(TAX_FILE)

exact_ix  = defaultdict(list)
for lbl, tid, depth in rows:
    exact_ix[lbl.lower()].append((lbl, tid, depth))

labels_only  = [r[0] for r in rows]
vec          = TfidfVectorizer(stop_words="english")
mat          = vec.fit_transform(labels_only)

def top_cosine(term, k=5, thr=0.75):
    v   = vec.transform([term])
    sc  = cosine_similarity(v, mat).ravel()
    idx = np.argsort(sc)[::-1]
    out = []
    for i in idx[:k]:
        if sc[i] < thr: break
        lbl, tid, d = rows[i]
        out.append((lbl, tid, d, sc[i]))
    return out

abbr_re = re.compile(r"^([A-Z])\.\s+([A-Za-z_-]+)$")
def preprocess(term):
    """toglie _ e spazio finale/iniziale"""
    return term.replace('_', ' ').strip()

def genus_abbrev_lookup(term):
    """
    Se il termine è tipo 'L. plantarum' cerca qualunque label che:
      • finisca con ' plantarum'
      • abbia un genere che comincia con 'L'
    """
    m = abbr_re.match(term)
    if not m:
        return []
    initial, species = m.groups()
    species = species.lower()
    hits = []
    for lbl, tid, d in rows:
        if lbl.lower().endswith(' ' + species) and lbl[0].upper() == initial:
            hits.append((lbl, tid, d, 1.00))
    return hits

# ---------------------------------------------------------------------------
# 2)  CARICA LE QUERY DAL JSON ----------------------------------------------
def strip_html(text):   # copia dalla tua versione originale
    return re.sub(r"<[^>]+>", "", text)

with open(r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\data\train_platinum\train_platinum.json", encoding="utf-8") as fh:
    js = json.load(fh)

queries = { strip_html(e["text_span"]).strip()
            for rec in js.values()
            for e in rec.get("entities", [])
            if e.get("label","").lower() == "bacteria" }

# ---------------------------------------------------------------------------
# 3)  MATCHING ---------------------------------------------------------------
for term_raw in sorted(queries):
    term = preprocess(term_raw)
    print(f"Query: {term_raw}")

    # 3-a   scorciatoia “L.” / “_”  -----------------------------------------
    hits = genus_abbrev_lookup(term)
    if hits:
        for l,t,d,s in hits: print(f"  • {l:40s} ID={t:15s} depth={d:<2d} score={s:.2f} (abbr)")
        print(); continue

    # 3-b   exact (case-insensitive)  ---------------------------------------
    ex = exact_ix.get(term.lower(), [])
    if ex:
        for l,t,d in ex: print(f"  • {l:40s} ID={t:15s} depth={d:<2d} score=1.00 (exact)")
        print(); continue

    # 3-c   cosine fallback  -------------------------------------------------
    cos = top_cosine(term)
    if not cos:
        matches = find_mesh_match(term, name_index)
        for name, ui, tree, score in matches[:5]:
            print(f"  • {name:30s} UI={ui:8s} Tree={tree:12s}  score={score:.2f}")
            #print("  no matches\n")
    else:
        for l,t,d,s in cos:
            print(f"  • {l:40s} ID={t:15s} depth={d:<2d} score={s:.2f}")
        print()

Query: A. muciniphila
  • Akkermansia muciniphila                  ID=NCBITaxon_239935 depth=16 score=1.00 (abbr)

Query: Acidaminococcus intestini
  • Acidaminococcus intestini                ID=NCBITaxon_187327 depth=14 score=1.00 (exact)

Query: Actinobacteria
  • Actinobacteria bacterium HGW-Actinobacteria-9 ID=NCBITaxon_2013654 depth=8  score=0.87
  • Actinobacteria bacterium HGW-Actinobacteria-8 ID=NCBITaxon_2013653 depth=8  score=0.87
  • Actinobacteria bacterium HGW-Actinobacteria-7 ID=NCBITaxon_2013652 depth=8  score=0.87
  • Actinobacteria bacterium HGW-Actinobacteria-6 ID=NCBITaxon_2013651 depth=8  score=0.87
  • Actinobacteria bacterium HGW-Actinobacteria-5 ID=NCBITaxon_2013650 depth=8  score=0.87

Query: Actinobacteriota
  • Actinobacteria                 UI=D039903  Tree=B03.510.024   score=0.93
  • Acidobacteria                  UI=D061271  Tree=B03.026       score=0.83
  • Cyanobacteria                  UI=D000458  Tree=B03.280       score=0.76
  • Acinetobacter        

In [57]:
#!/usr/bin/env python3
import os
import re
import unicodedata
import json
from pathlib import Path
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD, SKOS, OWL
from pprint import pprint

# -----------------------------------------------------------------------------
# 1. Setup paths and namespaces
# -----------------------------------------------------------------------------
path = str(Path(os.path.abspath(os.getcwd())).absolute())
json_file = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\data\train_platinum\train_platinum.json"
#json_file = os.path.join(path, "train_gold.json")
#vertex_file = os.path.join(path, "train_vertexsets.json")
tokenized_file = os.path.join(path, "tokenized_sentences_with_entitiesv2.json")
save_path = os.path.join(path, "rdf")
os.makedirs(save_path, exist_ok=True)

GUTBRAIN = Namespace("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/")
GUTPROP = Namespace("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/")

PAPER_CLASS       = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Paper")
MENTION_CLASS     = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Mention")
PAPER_ABSTRACT    = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/PaperAbstract")
PAPER_TITLE       = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/PaperTitle")
PAPER_COLLECTION  = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/PaperCollection")
PROJECT           = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Project")
SAMPLE            = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Sample")
SENTENCE          = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Sentence")

# -----------------------------------------------------------------------------
# 2. Load the JSON paper data
# -----------------------------------------------------------------------------
with open(json_file, "r", encoding="utf-8") as f:
    data = json.load(f)
# -----------------------------------------------------------------------------
# 3. Mapping dictionaries (keys must be in Title case)
# -----------------------------------------------------------------------------
label_mapping = {
    "Anatomical Location":   URIRef("https://w3id.org/brainteaser/ontology/schema/AnatomicalSite"),
    "Animal":                URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Animal"),
    "Biomedical Technique":  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/BiomedicalTechnique"),
    "Bacteria":              URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Species"),
    "Chemical":              URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Chemical"),
    "Dietary Supplement":    URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/DietarySupplement"),
    "DDF":                   URIRef("https://w3id.org/brainteaser/ontology/schema/DiseaseDisorderOrFinding"),
    "Drug":                  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Drug"),
    "Food":                  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Food"),
    "Gene":                  URIRef("https://w3id.org/brainteaser/ontology/schema/Gene"),
    "Human":                 URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Human"),
    "Microbiome":            URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Microbiome"),
    "Statistical Technique": URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/StatisticalTechnique")
}
concept_scheme_mapping = {
    "Anatomical Location":   URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/AnatomicSite"),
    "Animal":                URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Animal"),
    "Human":                 URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Human"),
    "Drug":                  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Drug"),
    "Gene":                  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Gene"),
    "Dietary Supplement":    URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/dietarySupplement"),
    "DDF":                   URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/diseaseDisorderOrFindingTaxonomy"),
    "Metabolite":            URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Metabolite"),
    "Bacteria":               URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Bacteria"),
    "Food":                  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Food"),
    "Chemical":              URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Chemical"),
    "Biomedical Technique":  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/BiomedicalTechnique"),
    "Microbiome":            URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Microbiome"),
    "Statistical Technique": URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/StatisticalTechnique")
}

# -----------------------------------------------------------------------------
# 4. Initialize the RDF graph and bind namespaces
# -----------------------------------------------------------------------------
BACTERIA_CLASS = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Species")
FAMILY_CLASS = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Family")
OBO_BASE = "http://purl.obolibrary.org/obo/"
MESH_BASE = "https://www.ncbi.nlm.nih.gov/mesh/"
BACTERIA_CONCEPT_SCHEME = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Bacteria")

g = Graph()
g.bind("gutbrain", GUTBRAIN)
g.bind("rdfs", RDFS)
g.bind("xsd", XSD)
g.bind("skos", SKOS)
g.bind("owl", OWL)
g.bind("gutprop", GUTPROP)

g.add((SKOS.inScheme, RDF.type, OWL.ObjectProperty))
g.add((SKOS.broaderTransitive, RDF.type, OWL.ObjectProperty))

g.add((GUTPROP.partOf, RDF.type, OWL.ObjectProperty))
g.add((GUTPROP.partOf, RDFS.label, Literal("partOf", datatype=XSD.string)))
g.add((GUTPROP.hasTitle, RDF.type, OWL.ObjectProperty))
g.add((GUTPROP.hasTitle, RDFS.label, Literal("hasTitle", datatype=XSD.string)))
g.add((GUTPROP.hasAbstract, RDF.type, OWL.ObjectProperty))
g.add((GUTPROP.hasAbstract, RDFS.label, Literal("hasAbstract", datatype=XSD.string)))

g.add((GUTPROP.containedIn, RDF.type, OWL.ObjectProperty))
g.add((GUTPROP.containedIn, RDFS.label, Literal("containedIn", datatype=XSD.string)))
g.add((GUTBRAIN.contains, RDF.type, OWL.ObjectProperty))
g.add((GUTBRAIN.contains, RDFS.label, Literal("contains", datatype=XSD.string)))
g.add((GUTPROP.composedOf, RDF.type, OWL.ObjectProperty))
g.add((GUTPROP.composedOf, RDFS.label, Literal("composedOf", datatype=XSD.string)))

g.add((GUTPROP.locatedIn, RDF.type, OWL.ObjectProperty))
g.add((GUTPROP.locatedIn, RDFS.label, Literal("locatedIn", datatype=XSD.string)))

g.add((GUTPROP.paperId, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.paperId, RDFS.label, Literal("paperId", datatype=XSD.string)))
g.add((GUTPROP.paperAnnotator, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.paperAnnotator, RDFS.label, Literal("paperAnnotator", datatype=XSD.string)))
g.add((GUTPROP.paperYear, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.paperYear, RDFS.label, Literal("paperYear", datatype=XSD.string)))
g.add((GUTPROP.paperJournal, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.paperJournal, RDFS.label, Literal("paperJournal", datatype=XSD.string)))
g.add((GUTPROP.paperAuthor, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.paperAuthor, RDFS.label, Literal("paperAuthor", datatype=XSD.string)))
g.add((GUTPROP.numberOfRunsFound, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.numberOfRunsFound, RDFS.label, Literal("numberOfRunsFound", datatype=XSD.string)))
g.add((GUTPROP.NCBITaxonID, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.NCBITaxonID, RDFS.label, Literal("NCBITaxonID", datatype=XSD.string)))
g.add((GUTPROP.sdRelativeAbundance, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.sdRelativeAbundance, RDFS.label, Literal("sdRelativeAbundance", datatype=XSD.string)))
g.add((GUTPROP.medianRelativeAbundance, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.medianRelativeAbundance, RDFS.label, Literal("medianRelativeAbundance", datatype=XSD.string)))
g.add((GUTPROP.meanRelativeAbundance, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.meanRelativeAbundance, RDFS.label, Literal("meanRelativeAbundance", datatype=XSD.string)))
g.add((GUTPROP.scientificName, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.scientificName, RDFS.label, Literal("scientificName", datatype=XSD.string)))
g.add((GUTPROP.hasMentionText, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.hasMentionText, RDFS.label, Literal("hasMentionText", datatype=XSD.string)))
g.add((GUTPROP.hasSentenceText, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.hasSentenceText, RDFS.label, Literal("hasSentenceText", datatype=XSD.string)))
g.add((GUTPROP.hasTitleText, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.hasTitleText, RDFS.label, Literal("hasTitleText", datatype=XSD.string)))
g.add((GUTPROP.hasAbstractText, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.hasAbstractText, RDFS.label, Literal("hasAbstractText", datatype=XSD.string)))

for scheme_uri in set(concept_scheme_mapping.values()):
    keys = [k for k, v in concept_scheme_mapping.items() if v == scheme_uri]
    label_text = ", ".join(k.title() for k in keys) + " Concept Scheme"
    g.add((scheme_uri, RDF.type, SKOS.ConceptScheme))
    g.add((scheme_uri, RDFS.label, Literal(label_text, datatype=XSD.string)))

is_train_platinum = "train_platinum" in os.path.basename(json_file)
#is_train_gold = "train_gold" in os.path.basename(json_file)

#if is_train_gold:
 #   gold_collection_uri = URIRef(GUTBRAIN["goldCollection"])
 #   label_text = "goldCollection"
  #  g.add((gold_collection_uri, RDF.type, PAPER_COLLECTION))
   # g.add((gold_collection_uri, RDFS.label, Literal(label_text, datatype=XSD.string)))
    
if is_train_platinum:
    platinum_collection_uri = URIRef(GUTBRAIN["platinumCollection"])
    label_text = "platinumCollection"
    g.add((platinum_collection_uri, RDF.type, PAPER_COLLECTION))
    g.add((platinum_collection_uri, RDFS.label, Literal(label_text, datatype=XSD.string)))

# -----------------------------------------------------------------------------
# Utility functions
# -----------------------------------------------------------------------------
def create_uri_fragment(text):
    cleaned = re.sub(r'<[^>]*>', '', text)
    cleaned = normalize_text(cleaned)
    # Allow word characters, digits, underscores, hyphens, and Greek characters (U+0370 to U+03FF)
    cleaned = re.sub(r'[^\w\u0370-\u03FF-]', '_', cleaned)
    return cleaned

def to_camel_case(s):
    s = re.sub(r'[^\w\s]', '', s)
    parts = re.split(r'\s+', s.strip())
    if not parts:
        return ""
    return parts[0].lower() + ''.join(word.title() for word in parts[1:])

def normalize_text(text):
    return unicodedata.normalize('NFC', text)

def preprocess(term):
    return term.replace('_', ' ').strip()

def normalize_to_ascii(s: str) -> str:
    # 1) Decomponi i caratteri accentati in base + segni diacritici
    nfkd = unicodedata.normalize('NFKD', s)
    # 2) Filtra solo i caratteri ASCII
    ascii_bytes = nfkd.encode('ascii', 'ignore')
    # 3) Ritorna la stringa decodificata
    return ascii_bytes.decode('ascii')

mention_counter   = 1
tokenized_mentions = {}
missing_mentions = []
created = {}
label2uri = {}
created["Bacteria"] = URIRef(f"{OBO_BASE}NCBITaxon_2")
g.add((created["Bacteria"], RDF.type, FAMILY_CLASS))
g.add((created["Bacteria"], RDF.type, SKOS.Concept))
g.add((created["Bacteria"], RDFS.label, Literal("Bacteria", datatype=XSD.string)))
g.add((created["Bacteria"], SKOS.inScheme, BACTERIA_CONCEPT_SCHEME))
#created["Bifidobacterium"] = URIRef(f"{MESH_BASE}D001644")
#g.add((created["Bifidobacterium"], RDF.type, BACTERIA_CLASS))
#g.add((created["Bifidobacterium"], RDF.type, SKOS.Concept))
#g.add((created["Bifidobacterium"], RDFS.label, Literal("Bifidobacterium", datatype=XSD.string)))
#g.add((created["Bifidobacterium"], SKOS.inScheme, BACTERIA_CONCEPT_SCHEME))

# -----------------------------------------------------------------------------
# 5. Process each paper (each key in the JSON represents a paper)
# -----------------------------------------------------------------------------
for paper_id, paper_data in data.items():
    paper_uri = URIRef(GUTBRAIN[f"paper_{paper_id}"])
    g.add((paper_uri, RDF.type, PAPER_CLASS))
    
    if is_train_platinum:
        g.add((paper_uri, GUTPROP.partOf, platinum_collection_uri))
        g.add((platinum_collection_uri, GUTBRAIN.contains, paper_uri))

    #if is_train_gold:
    #    g.add((paper_uri, GUTPROP.partOf, gold_collection_uri))
    #    g.add((gold_collection_uri, GUTBRAIN.contains, paper_uri))
    
    # Each paper gets its own mention node
    #paper_mention = URIRef(GUTBRAIN[f"mention_{paper_id}"])
    #g.add((paper_mention, RDF.type, MENTION_CLASS))
    #g.add((paper_uri, GUTPROP.hasMention, paper_mention))
    
    metadata = paper_data.get("metadata", {})
    full_title = metadata.get("title", None)
    full_abstract = metadata.get("abstract", None)
    try:
        paper_id_val = int(paper_id)
    except ValueError:
        paper_id_val = paper_id
    paper_annotator = metadata.get("annotator", None)
    paper_year = metadata.get("year", None)
    paper_journal = metadata.get("journal", None)
    paper_author = metadata.get("author", None)
    
    g.add((paper_uri, GUTPROP.paperId, Literal(paper_id_val, datatype=XSD.integer)))
    if paper_annotator is not None:
        g.add((paper_uri, GUTPROP.paperAnnotator, Literal(paper_annotator, datatype=XSD.string)))
    if paper_year is not None:
        g.add((paper_uri, GUTPROP.paperYear, Literal(paper_year, datatype=XSD.gYear)))
    if paper_journal is not None:
        g.add((paper_uri, GUTPROP.paperJournal, Literal(paper_journal, datatype=XSD.string)))
    if paper_author is not None:
        g.add((paper_uri, GUTPROP.paperAuthor, Literal(paper_author, datatype=XSD.string)))
    
    title_texts = []
    abstract_texts = []
    mention_created = False
    
    entities = paper_data.get("entities", [])
    
    for i, entity in enumerate(entities):
        raw_label = entity.get("label", "").strip()
        label_title = raw_label.title() if raw_label.lower() != "ddf" else "DDF"
        
        text_span = entity.get("text_span", "").strip()
        cleaned_text = create_uri_fragment(text_span)
        cleaned_text_span = normalize_to_ascii(cleaned_text)
        
        if label_title == "Bacteria":
            if cleaned_text_span.lower().endswith(" bacteria"):
                term_raw = "Bacteria"
            elif cleaned_text_span.lower().endswith(" Bifidobacterium"):
                term_raw = "Bifidobacterium"
            else:
                term_raw = cleaned_text_span
                
            term = preprocess(term_raw)
            
            print(f"Query: {term}")
            
            if term_raw in created:
                entity_uri = created[term_raw]
                print(f"  → Reusing existing URI: {uri}\n")
                continue
                
            hits = genus_abbrev_lookup(term)
            if hits:
                for l,t,d,s in hits: 
                    print(f"  • {l:40s} ID={t:15s} depth={d:<2d} score={s:.2f} (abbr)")
                    entity_uri = URIRef(f"{OBO_BASE}{t}")
                    created[term_raw] = entity_uri
                    label2uri[label_name.lower()] = entity_uri
                    g.add((entity_uri, RDF.type, BACTERIA_CLASS))
                    g.add((entity_uri, RDF.type, SKOS.Concept))
                    g.add((entity_uri, RDFS.label, Literal(label_name, datatype=XSD.string)))
                    g.add((entity_uri, SKOS.inScheme, BACTERIA_CONCEPT_SCHEME))
                    g.add((entity_uri, SKOS.broaderTransitive, created["Bacteria"]))
                    mention_uri = URIRef(GUTBRAIN[term_raw])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_{mention_counter}", datatype=XSD.string)))
                    #mention_counter += 1
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                    g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    print()
                    continue

            ex = exact_ix.get(term.lower(), [])
            if ex:
                for label_name, taxon_id, depth in ex:
                    print(f"  • {label_name:40s} ID={taxon_id:15s} depth={depth:<2d} (exact)")
                    entity_uri = URIRef(f"{OBO_BASE}{taxon_id}")
                    created[term_raw] = entity_uri
                    label2uri[label_name.lower()] = entity_uri
                    g.add((entity_uri, RDF.type, BACTERIA_CLASS))
                    g.add((entity_uri, RDF.type, SKOS.Concept))
                    g.add((entity_uri, RDFS.label, Literal(label_name, datatype=XSD.string)))
                    g.add((entity_uri, SKOS.inScheme, BACTERIA_CONCEPT_SCHEME))
                    g.add((entity_uri, SKOS.broaderTransitive, created["Bacteria"]))
                    mention_uri = URIRef(GUTBRAIN[term_raw])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_{mention_counter}", datatype=XSD.string)))
                    #mention_counter += 1
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                    g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    continue
                    print(); 
                    
            cos = top_cosine(term)
            if not cos:
                matches = find_mesh_match(term, name_index)
                if matches: 
                    for name, ui, tree, score in matches[:1]:
                        name_key = name.lower()
                        reused = False
                        if name_key in label2uri:
                            entity_uri = label2uri[name_key]
                            print(f"  → Reusing existing URI by label: {entity_uri}\n")
                            reused = True
                            continue
                        if reused:
                            break
                            
                        print(f"  • {name:30s} UI={ui:8s} Tree={tree:12s}  scoreMESH={score:.2f}")
                        name_uri = URIRef(f"{MESH_BASE}{ui}")
                        created[term_raw] = name_uri
                        label2uri[name_key] = name_uri
                        g.add((name_uri, RDF.type, BACTERIA_CLASS))
                        g.add((name_uri, RDF.type, SKOS.Concept))
                        g.add((name_uri, RDFS.label, Literal(name, datatype=XSD.string)))
                        g.add((name_uri, SKOS.inScheme, BACTERIA_CONCEPT_SCHEME))
                        g.add((name_uri, SKOS.broaderTransitive, created["Bacteria"]))
                        mention_uri = URIRef(GUTBRAIN[term_raw])
                        g.add((mention_uri, RDF.type, MENTION_CLASS))
                        g.add((mention_uri, RDFS.label, Literal(f"mention_{mention_counter}", datatype=XSD.string)))
                        #mention_counter += 1
                        g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                        g.add((name_uri, GUTPROP.containedIn, mention_uri))
                        tokenized_mentions[term_raw] = mention_uri
                        continue
                else:
                    mention_uri = URIRef(GUTBRAIN[term_raw])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_{mention_counter}", datatype=XSD.string)))
                    #mention_counter += 1
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                    g.add((created["Bacteria"], GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    continue
                    
            else:
                for l,t,d,s in cos[:1]:
                    print(f"  • {l:40s} ID={t:15s} depth={d:<2d} score={s:.2f}")
                    name_uri = URIRef(f"{OBO_BASE}{t}")
                    created[term_raw] = name_uri
                    label2uri[label_name.lower()] = name_uri
                    g.add((name_uri, RDF.type, BACTERIA_CLASS))
                    g.add((name_uri, RDF.type, SKOS.Concept))
                    g.add((name_uri, RDFS.label, Literal(l, datatype=XSD.string)))
                    g.add((name_uri, SKOS.inScheme, BACTERIA_CONCEPT_SCHEME))
                    g.add((name_uri, SKOS.broaderTransitive, created["Bacteria"]))
                    mention_uri = URIRef(GUTBRAIN[term_raw])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_{mention_counter}", datatype=XSD.string)))
                    mention_counter += 1
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                    g.add((name_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    continue
                print()
                
        #mention_uri = URIRef(GUTBRAIN[f"mention_{mention_counter}"])
        #mention_counter += 1
        #g.add((mention_uri, RDF.type, MENTION_CLASS))
        #g.add((mention_uri, GUTPROP.hasMentionText, Literal(cleaned_text_span, datatype=XSD.string)))
        #canonical = create_uri_fragment(cleaned_text_span)
        #tokenized_mentions[canonical] = mention_uri
        #continue
        
        location_lower = entity.get("location", "").strip().lower()
        if location_lower == "title":
            title_texts.append(cleaned_text_span)
        elif location_lower == "abstract":
            abstract_texts.append(cleaned_text_span)
        
        #if label_title in label_mapping:
            #entity_node = URIRef(GUTBRAIN[instance_fragment])
            #class_uri = label_mapping[label_title]
            #g.add((entity_node, RDF.type, class_uri))
            #g.add((entity_node, RDF.type, SKOS.Concept))
            #g.add((entity_node, RDFS.label, Literal(cleaned_text_span, datatype=XSD.string)))

            #if label_title in concept_scheme_mapping:
                #scheme_uri = concept_scheme_mapping[label_title]
                #g.add((entity_node, SKOS.inScheme, scheme_uri))

        else:
            entity_node = URIRef(GUTBRAIN[f"entity_{paper_id}_{i}"])
            g.add((entity_node, RDF.type, MENTION_CLASS))
            g.add((entity_node, RDF.type, SKOS.Concept))
            g.add((entity_node, RDFS.label, Literal(label_title, datatype=XSD.string)))
            g.add((entity_node, GUTPROP.annotator, Literal(metadata.get("annotator", "unknown"), datatype=XSD.string)))
            g.add((entity_node, GUTPROP.location, Literal(entity.get("location", ""), datatype=XSD.string)))
            g.add((entity_node, GUTPROP.text_span, Literal(cleaned_text_span, datatype=XSD.string)))
        
        #g.add((entity_node, GUTPROP.containedIn, paper_mention))
    
    
    if full_title is None and title_texts:
        full_title = " ".join(title_texts)
    if full_abstract is None and abstract_texts:
        full_abstract = " ".join(abstract_texts)
    
    if full_title:
        title_uri = URIRef(GUTBRAIN[f"title_{paper_id}"])
        g.add((title_uri, RDF.type, PAPER_TITLE))
        g.add((title_uri, GUTPROP.hasTitleText, Literal(full_title, datatype=XSD.string)))
        g.add((paper_uri, GUTPROP.hasTitle, title_uri))
    
    if full_abstract:
        abstract_uri = URIRef(GUTBRAIN[f"abstract_{paper_id}"])
        g.add((abstract_uri, RDF.type, PAPER_ABSTRACT))
        g.add((abstract_uri, GUTPROP.hasAbstractText, Literal(full_abstract, datatype=XSD.string)))
        g.add((paper_uri, GUTPROP.hasAbstract, abstract_uri))
    
    # Process relations
    #relations = paper_data.get("relations", [])
    #for relation in relations:
        #subj_text = relation.get("subject_text_span", "").strip()
        #cleaned_subj_text = re.sub(r'<[^>]*>', '', subj_text).strip()
        #subj_fragment = create_uri_fragment(cleaned_subj_text)
        #subj_uri = URIRef(GUTBRAIN[subj_fragment])
        #if not list(g.triples((subj_uri, None, None))):
           # print(f"Warning: Subject not recognized: {subj_uri}. Info: '{cleaned_subj_text}'")
        
        #obj_text = relation.get("object_text_span", "").strip()
        #cleaned_obj_text = re.sub(r'<[^>]*>', '', obj_text).strip()
        #obj_fragment = create_uri_fragment(cleaned_obj_text)
        #obj_uri = URIRef(GUTBRAIN[obj_fragment])
        #if not list(g.triples((obj_uri, None, None))):
            #print(f"Warning: Object not recognized: {obj_uri}. Info: '{cleaned_obj_text}'")
        
        #pred_text = relation.get("predicate", "").strip()
        #pred_text_clean = to_camel_case(pred_text)
        #pred_uri = URIRef(GUTPROP[pred_text_clean])
        #print(f"Predicate: '{pred_text}' -> '{pred_text_clean}'")
        #g.add((pred_uri, RDF.type, OWL.ObjectProperty))
        #g.add((pred_uri, RDFS.label, Literal(pred_text_clean, datatype=XSD.string)))
       # g.add((subj_uri, pred_uri, obj_uri))

with open(tokenized_file, "r", encoding="utf-8") as f_sent:
    tokenized_data = json.load(f_sent)

for entry in tokenized_data:
    pmid         = entry["pmid"]
    sent_id      = entry["sent_id"]
    sentence_txt = entry["sentence"].strip()
    entities     = entry["entities"]

    sent_uri = URIRef(GUTBRAIN[f"sentence_{pmid}_{sent_id}"])

    sent_uri = URIRef(GUTBRAIN[f"sentence_{pmid}_{sent_id}"])
    g.add((sent_uri, RDF.type, SENTENCE))
    g.add((sent_uri, GUTPROP.hasSentenceText, Literal(sentence_txt, datatype=XSD.string)))

    if sent_id == 0:
        parent_uri = URIRef(GUTBRAIN[f"title_{pmid}"])
    else:
        parent_uri = URIRef(GUTBRAIN[f"abstract_{pmid}"])
    g.add((sent_uri,      GUTPROP.partOf,  parent_uri))
    g.add((parent_uri,    GUTPROP.composedOf, sent_uri))
        
    for ent in entities:
        if isinstance(ent, dict):
            text_span = ent.get("text_span", "").strip()
            label     = ent.get("label",    "").strip().lower()
        else:
            continue # o None, se ti serve

        if label != "bacteria":
            continue
        canonical = create_uri_fragment(text_span)
        cleaned_text_span = normalize_to_ascii(canonical)
        #print(cleaned_text_span)
        if cleaned_text_span not in tokenized_mentions:
            mention_uri = URIRef(GUTBRAIN[cleaned_text_span])
            tokenized_mentions[cleaned_text_span] = mention_uri

            g.add((mention_uri, RDF.type, MENTION_CLASS))
            g.add((mention_uri, RDFS.label, Literal(f"mention_{cleaned_text_span}", datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
            
        else:
            mention_uri = tokenized_mentions[cleaned_text_span]
            
        g.add((mention_uri, GUTPROP.locatedIn, sent_uri))

        #entity_uri = URIRef(GUTBRAIN[canonical])
        #g.add((entity_uri, GUTPROP.containedIn, mention_node))
        #lit = Literal(ent_text, datatype=XSD.string)
        #if (mention_node, GUTPROP.hasMentionText, lit) not in g:
            #g.add((mention_node, GUTPROP.hasMentionText, lit))
print("\nDettaglio di tokenized_mentions:")
for canonical, mention_uri in tokenized_mentions.items():
    print(f"{canonical!r} → {mention_uri}")
# -----------------------------------------------------------------------------
# 7. Serialize and print the graph in Turtle format
# -----------------------------------------------------------------------------
output_file = os.path.join(save_path, "gutbrain_entities.ttl")
ttl_output = g.serialize(format="turtle")
with open(output_file, "w", encoding="utf-8") as f_out:
    f_out.write(ttl_output)

print(f"The RDF graph has been saved in {output_file}")

Query: Veillonella
  • Veillonella                              ID=NCBITaxon_29465 depth=12 (exact)
  • Veillonella                              ID=NCBITaxon_29465 depth=12 score=1.00

Query: Roseburia
  • Roseburia                                ID=NCBITaxon_841   depth=12 (exact)
  • Roseburia                                ID=NCBITaxon_841   depth=12 score=1.00

Query: Christensenellaceae R-7 group
  • Christensenellaceae                      ID=NCBITaxon_990719 depth=10 score=0.87

Query: Subdoligranulum
  • Subdoligranulum                          ID=NCBITaxon_292632 depth=12 (exact)
  • Subdoligranulum                          ID=NCBITaxon_292632 depth=12 score=1.00

Query: Oscillibacter
  • Oscillibacter                            ID=NCBITaxon_459786 depth=12 (exact)
  • Oscillibacter                            ID=NCBITaxon_459786 depth=12 score=1.00

Query: UCG-005
  • Ruminococcaceae bacterium UCG-005        ID=NCBITaxon_3068309 depth=14 score=0.87

Query: acetate-producing ba

In [10]:
from rdflib.namespace import RDFS

def labels_equal(uri1, uri2, graph):
    # Estrae tutti i label di uri1 e uri2
    labels1 = [str(lbl) for lbl in graph.objects(uri1, RDFS.label)]
    labels2 = [str(lbl) for lbl in graph.objects(uri2, RDFS.label)]
    # Confronta ogni coppia
    return any(l1 == l2 for l1 in labels1 for l2 in labels2)

# Esempio di utilizzo:
ncbi_uri = URIRef(f"{OBO_BASE}NCBITaxon_239934")       # l'individual da NCBITaxon
mesh_uri = URIRef(f"{MESH_BASE}D000086102")          # sostituisci con il tuo UI MeSH per Clostridium

if labels_equal(ncbi_uri, mesh_uri, g):
    print("I label coincidono!")
else:
    print("I label sono diversi.")


I label coincidono!


In [20]:
# Funzione per contare le righe che iniziano con "Query:" in un file di testo
def count_queries(file_path):
    """
    Conta il numero di righe che iniziano con 'Query:' in un file.
    
    Args:
        file_path (str): Percorso del file di testo da analizzare.
    
    Returns:
        int: Numero di query trovate.
    """
    count = 0
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.lstrip().startswith("Query:"):
                count += 1
    return count

# Sostituisci con i percorsi dei tuoi file
file1 = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\file1.txt"
file2 = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\file2.txt"

n1 = count_queries(file1)
n2 = count_queries(file2)

print(f"{file1}: {n1} query")
print(f"{file2}: {n2} query")

if n1 == n2:
    print("⚖️ I due file hanno lo stesso numero di query.")
else:
    print("❌ I due file hanno un numero diverso di query.")


C:\Users\samue\OneDrive\Desktop\ThesisPiron\file1.txt: 210 query
C:\Users\samue\OneDrive\Desktop\ThesisPiron\file2.txt: 416 query
❌ I due file hanno un numero diverso di query.
