In [4]:
import json
import nltk
from nltk.tokenize import sent_tokenize

# Scarica i dati necessari per il tokenizzatore (se non già presenti)
nltk.download('punkt')

def process_json_file(input_file, output_file):
    # Carica il file JSON di input
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    output_sentences = []

    for paper_id, entry in data.items():
        metadata = entry.get("metadata", {})
        title = metadata.get("title", "")
        abstract = metadata.get("abstract", "")
        
        combined_text = title + " " + abstract
        sentences = sent_tokenize(combined_text)
        
        # Lista di dict {text_span, label}
        ner_entities = [
            {"text_span": ent["text_span"], "label": ent["label"]}
            for ent in entry.get("entities", [])
            if ent.get("text_span")
        ]
        
        for idx, sentence in enumerate(sentences):
            found = []
            for ent in ner_entities:
                ts = ent["text_span"]
                if ts in sentence:
                    found.append(ent)
            # rimuovo duplicati basati su (text_span, label)
            unique = { (e["text_span"], e["label"]): e for e in found }
            
            output_sentences.append({
                "pmid": paper_id,
                "sent_id": idx,
                "sentence": sentence,
                "entities": list(unique.values())
            })
    
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(output_sentences, f, indent=2, ensure_ascii=False)
    
    print(f"Le frasi processate sono state salvate in {output_file}")

# Usa i tuoi percorsi
input_file = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\data\train_platinum\train_platinum.json"
output_file = 'tokenized_sentences_with_entitiesv2.json'

process_json_file(input_file, output_file)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\samue\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Le frasi processate sono state salvate in tokenized_sentences_with_entitiesv2.json


In [1]:
import requests

def download_file(url, save_path):
    # Stream the file in chunks
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(save_path, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)

url = "https://nlmpubs.nlm.nih.gov/projects/mesh/MESH_FILES/xmlmesh/desc2025.xml"
save_path = "desc2025.xml"
download_file(url, save_path)
print(f"File downloaded to {save_path}")

File downloaded to desc2025.xml


In [3]:
import xml.etree.ElementTree as ET

def parse_mesh_descriptors(xml_path):
    """
    Parse the MeSH XML and return a list of dicts:
      { 'ui': DescriptorUI,
        'name': DescriptorName,
        'tree_numbers': [treeNum1, treeNum2, ...]
      }
    """
    tree = ET.parse(xml_path)
    root = tree.getroot()
    descriptors = []
    for dr in root.findall('DescriptorRecord'):
        ui_el = dr.find('DescriptorUI')
        name_el = dr.find('DescriptorName/String')
        if ui_el is None or name_el is None:
            continue
        ui = ui_el.text
        name = name_el.text
        tree_numbers = [tn.text for tn in dr.findall('TreeNumberList/TreeNumber') if tn.text]
        descriptors.append({
            'ui': ui,
            'name': name,
            'tree_numbers': tree_numbers
        })
    return descriptors

def get_bacteria_taxonomy(xml_path):
    """
    From the full MeSH descriptors file, extract all descriptors
    under the 'Bacteria' branch (tree number B03).
    Returns a dict: { tree_number: { 'ui': ..., 'name': ... }, ... }
    """
    descriptors = parse_mesh_descriptors(xml_path)

    bacteria_prefix = None
    for d in descriptors:
        if d['name'] == 'Bacteria':
            bacteria_prefix = d['tree_numbers'][0]
            break

    if not bacteria_prefix:
        raise RuntimeError("Couldn't find a descriptor named 'Bacteria' in the file.")

    taxonomy = {}
    for d in descriptors:
        for tn in d['tree_numbers']:
            if tn == bacteria_prefix or tn.startswith(bacteria_prefix + '.'):
                taxonomy[tn] = {
                    'ui': d['ui'],
                    'name': d['name']
                }
                break

    return taxonomy

if __name__ == '__main__':
    xml_file = 'desc2025.xml'
    bacteria_tax = get_bacteria_taxonomy(xml_file)

    for tree_num in sorted(bacteria_tax):
        info = bacteria_tax[tree_num]
        print(f"{tree_num:10s}  {info['name']:30s}  ({info['ui']})")

B03         Bacteria                        (D001419)
B03.026     Acidobacteria                   (D061271)
B03.054     Agricultural Inoculants         (D059827)
B03.110     Atypical Bacterial Forms        (D001295)
B03.110.422  L Forms                         (D007740)
B03.110.761  Spheroplasts                    (D013104)
B03.120     Bacteria, Aerobic               (D001420)
B03.130     Bacteria, Anaerobic             (D001421)
B03.135     Bacteria, Thermoduric           (D000072280)
B03.250     Chlorobi                        (D019414)
B03.250.140  Chlorobium                      (D041883)
B03.275     Chloroflexi                     (D041862)
B03.275.150  Chloroflexus                    (D041861)
B03.275.575  Dehalococcoides                 (D000082942)
B03.280     Cyanobacteria                   (D000458)
B03.280.100  Anabaena                        (D017033)
B03.280.100.150  Anabaena cylindrica             (D046868)
B03.280.100.900  Anabaena variabilis             (D046870)
B03.28

In [4]:
import xml.etree.ElementTree as ET
from difflib import SequenceMatcher, get_close_matches

def parse_mesh_descriptors(xml_path):
    """Parse the MeSH XML and return a list of descriptors."""
    tree = ET.parse(xml_path)
    root = tree.getroot()
    descriptors = []
    for dr in root.findall('DescriptorRecord'):
        ui_el   = dr.find('DescriptorUI')
        name_el = dr.find('DescriptorName/String')
        if ui_el is None or name_el is None:
            continue
        ui   = ui_el.text
        name = name_el.text
        tree_nums = [tn.text for tn in dr.findall('TreeNumberList/TreeNumber') if tn.text]
        descriptors.append({'ui':ui, 'name':name, 'tree_numbers':tree_nums})
    return descriptors

MESH_XML = 'desc2025.xml' #from the folder
descriptors = parse_mesh_descriptors(MESH_XML)
print(f"Parsed {len(descriptors)} descriptors")
#print(descriptors)

Parsed 30956 descriptors


In [5]:
def get_bacteria_taxonomy(descriptors):
    """
    Find the descriptor with name 'Bacteria', 
    then collect every descriptor under that tree-number prefix.
    """
    prefix = None #bacteria is B03
    for d in descriptors:
        if d['name'].lower() == 'bacteria' and d['tree_numbers']:
            prefix = d['tree_numbers'][0]
            break
    if not prefix:
        raise RuntimeError("Could not find 'Bacteria' in descriptors")
    tax = {}
    for d in descriptors:
        for tn in d['tree_numbers']:
            if tn == prefix or tn.startswith(prefix + '.'):
                tax[tn] = {'ui':d['ui'], 'name':d['name']}
                break
    return tax

bacteria_tax = get_bacteria_taxonomy(descriptors)
#print(f"{len(bacteria_tax)} bacterial MeSH nodes")

In [6]:
def build_name_index(taxonomy):
    """
    Build a dict: lower-case name -> list of (tree#, ui, canonical name)
    """
    idx = {}
    for tree_num, info in taxonomy.items():
        key = info['name'].lower()
        idx.setdefault(key, []).append((tree_num, info['ui'], info['name']))
    return idx

name_index = build_name_index(bacteria_tax)
print(f"Indexed {len(name_index)} bacterial names")

Indexed 859 bacterial names


In [7]:
from difflib import SequenceMatcher

def find_mesh_match(input_name, name_index, n=5, cutoff=0.6):
    """
    Return matches for input_name among the MeSH bacterial names.
    Exact matches come first with score=1.0.
    Then fuzzy matches (score computed via SequenceMatcher.ratio).
    """
    key = input_name.lower()
    results = []
    if key in name_index:
        for tree_num, ui, name in name_index[key]:
            results.append((name, ui, tree_num, 1.0))
        return results

    all_names = list(name_index.keys())
    #print(all_names)
    
    close = get_close_matches(key, all_names, n=n, cutoff=cutoff)
    for cname in close:
        score = SequenceMatcher(None, key, cname).ratio()
        for tree_num, ui, name in name_index[cname]:
            results.append((name, ui, tree_num, score))
    results.sort(key=lambda x: x[3], reverse=True)
    return results


In [48]:
import re
import json

def strip_html_tags(text):
    return re.sub(r'<[^>]+>', '', text)

with open("train_platinum.json", "r", encoding="utf-8") as f:
    data = json.load(f)

bacteria_terms = set()
for record in data.values():
    for ent in record.get("entities", []):
        if ent.get("label", "").lower() == "bacteria":
            raw = ent["text_span"]
            clean = strip_html_tags(raw).strip()
            if clean:
                bacteria_terms.add(clean)

bacteria_terms.add(clean)

for term in sorted(bacteria_terms):
    print(f"Query: {term}")
    matches = find_mesh_match(term, name_index)
    if not matches:
        print("no matches")
    else:
        for name, ui, tree, score in matches[:5]:
            print(f"  • {name:30s} UI={ui:8s} Tree={tree:12s}  score={score:.2f}")
    print()

Query: A. muciniphila
  • Acidiphilium                   UI=D041801  Tree=B03.440.400.425.100.110  score=0.62

Query: Acidaminococcus intestini
  • Acidaminococcus                UI=D045850  Tree=B03.353.250   score=0.75
  • Micrococcus luteus             UI=D016982  Tree=B03.510.024.850.500.500  score=0.60

Query: Actinobacteria
  • Actinobacteria                 UI=D039903  Tree=B03.510.024   score=1.00

Query: Actinobacteriota
  • Actinobacteria                 UI=D039903  Tree=B03.510.024   score=0.93
  • Acidobacteria                  UI=D061271  Tree=B03.026       score=0.83
  • Cyanobacteria                  UI=D000458  Tree=B03.280       score=0.76
  • Acinetobacter                  UI=D000150  Tree=B03.440.400.425.537.050  score=0.76
  • Acetobacter                    UI=D000091  Tree=B03.440.400.425.100.100  score=0.74

Query: Acute/chronic insomnia-related signature bacteria
no matches

Query: Agathobaculum
  • Thiobacillus                   UI=D013855  Tree=B03.440.400.450.

In [1]:
import re, json
from pathlib import Path
from collections import defaultdict

# ---------------------------------------------------------------------------
# helpers
# ---------------------------------------------------------------------------
def strip_html_tags(text: str) -> str:
    return re.sub(r"<[^>]+>", "", text)

def load_taxonomy_tree(tree_path: str):
    """
    Parse lines like
        "    Agathobaculum faecis [NCBITaxon_1904561]"
    and build a list of (label, tax_id, depth).
    """
    line_re = re.compile(r"^\s*(.*?)\s+\[([^\]]+)\]\s*$")
    rows    = []

    with open(tree_path, encoding="utf-8") as fh:
        for line in fh:
            m = line_re.match(line)
            if not m:
                continue
            label, tax_id = m.groups()
            depth         = len(line) - len(line.lstrip())
            rows.append((label, tax_id, depth))

    return rows


# ---------------------------------------------------------------------------
# 1)  Build the taxonomy list and a quick exact-match index
# ---------------------------------------------------------------------------

TREE_FILE = "bacteria_tree1.txt"      # adjust if needed
tax_rows  = load_taxonomy_tree(TREE_FILE)

exact_index = defaultdict(list)
for lbl, tax_id, depth in tax_rows:
    exact_index[lbl.lower()].append((lbl, tax_id, depth))

# ---------------------------------------------------------------------------
# 2)  Pre-compute TF-IDF vectors for cosine similarity
# ---------------------------------------------------------------------------
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

labels_only       = [row[0] for row in tax_rows]
vectorizer        = TfidfVectorizer(stop_words="english")
taxon_matrix      = vectorizer.fit_transform(labels_only)   # sparse CSR
# Keep lookup from matrix row -> (label, tax_id, depth)
row2meta = tax_rows

def top_cosine_matches(term, k=5, min_score=0.15):
    """Return the k best cosine-similarity hits above min_score."""
    q_vec      = vectorizer.transform([term])
    scores     = cosine_similarity(q_vec, taxon_matrix).ravel()
    best_idx   = np.argsort(scores)[::-1]   # descending
    results    = []
    for idx in best_idx[:k]:
        score = scores[idx]
        if score < min_score:
            break
        lbl, tid, depth = row2meta[idx]
        results.append((lbl, tid, depth, score))
    return results


# ---------------------------------------------------------------------------
# 3)  pull the query terms from the JSON annotations
# ---------------------------------------------------------------------------
with open(r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\data\train_platinum\train_platinum.json", encoding="utf-8") as f:
    records = json.load(f)

bacteria_terms = set()
for rec in records.values():
    for ent in rec.get("entities", []):
        if ent.get("label", "").lower() == "bacteria":
            raw   = ent["text_span"]
            clean = strip_html_tags(raw).strip()
            if clean:
                bacteria_terms.add(clean)
# ---------------------------------------------------------------------------
# 4)  match & print
# ---------------------------------------------------------------------------
print(bacteria_terms)
for term in sorted(bacteria_terms):
    print(f"Query: {term}")

    # ---- exact ↓ ----------------------------------------------------------
    hits = exact_index.get(term.lower(), [])
    if hits:
        for lbl, tid, depth in hits:
            print(f"  • {lbl:40s} ID={tid:15s} depth={depth:<2d}  score=1.00 (exact)")
        print()
        continue

    # ---- cosine fallback ↓ -----------------------------------------------
    cos_hits = top_cosine_matches(term, k=5, min_score=0.75)
    if not cos_hits:
        matches = find_mesh_match(term, name_index)
        for name, ui, tree, score in matches[:5]:
            print(f"  • {name:30s} UI={ui:8s} Tree={tree:12s}  score={score:.2f}")
            #print("  no matches\n")
    else:
        for lbl, tid, depth, score in cos_hits:
            print(f"  • {lbl:40s} ID={tid:15s} depth={depth:<2d}  score={score:.2f}")
        print()


{'Odoribacter', 'Haemophilus', 'Streptococcus thermophilus', 'coliform', 'Bifidobacterium adolescentis', 'Lachnospiraceae family', 'Lacticaseibacillus paracasei', 'Parabacteroides', 'Clostridium sp. BR31', 'Salmonella', 'short-chain fatty acid producing bacteria', 'Ruminococcaceae Incertae Sedis', 'Eggerthellaceae', 'gut Christensenellaceae family', 'hypnotic psychobiotic strain', 'Parasutterella', 'genus_Eubacterium', 'Psychobiotics', 'intestinal bacteria', 'Desulfovibrio', 'Lactobacillaceae', 'Intestinibacter', 'UCG-005', 'Blautia', 'short-chain fatty acid (SCFA)-producing bacteria', 'Lactobacillus plantarum R1012', 'Faecalibacterium', 'Corynebacterium', 'Enterobacter cloacae', 'probiotics', 'Bifidobacteriaceae', 'Fusicatenibacter saccharivorans', 'Gut Bacteria', 'S24-7', 'genus_Subdoligranulum', 'Lachnostridium', 'Lactobacillus plantarum CCFM8661', 'Helicobacter pylori', 'Oscillospira', 'xylanophilum_group', 'UCG009', 'Enterococcus', 'Christensenellaceae R-7 group', 'Megasphaera', '

NameError: name 'find_mesh_match' is not defined

In [1]:
import re
import json
import numpy as np
import xml.etree.ElementTree as ET
from difflib import get_close_matches, SequenceMatcher
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# --- 1) parse MeSH descriptors and build only the 'Bacteria' subtree ---
def parse_mesh_descriptors(xml_path):
    tree = ET.parse(xml_path)
    root = tree.getroot()
    descriptors = []
    for dr in root.findall('DescriptorRecord'):
        ui_el   = dr.find('DescriptorUI')
        name_el = dr.find('DescriptorName/String')
        if ui_el is None or name_el is None:
            continue
        ui   = ui_el.text
        name = name_el.text
        tree_nums = [tn.text for tn in dr.findall('TreeNumberList/TreeNumber') if tn.text]
        descriptors.append({'ui':ui, 'name':name, 'tree_numbers':tree_nums})
    return descriptors

def get_bacteria_taxonomy(descriptors):
    # find the 'Bacteria' root (TreeNumber prefix B03)
    prefix = None
    for d in descriptors:
        if d['name'].lower() == 'bacteria' and d['tree_numbers']:
            prefix = d['tree_numbers'][0]
            break
    if not prefix:
        raise RuntimeError("Could not find 'Bacteria' descriptor")
    tax = {}
    for d in descriptors:
        for tn in d['tree_numbers']:
            if tn == prefix or tn.startswith(prefix + '.'):
                tax[tn] = {'ui':d['ui'], 'name':d['name']}
                break
    return tax

def build_name_index(taxonomy):
    idx = {}
    for tree_num, info in taxonomy.items():
        key = info['name'].lower()
        idx.setdefault(key, []).append((tree_num, info['ui'], info['name']))
    return idx

MESH_XML = 'desc2025.xml'
descriptors   = parse_mesh_descriptors(MESH_XML)
bacteria_tax  = get_bacteria_taxonomy(descriptors)
name_index    = build_name_index(bacteria_tax)

# --- 2) load bacterial terms from your JSON ---
def strip_html_tags(text):
    return re.sub(r'<[^>]+>', '', text)

with open("train_platinum.json", "r", encoding="utf-8") as f:
    data = json.load(f)

bacteria_terms = sorted({
    strip_html_tags(ent["text_span"]).strip()
    for rec in data.values()
    for ent in rec.get("entities", [])
    if ent.get("label","").lower() == "bacteria"
})

# --- 3) prepare MeSH names list aligned with name_index ---
mesh_names   = sorted({info[2] for infos in name_index.values() for info in infos})
mesh_triples = [name_index[name.lower()][0] for name in mesh_names]  # (tree,ui,name)

# --- 4) embed both lists ---
model = SentenceTransformer('all-mpnet-base-v2')
mesh_embeds = model.encode(mesh_names,   convert_to_tensor=False)  # (M, D)
bact_embeds = model.encode(bacteria_terms, convert_to_tensor=False)  # (N, D)

# --- 5) compute M×N cosine-similarity ---
sims = cosine_similarity(mesh_embeds, bact_embeds)  # shape=(M,N)

# --- 6) for each bacteria, exact-match or top 5 by similarity ---
for j, bact in enumerate(bacteria_terms):
    print(f"Query: {bact}")
    key = bact.lower()
    if key in name_index:
        # exact match: list all tree/UI for this name
        for tree, ui, name in name_index[key]:
            print(f"  • {name:30s} UI={ui:8s} Tree={tree:12s}  score=1.00")
    else:
        # no exact match: find top 5 mesh indices
        col = sims[:, j]
        top5 = np.argsort(col)[::-1][:5]
        for i in top5:
            name       = mesh_names[i]
            tree, ui,_ = mesh_triples[i]
            score      = col[i]
            print(f"  • {name:30s} UI={ui:8s} Tree={tree:12s}  score={score:.2f}")
    print()


FileNotFoundError: [Errno 2] No such file or directory: 'train_platinum.json'

In [50]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Downloading sentence_transformers-4.1.0-py3-none-any.whl (345 kB)
Installing collected packages: sentence_transformers
Successfully installed sentence_transformers-4.1.0




In [2]:
!pip install owlready2

Collecting owlready2
  Downloading owlready2-0.47.tar.gz (27.3 MB)
     ---------------------------------------- 0.0/27.3 MB ? eta -:--:--
     ---------------------------------------- 0.3/27.3 MB ? eta -:--:--
      --------------------------------------- 0.5/27.3 MB 1.5 MB/s eta 0:00:18
      --------------------------------------- 0.5/27.3 MB 1.5 MB/s eta 0:00:18
     - -------------------------------------- 0.8/27.3 MB 1.1 MB/s eta 0:00:25
     - -------------------------------------- 1.0/27.3 MB 1.0 MB/s eta 0:00:27
     - -------------------------------------- 1.3/27.3 MB 1.0 MB/s eta 0:00:26
     -- ------------------------------------- 1.6/27.3 MB 1.0 MB/s eta 0:00:25
     -- ------------------------------------- 1.6/27.3 MB 1.0 MB/s eta 0:00:25
     -- ------------------------------------- 1.8/27.3 MB 1.0 MB/s eta 0:00:26
     --- ------------------------------------ 2.1/27.3 MB 1.0 MB/s eta 0:00:25
     --- ------------------------------------ 2.4/27.3 MB 1.0 MB/s eta 0:00:25



In [1]:
from owlready2 import *

# 1.  LOAD THE ONTOLOGY ------------------------------------------------------
WORLD = World()
onto  = WORLD.get_ontology(
           r"C:\Users\samue\OneDrive\Desktop\foodon.owl"
        ).load()

# 2.  PICK THE ROOT CLASS ----------------------------------------------------
root = onto.search_one(iri = "*NCIT_C62695")   # Bacteria

# 3.  WALK AND WRITE ---------------------------------------------------------
out_path  = "food_tree.txt"     # file to create
max_depth = None                    # None = no depth-limit   (use an int if you
                                   #        want to cut it off, e.g. 10)

with open(out_path, "w", encoding="utf-8") as fh:

    def walk(cls, depth=0):
        # depth-limit (works even when max_depth is None)
        if max_depth is not None and depth > max_depth:
            return

        label = cls.label.first() or cls.name
        fh.write("  " * depth + f"{label} [{cls.name}]\n")

        for sub in cls.subclasses():
            walk(sub, depth + 1)

    # ---- start the traversal ----
    walk(root)

print(f"Done! Taxonomy saved to {out_path}")

Done! Taxonomy saved to food_tree.txt


In [1]:
# Cell 1: Imports, paths, and function definition

import gzip
from rdflib import Graph, RDFS

# Path al tuo file OWL compresso
gz_path = r"C:\Users\samue\OneDrive\Desktop\chebi.owl.gz"
# Path al file di destinazione per i risultati
output_path = r"C:\Users\samue\OneDrive\Desktop\chebi_labels.txt"

def save_chebi_labels(gz_path: str, output_path: str):
    """
    Streams and parses un file OWL gzippato,
    poi salva ogni URI di classe CHEBI e la sua rdfs:label in output_path.
    """
    g = Graph()
    print(f"Parsing '{gz_path}' (quest’operazione potrebbe richiedere alcuni minuti)...")
    with gzip.open(gz_path, 'rt', encoding='utf-8') as fh:
        g.parse(source=fh, format='xml')
    print(f"Scrivo i risultati in '{output_path}'...")
    with open(output_path, 'w', encoding='utf-8') as out:
        # intestazione
        out.write("URI\tName\n")
        # estrai tutte le triple (soggetto, rdfs:label, oggetto)
        for subj, _, label in g.triples((None, RDFS.label, None)):
            uri = str(subj)
            if uri.startswith("http://purl.obolibrary.org/obo/CHEBI_"):
                out.write(f"{uri}\t{label}\n")
    print("Fatto!")

if __name__ == "__main__":
    save_chebi_labels(gz_path, output_path)

Parsing 'C:\Users\samue\OneDrive\Desktop\chebi.owl.gz' (quest’operazione potrebbe richiedere alcuni minuti)...
Scrivo i risultati in 'C:\Users\samue\OneDrive\Desktop\chebi_labels.txt'...
Fatto!


In [4]:
from owlready2 import get_ontology, Thing

# 1) Carica il file OWL
onto = get_ontology(r"C:\Users\samue\OneDrive\Desktop\ohmi.owl").load()

def write_taxonomy(node, file_handle, level=0, visited=None):
    """
    Scrive su file_handle la riga per 'node' (label [ID]) con indentazione
    e poi richiama sé stessa per ciascuna sottoclasse diretta.
    Usa 'visited' per evitare cicli in caso di ereditarietà multipla.
    """
    if visited is None:
        visited = set()
    if node in visited:
        return
    visited.add(node)

    indent = "  " * level
    # prova a prendere rdfs:label, altrimenti fallback al nome della classe
    label = node.label[0] if node.label else node.name
    file_handle.write(f"{indent}{label} [{node.name}]\n")

    # ordina le sottoclassi per label o nome
    children = sorted(
        node.subclasses(),
        key=lambda x: (x.label[0] if x.label else x.name).lower()
    )
    for child in children:
        write_taxonomy(child, file_handle, level + 1, visited)

# 2) Apri il file di output e scrivi l’intero albero a partire da owl:Thing
output_file = r"C:\Users\samue\OneDrive\Desktop\OHMI_full_taxonomy.txt"
with open(output_file, "w", encoding="utf-8") as out:
    write_taxonomy(Thing, out)

print(f"Fatto! Trovi l’albero completo in:\n  {output_file}")

Fatto! Trovi l’albero completo in:
  C:\Users\samue\OneDrive\Desktop\OHMI_full_taxonomy.txt


In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from owlready2 import get_ontology
import os

# 1. Path to your local copy of the FoodOn OWL
OWL_PATH = r"C:\Users\samue\OneDrive\Desktop\foodon.owl"

# 2. Load the ontology
onto = get_ontology(f"file://{os.path.abspath(OWL_PATH)}").load()

# 3. Output file
OUT_PATH = "foodon_terms.txt"

with open(OUT_PATH, "w", encoding="utf-8") as out:
    for cls in onto.classes():
        # Only process terms whose IRI contains the FOODON_ prefix
        if "FOODON_" not in cls.iri:
            continue

        # Extract the numeric ID (e.g. "00002412")
        term_id = cls.iri.rsplit("FOODON_", 1)[-1]

        # Use rdfs:label if available, otherwise fall back to the class name
        label = cls.label.first() if cls.label else cls.name

        out.write(f"{label} [FOODON_{term_id}]\n")

print(f"Wrote {len(list(onto.classes()))} terms to {OUT_PATH}")


  http://purl.obolibrary.org/obo/FOODON_00001829

  http://purl.obolibrary.org/obo/FOODON_00002511



Wrote 39434 terms to foodon_terms.txt
