In [1]:
import xml.etree.ElementTree as ET
from difflib import SequenceMatcher, get_close_matches

def parse_mesh_descriptors(xml_path):
    """Parse the MeSH XML and return a list of descriptors."""
    tree = ET.parse(xml_path)
    root = tree.getroot()
    descriptors = []
    for dr in root.findall('DescriptorRecord'):
        ui_el   = dr.find('DescriptorUI')
        name_el = dr.find('DescriptorName/String')
        if ui_el is None or name_el is None:
            continue
        ui   = ui_el.text
        name = name_el.text
        tree_nums = [tn.text for tn in dr.findall('TreeNumberList/TreeNumber') if tn.text]
        descriptors.append({'ui':ui, 'name':name, 'tree_numbers':tree_nums})
    return descriptors

MESH_XML = 'desc2025.xml'
descriptors = parse_mesh_descriptors(MESH_XML)
print(f"Parsed {len(descriptors)} descriptors")

Parsed 30956 descriptors


In [2]:
import xml.etree.ElementTree as ET

def parse_mesh_descriptors(xml_path):
    """
    Parse the MeSH XML and return a list of dicts:
      { 'ui': DescriptorUI,
        'name': DescriptorName,
        'tree_numbers': [treeNum1, treeNum2, ...]
      }
    """
    tree = ET.parse(xml_path)
    root = tree.getroot()
    descriptors = []
    for dr in root.findall('DescriptorRecord'):
        ui_el = dr.find('DescriptorUI')
        name_el = dr.find('DescriptorName/String')
        if ui_el is None or name_el is None:
            continue
        ui = ui_el.text
        name = name_el.text
        tree_numbers = [tn.text for tn in dr.findall('TreeNumberList/TreeNumber') if tn.text]
        descriptors.append({
            'ui': ui,
            'name': name,
            'tree_numbers': tree_numbers
        })
    return descriptors

def get_bacteria_taxonomy(xml_path):
    """
    From the full MeSH descriptors file, extract all descriptors
    under the 'Bacteria' branch (tree number B03).
    Returns a dict: { tree_number: { 'ui': ..., 'name': ... }, ... }
    """
    descriptors = parse_mesh_descriptors(xml_path)

    bacteria_prefix = None
    for d in descriptors:
        if d['name'] == 'Bacteria':
            bacteria_prefix = d['tree_numbers'][0]
            break

    if not bacteria_prefix:
        raise RuntimeError("Couldn't find a descriptor named 'Bacteria' in the file.")

    taxonomy = {}
    for d in descriptors:
        for tn in d['tree_numbers']:
            if tn == bacteria_prefix or tn.startswith(bacteria_prefix + '.'):
                taxonomy[tn] = {
                    'ui': d['ui'],
                    'name': d['name']
                }
                break

    return taxonomy

if __name__ == '__main__':
    xml_file = 'desc2025.xml'
    bacteria_tax = get_bacteria_taxonomy(xml_file)

In [3]:
from difflib import SequenceMatcher

def find_mesh_match(input_name, name_index, n=5, cutoff=0.6):
    """
    Return matches for input_name among the MeSH bacterial names.
    Exact matches come first with score=1.0.
    Then fuzzy matches (score computed via SequenceMatcher.ratio).
    """
    key = input_name.lower()
    results = []
    if key in name_index:
        for tree_num, ui, name in name_index[key]:
            results.append((name, ui, tree_num, 1.0))
        return results

    all_names = list(name_index.keys())
    
    close = get_close_matches(key, all_names, n=n, cutoff=cutoff)
    for cname in close:
        score = SequenceMatcher(None, key, cname).ratio()
        for tree_num, ui, name in name_index[cname]:
            results.append((name, ui, tree_num, score))
    results.sort(key=lambda x: x[3], reverse=True)
    return results

def build_name_index(taxonomy):
    """
    Build a dict: lower-case name -> list of (tree#, ui, canonical name)
    """
    idx = {}
    for tree_num, info in taxonomy.items():
        key = info['name'].lower()
        idx.setdefault(key, []).append((tree_num, info['ui'], info['name']))
    return idx

name_index = build_name_index(bacteria_tax)
print(f"Indexed {len(name_index)} bacterial names")

Indexed 859 bacterial names


In [4]:
import xml.etree.ElementTree as ET

def load_mesh_definitions(xml_path: str) -> dict:
    """
    Parse the MeSH XML and return a dict mapping DescriptorUI → ScopeNote text.
    """
    defs = {}
    tree = ET.parse(xml_path)
    root = tree.getroot()
    for dr in root.findall('DescriptorRecord'):
        ui = dr.findtext('DescriptorUI')
        # ScopeNote can appear directly or under ScopeNoteList/ScopeNote
        sn = dr.findtext('ScopeNote') or dr.findtext('ScopeNoteList/ScopeNote')
        if ui and sn:
            defs[ui] = sn.strip()
    return defs

# load once at module top
MESH_XML = 'desc2025.xml'
mesh_definitions = load_mesh_definitions(MESH_XML)

def get_mesh_description(uri: str, mesh_defs: dict) -> str | None:
    """
    Given a full MeSH URI like "https://.../ui", return its ScopeNote definition if present.
    """
    ui = uri.rsplit('/', 1)[-1]
    return mesh_defs.get(ui)


In [5]:
import re, json, numpy as np
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def load_taxonomy_tree(path):
    row_re = re.compile(r"^\s*(.*?)\s+\[([^\]]+)\]\s*$")
    rows = []
    with open(path, encoding="utf-8") as fh:
        for ln in fh:
            m = row_re.match(ln)
            if m:
                label, tid = m.groups()
                depth      = len(ln) - len(ln.lstrip())
                rows.append((label, tid, depth))
    return rows

TAX_FILE = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\parsedOntologies\bacteria_tree1.txt"
rows     = load_taxonomy_tree(TAX_FILE)

exact_ix  = defaultdict(list)
for lbl, tid, depth in rows:
    exact_ix[lbl.lower()].append((lbl, tid, depth))

labels_only  = [r[0] for r in rows]
vec          = TfidfVectorizer(stop_words="english")
mat          = vec.fit_transform(labels_only)

def top_cosine(term, k=5, thr=0.75):
    v   = vec.transform([term])
    sc  = cosine_similarity(v, mat).ravel()
    idx = np.argsort(sc)[::-1]
    out = []
    for i in idx[:k]:
        if sc[i] < thr: break
        lbl, tid, d = rows[i]
        out.append((lbl, tid, d, sc[i]))
    return out

abbr_re = re.compile(r"^([A-Z])\.\s+([A-Za-z_-]+)$")

def preprocess(term):
    term = term.replace('_', ' ')
    term = term.strip()
    return " ".join(lemmatizer.lemmatize(w) for w in term.split())

def genus_abbrev_lookup(term):
    m = abbr_re.match(term)
    if not m:
        return []
    initial, species = m.groups()
    species = species.lower()
    hits = []
    for lbl, tid, d in rows:
        if lbl.lower().endswith(' ' + species) and lbl[0].upper() == initial:
            hits.append((lbl, tid, d, 1.00))
    return hits

<h1>INGEST BACTERIA</h1>

In [6]:
import os
import re
import unicodedata
import json
from pathlib import Path
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD, SKOS, OWL
from pprint import pprint
from rdflib.namespace import DCTERMS

# -----------------------------------------------------------------------------
# 1. Setup paths and namespaces
# -----------------------------------------------------------------------------
path = str(Path(os.path.abspath(os.getcwd())).absolute())
json_file = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\data\train_platinum\train_platinum.json"
#json_file = os.path.join(path, "train_gold.json")

tokenized_file = os.path.join(path, "tokenized_sentences_with_entitiesv2.json")
save_path = os.path.join(path, "rdf")
os.makedirs(save_path, exist_ok=True)

GUTBRAIN = Namespace("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/")
GUTPROP = Namespace("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/")
GUTBRAINMENTION = Namespace("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/mention/")
GUTBRAINSENTENCE = Namespace("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/sentence/")
GUTBRAINABSTRACT = Namespace("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/abstract/")
GUTBRAINTITLE = Namespace("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/title/")

PAPER_CLASS       = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Paper")
MENTION_CLASS     = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Mention")
PAPER_ABSTRACT    = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/PaperAbstract")
PAPER_TITLE       = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/PaperTitle")
PAPER_COLLECTION  = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/PaperCollection")
PROJECT           = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Project")
SAMPLE            = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Sample")
SENTENCE          = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Sentence")

# -----------------------------------------------------------------------------
# 2. Load the JSON paper data
# -----------------------------------------------------------------------------
with open(json_file, "r", encoding="utf-8") as f:
    data = json.load(f)
# -----------------------------------------------------------------------------
# 3. Mapping dictionaries (keys must be in Title case)
# -----------------------------------------------------------------------------
label_mapping = {
    "Anatomical Location":   URIRef("https://w3id.org/brainteaser/ontology/schema/AnatomicalSite"),
    "Animal":                URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Animal"),
    "Biomedical Technique":  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/BiomedicalTechnique"),
    "Bacteria":              URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Species"),
    "Chemical":              URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Chemical"),
    "Dietary Supplement":    URIRef("https://w3id.org/brainteaser/ontology/schema/DietarySupplement"),
    "DDF":                   URIRef("https://w3id.org/brainteaser/ontology/schema/DiseaseDisorderOrFinding"),
    "Drug":                  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Drug"),
    "Food":                  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Food"),
    "Gene":                  URIRef("https://w3id.org/brainteaser/ontology/schema/Gene"),
    "Human":                 URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Human"),
    "Microbiome":            URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Microbiome"),
    "Statistical Technique": URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/StatisticalTechnique")
}
concept_scheme_mapping = {
    "Anatomical Location":   URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/AnatomicSite"),
    "Animal":                URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Animal"),
    "Human":                 URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Human"),
    "Drug":                  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Drug"),
    "Gene":                  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Gene"),
    "Dietary Supplement":    URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/DietarySupplement"),
    "DDF":                   URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/DiseaseDisorderOrFinding"),
    "Metabolite":            URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Metabolite"),
    "Bacteria":               URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Bacteria"),
    "Food":                  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Food"),
    "Chemical":              URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Chemical"),
    "Biomedical Technique":  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/BiomedicalTechnique"),
    "Microbiome":            URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Microbiome"),
    "Statistical Technique": URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/StatisticalTechnique")
}
tokenized_mentions = {}

BACTERIA_CLASS = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Species")
FAMILY_CLASS = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Family")
OBO_BASE = "http://purl.obolibrary.org/obo/"
MESH_BASE = "https://www.ncbi.nlm.nih.gov/mesh/"
BACTERIA_CONCEPT_SCHEME = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Bacteria")

g = Graph()
g.bind("gutbrain", GUTBRAIN)
g.bind("rdfs", RDFS)
g.bind("xsd", XSD)
g.bind("skos", SKOS)
g.bind("owl", OWL)
g.bind("gutprop", GUTPROP)

In [None]:
from funcutils import get_ncit_description, get_chebi_description, get_omit_description, get_foodon_description, NCBI_BASE, HEREDITARY_BASE, UMLS_BASES, foodon_file, ncit_file, omit_file, chebi_file, hash_term_sha256
from groqutils import get_llm_definition
from umlsutils import get_umls_definition

g.add((SKOS.inScheme, RDF.type, OWL.ObjectProperty))
g.add((SKOS.broaderTransitive, RDF.type, OWL.ObjectProperty))

g.add((GUTPROP.partOf, RDF.type, OWL.ObjectProperty))
g.add((GUTPROP.partOf, RDFS.label, Literal("partOf", datatype=XSD.string)))
g.add((GUTPROP.hasTitle, RDF.type, OWL.ObjectProperty))
g.add((GUTPROP.hasTitle, RDFS.label, Literal("hasTitle", datatype=XSD.string)))
g.add((GUTPROP.hasAbstract, RDF.type, OWL.ObjectProperty))
g.add((GUTPROP.hasAbstract, RDFS.label, Literal("hasAbstract", datatype=XSD.string)))

g.add((GUTPROP.containedIn, RDF.type, OWL.ObjectProperty))
g.add((GUTPROP.containedIn, RDFS.label, Literal("containedIn", datatype=XSD.string)))
g.add((GUTBRAIN.contains, RDF.type, OWL.ObjectProperty))
g.add((GUTBRAIN.contains, RDFS.label, Literal("contains", datatype=XSD.string)))
g.add((GUTPROP.composedOf, RDF.type, OWL.ObjectProperty))
g.add((GUTPROP.composedOf, RDFS.label, Literal("composedOf", datatype=XSD.string)))

g.add((GUTPROP.locatedIn, RDF.type, OWL.ObjectProperty))
g.add((GUTPROP.locatedIn, RDFS.label, Literal("locatedIn", datatype=XSD.string)))

g.add((GUTPROP.paperId, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.paperId, RDFS.label, Literal("paperId", datatype=XSD.string)))
g.add((GUTPROP.paperAnnotator, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.paperAnnotator, RDFS.label, Literal("paperAnnotator", datatype=XSD.string)))
g.add((GUTPROP.paperYear, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.paperYear, RDFS.label, Literal("paperYear", datatype=XSD.string)))
g.add((GUTPROP.paperJournal, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.paperJournal, RDFS.label, Literal("paperJournal", datatype=XSD.string)))
g.add((GUTPROP.paperAuthor, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.paperAuthor, RDFS.label, Literal("paperAuthor", datatype=XSD.string)))
g.add((GUTPROP.numberOfRunsFound, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.numberOfRunsFound, RDFS.label, Literal("numberOfRunsFound", datatype=XSD.string)))
g.add((GUTPROP.NCBITaxonID, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.NCBITaxonID, RDFS.label, Literal("NCBITaxonID", datatype=XSD.string)))
g.add((GUTPROP.sdRelativeAbundance, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.sdRelativeAbundance, RDFS.label, Literal("sdRelativeAbundance", datatype=XSD.string)))
g.add((GUTPROP.medianRelativeAbundance, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.medianRelativeAbundance, RDFS.label, Literal("medianRelativeAbundance", datatype=XSD.string)))
g.add((GUTPROP.meanRelativeAbundance, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.meanRelativeAbundance, RDFS.label, Literal("meanRelativeAbundance", datatype=XSD.string)))
g.add((GUTPROP.scientificName, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.scientificName, RDFS.label, Literal("scientificName", datatype=XSD.string)))
g.add((GUTPROP.hasMentionText, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.hasMentionText, RDFS.label, Literal("hasMentionText", datatype=XSD.string)))
g.add((GUTPROP.hasSentenceText, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.hasSentenceText, RDFS.label, Literal("hasSentenceText", datatype=XSD.string)))
g.add((GUTPROP.hasTitleText, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.hasTitleText, RDFS.label, Literal("hasTitleText", datatype=XSD.string)))
g.add((GUTPROP.hasAbstractText, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.hasAbstractText, RDFS.label, Literal("hasAbstractText", datatype=XSD.string)))
g.add((GUTPROP.taggedAs, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.taggedAs, RDFS.label, Literal("taggedAs", datatype=XSD.string)))

for scheme_uri in set(concept_scheme_mapping.values()):
    keys = [k for k, v in concept_scheme_mapping.items() if v == scheme_uri]
    label_text = ", ".join(k.title() for k in keys) + " Concept Scheme"
    g.add((scheme_uri, RDF.type, SKOS.ConceptScheme))
    g.add((scheme_uri, RDFS.label, Literal(label_text, datatype=XSD.string)))

is_train_platinum = "train_platinum" in os.path.basename(json_file)
#is_train_gold = "train_gold" in os.path.basename(json_file)

#if is_train_gold:
 #   gold_collection_uri = URIRef(GUTBRAIN["goldCollection"])
 #   label_text = "goldCollection"
  #  g.add((gold_collection_uri, RDF.type, PAPER_COLLECTION))
   # g.add((gold_collection_uri, RDFS.label, Literal(label_text, datatype=XSD.string)))
    
if is_train_platinum:
    platinum_collection_uri = URIRef(GUTBRAIN["platinumCollection"])
    label_text = "platinumCollection"
    g.add((platinum_collection_uri, RDF.type, PAPER_COLLECTION))
    g.add((platinum_collection_uri, RDFS.label, Literal(label_text, datatype=XSD.string)))

def create_uri_fragment(text):
    cleaned = re.sub(r'<[^>]*>', '', text)
    cleaned = normalize_text(cleaned)
    cleaned = re.sub(r'[^\w\u0370-\u03FF-]', '_', cleaned)
    return cleaned

def to_camel_case(s):
    s = re.sub(r'[^\w\s]', '', s)
    parts = re.split(r'\s+', s.strip())
    if not parts:
        return ""
    return parts[0].lower() + ''.join(word.title() for word in parts[1:])

def normalize_text(text):
    return unicodedata.normalize('NFC', text)

def normalize_to_ascii(s: str) -> str:
    nfkd = unicodedata.normalize('NFKD', s)
    ascii_bytes = nfkd.encode('ascii', 'ignore')
    return ascii_bytes.decode('ascii')

def singularize(term):
    if term.endswith("ies"):
        return term[:-3] + "y"
    elif term.endswith("s") and not term.endswith("ss"):
        return term[:-1]
    return term

def choose_definition(uri_str: str, term_raw: str) -> str:
    # 1) NCIT
    raw_ncit = None
    if uri_str.startswith(NCBI_BASE) and "NCIT_" in uri_str:
        ncit_id = uri_str.rsplit("/",1)[-1].split("_",1)[1]
        try:
            raw_ncit = get_ncit_description(ncit_id, ncit_file)
        except KeyError:
            pass

    # 2) ChEBI
    raw_chebi = None
    if not raw_ncit:
        try:
            raw_chebi = get_chebi_description(uri_str, chebi_file)
        except KeyError:
            pass

    # 3) OMIT
    raw_omit = None
    if not raw_ncit and not raw_chebi:
        try:
            raw_omit = get_omit_description(uri_str, omit_file)
        except KeyError:
            pass

    # 4) FOODON
    raw_foodon = None
    if not raw_ncit and not raw_chebi and not raw_omit:
        try:
            raw_foodon = get_foodon_description(uri_str, foodon_file)
        except KeyError:
            pass

    # 5) MeSH
    raw_mesh = None
    if not raw_ncit and not raw_chebi and not raw_omit and not raw_foodon:
        raw_mesh = get_mesh_description(uri_str, mesh_definitions)

    # 6) LLM fallback
    if raw_ncit:
        m = re.search(r'—\s*(.*?)\s*—', raw_ncit)
        text = m.group(1).strip() if m else raw_ncit.strip()
        return f"{text} [Definition Source: NCIT]"
    elif raw_chebi:
        return f"{raw_chebi.strip()} [Definition Source: ChEBI]"
    elif raw_omit:
        return f"{raw_omit.strip()} [Definition Source: OMIT]"
    elif raw_foodon:
        return f"{raw_foodon.strip()} [Definition Source: FOODON]"
    elif raw_mesh:
        return f"{raw_mesh.strip()} [Definition Source: MeSH]"
    else:
        llm_def = get_llm_definition(term_raw)
        return f"{llm_def} [Definition Source: llama3-8b-8192]"

manual_created = {}
created = {}

manual_created = {
    "family_ruminococcaceae": URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C2584567"),
    "methanobrevibacter": URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0995874"),
    "rikenellaceae" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C1080664"),
    "firmicutes": URIRef("https://www.ncbi.nlm.nih.gov/mesh/?term=firmicutes"),
    "toxoplasma_gondii" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0040557"),
    "microorganism": URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0445623"),
    "psychobiotics": URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/species/Psychobiotics"),
    "probiotics": URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0525033"),
    "mucus-associated_species": URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/species/Mucus-AssociatedSpecies"),
    "short-chain_fatty_acid_producing_bacteria": URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/species/Short-ChainFattyAcidProducingBacteria"),
    "acute_chronic_insomnia-related_signature_bacteria": URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/species/AcuteChronicInsomniaRelatedSignatureBacteria"),
    "flavonifractor": URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C2959924"),
    "agathobacter_rectalis" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0317478"),
    "coliform_bacteria": URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0314760"),
    "bifidobacterium" : URIRef("http://purl.obolibrary.org/obo/NCBITaxon_1678"),
    "lactobacillus" : URIRef("http://purl.obolibrary.org/obo/NCBITaxon_1578"),
    "flavonifractor" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C2959924"),
    "phylum_synergistetes" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C2309303"),
    "leptospiraceae" : URIRef("https://www.ncbi.nlm.nih.gov/mesh/D016953"),
    "prevotella" : URIRef("https://www.ncbi.nlm.nih.gov/mesh/D018720"),
    "clostridium" : URIRef("https://www.ncbi.nlm.nih.gov/mesh/D003013"),
    "ruminococcus" : URIRef("http://purl.obolibrary.org/obo/NCBITaxon_1263"),
  }

created = dict(manual_created)

regex_map = [
    (r"lactobacillus_plantarum_ccfm8661","acinetobacter_plantarum"),
    (r"live_and_inactivated_lactobacillus_plantarum_ccfm8661","acinetobacter_plantarum"),
    (r"live_l__plantarum_ccfm8661","acinetobacter_plantarum"),
    (r"inactivated_l__plantarum_ccfm8661","acinetobacter_plantarum"),
    (r"\bprevotella_specie\w*\b","prevotella"),
    (r"anaerostipe","anaerostipes"),
    (r"\blactococcu\w*\b","lactococcus"),
    (r"lactobacillus_reuteri_atg-f4","limosilactobacillus_reuteri"),
    (r"lachnospiraceae_incertae_sedi","leptospiraceae"),
    (r"\bl_plantarum_ccfm8661\b","acinetobacter_plantarum"),
    (r"ruminococcaceae","family_ruminococcaceae"),
    (r"\bl__brevis_p30021\b","lactobacillus_brevis"),
    (r"\bl\b","lactobacillus"),
    (r"plantarum p-80","acinetobacter_plantarum"),
    (r"bacteroidaceae_and_porphyromonadaceae_families","bacteroidaceae"),
    (r"rikencellaceae","rikenellaceae"),
    (r"biffdobacterium","bifidobacterium"),
    (r"bifidobacterium","bifidobacterium"),
    (r"bacteroidaceae_and_porphyromonadaceae_family","bacteroidaceae"),
    (r"lactobacillus_plantarum_r1012","acinetobacter_plantarum"),
    (r"maternal_vaginal_microbes","bacteria"),
    (r"maternal_vaginal_microbiome","microbacterium"),
    (r"ruminococcaceae_incertae_sedis","family_ruminococcaceae"),
    (r"microorganisms","microorganism"),
    (r"hypnotic_psychobiotic_strain","psychobiotics"),
    (r"psychobiotic_strain","psychobiotics"),
    (r"ps150","limosilactobacillus_fermentum"),
    (r"\bl_fermentum_strain\b","limosilactobacillus_fermentum"),
    (r"gr1009","limosilactobacillus_fermentum"),
    (r"heat-killed_ps150","limosilactobacillus_fermentum"),
    (r"cancer-related_bacteria","bacteria"),
    (r"short-chain_fatty_acid__scfa_-producing_bacteria","short-chain_fatty_acid_producing_bacteria"),
    (r"clostridiuminnocuumgroup","clostridium_innocuum_group"),
    (r"gut_flavonifractor_genus","flavonifractor"),
    (r"gut_christensenellaceae_family","christensenellaceae"),
    (r"ucg009","family_ruminococcaceae"),
    (r"eubacterium_rectale","agathobacter_rectalis"),
    (r"coliform_genera","coliform_bacteria"),
    (r"bifidobacteria", "bifidobacterium"),
    (r"\bparabacteroide\w*\b","bacteroide"),
    (r"\blactobacillu","lactobacillus"),
    (r"\bbacteroide\w*\b","bacteroide"),
    (r"\bgut_bacteria\b","bacteria"),
    (r"\bbacteroidete\w*\b","bacteroidete"),
    (r"oral_bacteria\b", "bacteria"),
    (r"maternal_vaginal_microbe","microorganism"),
    (r"l__fermentum_strain","limosilactobacillus_fermentum"),
    (r"synergistete","phylum_synergistetes"),
    (r"gut_flavonifractor_genu","flavonifractor"),
    (r"\blactobacillacea\b","lactobacillaceae"),
    (r"\bstreptococcu\w*\b","streptococcus"),
    (r"\bruminococcu\w*\b","ruminococcus"),
    (r"\bec_ruminococcus_torques\b","ruminococcus"),
    (r"\bruminococcus\b","ruminococcus"),
    (r"\bcoprococcu\w*\b", "micrococcus"),
    (r"butyrate-producing_bacteria","butyrate-producing_bacterium"),
    (r"\bclostridium_sp__br31\b", "clostridium"),
    (r"\bmegamona\b","comamonas"),
    (r"\bcoproccu\w*\b","micrococcus"),
    (r"\bf4\b","bacillus_sp._f4-1"),
    (r"\bhaemophilu\w*\b","haemophilus"),
    (r"\bintestinal_bacteria\b","bacteria"),
    (r"\bactinobacteriota\b","actinobacteria"),
    (r"\bacetate-producing_bacteria\b","actinobacteria"),
    (r"\bverrucomicrobia\b","verrucomicrobia_bacterium_b-1-8")
    
]
mesh_descs   = parse_mesh_descriptors(MESH_XML)

for term_raw, uri in manual_created.items():
    uri_str = str(uri)

    # NCIT definitions
    if uri_str.startswith(NCBI_BASE) and "NCIT_" in uri_str:
        ncit_id = uri_str.rsplit("_", 1)[-1]
        desc = get_ncit_description(ncit_id, ncit_file)
        m = re.search(r'—\s*(.*?)\s*—', desc)
        if m:
            desc = m.group(1).strip()
            comment = f"{desc} [Definition Source: NCIT]"
        else:
            llm_def = get_llm_definition(term_raw)
            comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    # UMLS definitions
    elif uri_str.startswith(UMLS_BASES):
        cui = uri_str.rsplit("/", 1)[-1]
        defn = get_umls_definition(cui)
        if defn:
            comment = f"{defn.strip()} [Definition Source: UMLS]"
        else:
            llm_def = get_llm_definition(term_raw)
            comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    # MeSH definitions via your mesh_index
    elif uri_str.startswith(MESH_BASE):
        ui     = uri_str.rsplit("/",1)[-1]
        hits   = [d['name'] for d in mesh_descs if d['ui']==ui]
        if hits:
            comment = f"{hits[0]} [Definition Source: MeSH]"
        elif term_raw == "patients":
            comment = "Patients with various diseases. [Definition Source: GUTBRAIN]"
        else:
            llm_def = get_llm_definition(term_raw)
            comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    elif uri_str.startswith(HEREDITARY_BASE):
        llm_def = get_llm_definition(term_raw)
        comment = f"{llm_def} [Definition Source: llama3-8b-8192]"
    else:
        llm_def = get_llm_definition(term_raw)
        comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    g.add((uri,
           RDFS.comment,
           Literal(comment, datatype=XSD.string)))
    
CREATOR = "Samuel Piron"

for uri in manual_created.values():
    uri_str = str(uri)

    if uri_str.startswith(UMLS_BASES):
        # only a UMLS‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("UMLS Match", datatype=XSD.string)))
    elif "NCIT" in uri_str:
        # only a NCIT‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("NCIT Match", datatype=XSD.string)))
    elif "OMIT" in uri_str:
        # only a OMIT‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("OMIT Match", datatype=XSD.string)))
    elif "NCBITaxon" in uri_str:
        # only a NCBITaxon‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("NCBITaxon Match", datatype=XSD.string)))
    elif "CHEBI" in uri_str:
        # only a CHEBI‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("CHEBI Match", datatype=XSD.string)))
    elif "GO" in uri_str:
        # only a FOODON‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("GO Match", datatype=XSD.string)))
    elif "genome" in uri_str:
        # only a KO‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("KEGG Match", datatype=XSD.string)))
    elif "mesh" in uri_str:
        # only a MESH‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("MESH Match", datatype=XSD.string)))
    elif "PCO" in uri_str:
        # only a XCO‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("PCO Match", datatype=XSD.string)))
    else:
        # everything else still gets a creator
        g.add((uri,
               DCTERMS.creator,
               Literal(CREATOR, datatype=XSD.string)))

created["bacteria"] = URIRef(f"{OBO_BASE}NCBITaxon_2")
g.add((created["bacteria"], RDF.type, FAMILY_CLASS))
g.add((created["bacteria"], RDF.type, SKOS.Concept))
g.add((created["bacteria"], RDFS.label, Literal("Bacteria", datatype=XSD.string)))
g.add((created["bacteria"], SKOS.inScheme, BACTERIA_CONCEPT_SCHEME))

# -----------------------------------------------------------------------------
# 5. Process each paper (each key in the JSON represents a paper)
# -----------------------------------------------------------------------------
for paper_id, paper_data in data.items():
    paper_uri = URIRef(GUTBRAIN[f"paper_{paper_id}"])
    g.add((paper_uri, RDF.type, PAPER_CLASS))
    
    if is_train_platinum:
        g.add((paper_uri, GUTPROP.partOf, platinum_collection_uri))
        g.add((platinum_collection_uri, GUTBRAIN.contains, paper_uri))

    #if is_train_gold:
    #    g.add((paper_uri, GUTPROP.partOf, gold_collection_uri))
    #    g.add((gold_collection_uri, GUTBRAIN.contains, paper_uri))
    
    # Each paper gets its own mention node
    #paper_mention = URIRef(GUTBRAIN[f"mention_{paper_id}"])
    #g.add((paper_mention, RDF.type, MENTION_CLASS))
    #g.add((paper_uri, GUTPROP.hasMention, paper_mention))
    
    metadata = paper_data.get("metadata", {})
    full_title = metadata.get("title", None)
    full_abstract = metadata.get("abstract", None)
    try:
        paper_id_val = int(paper_id)
    except ValueError:
        paper_id_val = paper_id
    paper_annotator = metadata.get("annotator", None)
    paper_year = metadata.get("year", None)
    paper_journal = metadata.get("journal", None)
    paper_author = metadata.get("author", None)
    
    g.add((paper_uri, GUTPROP.paperId, Literal(paper_id_val, datatype=XSD.integer)))
    if paper_annotator is not None:
        g.add((paper_uri, GUTPROP.paperAnnotator, Literal(paper_annotator, datatype=XSD.string)))
    if paper_year is not None:
        g.add((paper_uri, GUTPROP.paperYear, Literal(paper_year, datatype=XSD.gYear)))
    if paper_journal is not None:
        g.add((paper_uri, GUTPROP.paperJournal, Literal(paper_journal, datatype=XSD.string)))
    if paper_author is not None:
        g.add((paper_uri, GUTPROP.paperAuthor, Literal(paper_author, datatype=XSD.string)))
    
    title_texts = []
    abstract_texts = []
    
    entities = paper_data.get("entities", [])
    
    for i, entity in enumerate(entities):
        raw_label = entity.get("label", "").strip()
        label_title = raw_label
        
        if label_title == "bacteria":
            text_span = entity.get("text_span", "").strip()
            cleaned_text_span = normalize_to_ascii(create_uri_fragment(text_span)).lower()
            term_raw = cleaned_text_span
            lookup_key = term_raw
            lookup_key = singularize(lookup_key)
            
            for pattern, replacement in regex_map:
                if re.search(pattern, lookup_key, flags=re.IGNORECASE):
                    lookup_key = replacement
                    print(lookup_key)
                    break
                    
            term = preprocess(lookup_key)
            
            print(f"Query: {term}")
            print(f"lookup term: {lookup_key}")
            
            
            if term_raw in created:

                entity_uri = created[term_raw]
                print(f"  → Reusing existing URI: {entity_uri}\n")
                g.add((entity_uri, RDF.type, BACTERIA_CLASS))
                g.add((entity_uri, RDF.type, SKOS.Concept))
                g.add((entity_uri, SKOS.inScheme, BACTERIA_CONCEPT_SCHEME))
                if (entity_uri, RDFS.label, None) not in g:
                        g.add((entity_uri,
                            RDFS.label,
                            Literal(term.title(), datatype=XSD.string)))
                g.add((entity_uri, SKOS.broaderTransitive, created["bacteria"]))
                mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_bacteria_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                tokenized_mentions[term_raw] = mention_uri
                print()
                continue

            if lookup_key in created:
                print(f"  → Reusing existing URI: {created[lookup_key]}")
                entity_uri = created[lookup_key]
                g.add((entity_uri, RDF.type, BACTERIA_CLASS))
                g.add((entity_uri, RDF.type, SKOS.Concept))
                g.add((entity_uri, SKOS.inScheme, BACTERIA_CONCEPT_SCHEME))
                if (entity_uri, RDFS.label, None) not in g:
                        g.add((entity_uri,
                            RDFS.label,
                            Literal(term.title(), datatype=XSD.string)))
                g.add((entity_uri, SKOS.broaderTransitive, created["bacteria"]))
                mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_bacteria_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                tokenized_mentions[term_raw] = mention_uri
                print()
                continue
                
            hits = genus_abbrev_lookup(term)
            ex = exact_ix.get(term, [])
            cos = top_cosine(term)
            if hits:
                for l,t,d,s in hits: 
                    print(f"  • {l:40s} ID={t:15s} depth={d:<2d} score={s:.2f} (abbr)")
                    entity_uri = URIRef(f"{OBO_BASE}{t}")
                    created[term_raw] = entity_uri
                    g.add((entity_uri, RDF.type, BACTERIA_CLASS))
                    g.add((entity_uri, RDF.type, SKOS.Concept))
                    if (entity_uri, RDFS.label, None) not in g:
                        g.add((entity_uri,
                            RDFS.label,
                            Literal(l.title(), datatype=XSD.string)))
                    uri_str = str(entity_uri)
                    definition = choose_definition(uri_str, term_raw)
                    existing_defs = [
                        c for c in g.objects(entity_uri, RDFS.comment)
                        if "[Definition Source:" in str(c)
                    ]
                    if not existing_defs:
                        g.add((entity_uri,
                            RDFS.comment,
                            Literal(definition, datatype=XSD.string)))
                    g.add((entity_uri, SKOS.inScheme, BACTERIA_CONCEPT_SCHEME))
                    g.add((entity_uri, SKOS.broaderTransitive, created["bacteria"]))
                    uri_str = str(entity_uri).lower()

                    if "stato_" in uri_str:
                        comment = "STATO Match"
                    elif "ncbitaxon_" in uri_str:
                        comment = "NCBITaxon Match"
                    elif "obi_" in uri_str:
                        comment = "OBI Match"
                    elif "umls" in uri_str:
                        comment = "UMLS Match"
                    elif "mesh" in uri_str:
                        comment = "MESH Match"
                    elif "ncbitaxon" in uri_str:
                        comment = "NCBITaxon Match"
                    else:
                        comment = CREATOR 
                    g.add((entity_uri, RDFS.comment, Literal(comment, datatype=XSD.string)))
                    mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_bacteria_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(label_title, datatype=XSD.string)))
                    g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    print()
                    continue

            #ex = exact_ix.get(term, [])
            elif ex:
                for label_name, taxon_id, depth in ex:
                    print(f"  • {label_name:40s} ID={taxon_id:15s} depth={depth:<2d} (exact)")
                    entity_uri = URIRef(f"{OBO_BASE}{taxon_id}")
                    created[term_raw] = entity_uri
                    #label2uri[label_name.lower()] = entity_uri
                    g.add((entity_uri, RDF.type, BACTERIA_CLASS))
                    g.add((entity_uri, RDF.type, SKOS.Concept))
                    if (entity_uri, RDFS.label, None) not in g:
                        g.add((entity_uri,
                            RDFS.label,
                            Literal(label_name.title(), datatype=XSD.string)))
                    g.add((entity_uri, SKOS.inScheme, BACTERIA_CONCEPT_SCHEME))
                    g.add((entity_uri, SKOS.broaderTransitive, created["bacteria"]))
                    uri_str = str(entity_uri).lower()

                    if "stato_" in uri_str:
                        comment = "STATO Match"
                    elif "ncbitaxon_" in uri_str:
                        comment = "NCBITaxon Match"
                    elif "obi_" in uri_str:
                        comment = "OBI Match"
                    elif "umls" in uri_str:
                        comment = "UMLS Match"
                    elif "mesh" in uri_str:
                        comment = "MESH Match"
                    elif "ncbitaxon" in uri_str:
                        comment = "NCBITaxon Match"
                    else:
                        comment = CREATOR 
                    g.add((entity_uri, RDFS.comment, Literal(comment, datatype=XSD.string)))
                    uri_str = str(entity_uri)
                    definition = choose_definition(uri_str, term_raw)
                    existing_defs = [
                        c for c in g.objects(entity_uri, RDFS.comment)
                        if "[Definition Source:" in str(c)
                    ]
                    if not existing_defs:
                        g.add((entity_uri,
                            RDFS.comment,
                            Literal(definition, datatype=XSD.string)))
                    mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_bacteria_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(label_title, datatype=XSD.string)))
                    g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    continue
                    print(); 
                    
            #cos = top_cosine(term)
            elif cos:
                for l,t,d,s in cos[:1]:
                    print(f"  • {l:40s} ID={t:15s} depth={d:<2d} score={s:.2f}")
                    name_uri = URIRef(f"{OBO_BASE}{t}")
                    created[term_raw] = name_uri
                    #label2uri[label_name.lower()] = name_uri
                    g.add((name_uri, RDF.type, BACTERIA_CLASS))
                    g.add((name_uri, RDF.type, SKOS.Concept))
                    if (name_uri, RDFS.label, None) not in g:
                        g.add((name_uri,
                            RDFS.label,
                            Literal(l.title(), datatype=XSD.string)))
                    g.add((name_uri, SKOS.inScheme, BACTERIA_CONCEPT_SCHEME))
                    g.add((name_uri, SKOS.broaderTransitive, created["bacteria"]))
                    uri_str = str(name_uri).lower()

                    if "stato_" in uri_str:
                        comment = "STATO Match"
                    elif "ncbitaxon_" in uri_str:
                        comment = "NCBITaxon Match"
                    elif "obi_" in uri_str:
                        comment = "OBI Match"
                    elif "umls" in uri_str:
                        comment = "UMLS Match"
                    elif "mesh" in uri_str:
                        comment = "MESH Match"
                    elif "ncbitaxon" in uri_str:
                        comment = "NCBITaxon Match"
                    else:
                        comment = CREATOR 
                    g.add((name_uri, RDFS.comment, Literal(comment, datatype=XSD.string)))
                    uri_str = str(name_uri)
                    definition = choose_definition(uri_str, term_raw)
                    existing_defs = [
                        c for c in g.objects(name_uri, RDFS.comment)
                        if "[Definition Source:" in str(c)
                    ]
                    if not existing_defs:
                        g.add((name_uri,
                            RDFS.comment,
                            Literal(definition, datatype=XSD.string)))
                    mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_bacteria_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(label_title, datatype=XSD.string)))
                    g.add((name_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    continue
                    print()
            else: 
                matches = find_mesh_match(term, name_index)
                if matches: 
                    for name, ui, tree, score in matches[:1]: 
                        print(f"  • {name:30s} UI={ui:8s} Tree={tree:12s}  scoreMESH={score:.2f}")
                        name_uri = URIRef(f"{MESH_BASE}{ui}")
                        created[term_raw] = name_uri
                        g.add((name_uri, RDF.type, BACTERIA_CLASS))
                        g.add((name_uri, RDF.type, SKOS.Concept))
                        if (name_uri, RDFS.label, None) not in g:
                            g.add((name_uri,
                            RDFS.label,
                            Literal(name.title(), datatype=XSD.string)))
                        g.add((name_uri, SKOS.inScheme, BACTERIA_CONCEPT_SCHEME))
                        g.add((name_uri, SKOS.broaderTransitive, created["bacteria"]))
                        uri_str = str(name_uri).lower()

                        if "stato_" in uri_str:
                            comment = "STATO Match"
                        elif "ncbitaxon_" in uri_str:
                            comment = "NCBITaxon Match"
                        elif "obi_" in uri_str:
                            comment = "OBI Match"
                        elif "umls" in uri_str:
                            comment = "UMLS Match"
                        elif "mesh" in uri_str:
                            comment = "MESH Match"
                        elif "ncbitaxon" in uri_str:
                            comment = "NCBITaxon Match"
                        else:
                            comment = CREATOR 
                        g.add((name_uri, RDFS.comment, Literal(comment, datatype=XSD.string)))
                        uri_str = str(name_uri)
                        definition = choose_definition(uri_str, term_raw)
                        existing_defs = [
                            c for c in g.objects(name_uri, RDFS.comment)
                            if "[Definition Source:" in str(c)
                        ]
                        if not existing_defs:
                            g.add((name_uri,
                                RDFS.comment,
                                Literal(definition, datatype=XSD.string)))
                        mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                        g.add((mention_uri, RDF.type, MENTION_CLASS))
                        g.add((mention_uri, RDFS.label, Literal(f"mention_bacteria_{term_raw}", datatype=XSD.string)))
                        g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                        g.add((mention_uri, GUTPROP.taggedAs, Literal(label_title, datatype=XSD.string)))
                        g.add((name_uri, GUTPROP.containedIn, mention_uri))
                        tokenized_mentions[term_raw] = mention_uri
                        continue
                else:
                    mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_bacteria_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(label_title, datatype=XSD.string)))
                    g.add((created["bacteria"], GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    print("no match")
                    continue
        else:
            pass

    g.remove((created["bacteria"], RDF.type, BACTERIA_CLASS))
    
    location_lower = entity.get("location", "").strip().lower()
    text_span = entity.get("text_span", "").strip()
    canonical = create_uri_fragment(text_span)
    cleaned_text_span = normalize_to_ascii(canonical)
    if location_lower == "title":
        title_texts.append(cleaned_text_span)
    elif location_lower == "abstract":
        abstract_texts.append(cleaned_text_span)
    
    if full_title is None and title_texts:
        full_title = " ".join(title_texts)
    if full_abstract is None and abstract_texts:
        full_abstract = " ".join(abstract_texts)
    
    if full_title:
        title_uri = URIRef(GUTBRAINTITLE[f"{paper_id}"])
        g.add((title_uri, RDFS.label, Literal(f"title_{paper_id}", datatype=XSD.string)))
        g.add((title_uri, RDF.type, PAPER_TITLE))
        g.add((title_uri, GUTPROP.hasTitleText, Literal(full_title, datatype=XSD.string)))
        g.add((paper_uri, GUTPROP.hasTitle, title_uri))
    
    if full_abstract:
        abstract_uri = URIRef(GUTBRAINABSTRACT[f"{paper_id}"])
        g.add((abstract_uri, RDFS.label, Literal(f"abstract_{paper_id}", datatype=XSD.string)))
        g.add((abstract_uri, RDF.type, PAPER_ABSTRACT))
        g.add((abstract_uri, GUTPROP.hasAbstractText, Literal(full_abstract, datatype=XSD.string)))
        g.add((paper_uri, GUTPROP.hasAbstract, abstract_uri))


with open(tokenized_file, "r", encoding="utf-8") as f_sent:
    tokenized_data = json.load(f_sent)

for entry in tokenized_data:
    pmid         = entry["pmid"]
    sent_id      = entry["sent_id"]
    sentence_txt = entry["sentence"].strip()
    entities     = entry["entities"]

    sent_uri = URIRef(GUTBRAINSENTENCE[f"{pmid}_{sent_id}"])
    g.add((sent_uri, RDF.type, SENTENCE))
    g.add((sent_uri, GUTPROP.hasSentenceText, Literal(sentence_txt, datatype=XSD.string)))

    if sent_id == 0:
        parent_uri = URIRef(GUTBRAINTITLE[f"{pmid}"])
    else:
        parent_uri = URIRef(GUTBRAINABSTRACT[f"{pmid}"])
    g.add((sent_uri,      GUTPROP.partOf,  parent_uri))
    g.add((parent_uri,    GUTPROP.composedOf, sent_uri))
        
    for ent in entities:
        if isinstance(ent, dict):
            text_span = ent.get("text_span", "").strip()
            label     = ent.get("label",    "").strip().lower()
        else:
            continue 

        if label != "bacteria":
            continue
        canonical = create_uri_fragment(text_span)
        cleaned_text_span = normalize_to_ascii(canonical).lower()
        if cleaned_text_span not in tokenized_mentions:
            mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(cleaned_text_span, max_length=16)])
            tokenized_mentions[cleaned_text_span] = mention_uri

            g.add((mention_uri, RDF.type, MENTION_CLASS))
            g.add((mention_uri, RDFS.label, Literal(f"mention_bacteria_{cleaned_text_span}", datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.taggedAs, Literal(label, datatype=XSD.string)))
            
        else:
            mention_uri = tokenized_mentions[cleaned_text_span]
            
        g.add((mention_uri, GUTPROP.locatedIn, sent_uri))

output_file = os.path.join(save_path, "gutbrain_entities.ttl")
ttl_output = g.serialize(format="turtle")
with open(output_file, "w", encoding="utf-8") as f_out:
    f_out.write(ttl_output)

print(f"The RDF graph has been saved in {output_file}")

Query: veillonella
lookup term: veillonella
  • Veillonella                              ID=NCBITaxon_29465 depth=12 (exact)
Query: roseburia
lookup term: roseburia
  • Roseburia                                ID=NCBITaxon_841   depth=12 (exact)
Query: christensenellaceae r-7 group
lookup term: christensenellaceae_r-7_group
  • Christensenellaceae                      ID=NCBITaxon_990719 depth=10 score=0.87
Query: subdoligranulum
lookup term: subdoligranulum
  • Subdoligranulum                          ID=NCBITaxon_292632 depth=12 (exact)
Query: oscillibacter
lookup term: oscillibacter
  • Oscillibacter                            ID=NCBITaxon_459786 depth=12 (exact)
Query: ucg-005
lookup term: ucg-005
  • Ruminococcaceae bacterium UCG-005        ID=NCBITaxon_3068309 depth=14 score=0.87
actinobacteria
Query: actinobacteria
lookup term: actinobacteria
  • Actinobacteria bacterium HGW-Actinobacteria-9 ID=NCBITaxon_2013654 depth=8  score=0.87
actinobacteria
Query: actinobacteria
lookup ter

<h1>INGEST CHEMICAL</h1>

In [None]:
import re
import json
import numpy as np
import requests
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
import unicodedata
from pathlib import Path
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD, SKOS, OWL
from rdflib.namespace import DCTERMS
from umlsutils import best_umls_match, search_umls, get_umls_definition
from groqutils import get_llm_definition
from funcutils import get_ncit_description, get_chebi_description, get_omit_description, get_foodon_description, NCBI_BASE, HEREDITARY_BASE, UMLS_BASES, foodon_file, ncit_file, omit_file, chebi_file, hash_term_sha256

CHEBI_BASE = "http://purl.obolibrary.org/obo/"
CHEMICAL_CLASS = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Chemical")
CHEMICAL_CONCEPT_SCHEME = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Chemical")
UBERON_URI = URIRef("http://purl.obolibrary.org/obo/UBERON_0002097")

def load_ncbitaxon_labels(path):
    pattern = re.compile(r'^\s*(.*?)\s*[\(\[]([A-Za-z0-9_]+)[\)\]]')
    rows = []
    with open(path, encoding="utf-8") as fh:
        for ln in fh:
            m = pattern.match(ln)
            if not m:
                continue
            label, term_id = m.group(1), m.group(2)
            uri = CHEBI_BASE + term_id
            rows.append((label, uri))
    return rows
    
def load_chebi_labels(path):
    rows = []
    with open(path, encoding="utf-8") as fh:
        next(fh) 
        for ln in fh:
            uri, label = ln.rstrip("\n").split("\t", 1)
            rows.append((label, uri))
    return rows

CHEBI_LABELS_FILE = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\parsedOntologies\chebi_labels.txt"
NCBITAXON_LABELS_FILE = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\parsedOntologies\ncit_full_taxonomy.txt"
chebi_rows = load_chebi_labels(CHEBI_LABELS_FILE)
ncbi_rows = load_ncbitaxon_labels(NCBITAXON_LABELS_FILE)

exact_ix = defaultdict(list)
for lbl, uri in chebi_rows:
    exact_ix[lbl.lower()].append((lbl, uri))

exact_ix1 = defaultdict(list)
for lbl, uri in ncbi_rows:
    exact_ix1[lbl.lower()].append((lbl, uri))

labels_only = [preprocess(lbl) for lbl, _ in chebi_rows]
vec = TfidfVectorizer(stop_words="english")
mat = vec.fit_transform(labels_only)

labels_only1 = [preprocess(lbl) for lbl, _ in ncbi_rows]
vec1 = TfidfVectorizer(stop_words="english")
mat1 = vec1.fit_transform(labels_only1)

def top_cosine(term, k=5, thr=0.75):
    v  = vec.transform([term])
    sc = cosine_similarity(v, mat).ravel()
    idx = np.argsort(sc)[::-1]
    out = []
    for i in idx[:k]:
        if sc[i] < thr:
            break
        lbl, uri = chebi_rows[i]
        out.append((lbl, uri, sc[i]))
    return out

def top_cosine_ncbitaxon(term, k=5, thr=0.75):
    v  = vec1.transform([term])
    sc = cosine_similarity(v, mat1).ravel()
    idx = np.argsort(sc)[::-1]
    out = []
    for i in idx[:k]:
        if sc[i] < thr:
            break
        lbl, uri = ncbi_rows[i]
        out.append((lbl, uri, sc[i]))
    return out

greek_map = {
    'α': 'alpha',  'Α': 'alpha',
    'β': 'beta',   'Β': 'beta',
    'γ': 'gamma',  'Γ': 'gamma',
    'δ': 'delta',  'Δ': 'delta',
    'ε': 'epsilon','Ε': 'epsilon',
    'ζ': 'zeta',   'Ζ': 'zeta',
    'η': 'eta',    'Η': 'eta',
    'θ': 'theta',  'Θ': 'theta',
    'ι': 'iota',   'Ι': 'iota',
    'κ': 'kappa',  'Κ': 'kappa',
    'λ': 'lambda', 'Λ': 'lambda',
    'μ': 'mu',     'Μ': 'mu',
    'ν': 'nu',     'Ν': 'nu',
    'ξ': 'xi',     'Ξ': 'xi',
    'ο': 'omicron','Ο': 'omicron',
    'π': 'pi',     'Π': 'pi',
    'ρ': 'rho',    'Ρ': 'rho',
    'σ': 'sigma',  'Σ': 'sigma',
    'τ': 'tau',    'Τ': 'tau',
    'υ': 'upsilon','Υ': 'upsilon',
    'φ': 'phi',    'Φ': 'phi',
    'χ': 'chi',    'Χ': 'chi',
    'ψ': 'psi',    'Ψ': 'psi',
    'ω': 'omega',  'Ω': 'omega',
}

def preprocess(term):
    for greek_char, name in greek_map.items():
        if greek_char in term:
            term = term.replace(greek_char, name)
    term = term.replace('_', ' ')
    term = term.strip()
    return " ".join(lemmatizer.lemmatize(w) for w in term.split())

manual_created = {}
created = {}

manual_created = {
    "serum_hormone": URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/chemical/SerumHormone"),
    "melanosome" : URIRef("http://purl.obolibrary.org/obo/GO_0042470"),
    "glutamate" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0220839"),
    "phosphoribosylanthranilate_isomerase" : URIRef("https://www.genome.jp/dbget-bin/www_bget?ko:K01817"),
    "aspartate_aminotransferase" : URIRef("https://www.genome.jp/dbget-bin/www_bget?ko:K11358"),
    "3-deoxy-7-phosphoheptulonate_synthase" : URIRef("https://www.genome.jp/dbget-bin/www_bget?ko:K01626"),
    "tryptophanase": URIRef("https://www.genome.jp/dbget-bin/www_bget?K01667"),
    "rhepo" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/chemical/Rhepo"),
    "dna_methylation" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0376452"),
    "enteric_glial_cells_network" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/chemical/EntericGlialCellsNetwork"),
    "choline_metabolite" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C4329663"),
    "oxygen": URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0030054"),
    "gut_iga_level" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/chemical/GutIgaLevel"),
    "me": URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/chemical/Me"),
    "toll-like_receptors" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0670896"),
    "fatty_acid" : URIRef("http://purl.obolibrary.org/obo/NCIT_C492"),
    "neurotransmitter" : URIRef("http://purl.obolibrary.org/obo/NCIT_C687"),
    "lipid" : URIRef("http://purl.obolibrary.org/obo/NCIT_C616"),
    "isorhamnetin" : URIRef("http://purl.obolibrary.org/obo/CHEBI_6052"),
    "serotonin" : URIRef("http://purl.obolibrary.org/obo/CHEBI_28790"),
    "nutrient" : URIRef("http://purl.obolibrary.org/obo/CHEBI_33284"),
    "atp" : URIRef("http://purl.obolibrary.org/obo/CHEBI_15422"),
    "iron" : URIRef("http://purl.obolibrary.org/obo/NCIT_C598"),
    "lactic_acid": URIRef("http://purl.obolibrary.org/obo/CHEBI_42111"),
    "microbially-derived_neurotransmitters" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/chemical/MicrobiallyDerivedNeurotransmitters"),
}

created = dict(manual_created)

regex_map = [
    (r"simple_sugar","monosaccharide"),
    (r"\bbioactive_polyphenol\w*\b","polyphenol"),
    (r"lactic-acid","lactic_acid"),
    (r"sfa","saturated_fatty_acid"),
    (r"fat_and_sugar_content", "monosaccharide"),
    (r"il-6", "interleukin_6"),
    (r"melanosomes", "melanosome"),
    (r"stress-related_hormones", "hormone"),
    (r"inflammation-related_cytokines", "cytokine"),
    (r"neuroactive_compound","compound"),
    (r"gaba","gamma-aminobutyric_acid"),
    (r"plant-origin_and_microbially-formed_neuroactive_compounds", "compound"),
    (r"microbial_metabolites", "metabolite"),
    (r"\bscfa\w*\b","short_chain_fatty_acid"),
    (r"microbial_neuroactive_metabolites", "metabolite"),
    (r"dextran_sodium_sulphate", "dextran_sulfate_sodium"),
    (r"dss", "dextran_sulfate_sodium"),
    (r"lps", "lipopolysaccharide"),
    (r"\beya1\b","eya-1_protein,_c_elegans"),
    (r"circulating_lipocalin-2", "lipocalin-2"),
    (r"\bserotonin\w*\b","serotonin"),
    (r"xpjyf", "xingpijieyu_formula"),
    (r"gfap","glial_fibrillary_acidic_protein"),
    (r"2-dg","2-deoxy-d-glucose"),
    (r"g1p","glucose-1-phosphate"),
    (r"glucose1pmetab-pwy","glucose-1-phosphate_metabolism"),
    (r"associated_synaptic_proteins","synaptic_cast_protein,_human"),
    (r"helicobacter_pylori-related_inflammatory_mediators","mediators"),
    (r"bovine_milk_casein","bovine_milk_plasma"),
    (r"food_antigens","antigens"),
    (r"inflammatory_cytokines","cytokines"),
    (r"\bil-1\b","interleukin-1"),
    (r"\bil-2\b","interleukin-2"),
    (r"\bil-4\b","interleukin-4"),
    (r"\btce\b","trichloroethylene"),
    (r"\bgm-csf\b","csf_39300/gm-csf"),
    (r"\bifn\b","interferons"),
    (r"\bTNF\w*\b","tnf_protein,_human"),
    (r"eps","exopolysaccharide"),
    (r"c-eps","exopolysaccharide"),
    (r"blood_erythrocytes_and_lymphocytes","erythrocytes"),
    (r"k01817","phosphoribosylanthranilate_isomerase"),
    (r"k11358","aspartate_aminotransferase"),
    (r"k01626","3-deoxy-7-phosphoheptulonate_synthase"),
    (r"k01667","tryptophanase"),
    (r"\bfe\b", "iron"),
    (r"5-ht", "serotonin"),
    (r"intestinal_gut_enteric_neuropeptides", "neuropeptides"),
    (r"vagal_and_spinal_afferent_neurons", "afferent_neuron"),
    (r"dietary_microbial_metabolites", "metabolite"),
    (r"vitamins", "vitamin"),
    (r"krebs_cycle_intermediates","krebs_henseleit_cycle"),
    (r"reactive_oxygen_species", "oxygen"),
    (r"napes","n-acylphosphatidyl_ethanolamines"),
    (r"pro-inflammatory_cytokines","cytokines"),
    (r"php","porphyra_haitanensis_polysaccharide"),
    (r"high-dose_php","porphyra_haitanensis_polysaccharide"),
    (r"\blphp\b","porphyra_haitanensis_polysaccharide"),
    (r"\bhphp\b","porphyra_haitanensis_polysaccharide"),
    (r"cd36","fatty_acid_transporter_interactions"),
    (r"\bacacb\b","fatty_acid_oxidation"),
    (r"\bTriglyceride\w*\b","triglyceride"),
    (r"microbial_metabolites","metabolite"),
    (r"5-hydroxytryptamine","serotonin"),
    (r"ldl-c","ldl_cholesterol_lipoproteins"),
    (r"tcd","total_cholesterol_lipoproteins"),
    (r"tg","tg_protein,_human"),
    (r"mcp-1","monocyte_chemoattractant_protein_1"),
    (r"\bgut_microbiota-associated_epitopes\w*\b","epitopes"),
    (r"mes","epitopes"),
    (r"gja1","gap_junction_alpha-1"),
    (r"paired_box_protein_pax-3","paired_box_protein_3"),
    (r"eyes_absent_homolog_1_isoform_4","eya1_protein,_human"),
    (r"central_monoamine_neurotransmitters","monoamine_neurotransmitters"),
    (r"amino_acid_transmitters","monoamine_neurotransmitters"),
    (r"(?:(?<=^)|(?<=[^A-Za-z0-9]))derived_neurotransmitter(?:(?=$)|(?=[^A-Za-z0-9]))","neurotransmitter"),
    (r"(?:(?<=^)|(?<=[^A-Za-z0-9]))lipid(?:(?=$)|(?=[^A-Za-z0-9]))","lipid"),
    (r"\bcirculating_lipid\w*\b","lipid"),
    (r"\blipid\w*\b","lipid"),
    (r"\bh2s\b","hydrogen_sulfide"),
    (r"\bintestinal_metabolite\w*\b","metabolite"),
    (r"bdnf","brain-derived_neurotrophic_factor"),
    (r"toll-like_receptor_4","toll-like_receptor_4"),
    (r"toll-like_receptor_2","toll-like_receptor_2"),
    (r"gap43","growth_associated_protein_43"),
    (r"microbial_tryptophan","tryptophan"),
    (r"tlr4", "toll-like_receptor_4"),
    (r"\bmacro-_and_micro-nutrient\w*\b","nutrient"),
    (r"bioactive_polyphenols","polyphenols"),
    (r"hba1c","glycated_hemoglobin_a1c"),
    (r"zo-1","zonula_occludens-1_protein"),
    (r"tlr2","toll-like_receptor_2"),
    (r"tight_junction_proteins","tight_junction_protein_1"),
    (r"gut_microbiota-derived_metabolites","metabolites"),
    (r"sv2c","sv2c_protein,_rat"),
    (r"\bindole\w*\b","indole"),
    (r"gut_iga_levels","gut_iga_level"),
    (r"\blipopolysaccharide\w*\b","lipopolysaccharide"),
    (r"tight_junction_protein","tight_junction_protein_1"),
    (r"\bmelanin\w*\b","melanin"),
    (r"\bneurotransmitter\w*\b","neurotransmitter"),
    (r"\bskin_triglyceride\w*\b","triglyceride"),
    (r"\btc\b","cholesterol"),
    (r"\beyes_absent_homolog_1_isoform_4\b","eya-1_protein,_c_elegans"),

]
for term_raw, uri in manual_created.items():
    uri_str = str(uri)

    # NCIT definitions
    if uri_str.startswith(NCBI_BASE) and "NCIT_" in uri_str:
        ncit_id = uri_str.rsplit("_", 1)[-1]
        desc = get_ncit_description(ncit_id, ncit_file)
        m = re.search(r'—\s*(.*?)\s*—', desc)
        if m:
            desc = m.group(1).strip()
            comment = f"{desc} [Definition Source: NCIT]"
        else:
            llm_def = get_llm_definition(term_raw)
            comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    # UMLS definitions
    elif uri_str.startswith(UMLS_BASES):
        cui = uri_str.rsplit("/", 1)[-1]
        defn = get_umls_definition(cui)
        if defn:
            comment = f"{defn.strip()} [Definition Source: UMLS]"
        else:
            llm_def = get_llm_definition(term_raw)
            comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    # MeSH definitions via your mesh_index
    elif uri_str.startswith(MESH_BASE):
        ui     = uri_str.rsplit("/",1)[-1]
        hits   = [d['name'] for d in mesh_descs if d['ui']==ui]
        if hits:
            comment = f"{hits[0]} [Definition Source: MeSH]"
        elif term_raw == "patients":
            comment = "Patients with various diseases. [Definition Source: GUTBRAIN]"
        else:
            llm_def = get_llm_definition(term_raw)
            comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    elif uri_str.startswith(HEREDITARY_BASE):
        llm_def = get_llm_definition(term_raw)
        comment = f"{llm_def} [Definition Source: llama3-8b-8192]"
    else:
        llm_def = get_llm_definition(term_raw)
        comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    g.add((uri,
           RDFS.comment,
           Literal(comment, datatype=XSD.string)))
    
CREATOR = "Samuel Piron"

mesh_descs   = parse_mesh_descriptors(MESH_XML)
mesh_defs   = load_mesh_definitions(MESH_XML)

for uri in manual_created.values():
    uri_str = str(uri)

    if uri_str.startswith(UMLS_BASES):
        # only a UMLS‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("UMLS Match", datatype=XSD.string)))
    elif "NCIT" in uri_str:
        # only a NCIT‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("NCIT Match", datatype=XSD.string)))
    elif "OMIT" in uri_str:
        # only a OMIT‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("OMIT Match", datatype=XSD.string)))
    elif "NCBITaxon" in uri_str:
        # only a NCBITaxon‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("NCBITaxon Match", datatype=XSD.string)))
    elif "CHEBI" in uri_str:
        # only a CHEBI‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("CHEBI Match", datatype=XSD.string)))
    elif "GO" in uri_str:
        # only a FOODON‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("GO Match", datatype=XSD.string)))
    elif "genome" in uri_str:
        # only a KO‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("KEGG Match", datatype=XSD.string)))
    elif "mesh" in uri_str:
        # only a MESH‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("MESH Match", datatype=XSD.string)))
    elif "PCO" in uri_str:
        # only a XCO‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("PCO Match", datatype=XSD.string)))
    else:
        # everything else still gets a creator
        g.add((uri,
               DCTERMS.creator,
               Literal(CREATOR, datatype=XSD.string)))
    
for paper_id, paper_data in data.items():
    
    entities = paper_data.get("entities", [])
    
    for i, entity in enumerate(entities):
        raw_label = entity.get("label", "").strip()
        
        text_span = entity.get("text_span", "").strip()
        
        if raw_label == "chemical":
            text_span = entity.get("text_span", "").strip()
            cleaned_text_span = normalize_to_ascii(create_uri_fragment(text_span)).lower()
            term_raw = cleaned_text_span
            lookup_key = term_raw
            #lookup_key = singularize(lookup_key)
            
            for pattern, replacement in regex_map:
                if re.search(pattern, lookup_key, flags=re.IGNORECASE):
                    lookup_key = replacement
                    #print(lookup_key)
                    break
                    
            term = preprocess(lookup_key)
            
            print(f"Query: {term}")
            print(f"lookup term: {lookup_key}")
            
            
            if term_raw in created:

                entity_uri = created[term_raw]
                print(f"  → Reusing existing URI: {entity_uri}\n")
                g.add((entity_uri, RDF.type, CHEMICAL_CLASS))
                g.add((entity_uri, RDF.type, SKOS.Concept))
                g.add((entity_uri, SKOS.inScheme, CHEMICAL_CONCEPT_SCHEME))
                if (entity_uri, RDFS.label, None) not in g:
                        g.add((entity_uri,
                            RDFS.label,
                            Literal(term.title(), datatype=XSD.string)))
                mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_chemical_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                tokenized_mentions[term_raw] = mention_uri
                print()
                continue

            if lookup_key in created:
                print(f"  → Reusing existing URI: {created[lookup_key]}")
                entity_uri = created[lookup_key]
                g.add((entity_uri, RDF.type, CHEMICAL_CLASS))
                g.add((entity_uri, RDF.type, SKOS.Concept))
                g.add((entity_uri, SKOS.inScheme, CHEMICAL_CONCEPT_SCHEME))
                if (entity_uri, RDFS.label, None) not in g:
                        g.add((entity_uri,
                            RDFS.label,
                            Literal(term.title(), datatype=XSD.string)))
                mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_chemical_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                tokenized_mentions[term_raw] = mention_uri
                print()
                continue

            ex = exact_ix.get(term, [])
            cos = top_cosine(term)
            cos1 = top_cosine_ncbitaxon(term)
            if ex:
                for lbl, uri in ex:
                    print(f"  • {lbl:40s} URI={uri}")
                    entity_uri = URIRef(f"{uri}")
                    created[term_raw] = entity_uri
                    g.add((entity_uri, RDF.type, CHEMICAL_CLASS))
                    g.add((entity_uri, RDF.type, SKOS.Concept))
                    
                    s = lbl.strip()
                    s = s.replace("′", "'")    # U+2032 prime → ASCII apostrophe
                    s = s.replace("’", "'")    # U+2019 right single quote → ASCII apostrophe
                    for dash in ["–", "—", "-", "−"]:   # en-dash, em-dash, non-breaking hyphen, minus
                        s = s.replace(dash, "-")

                    if re.fullmatch(r"(?i)pax3['’]", s):
                        label_str = "Pax3"

                    elif re.fullmatch(r"\(S\)\-Lactate", s):
                        label_str = "Lactate"

                    elif re.fullmatch(r"2,3\-Saturated Fatty Acid\(1\-\)", s):
                        label_str = "Saturated Fatty Acid"

                    elif re.fullmatch(r"2,4\-D Choline", s):
                        label_str = "2,4-D Choline"

                    elif s.lower().startswith("adenosine 3'-diphosphate"):
                        label_str = "Adenosine 3-Diphosphate, 5-Triphosphate"

                    elif re.fullmatch(r"5\-Hydroxytryptamine\(1\-\)", s):
                        label_str = "5-Hydroxytryptamine"
                    else:
                        label_str = lbl.title()

                    if (entity_uri, RDFS.label, None) not in g:
                        g.add((entity_uri,
                            RDFS.label,
                            Literal(label_str, datatype=XSD.string)))
                    uri_str = str(entity_uri)
                    definition = choose_definition(uri_str, term_raw)
                    existing_defs = [
                        c for c in g.objects(entity_uri, RDFS.comment)
                        if "[Definition Source:" in str(c)
                    ]
                    if not existing_defs:
                        g.add((entity_uri,
                            RDFS.comment,
                            Literal(definition, datatype=XSD.string)))

                    g.add((entity_uri, RDF.type, SKOS.Concept))
                    g.add((entity_uri, SKOS.inScheme, CHEMICAL_CONCEPT_SCHEME))
                    g.add((entity_uri, RDFS.comment, Literal("CHEBI Match", datatype=XSD.string)))
                    mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_chemical_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    continue
                    print(); 
                    
            elif cos:
                for lbl, uri, score in cos[:1]:
                    print(f"  • {lbl:40s} URI={uri:40s} score={score:.2f}")
                    name_uri = URIRef(f"{uri}")
                    created[term_raw] = name_uri
                    g.add((name_uri, RDF.type, CHEMICAL_CLASS))
                    g.add((name_uri, RDF.type, SKOS.Concept))
                    s = lbl.strip()
                    s = s.replace("′", "'")    # U+2032 prime → ASCII apostrophe
                    s = s.replace("’", "'")    # U+2019 right single quote → ASCII apostrophe
                    for dash in ["–", "—", "-", "−"]:   # en-dash, em-dash, non-breaking hyphen, minus
                        s = s.replace(dash, "-")

                    if re.fullmatch(r"(?i)pax3['’]", s):
                        label_str = "Pax3"

                    elif re.fullmatch(r"\(R\)\-Lactic Acid", s):
                        label_str = "Lactic Acid"

                    elif re.fullmatch(r"\(S\)\-Lactate", s):
                        label_str = "Lactate"

                    elif re.fullmatch(r"2,3\-Saturated Fatty Acid\(1\-\)", s):
                        label_str = "Saturated Fatty Acid"

                    elif re.fullmatch(r"2,4\-D Choline", s):
                        label_str = "2,4-D Choline"

                    elif re.fullmatch(r"5'\-D\[Tc\]\-3'", s):
                        label_str = "5-DTc-3"

                    elif s.lower().startswith("adenosine 3'-diphosphate"):
                        label_str = "Adenosine 3-Diphosphate, 5-Triphosphate"

                    elif re.fullmatch(r"5\-Hydroxytryptamine\(1\-\)", s):
                        label_str = "5-Hydroxytryptamine"

                    elif re.match(r"Iron", s):
                        label_str = "Iron"                     
                    else:
                    
                        label_str = lbl.title()
                    if (name_uri, RDFS.label, None) not in g:
                        g.add((name_uri,
                            RDFS.label,
                            Literal(label_str, datatype=XSD.string)))
                    g.add((name_uri, SKOS.inScheme, CHEMICAL_CONCEPT_SCHEME))
                    uri_str = str(name_uri)
                    definition = choose_definition(uri_str, term_raw)
                    existing_defs = [
                        c for c in g.objects(name_uri, RDFS.comment)
                        if "[Definition Source:" in str(c)
                    ]
                    if not existing_defs:
                        g.add((name_uri,
                            RDFS.comment,
                            Literal(definition, datatype=XSD.string)))
                    g.add((name_uri, RDFS.comment, Literal("CHEBI Match", datatype=XSD.string)))
                    mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_chemical_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((name_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    print()
            elif cos1:
                for lbl, uri, score in cos1[:1]:
                    print(f"  • {lbl:40s} URI={uri:40s} score={score:.2f}")
                    name_uri = URIRef(f"{uri}")
                    created[term_raw] = name_uri
                    g.add((name_uri, RDF.type, CHEMICAL_CLASS))
                    g.add((name_uri, RDF.type, SKOS.Concept))
                    if (name_uri, RDFS.label, None) not in g:
                        g.add((name_uri,
                            RDFS.label,
                            Literal(lbl.title(), datatype=XSD.string)))
                    g.add((name_uri, SKOS.inScheme, CHEMICAL_CONCEPT_SCHEME))
                    uri_str = str(name_uri)
                    definition = choose_definition(uri_str, term_raw)
                    existing_defs = [
                        c for c in g.objects(name_uri, RDFS.comment)
                        if "[Definition Source:" in str(c)
                    ]
                    if not existing_defs:
                        g.add((name_uri,
                            RDFS.comment,
                            Literal(definition, datatype=XSD.string)))
                    g.add((name_uri, RDFS.comment, Literal("NCIT Match", datatype=XSD.string)))
                    mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_chemical_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((name_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    print()
            else:
                api_term = lookup_key.replace("_", " ")
                umls_hits = search_umls(api_term)
                if umls_hits:
                    cui, name, score, definition = best_umls_match(api_term, umls_hits)
                    if name in created:
                        entity_uri = created[term_raw]
                    else:
                        entity_uri = URIRef(f"https://uts.nlm.nih.gov/uts/umls/concept/{cui}")
                        created[term_raw] = entity_uri

                    g.add((entity_uri, RDF.type,      CHEMICAL_CLASS))
                    g.add((entity_uri, RDF.type,      SKOS.Concept))
                    if (entity_uri, RDFS.label, None) not in g:
                        g.add((entity_uri,
                            RDFS.label,
                            Literal(name.title(), datatype=XSD.string)))
                    g.add((entity_uri, SKOS.inScheme, CHEMICAL_CONCEPT_SCHEME))
                    if definition:
                        comment_str = f"{definition.strip()} [Definition Source: UMLS]"
                    else:
                        llm_def = get_llm_definition(term_raw)
                        comment_str = f"{llm_def} [Definition Source: llama3-8b-8192]"
                    g.add((entity_uri, RDFS.comment, Literal("UMLS Match", datatype=XSD.string)))
            
                    mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                    g.add((mention_uri, RDF.type,        MENTION_CLASS))
                    g.add((mention_uri, RDFS.label,      Literal(f"mention_chemical_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs,      Literal(raw_label, datatype=XSD.string)))
                    g.add((entity_uri, GUTPROP.containedIn,    mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    print(f"  • UMLS CUI={cui}  Name={name!r}  sim={score:.2f}")
                    continue
                    
                else: 
                    mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                    g.add((mention_uri, RDF.type,      MENTION_CLASS))
                    g.add((mention_uri, RDFS.label,    Literal(f"mention_chemical_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs,      Literal(raw_label, datatype=XSD.string)))
                    tokenized_mentions[term_raw] = mention_uri
                    print("no matches locally or in UMLS")
        else:
            pass

with open(tokenized_file, "r", encoding="utf-8") as f_sent:
    tokenized_data = json.load(f_sent)

for entry in tokenized_data:
    pmid         = entry["pmid"]
    sent_id      = entry["sent_id"]
    sentence_txt = entry["sentence"].strip()
    entities     = entry["entities"]

    sent_uri = URIRef(GUTBRAINSENTENCE[f"{pmid}_{sent_id}"])
    g.add((sent_uri, RDF.type, SENTENCE))
    g.add((sent_uri, GUTPROP.hasSentenceText, Literal(sentence_txt, datatype=XSD.string)))

    if sent_id == 0:
        parent_uri = URIRef(GUTBRAIN[f"title_{pmid}"])
    else:
        parent_uri = URIRef(GUTBRAIN[f"abstract_{pmid}"])
    g.add((sent_uri,      GUTPROP.partOf,  parent_uri))
    g.add((parent_uri,    GUTPROP.composedOf, sent_uri))
        
    for ent in entities:
        if isinstance(ent, dict):
            text_span = ent.get("text_span", "").strip()
            label     = ent.get("label",    "").strip().lower()
        else:
            continue

        if label != "chemical":
            continue
        canonical = create_uri_fragment(text_span)
        cleaned_text_span = normalize_to_ascii(canonical).lower()
        
        if cleaned_text_span not in tokenized_mentions:
            mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(cleaned_text_span, max_length=16)])
            tokenized_mentions[cleaned_text_span] = mention_uri

            g.add((mention_uri, RDF.type, MENTION_CLASS))
            g.add((mention_uri, RDFS.label, Literal(f"mention_chemical_{cleaned_text_span}", datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.taggedAs, Literal(label, datatype=XSD.string)))
            
        else:
            mention_uri = tokenized_mentions[cleaned_text_span]
            
        g.add((mention_uri, GUTPROP.locatedIn, sent_uri))

Query: monosaccharide
lookup term: monosaccharide
  • monosaccharide                           URI=http://purl.obolibrary.org/obo/CHEBI_35381
Query: saturated fatty acid
lookup term: saturated_fatty_acid
  • saturated fatty acid                     URI=http://purl.obolibrary.org/obo/CHEBI_26607
Query: monounsaturated fatty acid
lookup term: monounsaturated_fatty_acids
  • monounsaturated fatty acid               URI=http://purl.obolibrary.org/obo/CHEBI_25413
Query: metabolite acetate
lookup term: metabolite_acetate
  • metabolite                               URI=http://purl.obolibrary.org/obo/CHEBI_25212 score=0.81

Query: monosaccharide
lookup term: monosaccharide
  • monosaccharide                           URI=http://purl.obolibrary.org/obo/CHEBI_35381
Query: tnf protein, human
lookup term: tnf_protein,_human
  • UMLS CUI=C1448177  Name='TNF protein, human'  sim=1.00
Query: interleukin 6
lookup term: interleukin_6
  • interleukins                             URI=http://purl.obolibr

<h1>INGEST FOOD</h1>

In [8]:
import os
import re
import unicodedata
import json
from pathlib import Path
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD, SKOS, OWL
from pprint import pprint
from rdflib.namespace import DCTERMS
from groqutils import get_llm_definition
from umlsutils import search_umls, best_umls_match, get_umls_definition
from funcutils import get_ncit_description, get_chebi_description, get_omit_description, NCBI_BASE, HEREDITARY_BASE, UMLS_BASES, foodon_file, ncit_file, omit_file, chebi_file, hash_term_sha256, get_foodon_description

FOODON_BASE = "http://purl.obolibrary.org/obo/"
NCIT_BASE = "http://purl.obolibrary.org/obo/"
FOOD_CLASS = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Food")
FOOD_CONCEPT_SCHEME = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Food")

TAX1_FILE = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\parsedOntologies\food_tree.txt"
TAX2_FILE = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\parsedOntologies\foodon_terms.txt"

def load_taxonomy_tree(path):
    row_re = re.compile(r"^\s*(.*?)\s+\[([^\]]+)\]\s*$")
    rows = []
    with open(path, encoding="utf-8") as fh:
        for ln in fh:
            m = row_re.match(ln)
            if m:
                label, tid = m.groups()
                depth = len(ln) - len(ln.lstrip())
                rows.append((label, tid, depth))
    return rows

rows1 = load_taxonomy_tree(TAX1_FILE)
rows2 = load_taxonomy_tree(TAX2_FILE)

exact1 = defaultdict(list)
for lbl, tid, depth in rows1:
    exact1[lbl.lower()].append((lbl, tid, depth))

exact2 = defaultdict(list)
for lbl, tid, depth in rows2:
    exact2[lbl.lower()].append((lbl, tid, depth))

labels1 = [lbl for lbl,_,_ in rows1]
vec1    = TfidfVectorizer(stop_words="english")
mat1    = vec1.fit_transform(labels1)

labels2 = [lbl for lbl,_,_ in rows2]
vec2    = TfidfVectorizer(stop_words="english")
mat2    = vec2.fit_transform(labels2)

def top_cosine(rows, vec, mat, labels, term, k=5, thr=0.75):
    v   = vec.transform([term])
    sc  = cosine_similarity(v, mat).ravel()
    idx = np.argsort(sc)[::-1]
    out = []
    for i in idx[:k]:
        if sc[i] < thr:
            break
        lbl, tid, depth = rows[i]
        out.append((lbl, tid, depth, sc[i]))
    return out

manual_created = {}
created = {}

manual_created = {
    "whole_grain_cereals" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C4046096"),
    "thymus_vulgaris" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0697238"),
    "origanum_vulgare" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0946715"),
    "low-caloric_diet" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C2930544"),
    "egg_food_product" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0013710"),
    "oregano_spice" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0453263"),
}

created = dict(manual_created)

regex_map = [
    (r"dietary_fiber", "high-fiber_diet"),
    (r"fermented_foods", "food"),
    (r"animal_to_vegetal_food", "food"),
    (r"eggs","egg_food_product"),
    (r"vegetables","vegetable"),
    (r"\bwheat_germ\w*\b", "wheat_germ"),
    (r"(?:(?<=^)|(?<=[^A-Za-z0-9]))oregano(?:(?=$)|(?=[^A-Za-z0-9]))", "oregano_spice"),
]
CREATOR = "Samuel Piron"

mesh_descs   = parse_mesh_descriptors(MESH_XML)

for uri in manual_created.values():
    uri_str = str(uri)

    if uri_str.startswith(UMLS_BASES):
        # only a UMLS‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("UMLS Match", datatype=XSD.string)))
    elif "NCIT" in uri_str:
        # only a NCIT‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("NCIT Match", datatype=XSD.string)))
    elif "OMIT" in uri_str:
        # only a OMIT‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("OMIT Match", datatype=XSD.string)))
    elif "NCBITaxon" in uri_str:
        # only a NCBITaxon‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("NCBITaxon Match", datatype=XSD.string)))
    elif "CHEBI" in uri_str:
        # only a CHEBI‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("CHEBI Match", datatype=XSD.string)))
    elif "FOODON" in uri_str:
        # only a FOODON‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("FOODON Match", datatype=XSD.string)))
    elif "mesh" in uri_str:
        # only a MESH‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("MESH Match", datatype=XSD.string)))
    elif "PCO" in uri_str:
        # only a XCO‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("PCO Match", datatype=XSD.string)))
    else:
        # everything else still gets a creator
        g.add((uri,
               DCTERMS.creator,
               Literal(CREATOR, datatype=XSD.string)))

for term_raw, uri in manual_created.items():
    uri_str = str(uri)

    # NCIT definitions
    if uri_str.startswith(NCBI_BASE) and "NCIT_" in uri_str:
        ncit_id = uri_str.rsplit("_", 1)[-1]
        desc = get_ncit_description(ncit_id, ncit_file)
        m = re.search(r'—\s*(.*?)\s*—', desc)
        if m:
            desc = m.group(1).strip()
            comment = f"{desc} [Definition Source: NCIT]"
        else:
            llm_def = get_llm_definition(term_raw)
            comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    # UMLS definitions
    elif uri_str.startswith(UMLS_BASES):
        cui = uri_str.rsplit("/", 1)[-1]
        defn = get_umls_definition(cui)
        if defn:
            comment = f"{defn.strip()} [Definition Source: UMLS]"
        else:
            llm_def = get_llm_definition(term_raw)
            comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    # MeSH definitions via your mesh_index
    elif uri_str.startswith(MESH_BASE):
        ui     = uri_str.rsplit("/",1)[-1]
        hits   = [d['name'] for d in mesh_descs if d['ui']==ui]
        if hits:
            comment = f"{hits[0]} [Definition Source: MeSH]"
        elif term_raw == "patients":
            comment = "Patients with various diseases. [Definition Source: GUTBRAIN]"
        else:
            llm_def = get_llm_definition(term_raw)
            comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    elif uri_str.startswith(HEREDITARY_BASE):
        llm_def = get_llm_definition(term_raw)
        comment = f"{llm_def} [Definition Source: llama3-8b-8192]"
    else:
        llm_def = get_llm_definition(term_raw)
        comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    g.add((uri,
           RDFS.comment,
           Literal(comment, datatype=XSD.string)))

for paper_id, paper_data in data.items():
    
    entities = paper_data.get("entities", [])
    
    for i, entity in enumerate(entities):
        raw_label = entity.get("label", "").strip()
        
        text_span = entity.get("text_span", "").strip()
        
        if raw_label == "food":
            text_span = entity.get("text_span", "").strip()
            cleaned_text_span = normalize_to_ascii(create_uri_fragment(text_span)).lower()
            term_raw = cleaned_text_span
            lookup_key = term_raw
            #lookup_key = singularize(lookup_key)
            
            for pattern, replacement in regex_map:
                if re.search(pattern, lookup_key, flags=re.IGNORECASE):
                    lookup_key = replacement
                    print(lookup_key)
                    break
                    
            term = preprocess(lookup_key)
            
            print(f"Query: {term}")
            print(f"lookup term: {lookup_key}")
            
            
            if term_raw in created:

                entity_uri = created[term_raw]
                print(f"  → Reusing existing URI: {entity_uri}\n")
                g.add((entity_uri, RDF.type, FOOD_CLASS))
                g.add((entity_uri, RDF.type, SKOS.Concept))
                g.add((entity_uri, SKOS.inScheme, FOOD_CONCEPT_SCHEME))
                if (entity_uri, RDFS.label, None) not in g:
                        g.add((entity_uri,
                            RDFS.label,
                            Literal(term.title(), datatype=XSD.string)))
                mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_food_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                tokenized_mentions[term_raw] = mention_uri
                print()
                continue

            if lookup_key in created:
                print(f"  → Reusing existing URI: {created[lookup_key]}")
                entity_uri = created[lookup_key]
                g.add((entity_uri, RDF.type, FOOD_CLASS))
                g.add((entity_uri, RDF.type, SKOS.Concept))
                g.add((entity_uri, SKOS.inScheme, FOOD_CONCEPT_SCHEME))
                if (entity_uri, RDFS.label, None) not in g:
                        g.add((entity_uri,
                            RDFS.label,
                            Literal(term.title(), datatype=XSD.string)))
                mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_food_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                tokenized_mentions[term_raw] = mention_uri
                print()
                continue

            ex1 = exact1.get(term.lower(), [])
            cos1 = top_cosine(rows1, vec1, mat1, labels1, term, k=5, thr=0.75)
            cos2 = top_cosine(rows2, vec2, mat2, labels2, term, k=5, thr=0.75)
            if ex1:
                for l,t,d in ex1:
                    print(f"  • {l:40s} ID={t:15s} depth={d:<2d} score=1.00 (TAX1 exact)")
                    entity_uri = URIRef(f"{NCIT_BASE}{t}")
                    created[term_raw] = entity_uri
                    g.add((entity_uri, RDF.type, FOOD_CLASS))
                    g.add((entity_uri, RDF.type, SKOS.Concept))
                    if (entity_uri, RDFS.label, None) not in g:
                        g.add((entity_uri,
                            RDFS.label,
                            Literal(l.title(), datatype=XSD.string)))
                    g.add((entity_uri, SKOS.inScheme, FOOD_CONCEPT_SCHEME))
                    g.add((entity_uri, RDFS.comment, Literal("NCIT Match", datatype=XSD.string)))
                    raw_desc = get_ncit_description(t, ncit_file)
                    uri_str = str(entity_uri)
                    definition = choose_definition(uri_str, term_raw)
                    existing_defs = [
                        c for c in g.objects(entity_uri, RDFS.comment)
                        if "[Definition Source:" in str(c)
                    ]
                    if not existing_defs:
                        g.add((entity_uri,
                            RDFS.comment,
                            Literal(definition, datatype=XSD.string)))
                    mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_food_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    continue
                    print(); 
                    
            elif cos1:
                for l,t,d,s in cos1[:1]:
                    print(f"  • {l:40s} ID={t:15s} depth={d:<2d} score={s:.2f} (TAX1 cosine)")
                    name_uri = URIRef(f"{NCIT_BASE}{t}")
                    created[term_raw] = name_uri
                    g.add((name_uri, RDF.type, FOOD_CLASS))
                    g.add((name_uri, RDF.type, SKOS.Concept))
                    if (name_uri, RDFS.label, None) not in g:
                        g.add((name_uri,
                            RDFS.label,
                            Literal(l.title(), datatype=XSD.string)))
                    g.add((name_uri, SKOS.inScheme, FOOD_CONCEPT_SCHEME))
                    g.add((name_uri, RDFS.comment, Literal("NCIT Match", datatype=XSD.string)))
                    uri_str = str(name_uri)
                    definition = choose_definition(uri_str, term_raw)
                    existing_defs = [
                        c for c in g.objects(name_uri, RDFS.comment)
                        if "[Definition Source:" in str(c)
                    ]
                    if not existing_defs:
                        g.add((name_uri,
                            RDFS.comment,
                            Literal(definition, datatype=XSD.string)))
                    mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_food_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((name_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    print()
                continue

            elif cos2:
                for l,t,d,s in cos2[:1]:
                    print(f"  • {l:40s} ID={t:15s} depth={d:<2d} score={s:.2f} (TAX2 cosine)")
                    name_uri = URIRef(f"{FOODON_BASE}{t}")
                    created[term_raw] = name_uri
                    g.add((name_uri, RDF.type, FOOD_CLASS))
                    g.add((name_uri, RDF.type, SKOS.Concept))
                    if (name_uri, RDFS.label, None) not in g:
                        g.add((name_uri,
                            RDFS.label,
                            Literal(l.title(), datatype=XSD.string)))
                    g.add((name_uri, SKOS.inScheme, FOOD_CONCEPT_SCHEME))
                    uri_str    = str(name_uri)
                    # pick off only the ID portion
                    fragment   = uri_str.rsplit("/", 1)[-1]
                    definition = choose_definition(fragment, term_raw)

                    existing_defs = [
                        c for c in g.objects(name_uri, RDFS.comment)
                        if "[Definition Source:" in str(c)
                    ]
                    if not existing_defs:
                        g.add((
                            name_uri,
                            RDFS.comment,
                            Literal(definition, datatype=XSD.string)
                        ))
                    g.add((name_uri, RDFS.comment, Literal("FOODON Match", datatype=XSD.string)))
                    mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_food_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((name_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
            else:
                mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_food_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                tokenized_mentions[term_raw] = mention_uri
                print("no matches")
                continue

        else:
            pass

with open(tokenized_file, "r", encoding="utf-8") as f_sent:
    tokenized_data = json.load(f_sent)

for entry in tokenized_data:
    pmid         = entry["pmid"]
    sent_id      = entry["sent_id"]
    sentence_txt = entry["sentence"].strip()
    entities     = entry["entities"]

    sent_uri = URIRef(GUTBRAINSENTENCE[f"{pmid}_{sent_id}"])
    g.add((sent_uri, RDF.type, SENTENCE))
    g.add((sent_uri, GUTPROP.hasSentenceText, Literal(sentence_txt, datatype=XSD.string)))

    if sent_id == 0:
        parent_uri = URIRef(GUTBRAIN[f"title_{pmid}"])
    else:
        parent_uri = URIRef(GUTBRAIN[f"abstract_{pmid}"])
    g.add((sent_uri,      GUTPROP.partOf,  parent_uri))
    g.add((parent_uri,    GUTPROP.composedOf, sent_uri))
        
    for ent in entities:
        if isinstance(ent, dict):
            text_span = ent.get("text_span", "").strip()
            label     = ent.get("label",    "").strip().lower()
        else:
            continue

        if label != "food":
            continue
        canonical = create_uri_fragment(text_span)
        cleaned_text_span = normalize_to_ascii(canonical).lower()
        
        if cleaned_text_span not in tokenized_mentions:
            mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(cleaned_text_span, max_length=16)])
            tokenized_mentions[cleaned_text_span] = mention_uri

            g.add((mention_uri, RDF.type, MENTION_CLASS))
            g.add((mention_uri, RDFS.label, Literal(f"mention_food_{cleaned_text_span}", datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.taggedAs, Literal(label, datatype=XSD.string)))
            
        else:
            mention_uri = tokenized_mentions[cleaned_text_span]
            
        g.add((mention_uri, GUTPROP.locatedIn, sent_uri))

output_file = os.path.join(save_path, "gutbrain_entities.ttl")
ttl_output = g.serialize(format="turtle")
with open(output_file, "w", encoding="utf-8") as f_out:
    f_out.write(ttl_output)

print(f"The RDF graph has been saved in {output_file}")

vegetable
Query: vegetable
lookup term: vegetable
  • Vegetable                                ID=NCIT_C178192    depth=4  score=1.00 (TAX1 exact)
Query: whole grain cereal
lookup term: whole_grain_cereals
  → Reusing existing URI: https://uts.nlm.nih.gov/uts/umls/concept/C4046096


Query: high-fiber diet
lookup term: high-fiber_diet
  • high fiber food                          ID=FOODON_03510048 depth=0  score=0.76 (TAX2 cosine)
Query: high-fiber diet
lookup term: high-fiber_diet
  → Reusing existing URI: http://purl.obolibrary.org/obo/FOODON_03510048


Query: high-fiber diet
lookup term: high-fiber_diet
  → Reusing existing URI: http://purl.obolibrary.org/obo/FOODON_03510048


Query: high-fiber diet
lookup term: high-fiber_diet
  → Reusing existing URI: http://purl.obolibrary.org/obo/FOODON_03510048


Query: high-fiber diet
lookup term: high-fiber_diet
  → Reusing existing URI: http://purl.obolibrary.org/obo/FOODON_03510048


wheat_germ
Query: wheat germ
lookup term: wheat_germ
  • w

<h1>INGEST HUMAN</h1>

In [9]:
import os
import requests
import re
import unicodedata
import json
from pathlib import Path
from collections import defaultdict
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD, SKOS, OWL
from rdflib.namespace import DCTERMS
from groqutils import get_llm_definition
from umlsutils import search_umls, best_umls_match, get_umls_definition
from funcutils import get_ncit_description, get_chebi_description, get_omit_description, NCBI_BASE, HEREDITARY_BASE, UMLS_BASES, foodon_file, ncit_file, omit_file, chebi_file, hash_term_sha256

NCBI_BASE             = "http://purl.obolibrary.org/obo/"
MESH_BASE             = "https://www.ncbi.nlm.nih.gov/mesh/"
HUMAN_CLASS           = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Human")
HUMAN_CONCEPT_SCHEME  = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Human")

GUTBRAINMENTION = Namespace("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/mention/")

def load_ddf_labels(path):
    pattern = re.compile(r'^\s*(.*?)\s*[\(\[]([A-Za-z0-9_]+)[\)\]]')
    rows = []
    with open(path, encoding="utf-8") as fh:
        for ln in fh:
            m = pattern.match(ln)
            if not m:
                continue
            label, term_id = m.group(1), m.group(2)
            uri = NCBI_BASE + term_id
            rows.append((label, uri))
    return rows

def parse_mesh_descriptors(xml_path):
    descs = []
    tree  = ET.parse(xml_path)
    root  = tree.getroot()
    for dr in root.findall('DescriptorRecord'):
        ui   = dr.findtext('DescriptorUI')
        name = dr.findtext('DescriptorName/String')
        tns  = [tn.text for tn in dr.findall('TreeNumberList/TreeNumber') if tn.text]
        if ui and name:
            descs.append({'ui':ui,'name':name,'tree_numbers':tns})
    return descs

def build_name_index(descriptors):
    idx = defaultdict(list)
    for d in descriptors:
        if not d['tree_numbers']: continue
        tn = d['tree_numbers'][0]
        idx[d['name'].lower()].append((tn, d['ui'], d['name']))
    return idx

MESH_XML     = 'desc2025.xml'
mesh_descs   = parse_mesh_descriptors(MESH_XML)
mesh_index   = build_name_index(mesh_descs)

mesh_items   = [(tn,ui,name) for vs in mesh_index.values() for tn,ui,name in vs]
mesh_labels  = [name for (_,_,name) in mesh_items]
mesh_uids    = [ui   for (_,ui,_)   in mesh_items]

mesh_vec     = TfidfVectorizer(stop_words="english").fit(mesh_labels)
mesh_mat     = mesh_vec.transform(mesh_labels)

def load_taxonomy_tree(path):
    row_re = re.compile(r"^\s*(.*?)\s+\[([^\]]+)\]\s*$")
    rows = []
    with open(path, encoding="utf-8") as fh:
        for ln in fh:
            m = row_re.match(ln)
            if m:
                label, tid = m.groups()
                depth      = len(ln) - len(ln.lstrip())
                rows.append((label, tid, depth))
    return rows

TAX_FILE = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\parsedOntologies\patients_output.txt"
rows     = load_taxonomy_tree(TAX_FILE)

exact_ix  = defaultdict(list)
for lbl, tid, depth in rows:
    exact_ix[lbl.lower()].append((lbl, tid, depth))

labels_only  = [r[0] for r in rows]
vec          = TfidfVectorizer(stop_words="english")
mat          = vec.fit_transform(labels_only)

def top_cosine(term, k=5, thr=0.75):
    v   = vec.transform([term])
    sc  = cosine_similarity(v, mat).ravel()
    idx = np.argsort(sc)[::-1]
    out = []
    for i in idx[:k]:
        if sc[i] < thr: break
        lbl, tid, d = rows[i]
        out.append((lbl, tid, d, sc[i]))
    return out

manual_created = {}
created = {}

manual_created = {
    "patients": URIRef("https://www.ncbi.nlm.nih.gov/mesh/68010361"),
    "parkinson_disease_patients":URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/human/ParkinsonDiseasePatients"),
    "depression_patients":URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/human/DepressionPatients"),
    "major_depressive_disorder_patients" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/human/MajorDepressiveDisorderPatients"),
    "alcohol_overconsumption_group":URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/human/AlcoholOverconsumptionGroup"),
    "non-smoking_patients":URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/human/NonSmokingPatients"),
    "patients_with_schizophrenia":URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/human/PatientsWithSchizophrenia"),
    "human_subjects":URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/human/HumanSubjects"),
    "crew_members":URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/human/CrewMembers"),
    "ibs-d_patients" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/human/IbsDPatients"),
}

created = dict(manual_created)

regex_map = [
    (r"bariatric_patients", "patients"),
    (r"non-smoking_healthy_controls","humans"),
    (r"controls", "humans"),
    (r"healthy_controls", "humans"),
    (r"schizophrenia_patients", "patients"),
    (r"health_controls", "humans"),
    (r"schizophrenia_cohort", "cohort"),
    (r"control_cohort", "cohort"),
    (r"stressed_adults", "adult"),
    (r"placebo_group", "people"),
    (r"probiotic group", "people"),
    (r"parkinson_s_disease__pd__patients", "parkinson_disease_patients"),
    (r"pd_patients", "parkinson_disease_patients"),
    (r"individuals","individual"),
    (r"comorbid_patients", "patients"),
    (r"mother-infant_pairs","humans"),
    (r"vaginally_delivered_infants", "infants"),
    (r"vaginally_delivered__fully_breastfed_infants", "infants"),
    (r"world_s_population", "population"),
    (r"hcs", "humans"),
    (r"child_and_adolescent_population__0-18_years_old_", "population"),
    (r"youth", "adolescent"),
    (r"asd_and_healthy_controls", "humans"),
    (r"child_and_adolescent_populations", "population"),
    (r"type_2_diabetes_patients", "patients"),
    (r"homeless_individuals", "individual"),
    (r"individual_patients", "patients"),
    (r"hospitalized_older_adults", "elderly_(population_group)"),
    (r"hospitalized_acutely_ill_older_adults", "elderly_(population_group)"),
    (r"geriatric_patients", "elderly_(population_group)"),
    (r"inpatients", "patients"),
    (r"gender-matched_controls", "humans"),
    (r"pcos_patients", "patients"),
    (r"depressed_women", "women"),
    (r"\bc_group\b", "control_groups"),
    (r"pcos-dp", "patients"),
    (r"hc__mdd__and_pcos_groups", "patients"),
    (r"typically_developing__td__children", "children"),
    (r"td_children", "children"),
    (r"asd_children", "children"),
    (r"mdd_patients", "patients"),
    (r"mdd_group", "population"),
    (r"nvp-1704_group", "population"),
    (r"edf_group", "population"),
    (r"d_and_edf_groups","p_group"),
    (r"reproductive-aged_women","women"),
    (r"pd_and_healthy_controls","parkinson_disease_patients"),
    (r"insomnia_patients","patients"),
    (r"acute_insomnia_patients","patients"),
    (r"chronic_insomnia_patients","patients"),
    (r"tibetan_buddhist_monks","monks"),
    (r"neighbouring_residents","residents"),
    (r"control_subjects","human_subjects"),
    (r"controls", "control_groups"),
    (r"participants_aged_18-40_years","participants"),
    (r"people_with_pd","parkinson_disease_patients"),
    (r"pd_cohort","cohort"),
    (r"southern_hemisphere_pd_population","parkinson_disease_patients"),
    (r"parkinson_s_disease_patients","parkinson_disease_patients"),
    (r"female_pd_patients","parkinson_disease_patients"),
    (r"occidental_patients","patients"),
    (r"non-pd_controls","control_groups"),
    (r"acute_patients","patients"),
    (r"remission_patients","patients"),
    (r"unaffected first-degree relatives","relatives"),
    (r"unaffected_relatives","relatives"),
    (r"controls","control_groups"),
    (r"vvs_children","children"),
    (r"matched_ontrols","control_groups"),
    (r"vvs_cases","humans"),
    (r"vvs_patients","patients"),
    (r"\bibs\b","irritable_bowel_syndrome"),
    (r"\bibs-d_patient\w*\b","ibs-d_patients"),
    (r"gut_microbial_composition_of_patients","patients"),
    (r"taiwanese_patients","patients"),
    (r"major_depressive_episode_patients","patients"),
    (r"asd_children","children"),
    (r"typically_developing__td__children","children"),
    (r"td_children","children"),
    (r"criteria-acute_group__patients_with_acute_schizophrenia_","patients_with_schizophrenia"),
    (r"remission_group__patients_with_schizophrenia_in_remission_","patients_with_schizophrenia"),
    (r"control_group__healthy_controls_","control_groups"),
    (r"remission_and_control_groups","control_groups"),
    (r"crew_members","crew_members"),
    (r"older_subjects","elderly_(population_group)"),
    (r"at-risk_groups","control_groups"),
    (r"healthy_persons","persons"),
    (r"generation_over_75","elderly_(population_group)"),
    (r"younger_generations","adolescent"),
    (r"younger_generation","adolescent"),
    (r"egyptian_patients","patients"),
    (r"healthy_subjects","human_subjects"),
    (r"breast_cancer_survivors","cancer_survivors"),
    (r"healthy_population","population"),
    (r"control__c__group", "control_groups"),
    (r"subjects","human_subjects"),
    (r"probiotic_group","control_groups"),
    (r"individual_patient","patients"),
    (r"acute_group","patients"),
    (r"unaffected_first-degree_relatives","relatives"),
    (r"healthy_people","people"),
    (r"residents","resident_(person)"),
    (r"resident","resident_(person)"),
    (r"\bhealthy_adult\w*\b","adult"),
    (r"mentally_healthy_women","women"),
    (r"\bgeriatric_population\w*\b","elderly_(population_group)"),
    (r"\bindividual_patient\w*\b","patients"),
    (r"\bpwp\b","parkinson_disease_patients"),

]

CREATOR = "Samuel Piron"

UMLS_BASES = "https://uts.nlm.nih.gov/uts/umls/concept/"
HEREDITARY_BASE = "https://hereditary.dei.unipd.it/ontology/gutbrain/resource/"

for uri in manual_created.values():
    uri_str = str(uri)

    if uri_str.startswith(UMLS_BASES):
        g.add((uri,
               RDFS.comment,
               Literal("UMLS Match", datatype=XSD.string)))
    elif "NCIT" in uri_str:
        g.add((uri,
               RDFS.comment,
               Literal("NCIT Match", datatype=XSD.string)))
    elif "OMIT" in uri_str:
        g.add((uri,
               RDFS.comment,
               Literal("OMIT Match", datatype=XSD.string)))
    elif "NCBITaxon" in uri_str:
        g.add((uri,
               RDFS.comment,
               Literal("NCBITaxon Match", datatype=XSD.string)))
    elif "CHEBI" in uri_str:
        g.add((uri,
               RDFS.comment,
               Literal("CHEBI Match", datatype=XSD.string)))
    elif "FOODON" in uri_str:
        g.add((uri,
               RDFS.comment,
               Literal("FOODON Match", datatype=XSD.string)))
    elif "mesh" in uri_str:
        g.add((uri,
               RDFS.comment,
               Literal("MESH Match", datatype=XSD.string)))
    elif "PCO" in uri_str:
        g.add((uri,
               RDFS.comment,
               Literal("PCO Match", datatype=XSD.string)))
    else:
        g.add((uri,
               DCTERMS.creator,
               Literal(CREATOR, datatype=XSD.string)))

for term_raw, uri in manual_created.items():
    if (uri, RDFS.label, None) not in g:
        pretty = term_raw.replace("_", " ").title()
        g.add((uri,
               RDFS.label,
               Literal(pretty, datatype=XSD.string)))

for term_raw, uri in manual_created.items():
    uri_str = str(uri)

    # NCIT definitions
    if uri_str.startswith(NCBI_BASE) and "NCIT_" in uri_str:
        ncit_id = uri_str.rsplit("_", 1)[-1]
        desc = get_ncit_description(ncit_id, ncit_file)
        m = re.search(r'—\s*(.*?)\s*—', desc)
        if m:
            desc = m.group(1).strip()
            comment = f"{desc} [Definition Source: NCIT]"
        else:
            llm_def = get_llm_definition(term_raw)
            comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    # UMLS definitions
    elif uri_str.startswith(UMLS_BASES):
        cui = uri_str.rsplit("/", 1)[-1]
        defn = get_umls_definition(cui)
        if defn:
            comment = f"{defn.strip()} [Definition Source: UMLS]"
        else:
            llm_def = get_llm_definition(term_raw)
            comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    # MeSH definitions via your mesh_index
    elif uri_str.startswith(MESH_BASE):
        ui     = uri_str.rsplit("/",1)[-1]
        hits   = [d['name'] for d in mesh_descs if d['ui']==ui]
        if hits:
            comment = f"{hits[0]} [Definition Source: MeSH]"
        elif term_raw == "patients":
            comment = "Patients with various diseases. [Definition Source: GUTBRAIN]"
        else:
            llm_def = get_llm_definition(term_raw)
            comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    elif uri_str.startswith(HEREDITARY_BASE):
        llm_def = get_llm_definition(term_raw)
        comment = f"{llm_def} [Definition Source: llama3-8b-8192]"
    else:
        llm_def = get_llm_definition(term_raw)
        comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    g.add((uri,
           RDFS.comment,
           Literal(comment, datatype=XSD.string)))
    
IBSD_URI = URIRef(
    "https://hereditary.dei.unipd.it/ontology/gutbrain/resource/human/IbsDPatients"
)
ADOLESCENT_URI = URIRef("https://www.ncbi.nlm.nih.gov/mesh/D000293")
ADULT_URI = URIRef("https://www.ncbi.nlm.nih.gov/mesh/D000328")
CHILD_URI = URIRef("https://www.ncbi.nlm.nih.gov/mesh/D002648")
INFANT_URI = URIRef("https://www.ncbi.nlm.nih.gov/mesh/D007223")
MEN_URI = URIRef("https://www.ncbi.nlm.nih.gov/mesh/D008571")
POPULATION_URI = URIRef("https://www.ncbi.nlm.nih.gov/mesh/D011153")    
    
for paper_id, paper_data in data.items():
    entities = paper_data.get("entities", [])
    for i, entity in enumerate(entities):
        raw_label = entity.get("label", "").strip()
        
        if raw_label == "human":
            text_span = entity.get("text_span", "").strip()
            cleaned_text_span = normalize_to_ascii(create_uri_fragment(text_span)).lower()
            term_raw = cleaned_text_span
            lookup_key = term_raw
            
            for pattern, replacement in regex_map:
                if re.search(pattern, lookup_key, flags=re.IGNORECASE):
                    lookup_key = replacement
                    if lookup_key == "residents":
                        lookup_key = "resident"
                    print(lookup_key)
                    break
                    
            term = preprocess(lookup_key)
            
            print(f"Query: {term}")
            print(f"lookup term: {lookup_key}")
            
            
            if term_raw in created:

                entity_uri = created[term_raw]
                print(f"  → Reusing existing URI: {entity_uri}\n")
                g.add((entity_uri, RDF.type, HUMAN_CLASS))
                g.add((entity_uri, RDF.type, SKOS.Concept))
                g.add((entity_uri, SKOS.inScheme, HUMAN_CONCEPT_SCHEME))
                if (entity_uri, RDFS.label, None) not in g:
                    if entity_uri == IBSD_URI:
                        g.add((entity_uri, RDFS.label, Literal("IBS-D Patients", datatype=XSD.string)))
                    elif entity_uri == ADOLESCENT_URI:
                        g.add((entity_uri, RDFS.label, Literal("Adolescent", datatype=XSD.string)))
                    elif entity_uri == ADULT_URI:
                        g.add((entity_uri, RDFS.label, Literal("Adult", datatype=XSD.string)))
                    elif entity_uri == CHILD_URI:
                        g.add((entity_uri, RDFS.label, Literal("Child", datatype=XSD.string)))
                    elif entity_uri == INFANT_URI:
                        g.add((entity_uri, RDFS.label, Literal("Infant", datatype=XSD.string)))
                    elif entity_uri == MEN_URI:
                        g.add((entity_uri, RDFS.label, Literal("Men", datatype=XSD.string)))
                    elif entity_uri == POPULATION_URI:
                        g.add((entity_uri, RDFS.label, Literal("Population", datatype=XSD.string)))
                    else:
                        g.add((entity_uri,
                            RDFS.label,
                            Literal(term.title(), datatype=XSD.string)))
                mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_human_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                tokenized_mentions[term_raw] = mention_uri
                print()
                continue

            if lookup_key in created:
                print(f"  → Reusing existing URI: {created[lookup_key]}")
                entity_uri = created[lookup_key]
                g.add((entity_uri, RDF.type, HUMAN_CLASS))
                g.add((entity_uri, RDF.type, SKOS.Concept))
                g.add((entity_uri, SKOS.inScheme, HUMAN_CONCEPT_SCHEME))
                if (entity_uri, RDFS.label, None) not in g:
                    if entity_uri == IBSD_URI:
                        g.add((entity_uri, RDFS.label, Literal("IBS-D Patients", datatype=XSD.string)))
                    elif entity_uri == ADOLESCENT_URI:
                        g.add((entity_uri, RDFS.label, Literal("Adolescent", datatype=XSD.string)))
                    elif entity_uri == ADULT_URI:
                        g.add((entity_uri, RDFS.label, Literal("Adult", datatype=XSD.string)))
                    elif entity_uri == CHILD_URI:
                        g.add((entity_uri, RDFS.label, Literal("Child", datatype=XSD.string)))
                    elif entity_uri == INFANT_URI:
                        g.add((entity_uri, RDFS.label, Literal("Infant", datatype=XSD.string)))
                    elif entity_uri == MEN_URI:
                        g.add((entity_uri, RDFS.label, Literal("Men", datatype=XSD.string)))
                    elif entity_uri == POPULATION_URI:
                        g.add((entity_uri, RDFS.label, Literal("Population", datatype=XSD.string)))
                    else:
                        g.add((entity_uri,
                            RDFS.label,
                            Literal(term.title(), datatype=XSD.string)))
                mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_human_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                tokenized_mentions[term_raw] = mention_uri
                print()
                continue

            mesh_ex = mesh_index.get(term, [])
            cos = top_cosine(term)
            if mesh_ex:
                for t,u,n in mesh_ex:
                    print(f"  • {t:40s} ID={u:15s} depth={n:40s} score=1.00 (TAX1 exact)")
                    entity_uri = URIRef(f"{MESH_BASE}{u}")
                    created[term_raw] = entity_uri
                    g.add((entity_uri, RDF.type, HUMAN_CLASS))
                    g.add((entity_uri, RDF.type, SKOS.Concept))
                    if (entity_uri, RDFS.label, None) not in g:
                        if entity_uri == IBSD_URI:
                            g.add((entity_uri, RDFS.label, Literal("IBS-D Patients", datatype=XSD.string)))
                        elif entity_uri == ADOLESCENT_URI:
                            g.add((entity_uri, RDFS.label, Literal("Adolescent", datatype=XSD.string)))
                        elif entity_uri == ADULT_URI:
                            g.add((entity_uri, RDFS.label, Literal("Adult", datatype=XSD.string)))
                        elif entity_uri == CHILD_URI:
                            g.add((entity_uri, RDFS.label, Literal("Child", datatype=XSD.string)))
                        elif entity_uri == INFANT_URI:
                            g.add((entity_uri, RDFS.label, Literal("Infant", datatype=XSD.string)))
                        elif entity_uri == MEN_URI:
                            g.add((entity_uri, RDFS.label, Literal("Men", datatype=XSD.string)))
                        elif entity_uri == POPULATION_URI:
                            g.add((entity_uri, RDFS.label, Literal("Population", datatype=XSD.string)))
                        else:
                            g.add((entity_uri,
                                RDFS.label,
                                Literal(t.title(), datatype=XSD.string)))
                    uri_str = str(entity_uri)
                    definition = choose_definition(uri_str, term_raw)
                    existing_defs = [
                        c for c in g.objects(entity_uri, RDFS.comment)
                        if "[Definition Source:" in str(c)
                    ]
                    if not existing_defs:
                        g.add((entity_uri,
                            RDFS.comment,
                            Literal(definition, datatype=XSD.string)))
                    g.add((entity_uri, RDFS.comment, Literal("MeSH Match", datatype=XSD.string)))
                    g.add((entity_uri, SKOS.inScheme, HUMAN_CONCEPT_SCHEME))
                    mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_human_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    continue
                    print(); 
                    
            elif cos:
                for name, ui, depth, score in cos[:1]:
                    print(f"  • {name:40s} ID={ui:15s} depth={depth:<2d} score={score:.2f} (Patients cosine)")
                    name_uri = URIRef(f"{MESH_BASE}{ui}")
                    created[term_raw] = name_uri
                    g.add((name_uri, RDF.type, HUMAN_CLASS))
                    g.add((name_uri, RDF.type, SKOS.Concept))
                    if (name_uri, RDFS.label, None) not in g:
                        if name_uri == IBSD_URI:
                            g.add((name_uri, RDFS.label, Literal("IBS-D Patients", datatype=XSD.string)))
                        elif name_uri == ADOLESCENT_URI:
                            g.add((name_uri, RDFS.label, Literal("Adolescent", datatype=XSD.string)))
                        elif name_uri == ADULT_URI:
                            g.add((name_uri, RDFS.label, Literal("Adult", datatype=XSD.string)))
                        elif name_uri == CHILD_URI:
                            g.add((name_uri, RDFS.label, Literal("Child", datatype=XSD.string)))
                        elif name_uri == INFANT_URI:
                            g.add((name_uri, RDFS.label, Literal("Infant", datatype=XSD.string)))
                        elif name_uri == MEN_URI:
                            g.add((name_uri, RDFS.label, Literal("Men", datatype=XSD.string)))
                        elif name_uri == POPULATION_URI:
                            g.add((name_uri, RDFS.label, Literal("Population", datatype=XSD.string)))
                        else:
                            g.add((name_uri,
                                RDFS.label,
                                Literal(name.title(), datatype=XSD.string)))
                    g.add((name_uri, SKOS.inScheme, HUMAN_CONCEPT_SCHEME))
                    uri_str = str(name_uri)
                    definition = choose_definition(uri_str, term_raw)
                    existing_defs = [
                        c for c in g.objects(name_uri, RDFS.comment)
                        if "[Definition Source:" in str(c)
                    ]
                    if not existing_defs:
                        g.add((name_uri,
                            RDFS.comment,
                            Literal(definition, datatype=XSD.string)))
                    g.add((name_uri, RDFS.comment, Literal("MeSH Match", datatype=XSD.string)))
                    mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_human_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((name_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    print()
                    continue
            else:
                api_term = lookup_key.replace("_", " ")
                umls_hits = search_umls(api_term)
                if umls_hits:
                    cui, name, score, definition = best_umls_match(api_term, umls_hits)
                    if name in created:
                        entity_uri = created[term_raw]
                    else:
                        entity_uri = URIRef(f"https://uts.nlm.nih.gov/uts/umls/concept/{cui}")
                        created[term_raw] = entity_uri

                    g.add((entity_uri, RDF.type,      HUMAN_CLASS))
                    g.add((entity_uri, RDF.type,      SKOS.Concept))
                    if (entity_uri, RDFS.label, None) not in g:
                        g.add((entity_uri,
                            RDFS.label,
                            Literal(name.title(), datatype=XSD.string)))

                    g.add((entity_uri, SKOS.inScheme, HUMAN_CONCEPT_SCHEME))
                    g.add((entity_uri, RDFS.comment, Literal("UMLS Match", datatype=XSD.string)))
                    if definition:
                        comment_str = f"{definition.strip()} [Definition Source: UMLS]"
                    else:
                        llm_def = get_llm_definition(term_raw)
                        comment_str = f"{llm_def} [Definition Source: llama3-8b-8192]"

                    g.add((entity_uri,
                        RDFS.comment,
                        Literal(comment_str, datatype=XSD.string)))
            
                    mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                    g.add((mention_uri, RDF.type,        MENTION_CLASS))
                    g.add((mention_uri, RDFS.label,      Literal(f"mention_human_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs,      Literal(raw_label, datatype=XSD.string)))
                    g.add((entity_uri, GUTPROP.containedIn,    mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    print(f"  • UMLS CUI={cui}  Name={name!r}  sim={score:.2f}")
                    continue
                    
                else: 
                    mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                    g.add((mention_uri, RDF.type,      MENTION_CLASS))
                    g.add((mention_uri, RDFS.label,    Literal(f"mention_human_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs,      Literal(raw_label, datatype=XSD.string)))
                    tokenized_mentions[term_raw] = mention_uri
                    print("no matches locally or in UMLS")

        else:
            pass

with open(tokenized_file, "r", encoding="utf-8") as f_sent:
    tokenized_data = json.load(f_sent)

for entry in tokenized_data:
    pmid         = entry["pmid"]
    sent_id      = entry["sent_id"]
    sentence_txt = entry["sentence"].strip()
    entities     = entry["entities"]

    sent_uri = URIRef(GUTBRAINSENTENCE[f"{pmid}_{sent_id}"])

    sent_uri = URIRef(GUTBRAINSENTENCE[f"{pmid}_{sent_id}"])
    g.add((sent_uri, RDF.type, SENTENCE))
    g.add((sent_uri, GUTPROP.hasSentenceText, Literal(sentence_txt, datatype=XSD.string)))

    if sent_id == 0:
        parent_uri = URIRef(GUTBRAIN[f"title_{pmid}"])
    else:
        parent_uri = URIRef(GUTBRAIN[f"abstract_{pmid}"])
    g.add((sent_uri,      GUTPROP.partOf,  parent_uri))
    g.add((parent_uri,    GUTPROP.composedOf, sent_uri))
        
    for ent in entities:
        if isinstance(ent, dict):
            text_span = ent.get("text_span", "").strip()
            label     = ent.get("label",    "").strip().lower()
        else:
            continue

        if label != "human":
            continue
        canonical = create_uri_fragment(text_span)
        cleaned_text_span = normalize_to_ascii(canonical).lower()
        
        if cleaned_text_span not in tokenized_mentions:
            mention_uri = URIRef(GUTBRAINMENTION[cleaned_text_span])
            tokenized_mentions[cleaned_text_span] = mention_uri
            print(cleaned_text_span)
            g.add((mention_uri, RDF.type, MENTION_CLASS))
            g.add((mention_uri, RDFS.label, Literal(f"mention_human_{cleaned_text_span}", datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.taggedAs, Literal(label, datatype=XSD.string)))
            
        else:
            mention_uri = tokenized_mentions[cleaned_text_span]
            
        g.add((mention_uri, GUTPROP.locatedIn, sent_uri))

output_file = os.path.join(save_path, "gutbrain_entities.ttl")
ttl_output = g.serialize(format="turtle")
with open(output_file, "w", encoding="utf-8") as f_out:
    f_out.write(ttl_output)

print(f"The RDF graph has been saved in {output_file}")

Query: patient
lookup term: patients
  → Reusing existing URI: https://www.ncbi.nlm.nih.gov/mesh/68010361


Query: patient
lookup term: patients
  → Reusing existing URI: https://www.ncbi.nlm.nih.gov/mesh/68010361


Query: people
lookup term: people
  • UMLS CUI=C0027361  Name='Persons'  sim=0.00
patients
Query: patient
lookup term: patients
  → Reusing existing URI: https://www.ncbi.nlm.nih.gov/mesh/68010361

Query: patient
lookup term: patients
  → Reusing existing URI: https://www.ncbi.nlm.nih.gov/mesh/68010361


Query: patient
lookup term: patients
  → Reusing existing URI: https://www.ncbi.nlm.nih.gov/mesh/68010361


Query: patient
lookup term: patients
  → Reusing existing URI: https://www.ncbi.nlm.nih.gov/mesh/68010361


humans
Query: human
lookup term: humans
  • UMLS CUI=C1300203  Name='Genus Homo'  sim=0.00
Query: patient
lookup term: patients
  → Reusing existing URI: https://www.ncbi.nlm.nih.gov/mesh/68010361


Query: patient
lookup term: patients
  → Reusing existing URI: 

<h1>INGEST DRUG</h1>

In [13]:
import re
import json
import numpy as np
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
import unicodedata
from pathlib import Path
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD, SKOS, OWL
from pprint import pprint
from rdflib.namespace import DCTERMS
from groqutils import get_llm_definition
from funcutils import get_ncit_description, get_chebi_description, get_omit_description, NCBI_BASE, HEREDITARY_BASE, UMLS_BASES, foodon_file, ncit_file, omit_file, chebi_file, hash_term_sha256

DRUG_BASE = "http://purl.obolibrary.org/obo/"
DRUG_CLASS = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Drug")
DRUG_CONCEPT_SCHEME = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Drug")


def load_chebi_labels(path):
    rows = []
    with open(path, encoding="utf-8") as fh:
        next(fh) 
        for ln in fh:
            uri, label = ln.rstrip("\n").split("\t", 1)
            rows.append((label, uri))
    return rows

CHEBI_LABELS_FILE = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\parsedOntologies\chebi_labels.txt"
chebi_rows = load_chebi_labels(CHEBI_LABELS_FILE)

exact_ix = defaultdict(list)
for lbl, uri in chebi_rows:
    exact_ix[lbl.lower()].append((lbl, uri))

labels_only = [lbl for lbl, _ in chebi_rows]
vec = TfidfVectorizer(stop_words="english")
mat = vec.fit_transform(labels_only)

def top_cosine(term, k=5, thr=0.75):
    v  = vec.transform([term])
    sc = cosine_similarity(v, mat).ravel()
    idx = np.argsort(sc)[::-1]
    out = []
    for i in idx[:k]:
        if sc[i] < thr:
            break
        lbl, uri = chebi_rows[i]
        out.append((lbl, uri, sc[i]))
    return out

greek_map = {
    'α': 'alpha',  'Α': 'alpha',
    'β': 'beta',   'Β': 'beta',
    'γ': 'gamma',  'Γ': 'gamma',
    'δ': 'delta',  'Δ': 'delta',
    'ε': 'epsilon','Ε': 'epsilon',
    'ζ': 'zeta',   'Ζ': 'zeta',
    'η': 'eta',    'Η': 'eta',
    'θ': 'theta',  'Θ': 'theta',
    'ι': 'iota',   'Ι': 'iota',
    'κ': 'kappa',  'Κ': 'kappa',
    'λ': 'lambda', 'Λ': 'lambda',
    'μ': 'mu',     'Μ': 'mu',
    'ν': 'nu',     'Ν': 'nu',
    'ξ': 'xi',     'Ξ': 'xi',
    'ο': 'omicron','Ο': 'omicron',
    'π': 'pi',     'Π': 'pi',
    'ρ': 'rho',    'Ρ': 'rho',
    'σ': 'sigma',  'Σ': 'sigma',
    'τ': 'tau',    'Τ': 'tau',
    'υ': 'upsilon','Υ': 'upsilon',
    'φ': 'phi',    'Φ': 'phi',
    'χ': 'chi',    'Χ': 'chi',
    'ψ': 'psi',    'Ψ': 'psi',
    'ω': 'omega',  'Ω': 'omega',
}

def preprocess(term):
    for greek_char, name in greek_map.items():
        if greek_char in term:
            term = term.replace(greek_char, name)
    term = term.replace('_', ' ')
    term = term.strip()
    return term.lower()

manual_created = {}
created = {}

manual_created = {
    "antibiotic" : URIRef("http://purl.obolibrary.org/obo/NCIT_C258"),
    "antibiotic_growth_promotant" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/drug/AntibioticGrowthPromotant"),
    "vancomycin" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0042313"),
    "placebo_oral_tablet" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C1249327"),
    "ct-26": URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/drug/Ct26"),
    "rhepo" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/chemical/Rhepo"),
    "cuprizone": URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0010460"),
    "anti-anxiety_agents": URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0040616")
}

created = dict(manual_created)

regex_map = [
    (r"agp", "antibiotic_growth_promotant"),
    (r"non-absorbable_antibiotic_vancomycin","vancomycin"),
    (r"placebo_tablet", "placebo_oral_tablet"),
    (r"placebo", "placebo_oral_tablet"),
    (r"cpz", "cuprizone"),
    (r"anti-anxiety_drug","anti-anxiety_agents")
]
CREATOR = "Samuel Piron"

UMLS_BASES = "https://uts.nlm.nih.gov/uts/umls/concept/"

for uri in manual_created.values():
    uri_str = str(uri)

    if uri_str.startswith(UMLS_BASES):
        g.add((uri,
               RDFS.comment,
               Literal("UMLS Match", datatype=XSD.string)))
    elif "NCIT" in uri_str:
        g.add((uri,
               RDFS.comment,
               Literal("NCIT Match", datatype=XSD.string)))
    elif "OMIT" in uri_str:
        g.add((uri,
               RDFS.comment,
               Literal("OMIT Match", datatype=XSD.string)))
    elif "NCBITaxon" in uri_str:
        g.add((uri,
               RDFS.comment,
               Literal("NCBITaxon Match", datatype=XSD.string)))
    elif "CHEBI" in uri_str:
        g.add((uri,
               RDFS.comment,
               Literal("CHEBI Match", datatype=XSD.string)))
    elif "FOODON" in uri_str:
        g.add((uri,
               RDFS.comment,
               Literal("FOODON Match", datatype=XSD.string)))
    elif "mesh" in uri_str:
        g.add((uri,
               RDFS.comment,
               Literal("MESH Match", datatype=XSD.string)))
    elif "PCO" in uri_str:
        g.add((uri,
               RDFS.comment,
               Literal("PCO Match", datatype=XSD.string)))
    else:
        g.add((uri,
               DCTERMS.creator,
               Literal(CREATOR, datatype=XSD.string)))

for term_raw, uri in manual_created.items():
    uri_str = str(uri)

    if uri_str.startswith(NCBI_BASE) and "NCIT_" in uri_str:
        ncit_id = uri_str.rsplit("_", 1)[-1]
        desc = get_ncit_description(ncit_id, ncit_file)
        m = re.search(r'—\s*(.*?)\s*—', desc)
        if m:
            desc = m.group(1).strip()
            comment = f"{desc} [Definition Source: NCIT]"
        else:
            llm_def = get_llm_definition(term_raw)
            comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    elif uri_str.startswith(UMLS_BASES):
        cui = uri_str.rsplit("/", 1)[-1]
        defn = get_umls_definition(cui)
        if defn:
            comment = f"{defn.strip()} [Definition Source: UMLS]"
        else:
            llm_def = get_llm_definition(term_raw)
            comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    elif uri_str.startswith(HEREDITARY_BASE):
        llm_def = get_llm_definition(term_raw)
        comment = f"{llm_def} [Definition Source: llama3-8b-8192]"
    else:
        llm_def = get_llm_definition(term_raw)
        comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    g.add((uri,
           RDFS.comment,
           Literal(comment, datatype=XSD.string)))

for paper_id, paper_data in data.items():
    
    entities = paper_data.get("entities", [])
    
    for i, entity in enumerate(entities):
        raw_label = entity.get("label", "").strip()
        
        text_span = entity.get("text_span", "").strip()
        
        if raw_label == "drug":
            text_span = entity.get("text_span", "").strip()
            cleaned_text_span = normalize_to_ascii(create_uri_fragment(text_span)).lower()
            term_raw = cleaned_text_span
            lookup_key = term_raw
            lookup_key = singularize(lookup_key)
            
            for pattern, replacement in regex_map:
                if re.search(pattern, lookup_key, flags=re.IGNORECASE):
                    lookup_key = replacement
                    print(lookup_key)
                    break
                    
            term = preprocess(lookup_key)
            
            print(f"Query: {term}")
            print(f"lookup term: {lookup_key}")
            
            
            if term_raw in created:

                entity_uri = created[term_raw]
                print(f"  → Reusing existing URI: {entity_uri}\n")
                g.add((entity_uri, RDF.type, DRUG_CLASS))
                g.add((entity_uri, RDF.type, SKOS.Concept))
                g.add((entity_uri, SKOS.inScheme, DRUG_CONCEPT_SCHEME))
                if (entity_uri, RDFS.label, None) not in g:
                        g.add((entity_uri,
                            RDFS.label,
                            Literal(term.title(), datatype=XSD.string)))
                mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_drug_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                tokenized_mentions[term_raw] = mention_uri
                print()
                continue

            if lookup_key in created:
                print(f"  → Reusing existing URI: {created[lookup_key]}")
                entity_uri = created[lookup_key]
                g.add((entity_uri, RDF.type, DRUG_CLASS))
                g.add((entity_uri, RDF.type, SKOS.Concept))
                g.add((entity_uri, SKOS.inScheme, DRUG_CONCEPT_SCHEME))
                if (entity_uri, RDFS.label, None) not in g:
                        g.add((entity_uri,
                            RDFS.label,
                            Literal(term.title(), datatype=XSD.string)))
                mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_drug_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                tokenized_mentions[term_raw] = mention_uri
                print()
                continue

            ex = exact_ix.get(term, [])
            cos = top_cosine(term)
            if ex:
                for lbl, uri in ex:
                    print(f"  • {lbl:40s} URI={uri}")
                    entity_uri = URIRef(f"{uri}")
                    created[term_raw] = entity_uri
                    g.add((entity_uri, RDF.type, DRUG_CLASS))
                    g.add((entity_uri, RDF.type, SKOS.Concept))
                    if (entity_uri, RDFS.label, None) not in g:
                        g.add((entity_uri,
                            RDFS.label,
                            Literal(lbl.title(), datatype=XSD.string)))
                    g.add((entity_uri, SKOS.inScheme, DRUG_CONCEPT_SCHEME))
                    uri_str = str(entity_uri).lower()

                    if "stato_" in uri_str:
                        comment = "STATO Match"
                    elif "ncbitaxon_" in uri_str:
                        comment = "NCBITaxon Match"
                    elif "ncit_" in uri_str:
                        comment = "NCIT Match"
                    elif "obi_" in uri_str:
                        comment = "OBI Match"
                    elif "umls" in uri_str:
                        comment = "UMLS Match"
                    elif "omit" in uri_str:
                        comment = "OMIT Match"
                    elif "chebi" in uri_str:
                        comment = "CHEBI Match"
                    elif "ohmi" in uri_str:
                        comment = "OHMI Match"
                    else:
                        comment = CREATOR 
                    g.add((entity_uri, RDFS.comment, Literal(comment, datatype=XSD.string)))
                    uri_str = str(entity_uri)
                    definition = choose_definition(uri_str, term_raw)
                    existing_defs = [
                        c for c in g.objects(entity_uri, RDFS.comment)
                        if "[Definition Source:" in str(c)
                    ]
                    if not existing_defs:
                        g.add((entity_uri,
                            RDFS.comment,
                            Literal(definition, datatype=XSD.string)))
                    mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_drug_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    continue
                    print(); 

            elif cos:
                for lbl, uri, score in cos[:1]:
                    print(f"  • {lbl:40s} URI={uri:40s} score={score:.2f}")
                    name_uri = URIRef(f"{uri}")
                    created[term_raw] = name_uri
                    g.add((name_uri, RDF.type, DRUG_CLASS))
                    g.add((name_uri, RDF.type, SKOS.Concept))
                    if (name_uri, RDFS.label, None) not in g:
                        g.add((name_uri,
                            RDFS.label,
                            Literal(lbl.title(), datatype=XSD.string)))
                    g.add((name_uri, SKOS.inScheme, DRUG_CONCEPT_SCHEME))
                    uri_str = str(name_uri).lower()

                    if "stato_" in uri_str:
                        comment = "STATO Match"
                    elif "ncbitaxon_" in uri_str:
                        comment = "NCBITaxon Match"
                    elif "ncit_" in uri_str:
                        comment = "NCIT Match"
                    elif "obi_" in uri_str:
                        comment = "OBI Match"
                    elif "umls" in uri_str:
                        comment = "UMLS Match"
                    elif "omit" in uri_str:
                        comment = "OMIT Match"
                    elif "chebi" in uri_str:
                        comment = "CHEBI Match"
                    elif "ohmi" in uri_str:
                        comment = "OHMI Match"
                    else:
                        comment = CREATOR 
                    g.add((name_uri, RDFS.comment, Literal(comment, datatype=XSD.string)))
                    uri_str = str(name_uri)
                    definition = choose_definition(uri_str, term_raw)
                    existing_defs = [
                        c for c in g.objects(name_uri, RDFS.comment)
                        if "[Definition Source:" in str(c)
                    ]
                    if not existing_defs:
                        g.add((name_uri,
                            RDFS.comment,
                            Literal(definition, datatype=XSD.string)))
                    mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_drug_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((name_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    print()
            else:
                mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_drug_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                tokenized_mentions[term_raw] = mention_uri
                continue

        else:
            pass

with open(tokenized_file, "r", encoding="utf-8") as f_sent:
    tokenized_data = json.load(f_sent)

for entry in tokenized_data:
    pmid         = entry["pmid"]
    sent_id      = entry["sent_id"]
    sentence_txt = entry["sentence"].strip()
    entities     = entry["entities"]

    sent_uri = URIRef(GUTBRAINSENTENCE[f"{pmid}_{sent_id}"])
    g.add((sent_uri, RDF.type, SENTENCE))
    g.add((sent_uri, GUTPROP.hasSentenceText, Literal(sentence_txt, datatype=XSD.string)))

    if sent_id == 0:
        parent_uri = URIRef(GUTBRAIN[f"title_{pmid}"])
    else:
        parent_uri = URIRef(GUTBRAIN[f"abstract_{pmid}"])
    g.add((sent_uri,      GUTPROP.partOf,  parent_uri))
    g.add((parent_uri,    GUTPROP.composedOf, sent_uri))
        
    for ent in entities:
        if isinstance(ent, dict):
            text_span = ent.get("text_span", "").strip()
            label     = ent.get("label",    "").strip().lower()
        else:
            continue

        if label != "drug":
            continue
        canonical = create_uri_fragment(text_span)
        cleaned_text_span = normalize_to_ascii(canonical).lower()
        
        if cleaned_text_span not in tokenized_mentions:
            mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(cleaned_text_span, max_length=16)])
            tokenized_mentions[cleaned_text_span] = mention_uri

            g.add((mention_uri, RDF.type, MENTION_CLASS))
            g.add((mention_uri, RDFS.label, Literal(f"mention_drug_{cleaned_text_span}", datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.taggedAs, Literal(label, datatype=XSD.string)))
            
        else:
            mention_uri = tokenized_mentions[cleaned_text_span]
            
        g.add((mention_uri, GUTPROP.locatedIn, sent_uri))

Query: antibiotic
lookup term: antibiotic
  → Reusing existing URI: http://purl.obolibrary.org/obo/NCIT_C258

Query: antibiotic growth promotant
lookup term: antibiotic_growth_promotant
  → Reusing existing URI: https://hereditary.dei.unipd.it/ontology/gutbrain/resource/drug/AntibioticGrowthPromotant

antibiotic_growth_promotant
Query: antibiotic growth promotant
lookup term: antibiotic_growth_promotant
  → Reusing existing URI: https://hereditary.dei.unipd.it/ontology/gutbrain/resource/drug/AntibioticGrowthPromotant

antibiotic_growth_promotant
Query: antibiotic growth promotant
lookup term: antibiotic_growth_promotant
  → Reusing existing URI: https://hereditary.dei.unipd.it/ontology/gutbrain/resource/drug/AntibioticGrowthPromotant

antibiotic_growth_promotant
Query: antibiotic growth promotant
lookup term: antibiotic_growth_promotant
  → Reusing existing URI: https://hereditary.dei.unipd.it/ontology/gutbrain/resource/drug/AntibioticGrowthPromotant

antibiotic_growth_promotant
Query:

<h1>INGEST MICROBIOME</h1>

In [None]:
import re
import json
import numpy as np
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
import unicodedata
from pathlib import Path
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD, SKOS, OWL
from pprint import pprint
from rdflib.namespace import DCTERMS
from groqutils import get_llm_definition
from funcutils import get_ncit_description, get_chebi_description, get_omit_description, NCBI_BASE, HEREDITARY_BASE, UMLS_BASES, foodon_file, ncit_file, omit_file, chebi_file, hash_term_sha256

MICROBIOME_BASE = "http://purl.obolibrary.org/obo/"
MICROBIOME_CLASS = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Microbiome")
MICROBIOME_CONCEPT_SCHEME = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Microbiome")

def load_ohmi_labels(path):
    pattern = re.compile(r'^\s*(.*?)\s*\[([A-Za-z0-9_]+)\]')
    rows = []
    with open(path, encoding="utf-8") as fh:
        for ln in fh:
            m = pattern.match(ln)
            if not m:
                continue
            label, term_id = m.group(1), m.group(2)
            uri = MICROBIOME_BASE + term_id
            rows.append((label, uri))
    return rows

OHMI_LABELS_FILE = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\parsedOntologies\OHMI_full_taxonomy.txt"
ohmi_rows = load_ohmi_labels(OHMI_LABELS_FILE)

exact_ix = defaultdict(list)
for lbl, uri in ohmi_rows:
    exact_ix[lbl.lower()].append((lbl, uri))

labels_only = [preprocess(lbl) for lbl, _ in ohmi_rows]
vec = TfidfVectorizer(stop_words="english")
mat = vec.fit_transform(labels_only)

def top_cosine(term, k=5, thr=0.75):
    v  = vec.transform([term])
    sc = cosine_similarity(v, mat).ravel()
    idx = np.argsort(sc)[::-1]
    out = []
    for i in idx[:k]:
        if sc[i] < thr:
            break
        lbl, uri = ohmi_rows[i]
        out.append((lbl, uri, sc[i]))
    return out

def preprocess(term):
    term = term.replace('_', ' ')
    term = term.strip()
    return term.lower()

manual_created = {}
created = {}

manual_created = {
    "poultry_gut_microbiome": URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/microbiome/PoultryGutMicrobiome"),
    "intestinal_microbiome" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/microbiome/IntestinalMicrobiome"),
    "gut_microbial_ecosystem" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/microbiome/GutMicrobialEcosystem"),
    "gut_metagenome" : URIRef("http://purl.obolibrary.org/obo/NCBITaxon_749906"),
    "feces_metagenome" : URIRef("http://purl.obolibrary.org/obo/NCBITaxon_1861841"),
    "microbial_community" : URIRef("http://purl.obolibrary.org/obo/PCO_1000004"),
    "intestinal_flora" : URIRef("http://purl.obolibrary.org/obo/NCIT_C93019"),
    "microorganism" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0445623"),
    "gut_microbial": URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/microbiome/GutMicrobial"),
    "pharmacomicrobiomics" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/microbiome/Pharmacomicrobiomics"),
    "colon_microbiota": URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C3510360"),
    "interventions_regulating_intestinal_flora" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/microbiome/InterventionsRegulatingIntestinalFlora")
}

created = dict(manual_created)

regex_map = [
    (r"chicken_gut_microbiome", "poultry_gut_microbiome"),
    (r"fecal_microbiota","microbiota"),
    (r"human_intestinal_microbiota","intestinal_microbiome"),
    (r"intestinal_microbiota","intestinal_microbiome"),
    (r"gut_microbial_community","microbial_community"),
    (r"fecal_metagenome","feces_metagenome"),
    (r"canine_intestinal_microbiota","intestinal_microbiome"),
    (r"gastrointestinal_tract_microbiome","microbiome_in_human_gastrointestinal_system"),
    (r"irif","interventions_regulating_intestinal_flora"),
    (r"gut_microorganism","microorganism"),
    (r"microbiome_population","microbiome"),
    (r"gut_flora","intestinal_flora"),
    (r"gm","microbiome"),
    (r"cecal_microbiota","microbiota"),
    (r"intestinal_microbe","microbiome"),
    (r"inflammatory_microbes_and_gene","microbiota"),
    (r"pro-inflammatory_bacteria__genes_and_pathway","microbiome"),
    (r"anti-inflammatory_bacteria__genes_and_pathway","microbiome"),
    (r"positive_mood-related_gut_microbiota","microbiota"),
    (r"small_bowel_microbiome","microbiome"),
    (r"small_intestinal_microbiome", "intestinal_microbiome")

]
CREATOR = "Samuel Piron"

UMLS_BASES = "https://uts.nlm.nih.gov/uts/umls/concept/"
#MESH_BASE = "https://meshb.nlm.nih.gov/record/ui?ui="

for uri in manual_created.values():
    uri_str = str(uri)

    if uri_str.startswith(UMLS_BASES):
        # only a UMLS‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("UMLS Match", datatype=XSD.string)))
    elif "NCIT" in uri_str:
        # only a NCIT‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("NCIT Match", datatype=XSD.string)))
    elif "OMIT" in uri_str:
        # only a OMIT‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("OMIT Match", datatype=XSD.string)))
    elif "NCBITaxon" in uri_str:
        # only a NCBITaxon‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("NCBITaxon Match", datatype=XSD.string)))
    elif "CHEBI" in uri_str:
        # only a CHEBI‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("CHEBI Match", datatype=XSD.string)))
    elif "FOODON" in uri_str:
        # only a FOODON‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("FOODON Match", datatype=XSD.string)))
    elif "mesh" in uri_str:
        # only a MESH‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("MESH Match", datatype=XSD.string)))
    elif "PCO" in uri_str:
        # only a XCO‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("PCO Match", datatype=XSD.string)))
    else:
        # everything else still gets a creator
        g.add((uri,
               DCTERMS.creator,
               Literal(CREATOR, datatype=XSD.string)))
        
for term_raw, uri in manual_created.items():
    uri_str = str(uri)

    # NCIT definitions
    if uri_str.startswith(NCBI_BASE) and "NCIT_" in uri_str:
        ncit_id = uri_str.rsplit("_", 1)[-1]
        desc = get_ncit_description(ncit_id, ncit_file)
        m = re.search(r'—\s*(.*?)\s*—', desc)
        if m:
            desc = m.group(1).strip()
            comment = f"{desc} [Definition Source: NCIT]"
        else:
            llm_def = get_llm_definition(term_raw)
            comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    # UMLS definitions
    elif uri_str.startswith(UMLS_BASES):
        cui = uri_str.rsplit("/", 1)[-1]
        defn = get_umls_definition(cui)
        if defn:
            comment = f"{defn.strip()} [Definition Source: UMLS]"
        else:
            llm_def = get_llm_definition(term_raw)
            comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    # MeSH definitions via your mesh_index
    elif uri_str.startswith(MESH_BASE):
        ui     = uri_str.rsplit("/",1)[-1]
        hits   = [d['name'] for d in mesh_descs if d['ui']==ui]
        if hits:
            comment = f"{hits[0]} [Definition Source: MeSH]"
        elif term_raw == "patients":
            comment = "Patients with various diseases. [Definition Source: GUTBRAIN]"
        else:
            llm_def = get_llm_definition(term_raw)
            comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    elif uri_str.startswith(HEREDITARY_BASE):
        llm_def = get_llm_definition(term_raw)
        comment = f"{llm_def} [Definition Source: llama3-8b-8192]"
    else:
        llm_def = get_llm_definition(term_raw)
        comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    g.add((uri,
           RDFS.comment,
           Literal(comment, datatype=XSD.string)))
    
for paper_id, paper_data in data.items():
    
    entities = paper_data.get("entities", [])
    
    for i, entity in enumerate(entities):
        raw_label = entity.get("label", "").strip()
        
        text_span = entity.get("text_span", "").strip()
        
        if raw_label == "microbiome":
            text_span = entity.get("text_span", "").strip()
            cleaned_text_span = normalize_to_ascii(create_uri_fragment(text_span)).lower()
            term_raw = cleaned_text_span
            lookup_key = term_raw
            lookup_key = singularize(lookup_key)
            
            for pattern, replacement in regex_map:
                if re.search(pattern, lookup_key, flags=re.IGNORECASE):
                    lookup_key = replacement
                    print(lookup_key)
                    break
                    
            term = preprocess(lookup_key)
            
            print(f"Query: {term}")
            print(f"lookup term: {lookup_key}")
            
            
            if term_raw in created:

                entity_uri = created[term_raw]
                print(f"  → Reusing existing URI: {entity_uri}\n")
                g.add((entity_uri, RDF.type, MICROBIOME_CLASS))
                g.add((entity_uri, RDF.type, SKOS.Concept))
                g.add((entity_uri, SKOS.inScheme, MICROBIOME_CONCEPT_SCHEME))
                if (entity_uri, RDFS.label, None) not in g:
                        g.add((entity_uri,
                            RDFS.label,
                            Literal(term.title(), datatype=XSD.string)))
                mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_microbiome_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                tokenized_mentions[term_raw] = mention_uri
                print()
                continue

            if lookup_key in created:
                print(f"  → Reusing existing URI: {created[lookup_key]}")
                entity_uri = created[lookup_key]
                g.add((entity_uri, RDF.type, MICROBIOME_CLASS))
                g.add((entity_uri, RDF.type, SKOS.Concept))
                g.add((entity_uri, SKOS.inScheme, MICROBIOME_CONCEPT_SCHEME))
                if (entity_uri, RDFS.label, None) not in g:
                        g.add((entity_uri,
                            RDFS.label,
                            Literal(term.title(), datatype=XSD.string)))
                mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_microbiome_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                tokenized_mentions[term_raw] = mention_uri
                print()
                continue

            ex = exact_ix.get(term, [])
            cos = top_cosine(term)
            if ex:
                for lbl, uri in ex:
                    print(f"  • {lbl:40s} URI={uri}")
                    entity_uri = URIRef(f"{uri}")
                    created[term_raw] = entity_uri
                    g.add((entity_uri, RDF.type, MICROBIOME_CLASS))
                    g.add((entity_uri, RDF.type, SKOS.Concept))
                    if (entity_uri, RDFS.label, None) not in g:
                        g.add((entity_uri,
                            RDFS.label,
                            Literal(lbl.title(), datatype=XSD.string)))
                    g.add((entity_uri, SKOS.inScheme, MICROBIOME_CONCEPT_SCHEME))
                    uri_str = str(entity_uri).lower()

                    if "stato_" in uri_str:
                        comment = "STATO Match"
                    elif "ncbitaxon_" in uri_str:
                        comment = "NCBITaxon Match"
                    elif "ncit_" in uri_str:
                        comment = "NCIT Match"
                    elif "obi_" in uri_str:
                        comment = "OBI Match"
                    elif "umls" in uri_str:
                        comment = "UMLS Match"
                    elif "omit" in uri_str:
                        comment = "OMIT Match"
                    elif "chebi" in uri_str:
                        comment = "CHEBI Match"
                    elif "ohmi" in uri_str:
                        comment = "OHMI Match"
                    else:
                        comment = CREATOR 
                    g.add((entity_uri, RDFS.comment, Literal(comment, datatype=XSD.string)))
                    uri_str = str(entity_uri)
                    definition = choose_definition(uri_str, term_raw)
                    existing_defs = [
                        c for c in g.objects(entity_uri, RDFS.comment)
                        if "[Definition Source:" in str(c)
                    ]
                    if not existing_defs:
                        g.add((entity_uri,
                            RDFS.comment,
                            Literal(definition, datatype=XSD.string)))
                    mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_microbiome_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    continue
                    print(); 
                    
            elif cos:
                for lbl, uri, score in cos[:1]:
                    print(f"  • {lbl:40s} URI={uri:40s} score={score:.2f}")
                    name_uri = URIRef(f"{uri}")
                    created[term_raw] = name_uri
                    g.add((name_uri, RDF.type, MICROBIOME_CLASS))
                    g.add((name_uri, RDF.type, SKOS.Concept))
                    if (name_uri, RDFS.label, None) not in g:
                        g.add((name_uri,
                            RDFS.label,
                            Literal(lbl.title(), datatype=XSD.string)))
                    g.add((name_uri, SKOS.inScheme, MICROBIOME_CONCEPT_SCHEME))
                    uri_str = str(name_uri).lower()

                    if "stato_" in uri_str:
                        comment = "STATO Match"
                    elif "ncbitaxon_" in uri_str:
                        comment = "NCBITaxon Match"
                    elif "ncit_" in uri_str:
                        comment = "NCIT Match"
                    elif "obi_" in uri_str:
                        comment = "OBI Match"
                    elif "umls" in uri_str:
                        comment = "UMLS Match"
                    elif "omit" in uri_str:
                        comment = "OMIT Match"
                    elif "chebi" in uri_str:
                        comment = "CHEBI Match"
                    elif "ohmi" in uri_str:
                        comment = "OHMI Match"
                    else:
                        comment = CREATOR 
                    g.add((name_uri, RDFS.comment, Literal(comment, datatype=XSD.string)))
                    uri_str = str(name_uri)
                    definition = choose_definition(uri_str, term_raw)
                    existing_defs = [
                        c for c in g.objects(name_uri, RDFS.comment)
                        if "[Definition Source:" in str(c)
                    ]
                    if not existing_defs:
                        g.add((name_uri,
                            RDFS.comment,
                            Literal(definition, datatype=XSD.string)))
                    mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_microbiome_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((name_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    print()
            else:
                mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_microbiome_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                tokenized_mentions[term_raw] = mention_uri
                print("no matches")
                continue

        else:
            pass

created["bacteria"] = URIRef("http://purl.obolibrary.org/obo/NCBITaxon_2")
g.remove((created["bacteria"], RDF.type, MICROBIOME_CLASS))
g.remove((created["bacteria"], SKOS.inScheme, MICROBIOME_CONCEPT_SCHEME))

with open(tokenized_file, "r", encoding="utf-8") as f_sent:
    tokenized_data = json.load(f_sent)

for entry in tokenized_data:
    pmid         = entry["pmid"]
    sent_id      = entry["sent_id"]
    sentence_txt = entry["sentence"].strip()
    entities     = entry["entities"]

    sent_uri = URIRef(GUTBRAINSENTENCE[f"{pmid}_{sent_id}"])
    g.add((sent_uri, RDF.type, SENTENCE))
    g.add((sent_uri, GUTPROP.hasSentenceText, Literal(sentence_txt, datatype=XSD.string)))

    if sent_id == 0:
        parent_uri = URIRef(GUTBRAIN[f"title_{pmid}"])
    else:
        parent_uri = URIRef(GUTBRAIN[f"abstract_{pmid}"])
    g.add((sent_uri,      GUTPROP.partOf,  parent_uri))
    g.add((parent_uri,    GUTPROP.composedOf, sent_uri))
        
    for ent in entities:
        if isinstance(ent, dict):
            text_span = ent.get("text_span", "").strip()
            label     = ent.get("label",    "").strip().lower()
        else:
            continue

        if label != "microbiome":
            continue
        canonical = create_uri_fragment(text_span)
        cleaned_text_span = normalize_to_ascii(canonical).lower()
        
        if cleaned_text_span not in tokenized_mentions:
            mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(cleaned_text_span, max_length=16)])
            tokenized_mentions[cleaned_text_span] = mention_uri

            g.add((mention_uri, RDF.type, MENTION_CLASS))
            g.add((mention_uri, RDFS.label, Literal(f"mention_microbiome_{cleaned_text_span}", datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.taggedAs, Literal(label, datatype=XSD.string)))
            
        else:
            mention_uri = tokenized_mentions[cleaned_text_span]
            
        g.add((mention_uri, GUTPROP.locatedIn, sent_uri))

<h1>INGEST STATISTICAL TECHNIQUE</h1>

In [15]:
import re
import json
import numpy as np
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
import unicodedata
from pathlib import Path
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD, SKOS, OWL
from pprint import pprint
from rdflib.namespace import DCTERMS
from groqutils import get_llm_definition
from funcutils import get_ncit_description, get_chebi_description, get_omit_description, NCBI_BASE, HEREDITARY_BASE, UMLS_BASES, foodon_file, ncit_file, omit_file, chebi_file, hash_term_sha256

STATISTICALTECHNIQUE_BASE = "http://purl.obolibrary.org/obo/"
STATISTICALTECHNIQUE_CLASS = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/StatisticalTechnique")
STATISTICALTECHNIQUE_CONCEPT_SCHEME = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/StatisticalTechnique")

def load_statistical_labels(path):
    pattern = re.compile(r'^\s*(.*?)\s*[\(\[]([A-Za-z0-9_]+)[\)\]]')
    rows = []
    with open(path, encoding="utf-8") as fh:
        for ln in fh:
            m = pattern.match(ln)
            if not m:
                continue
            label, term_id = m.group(1), m.group(2)
            uri = STATISTICALTECHNIQUE_BASE + term_id
            rows.append((label, uri))
    return rows

STATISTICAL_LABELS_FILE = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\parsedOntologies\stato_full_taxonomy.txt"
STATO_LABELS_FILE = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\parsedOntologies\subtree_C19044.txt"
stat_rows = load_statistical_labels(STATISTICAL_LABELS_FILE)
stat1_rows = load_statistical_labels(STATO_LABELS_FILE)

exact_ix = defaultdict(list)
for lbl, uri in stat_rows:
    exact_ix[lbl.lower()].append((lbl, uri))

exact_ix1 = defaultdict(list)
for lbl, uri in stat1_rows:
    exact_ix1[lbl.lower()].append((lbl, uri))

labels_only = [preprocess(lbl) for lbl, _ in stat_rows]
vec = TfidfVectorizer(stop_words="english")
mat = vec.fit_transform(labels_only)

labels1_only = [preprocess(lbl) for lbl, _ in stat1_rows]
vec1 = TfidfVectorizer(stop_words="english")
mat1 = vec1.fit_transform(labels1_only)

def top_cosine(term, k=5, thr=0.75):
    v  = vec.transform([term])
    sc = cosine_similarity(v, mat).ravel()
    idx = np.argsort(sc)[::-1]
    out = []
    for i in idx[:k]:
        if sc[i] < thr:
            break
        lbl, uri = stat_rows[i]
        out.append((lbl, uri, sc[i]))
    return out

def top_cosine1(term, k=5, thr=0.75):
    v   = vec1.transform([term])
    sc  = cosine_similarity(v, mat1).ravel()
    idx = np.argsort(sc)[::-1]
    out = []
    for i in idx[:k]:
        if sc[i] < thr:
            break
        lbl, uri = stat1_rows[i]
        out.append((lbl, uri, sc[i]))
    return out

manual_created = {}
created = {}

manual_created = {
    "random-effects_meta-analyses" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/statisticaltechnique/RandomEffectsMetaAnalyses"),
    "receiver_operating_characteristic_curve_analysis" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/statisticaltechnique/ReceiverOperatingCharacteristicCurveAnalysis"),
    "chao1_index" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/statisticaltechnique/Chao1Index"),
    "shannon_diversity_index" : URIRef("http://purl.obolibrary.org/obo/PCO_0000062"),
    "inverse_simpson_diversity_index" : URIRef("http://purl.obolibrary.org/obo/PCO_0000064"),
    "bray_curtis_dissimilarity" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/statisticaltechnique/BrayCurtisDissimilarity"),
    "cochrane_risk_of_bias" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/statisticaltechnique/CochraneRiskOfBias"),
    "mr_analysis" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/statisticaltechnique/MrAnalysis"),
    "linear_discriminant_analysis": URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C5940528"),
    "two-stage_cluster_random_sampling" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C2348830"),
    "inverse_variance_weighted_mr_analysis": URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/statisticaltechnique/InverseVarianceWeightedMrAnalysis"),
    "multivariable_mr_analysis" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0026777"),
    "q_test" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0809418"),
    "sensitivity_analysis": URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/statisticaltechnique/SensitivityAnalysis"),
    "linear_discriminant_analysis": URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C5940528"),
    "logistic_regression": URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0206031"),
    "multiple_regression": URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0681923"),
    "mr_steiger_test": URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/statisticaltechnique/MrSteigerTest"),
    "mr_egger_intercept_test": URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/statisticaltechnique/MrEggerInterceptTest"),
}

created = dict(manual_created)
UMLS_BASE = "https://uts.nlm.nih.gov/uts/umls/concept/"


regex_map = [

    (r"rf", "random_forest"),
    (r"random-effects_meta-analyse", "random-effects_meta-analyses"),
    (r"shannon", "shannon_diversity_index"),
    (r"inverse_simpson", "inverse_simpson_diversity_index"),
    (r"random_forests__rf_", "random_forest"),
    (r"mr_analysi", "mr_analysis"),
    (r"lefse","linear_discriminant_analysis"),
    (r"lefse_analysi", "linear_discriminant_analysis"),
    (r"random_forest_classification", "random_forest"),
    (r"two-sample_mr", "two-stage_cluster_random_sampling"),
    (r"inverse_variance_weighted__ivw___mr-egger__and_weighted_median__wm__method","inverse_variance_weighted_mr_analysis"),
    (r"multivariable_mr", "multivariate_mr_analysis"),
    (r"mvmr", "multivariate_mr_analysis"),
    (r"cochran_s_q_test","q_test"),
    (r"sensitivity_analyse", "sensitivity_analysis"),
    (r"linear_discriminant_analysi","linear_discriminant_analysis"),
    (r"lda", "linear_discriminant_analysis"),
    (r"statistical_modeling_with_logistic_regression", "logistic_regression"),
    (r"random_forest_model", "random_forest"),
    (r"pearson_or_spearman_correlation_test", "pearson_correlation_test"),
    (r"\bpearson_correlation_analysi\w*\b","pearson_correlation_test"),
    (r"multiple_regression_analysi","multiple_regression"),
    (r"receiver_operating_characteristic_curve_analysi", "receiver_operating_characteristic_curve_analysis"),
]

CREATOR = "Samuel Piron"

UMLS_BASE = "https://uts.nlm.nih.gov/uts/umls/concept/"

for uri in manual_created.values():
    uri_str = str(uri)

    if uri_str.startswith(UMLS_BASE):
        # only a UMLS‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("UMLS Match", datatype=XSD.string)))
    elif "PCO" in uri_str:
        # only a STATO‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("PCO Match", datatype=XSD.string)))
    else:
        # everything else still gets a creator
        g.add((uri,
               DCTERMS.creator,
               Literal(CREATOR, datatype=XSD.string)))
        
for term_raw, uri in manual_created.items():
    uri_str = str(uri)

    # NCIT definitions
    if uri_str.startswith(NCBI_BASE) and "NCIT_" in uri_str:
        ncit_id = uri_str.rsplit("_", 1)[-1]
        desc = get_ncit_description(ncit_id, ncit_file)
        m = re.search(r'—\s*(.*?)\s*—', desc)
        if m:
            desc = m.group(1).strip()
            comment = f"{desc} [Definition Source: NCIT]"
        else:
            llm_def = get_llm_definition(term_raw)
            comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    # UMLS definitions
    elif uri_str.startswith(UMLS_BASES):
        cui = uri_str.rsplit("/", 1)[-1]
        defn = get_umls_definition(cui)
        if defn:
            comment = f"{defn.strip()} [Definition Source: UMLS]"
        else:
            llm_def = get_llm_definition(term_raw)
            comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    # MeSH definitions via your mesh_index
    elif uri_str.startswith(MESH_BASE):
        ui     = uri_str.rsplit("/",1)[-1]
        hits   = [d['name'] for d in mesh_descs if d['ui']==ui]
        if hits:
            comment = f"{hits[0]} [Definition Source: MeSH]"
        elif term_raw == "patients":
            comment = "Patients with various diseases. [Definition Source: GUTBRAIN]"
        else:
            llm_def = get_llm_definition(term_raw)
            comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    elif uri_str.startswith(HEREDITARY_BASE):
        llm_def = get_llm_definition(term_raw)
        comment = f"{llm_def} [Definition Source: llama3-8b-8192]"
    else:
        llm_def = get_llm_definition(term_raw)
        comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    g.add((uri,
           RDFS.comment,
           Literal(comment, datatype=XSD.string)))
    
for paper_id, paper_data in data.items():
    
    entities = paper_data.get("entities", [])
    
    for i, entity in enumerate(entities):
        raw_label = entity.get("label", "").strip()
        
        text_span = entity.get("text_span", "").strip()
        
        if raw_label == "statistical technique":
            text_span = entity.get("text_span", "").strip()
            cleaned_text_span = normalize_to_ascii(create_uri_fragment(text_span)).lower()
            term_raw = cleaned_text_span
            lookup_key = term_raw
            lookup_key = singularize(lookup_key)
            
            for pattern, replacement in regex_map:
                if re.search(pattern, lookup_key, flags=re.IGNORECASE):
                    lookup_key = replacement
                    print(lookup_key)
                    break
                    
            term = preprocess(lookup_key)
            
            print(f"Query: {term}")
            print(f"lookup term: {lookup_key}")
            
            
            if term_raw in created:

                entity_uri = created[term_raw]
                print(f"  → Reusing existing URI: {entity_uri}\n")
                g.add((entity_uri, RDF.type, STATISTICALTECHNIQUE_CLASS))
                g.add((entity_uri, RDF.type, SKOS.Concept))
                g.add((entity_uri, SKOS.inScheme, STATISTICALTECHNIQUE_CONCEPT_SCHEME))
                if (entity_uri, RDFS.label, None) not in g:
                    g.add((entity_uri,
                        RDFS.label,
                        Literal(term.title(), datatype=XSD.string)))
                mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_stattechnique_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                tokenized_mentions[term_raw] = mention_uri
                print()
                continue

            if lookup_key in created:
                print(f"  → Reusing existing URI: {created[lookup_key]}")
                entity_uri = created[lookup_key]
                g.add((entity_uri, RDF.type, STATISTICALTECHNIQUE_CLASS))
                g.add((entity_uri, RDF.type, SKOS.Concept))
                g.add((entity_uri, SKOS.inScheme, STATISTICALTECHNIQUE_CONCEPT_SCHEME))
                if (entity_uri, RDFS.label, None) not in g:
                    g.add((entity_uri,
                        RDFS.label,
                        Literal(term.title(), datatype=XSD.string)))
                mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_stattechnique_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                tokenized_mentions[term_raw] = mention_uri
                print()
                continue

            ex = exact_ix.get(term, [])
            cos = top_cosine(term)
            cos1 = top_cosine1(term)
            if ex:
                for lbl, uri in ex:
                    print(f"  • {lbl:40s} URI={uri}")
                    entity_uri = URIRef(f"{uri}")
                    created[term_raw] = entity_uri
                    g.add((entity_uri, RDF.type, STATISTICALTECHNIQUE_CLASS))
                    g.add((entity_uri, RDF.type, SKOS.Concept))
                    if (entity_uri, RDFS.label, None) not in g:
                        g.add((entity_uri,
                            RDFS.label,
                            Literal(lbl.title(), datatype=XSD.string)))
                    g.add((entity_uri, SKOS.inScheme, STATISTICALTECHNIQUE_CONCEPT_SCHEME))
                    uri_str = str(entity_uri).lower()

                    if "stato_" in uri_str:
                        comment = "STATO Match"
                    elif "ncit_" in uri_str:
                        comment = "NCIT Match"
                    elif "obi_" in uri_str:
                        comment = "OBI Match"
                    elif "iao_" in uri_str:
                        comment = "IAO Match"
                    else:
                        comment = CREATOR 
                    g.add((entity_uri, RDFS.comment, Literal(comment, datatype=XSD.string)))
                    uri_str = str(entity_uri)
                    definition = choose_definition(uri_str, term_raw)
                    existing_defs = [
                        c for c in g.objects(entity_uri, RDFS.comment)
                        if "[Definition Source:" in str(c)
                    ]
                    if not existing_defs:
                        g.add((entity_uri,
                            RDFS.comment,
                            Literal(definition, datatype=XSD.string)))
                    mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_stattechnique_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    continue
                    print(); 
                    
            elif cos:
                for lbl, uri, score in cos[:1]:
                    print(f"  • {lbl:40s} URI={uri:40s} score={score:.2f}")
                    name_uri = URIRef(f"{uri}")
                    created[term_raw] = name_uri
                    g.add((name_uri, RDF.type, STATISTICALTECHNIQUE_CLASS))
                    g.add((name_uri, RDF.type, SKOS.Concept))
                    if re.match(r"(?i)pearson['’]s correlation coefficient", lbl.strip()):
                        label_str = "Pearsons Correlation Coefficient"
                    else:
                        label_str = lbl.title()

                    if (name_uri, RDFS.label, None) not in g:
                        g.add((name_uri,
                            RDFS.label,
                            Literal(label_str, datatype=XSD.string)))
                    g.add((name_uri, SKOS.inScheme, STATISTICALTECHNIQUE_CONCEPT_SCHEME))
                    uri_str = str(name_uri).lower()

                    if "stato_" in uri_str:
                        comment = "STATO Match"
                    elif "ncit_" in uri_str:
                        comment = "NCIT Match"
                    elif "obi_" in uri_str:
                        comment = "OBI Match"
                    elif "iao_" in uri_str:
                        comment = "IAO Match"
                    else:
                        comment = CREATOR 
                    g.add((name_uri, RDFS.comment, Literal(comment, datatype=XSD.string)))
                    uri_str = str(name_uri)
                    definition = choose_definition(uri_str, term_raw)
                    existing_defs = [
                        c for c in g.objects(name_uri, RDFS.comment)
                        if "[Definition Source:" in str(c)
                    ]
                    if not existing_defs:
                        g.add((name_uri,
                            RDFS.comment,
                            Literal(definition, datatype=XSD.string)))
                    mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_stattechnique_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((name_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    print()
                    
            elif cos1:
                for lbl, uri, score in cos1[:1]:
                    print(f"  • {lbl:40s} URI={uri:40s} score={score:.2f}")
                    name_uri = URIRef(f"{uri}")
                    created[term_raw] = name_uri
                    g.add((name_uri, RDF.type, STATISTICALTECHNIQUE_CLASS))
                    g.add((name_uri, RDF.type, SKOS.Concept))
                    if re.match(r"(?i)pearson['’]s correlation coefficient", lbl.strip()):
                        label_str = "Pearsons Correlation Coefficient"
                    else:
                        label_str = lbl.title()

                    if (name_uri, RDFS.label, None) not in g:
                        g.add((name_uri,
                            RDFS.label,
                            Literal(label_str, datatype=XSD.string)))
                    g.add((name_uri, SKOS.inScheme, STATISTICALTECHNIQUE_CONCEPT_SCHEME))
                    uri_str = str(name_uri).lower()

                    if "stato_" in uri_str:
                        comment = "STATO Match"
                    elif "ncit_" in uri_str:
                        comment = "NCIT Match"
                    elif "obi_" in uri_str:
                        comment = "OBI Match"
                    elif "iao_" in uri_str:
                        comment = "IAO Match"
                    else:
                        comment = CREATOR 
                    g.add((name_uri, RDFS.comment, Literal(comment, datatype=XSD.string)))
                    uri_str = str(name_uri)
                    definition = choose_definition(uri_str, term_raw)
                    existing_defs = [
                        c for c in g.objects(name_uri, RDFS.comment)
                        if "[Definition Source:" in str(c)
                    ]
                    if not existing_defs:
                        g.add((name_uri,
                            RDFS.comment,
                            Literal(definition, datatype=XSD.string)))
                    mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_stattechnique_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((name_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    print()
            else:
                mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_stattechnique_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                tokenized_mentions[term_raw] = mention_uri
                print("no matches")
                continue

        else:
            pass

with open(tokenized_file, "r", encoding="utf-8") as f_sent:
    tokenized_data = json.load(f_sent)

for entry in tokenized_data:
    pmid         = entry["pmid"]
    sent_id      = entry["sent_id"]
    sentence_txt = entry["sentence"].strip()
    entities     = entry["entities"]

    sent_uri = URIRef(GUTBRAINSENTENCE[f"{pmid}_{sent_id}"])
    g.add((sent_uri, RDF.type, SENTENCE))
    g.add((sent_uri, GUTPROP.hasSentenceText, Literal(sentence_txt, datatype=XSD.string)))

    if sent_id == 0:
        parent_uri = URIRef(GUTBRAIN[f"title_{pmid}"])
    else:
        parent_uri = URIRef(GUTBRAIN[f"abstract_{pmid}"])
    g.add((sent_uri,      GUTPROP.partOf,  parent_uri))
    g.add((parent_uri,    GUTPROP.composedOf, sent_uri))
        
    for ent in entities:
        if isinstance(ent, dict):
            text_span = ent.get("text_span", "").strip()
            label     = ent.get("label",    "").strip().lower()
        else:
            continue

        if label != "statistical technique":
            continue
        canonical = create_uri_fragment(text_span)
        cleaned_text_span = normalize_to_ascii(canonical).lower()
        
        if cleaned_text_span not in tokenized_mentions:
            mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(cleaned_text_span, max_length=16)])
            tokenized_mentions[cleaned_text_span] = mention_uri

            g.add((mention_uri, RDF.type, MENTION_CLASS))
            g.add((mention_uri, RDFS.label, Literal(f"mention_stattechnique_{cleaned_text_span}", datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.taggedAs, Literal(label, datatype=XSD.string)))
            
        else:
            mention_uri = tokenized_mentions[cleaned_text_span]
            
        g.add((mention_uri, GUTPROP.locatedIn, sent_uri))

output_file = os.path.join(save_path, "gutbrain_entities.ttl")
ttl_output = g.serialize(format="turtle")
with open(output_file, "w", encoding="utf-8") as f_out:
    f_out.write(ttl_output)

print(f"The RDF graph has been saved in {output_file}")

random-effects_meta-analyses
Query: random-effects meta-analyses
lookup term: random-effects_meta-analyses
  → Reusing existing URI: https://hereditary.dei.unipd.it/ontology/gutbrain/resource/statisticaltechnique/RandomEffectsMetaAnalyses


Query: wald test
lookup term: wald_test
  • Wald test                                URI=http://purl.obolibrary.org/obo/STATO_0000559
receiver_operating_characteristic_curve_analysis
Query: receiver operating characteristic curve analysis
lookup term: receiver_operating_characteristic_curve_analysis
  → Reusing existing URI: https://hereditary.dei.unipd.it/ontology/gutbrain/resource/statisticaltechnique/ReceiverOperatingCharacteristicCurveAnalysis


Query: random forest
lookup term: random_forest
  • random forest procedure                  URI=http://purl.obolibrary.org/obo/STATO_0000549 score=0.82

random_forest
Query: random forest
lookup term: random_forest
  • random forest procedure                  URI=http://purl.obolibrary.org/obo/STATO_000

<h1>INGEST BIOMEDICAL TECHNIQUE</h1>

In [16]:
import re
import json
import numpy as np
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
import unicodedata
from pathlib import Path
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD, SKOS, OWL
from rdflib.namespace import DCTERMS
from groqutils import get_llm_definition
from funcutils import get_ncit_description, get_chebi_description, get_omit_description, NCBI_BASE, HEREDITARY_BASE, UMLS_BASES, foodon_file, ncit_file, omit_file, chebi_file, hash_term_sha256

BIOMEDICALTECHNIQUE_BASE = "http://purl.obolibrary.org/obo/"
BIOMEDICALTECHNIQUE_CLASS = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/BiomedicalTechnique")
BIOMEDICALTECHNIQUE_CONCEPT_SCHEME = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/BiomedicalTechnique")

def load_biomedical_labels(path):
    pattern = re.compile(r'^\s*(.*?)\s*[\(\[]([A-Za-z0-9_]+)[\)\]]')
    rows = []
    with open(path, encoding="utf-8") as fh:
        for ln in fh:
            m = pattern.match(ln)
            if not m:
                continue
            label, term_id = m.group(1), m.group(2)
            uri = BIOMEDICALTECHNIQUE_BASE + term_id
            rows.append((label, uri))
    return rows

BIOMEDICAL_LABELS_FILE = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\parsedOntologies\ncbitaxon_full_taxonomy.txt"
STATO_LABELS_FILE = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\parsedOntologies\stato_full_taxonomy.txt"
biom_rows = load_biomedical_labels(BIOMEDICAL_LABELS_FILE)
biom1_rows = load_biomedical_labels(STATO_LABELS_FILE)

exact_ix = defaultdict(list)
for lbl, uri in biom_rows:
    exact_ix[lbl.lower()].append((lbl, uri))

exact_ix1 = defaultdict(list)
for lbl, uri in biom1_rows:
    exact_ix1[lbl.lower()].append((lbl, uri))

labels_only = [preprocess(lbl) for lbl, _ in biom_rows]
vec = TfidfVectorizer(stop_words="english")
mat = vec.fit_transform(labels_only)

labels1_only = [preprocess(lbl) for lbl, _ in biom1_rows]
vec1 = TfidfVectorizer(stop_words="english")
mat1 = vec1.fit_transform(labels1_only)

def top_cosine(term, k=5, thr=0.75):
    v  = vec.transform([term])
    sc = cosine_similarity(v, mat).ravel()
    idx = np.argsort(sc)[::-1]
    out = []
    for i in idx[:k]:
        if sc[i] < thr:
            break
        lbl, uri = biom_rows[i]
        out.append((lbl, uri, sc[i]))
    return out

def top_cosine1(term, k=5, thr=0.75):
    v   = vec1.transform([term])
    sc  = cosine_similarity(v, mat1).ravel()
    idx = np.argsort(sc)[::-1]
    out = []
    for i in idx[:k]:
        if sc[i] < thr:
            break
        lbl, uri = biom1_rows[i]
        out.append((lbl, uri, sc[i]))
    return out

manual_created = {}
created = {}

manual_created = {
    "dqi-i" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/biomedicaltechnique/Dqii"),
    "16s_rdna_pcr" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/biomedicaltechnique/16sRdnaPcr"),
    "childhood_behaviour_checklist" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/biomedicaltechnique/ChildhoodBehaviourChecklist"),
    "liquid_chromatography_mass_spectrometry" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0872318"),
    "gas_chromatography" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0008555"),
    "16s_rrna_sequencing" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/biomedicaltechnique/16sRrnaSequencing"),
    "picrust_analysis" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/biomedicaltechnique/PicrustAnalysis"),
    "dual_hit_toxin_model" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/biomedicaltechnique/DualHitToxinModel"),
    "forced_swim_test" : URIRef("http://purl.obolibrary.org/obo/MMO_0000574"),
    "open_field_apparatus_method" : URIRef("http://purl.obolibrary.org/obo/MMO_0000258"),
    "sucrose_preference_test" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/biomedicaltechnique/SucrosePreferenceTest"),
    "multiomics_study" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/biomedicaltechnique/MultiomicsStudy"),
    "metagenomic_dna_and_rna_sequencing" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C5848503"),
    "serum_metabolomics_profiling" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/biomedicaltechnique/SerumMetabolomicsProfiling"),
    "metagenomic_analysis": URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C5906995"),
    "hippocampal_proteomic_analysis" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/biomedicaltechnique/HippocampalProteomicAnalysis"),
    "gut_permeability_assay" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/biomedicaltechnique/GutPermeabilityAssay"),
    "ussing_chamber_permeability_assay" : URIRef("http://www.bioassayontology.org/bao#BAO_0010084"),
    "immunoblotting":URIRef("http://purl.obolibrary.org/obo/OMIT_0015957"),
    "immunohistochemical_analyses" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/biomedicaltechnique/ImmunohistochemicalAnalyses"),
    "liquid_chromatography_mass_spectrometry": URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0872318"),
    "body_weight_test" : URIRef("http://purl.obolibrary.org/obo/NCIT_C119794"),
    "enzyme-linked_immunosorbent_assay" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0014441"),
    "westrern_blotting" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0005863"),
    "social_stress_scale" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C4050347"),
    "shotgun_metagenomic_sequencing" : URIRef("http://edamontology.org/topic_3837"),
    "fpg_technique" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/biomedicaltechnique/FpgTechnique"),
    "fpi_technique" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/biomedicaltechnique/FpiTechnique"),
    "homeostasis_model_assessment_insulin_resistance" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/biomedicaltechnique/HomeostasisModelAssessmentInsulinResistance"),
    "ancom-bc2" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/biomedicaltechnique/AncomBc2"),
    "updrs_part_iii_motor_scores" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/biomedicaltechnique/UpdrsPartIIIMotorScores"),
    "comparative_genomic_analysis" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0796358"),
    "mitbamp_analyses": URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/biomedicaltechnique/MitbampAnalyses"),
    "aav-shrna" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/biomedicaltechnique/AavShrna"),
    "biological_laboratory_methods" :URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0814046"),
    "c14_d-xylose_breath_test" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0430719"),
    "targeted_therapy" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C2985566"),
    "strain-level_meta-analysis" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/biomedicaltechnique/StrainLevelMetaAnalysis"),
    "nuclear_spectroscopy" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C4295577"),
    "kegg_analysis" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/biomedicaltechnique/KeggAnalysis"),
    "gastrointestinal_and_microbiome_profiling" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/biomedicaltechnique/GastrointestinalAndMicrobiomeProfiling"),
    "splash": URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/biomedicaltechnique/Splash"),
    "elevated_plus_maze_test": URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C5392082"),
    "metabolomic":URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C1328813"),
    "double-blind_method" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0013072"),
    "burst_testing" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/biomedicaltechnique/BurstTesting"),
    "bielschowsky_head_tilt_test": URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C1302989"),
    "fluorescent_in_situ_hybridization": URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0162789"),
    "oft":URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/biomedicaltechnique/Oft"),
    "spt": URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/biomedicaltechnique/Spt"),
    "tst" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/biomedicaltechnique/Tst"),
    "intravenous_glucose_tolerance_test" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0021911"),
    "immunofluorescent_stain_method": URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C1318793"),
    "analysis":URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0936012"),
    "shotgun_sequencing": URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C1519305"),
    "stress_model" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/biomedicaltechnique/StressModel"),
    "multiplex_electrochemiluminescence_immunoassay":URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C5225027"),
    "sleep_latency_test" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0430629"),
    "microbiome_profiling": URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/biomedicaltechnique/MicrobiomeProfiling"),
    "bielschowsky_head_tilt_test": URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C1302989"),
    "glucose_tolerance_test" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0017741"),
    "liver_lipid_kit" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/biomedicaltechnique/LiverLipidKit"),
    "legend_plex_kit" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/biomedicaltechnique/LegendPlexKit"),
    "enzyme-linked_immunosorbent_assay" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0014441"),
    "functional_enrichment_analysis" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/biomedicaltechnique/FunctionalEnrichmentAnalysis"),
    "duodenal_aspirate_analysis" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0430151"),
    "bioinformatics_analysis" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/biomedicaltechnique/BioinformaticsAnalysis"),
    "stool_biobank" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/biomedicaltechnique/StoolBiobank"),
    "double-blind_method" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0013072"),
    "high_sensitivity_c-reactive_protein_measurement" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C4763419")
}

created = dict(manual_created)

regex_map = [
    (r"16s_rdna_survey", "16s_rdna_pcr"),
    (r"lc-ms","liquid_chromatography_mass_spectrometry"),
    (r"gc","gas_chromatography"),
    (r"16s_rrna_gene_sequencing", "16s_rrna_sequencing"),
    (r"16_s_rna_sequencing", "16s_rrna_sequencing"),
    (r"picrust_analysi", "picrust_analysis"),
    (r"forced_swimming_test", "forced_swim_test"),
    (r"open_field_test","open_field_apparatus_method"),
    (r"body_weight", "body_weight_test"),
    (r"immunofluorescence","immunofluorescent_staining_method"),
    (r"fecal_metagenome-wide-sequencing","metagenomic_dna_and_rna_sequencing"),
    (r"qpcr","real_time_pcr"),
    (r"social_defeat_stress", "social_stress_scale"),
    (r"rsd","social_stress_scale"),
    (r"metagenomics_analysis_of_the_microbiome", "metagenomic_analysis"),
    (r"ussing_chamber","ussing_chamber_permeability_assay"),
    (r"immunohistochemical_analyse","immunohistochemical_analyses") ,
    (r"lc-m","liquid_chromatography_mass_spectrometry"),
    (r"enzyme_linked_immunosorbent_assay","enzyme-linked_immunosorbent_assay"),
    (r"fecal_metagenome-wide_sequencing","metagenomic_dna_and_rna_sequencing"),
    (r"mechanistic_analysi","analysis"),
    (r"mitbamp_analyse","mitbamp_analyses"),
    (r"shotgun_metagenomic","shotgun_metagenomic_sequencing"),
    (r"shotgun_metagenomics_sequencing","shotgun_metagenomic_sequencing"),
    (r"hippocampal_proteomic_analysi","hippocampal_proteomic_analysis"),
    (r"sms","shotgun_metagenomic_sequencing"),
    (r"fpg","fpg_technique"),
    (r"hippocampal_proteomic_analysi","hippocampal_proteomic_analysis"),
    (r"splash","splash"),
    (r"spt","spt"),
    (r"fpi","fpi_technique"),
    (r"homa-ir","homeostasis_model_assessment_insulin_resistance"),
    (r"chronic_unpredictable_mild_stress__cums__model","stress_model"),
    (r"16s_rrna_gene-sequenced_gut_microbiota_data","16s_rrna_sequencing"),
    (r"pentobarbital-induced_sleep_test","sleep_latency_test"),
    (r"comparative_genomics_analysi", "analysis"),
    (r"rna-seq_profiling","metagenomic_dna_and_rna_sequencing"),
    (r"multi-omics_analyses","multiomics_study"),
    (r"metabolomics_profiling","serum_metabolomics_profiling"),
    (r"16s_rrna", "16s_rrna_sequencing"),
    (r"untargeted_metabolomic", "serum_metabolomics_profiling"),
    (r"16s_ribosomal_rna_analysi", "16s_rrna_sequencing"),
    (r"untargeted_metabolomic_analysi", "serum_metabolomics_profiling"),
    (r"analysis_of_the_fecal_sample", "metagenomic_analysis"),
    (r"wet_laboratory_method","biological_laboratory_methods"),
    (r"13c-d-xylose_breath_test","c14_d-xylose_breath_test"),
    (r"13c-d-xylose_breath_testing","c14_d-xylose_breath_test"),
    (r"single-cell_rna-seq_analysi","single-cell_rna-seq_analysis"),
    (r"microbiota-targeted_therapy","targeted_therapy"),
    (r"strain-level_meta-analysi", "strain-level_meta-analysis"),
    (r"nuclear_mr_spectroscopy","nuclear_spectroscopy"),
    (r"kegg_analysi", "kegg_analysis"),
    (r"omics_technique", "multiomics_study"),
    (r"elevated_plus_maze","elevated_plus_maze_test"),
    (r"16s_ribosomal_rna__16s_rrna__gene_sequence-based_approach","16s_rrna_sequencing"),
    (r"untargeted_liquid_chromatography-mass_spectrometry-based_metabolic_profiling_approach","liquid_chromatography_mass_spectrometry"),
    (r"16s_rrna_gene_amplicon_sequencing","16s_rrna_sequencing"),
    (r"metatranscriptomics_analyses","metagenomic_dna_and_rna_sequencing"),
    (r"double-blind__randomized__placebo-controlled_trial","double-blind_method"),
    (r"hutt","bielschowsky_head_tilt_test"),
    (r"quantitative_fluorescence_in_situ_hybridization","fluorescent_in_situ_hybridization"),
    (r"high-sensitivity_c-reactive_protein","high_sensitivity_c-reactive_protein_measurement"),
    (r"itt","intravenous_glucose_tolerance_test"),
    (r"metatranscriptomics_analyse", "metagenomic_dna_and_rna_sequencing"),
    (r"sm", "shotgun_sequencing"),
    (r"genomic_sequencing","shotgun_sequencing"),
    (r"whole_genome_shotgun_sequencing","shotgun_sequencing"),
    (r"multiplex_immunoassay", "multiplex_electrochemiluminescence_immunoassay"),
    (r"sleep_test","sleep_latency_test"),
    (r"comparative_genomics_analysis","analysis"),
    (r"multi-omics_analyses","multiomics_study"),
    (r"single-cell_rna-seq_analysis","metagenomic_dna_and_rna_sequencing"),
    (r"multiple_integrated_omic","multiomics_study"),
    (r"head-up_tilt_test","bielschowsky_head_tilt_test"),
    (r"igtt","glucose_tolerance_test"),
    (r"elisa","enzyme-linked_immunosorbent_assay"),
    (r"multi-omics_approach","multiomics_study"),
    (r"functional_enrichment_analysi","functional_enrichment_analysis"),
    (r"duodenal_aspirate", "duodenal_aspirate_analysis"),
    (r"bioinformatics_analysi", "bioinformatics_analysis"),
    (r"single_prolonged_stress__sps__model","stress_model"),
    (r"sp","stress_model"),
    (r"sps_model","stress_model"),
    (r"taxon-function_analysi","analysis"),
    (r"randomized__double-blinded__two-arm_feasibility_study","double-blind_method"),
    (r"16s_rna_method","16s_rrna_sequencing"),
    (r"elisa_kit","enzyme-linked_immunosorbent_assay"),
    (r"community-based_metabolic_modeling","metagenomic_analysis"),
    (r"metagenomic_analysi","metagenomic_analysis"),
    (r"immunofluorescent_staining_method","immunofluorescent_stain_method"),
    (r"multi-omics_analyse", "multiomics_study"),
    (r"single-cell_rna-seq_analysis","metagenomic_dna_and_rna_sequencing"),
    (r"multi-omics_analysi", "multiomics_study"),
    (r"single-cell_rna-seq_analysis","analysis"),
    (r"\bphylogenetic_investigation_of_communities_by_reconstruction_of_unobserved_states__picrust__analysi\w*\b","picrust_analysis"),
]

CREATOR = "Samuel Piron"

UMLS_BASE = "https://uts.nlm.nih.gov/uts/umls/concept/"

for uri in manual_created.values():
    uri_str = str(uri)

    if uri_str.startswith(UMLS_BASE):
        # only a UMLS‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("UMLS Match", datatype=XSD.string)))
    elif "MMO" in uri_str:
        # only a STATO‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("MNO Match", datatype=XSD.string)))
    elif "BAO" in uri_str:
        # only a BAO‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("BAO Match", datatype=XSD.string)))
    elif "NCIT" in uri_str:
        # only a NCIT‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("NCIT Match", datatype=XSD.string)))
    elif "OMIT" in uri_str:
        # only a OMIT‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("OMIT Match", datatype=XSD.string)))
    else:
        # everything else still gets a creator
        g.add((uri,
               DCTERMS.creator,
               Literal(CREATOR, datatype=XSD.string)))
        
for term_raw, uri in manual_created.items():
    uri_str = str(uri)

    # NCIT definitions
    if uri_str.startswith(NCBI_BASE) and "NCIT_" in uri_str:
        ncit_id = uri_str.rsplit("_", 1)[-1]
        desc = get_ncit_description(ncit_id, ncit_file)
        m = re.search(r'—\s*(.*?)\s*—', desc)
        if m:
            desc = m.group(1).strip()
            comment = f"{desc} [Definition Source: NCIT]"
        else:
            llm_def = get_llm_definition(term_raw)
            comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    # UMLS definitions
    elif uri_str.startswith(UMLS_BASES):
        cui = uri_str.rsplit("/", 1)[-1]
        defn = get_umls_definition(cui)
        if defn:
            comment = f"{defn.strip()} [Definition Source: UMLS]"
        else:
            llm_def = get_llm_definition(term_raw)
            comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    # MeSH definitions via your mesh_index
    elif uri_str.startswith(MESH_BASE):
        ui     = uri_str.rsplit("/",1)[-1]
        hits   = [d['name'] for d in mesh_descs if d['ui']==ui]
        if hits:
            comment = f"{hits[0]} [Definition Source: MeSH]"
        else:
            llm_def = get_llm_definition(term_raw)
            comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    elif uri_str.startswith(HEREDITARY_BASE):
        llm_def = get_llm_definition(term_raw)
        comment = f"{llm_def} [Definition Source: llama3-8b-8192]"
    else:
        llm_def = get_llm_definition(term_raw)
        comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    g.add((uri,
           RDFS.comment,
           Literal(comment, datatype=XSD.string)))
    
for paper_id, paper_data in data.items():
    
    entities = paper_data.get("entities", [])
    
    for i, entity in enumerate(entities):
        raw_label = entity.get("label", "").strip()
        
        text_span = entity.get("text_span", "").strip()
        
        if raw_label == "biomedical technique":
            text_span = entity.get("text_span", "").strip()
            cleaned_text_span = normalize_to_ascii(create_uri_fragment(text_span)).lower()
            term_raw = cleaned_text_span
            lookup_key = term_raw
            lookup_key = singularize(lookup_key)
            
            for pattern, replacement in regex_map:
                if re.search(pattern, lookup_key, flags=re.IGNORECASE):
                    lookup_key = replacement
                    if lookup_key == "single-cell_rna-seq_analysis":
                        lookup_key = "metagenomic_dna_and_rna_sequencing"
                    elif lookup_key == "immunofluorescent_staining_method":
                        lookup_key = "immunofluorescent_stain_method"
                    print(lookup_key)
                    break
                    
            term = preprocess(lookup_key)
            
            print(f"Query: {term}")
            print(f"lookup term: {lookup_key}")
            
            
            if term_raw in created:

                entity_uri = created[term_raw]
                print(f"  → Reusing existing URI: {entity_uri}\n")
                g.add((entity_uri, RDF.type, BIOMEDICALTECHNIQUE_CLASS))
                g.add((entity_uri, RDF.type, SKOS.Concept))
                g.add((entity_uri, SKOS.inScheme, BIOMEDICALTECHNIQUE_CONCEPT_SCHEME))
                g.add((entity_uri, RDFS.label, Literal(term.title(), datatype=XSD.string)))
                mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_biomtechnique_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                tokenized_mentions[term_raw] = mention_uri
                print()
                continue

            if lookup_key in created:
                print(f"  → Reusing existing URI: {created[lookup_key]}")
                entity_uri = created[lookup_key]
                g.add((entity_uri, RDF.type, BIOMEDICALTECHNIQUE_CLASS))
                g.add((entity_uri, RDF.type, SKOS.Concept))
                g.add((entity_uri, SKOS.inScheme, BIOMEDICALTECHNIQUE_CONCEPT_SCHEME))
                g.add((entity_uri, RDFS.label, Literal(term.title(), datatype=XSD.string)))
                mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_biomtechnique_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                tokenized_mentions[term_raw] = mention_uri
                print()
                continue

            ex = exact_ix.get(term, [])
            cos = top_cosine(term)
            cos1 = top_cosine1(term)
            if ex:
                for lbl, uri in ex:
                    print(f"  • {lbl:40s} URI={uri}")
                    entity_uri = URIRef(f"{uri}")
                    created[term_raw] = entity_uri
                    g.add((entity_uri, RDF.type, BIOMEDICALTECHNIQUE_CLASS))
                    g.add((entity_uri, RDF.type, SKOS.Concept))
                    g.add((entity_uri, RDFS.label, Literal(lbl.title(), datatype=XSD.string)))
                    g.add((entity_uri, SKOS.inScheme, BIOMEDICALTECHNIQUE_CONCEPT_SCHEME))
                    uri_str = str(entity_uri).lower()

                    if "stato_" in uri_str:
                        comment = "STATO Match"
                    elif "ncbitaxon_" in uri_str:
                        comment = "NCBITaxon Match"
                    elif "obi_" in uri_str:
                        comment = "OBI Match"
                    elif "umls" in uri_str:
                        comment = "UMLS Match"
                    else:
                        comment = CREATOR 
                    g.add((entity_uri, RDFS.comment, Literal(comment, datatype=XSD.string)))
                    uri_str = str(entity_uri)
                    definition = choose_definition(uri_str, term_raw)
                    existing_defs = [
                        c for c in g.objects(entity_uri, RDFS.comment)
                        if "[Definition Source:" in str(c)
                    ]
                    if not existing_defs:
                        g.add((entity_uri,
                            RDFS.comment,
                            Literal(definition, datatype=XSD.string)))
                    mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_biomtechnique_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    continue
                    print(); 
                    
            elif cos:
                for lbl, uri, score in cos[:1]:
                    print(f"  • {lbl:40s} URI={uri:40s} score={score:.2f}")
                    name_uri = URIRef(f"{uri}")
                    created[term_raw] = name_uri
                    g.add((name_uri, RDF.type, BIOMEDICALTECHNIQUE_CLASS))
                    g.add((name_uri, RDF.type, SKOS.Concept))
                    g.add((name_uri, RDFS.label, Literal(lbl.title(), datatype=XSD.string)))
                    g.add((name_uri, SKOS.inScheme, BIOMEDICALTECHNIQUE_CONCEPT_SCHEME))
                    uri_str = str(name_uri).lower()

                    if "stato_" in uri_str:
                        comment = "STATO Match"
                    elif "ncbitaxon_" in uri_str:
                        comment = "NCBITaxon Match"
                    elif "obi_" in uri_str:
                        comment = "OBI Match"
                    elif "umls" in uri_str:
                        comment = "UMLS Match"
                    else:
                        comment = CREATOR 
                    g.add((name_uri, RDFS.comment, Literal(comment, datatype=XSD.string)))
                    uri_str = str(name_uri)
                    definition = choose_definition(uri_str, term_raw)
                    existing_defs = [
                        c for c in g.objects(name_uri, RDFS.comment)
                        if "[Definition Source:" in str(c)
                    ]
                    if not existing_defs:
                        g.add((name_uri,
                            RDFS.comment,
                            Literal(definition, datatype=XSD.string)))
                    mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_biomtechnique_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((name_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    print()
                    
            elif cos1:
                for lbl, uri, score in cos1[:1]:
                    print(f"  • {lbl:40s} URI={uri:40s} score={score:.2f}")
                    name_uri = URIRef(f"{uri}")
                    created[term_raw] = name_uri
                    g.add((name_uri, RDF.type, BIOMEDICALTECHNIQUE_CLASS))
                    g.add((name_uri, RDF.type, SKOS.Concept))
                    g.add((name_uri, RDFS.label, Literal(lbl.title(), datatype=XSD.string)))
                    g.add((name_uri, SKOS.inScheme, BIOMEDICALTECHNIQUE_CONCEPT_SCHEME))
                    uri_str = str(name_uri).lower()

                    if "stato_" in uri_str:
                        comment = "STATO Match"
                    elif "ncbitaxon_" in uri_str:
                        comment = "NCBITaxon Match"
                    elif "obi_" in uri_str:
                        comment = "OBI Match"
                    elif "umls" in uri_str:
                        comment = "UMLS Match"
                    else:
                        comment = CREATOR 
                    g.add((name_uri, RDFS.comment, Literal(comment, datatype=XSD.string)))
                    uri_str = str(name_uri)
                    definition = choose_definition(uri_str, term_raw)
                    existing_defs = [
                        c for c in g.objects(name_uri, RDFS.comment)
                        if "[Definition Source:" in str(c)
                    ]
                    if not existing_defs:
                        g.add((name_uri,
                            RDFS.comment,
                            Literal(definition, datatype=XSD.string)))
                    mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_biomtechnique_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((name_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    print()
            else:
                mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_biomtechnique_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                tokenized_mentions[term_raw] = mention_uri
                print("no matches")
                continue

        else:
            pass

with open(tokenized_file, "r", encoding="utf-8") as f_sent:
    tokenized_data = json.load(f_sent)

for entry in tokenized_data:
    pmid         = entry["pmid"]
    sent_id      = entry["sent_id"]
    sentence_txt = entry["sentence"].strip()
    entities     = entry["entities"]
    
    sent_uri = URIRef(GUTBRAINSENTENCE[f"{pmid}_{sent_id}"])
    g.add((sent_uri, RDF.type, SENTENCE))
    g.add((sent_uri, GUTPROP.hasSentenceText, Literal(sentence_txt, datatype=XSD.string)))

    if sent_id == 0:
        parent_uri = URIRef(GUTBRAIN[f"title_{pmid}"])
    else:
        parent_uri = URIRef(GUTBRAIN[f"abstract_{pmid}"])
    g.add((sent_uri,      GUTPROP.partOf,  parent_uri))
    g.add((parent_uri,    GUTPROP.composedOf, sent_uri))
        
    for ent in entities:
        if isinstance(ent, dict):
            text_span = ent.get("text_span", "").strip()
            label     = ent.get("label",    "").strip().lower()
        else:
            continue

        if label != "biomedical technique":
            continue
        canonical = create_uri_fragment(text_span)
        cleaned_text_span = normalize_to_ascii(canonical).lower()
        
        if cleaned_text_span not in tokenized_mentions:
            mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(cleaned_text_span, max_length=16)])
            tokenized_mentions[cleaned_text_span] = mention_uri

            g.add((mention_uri, RDF.type, MENTION_CLASS))
            g.add((mention_uri, RDFS.label, Literal(f"mention_biomtechnique_{cleaned_text_span}", datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.taggedAs, Literal(label, datatype=XSD.string)))
            
        else:
            mention_uri = tokenized_mentions[cleaned_text_span]
            
        g.add((mention_uri, GUTPROP.locatedIn, sent_uri))

output_file = os.path.join(save_path, "gutbrain_entities.ttl")
ttl_output = g.serialize(format="turtle")
with open(output_file, "w", encoding="utf-8") as f_out:
    f_out.write(ttl_output)

print(f"The RDF graph has been saved in {output_file}")

Query: dqi-i
lookup term: dqi-i
  → Reusing existing URI: https://hereditary.dei.unipd.it/ontology/gutbrain/resource/biomedicaltechnique/Dqii


16s_rdna_pcr
Query: 16s rdna pcr
lookup term: 16s_rdna_pcr
  → Reusing existing URI: https://hereditary.dei.unipd.it/ontology/gutbrain/resource/biomedicaltechnique/16sRdnaPcr

16s_rdna_pcr
Query: 16s rdna pcr
lookup term: 16s_rdna_pcr
  → Reusing existing URI: https://hereditary.dei.unipd.it/ontology/gutbrain/resource/biomedicaltechnique/16sRdnaPcr

metagenomic_dna_and_rna_sequencing
Query: metagenomic dna and rna sequencing
lookup term: metagenomic_dna_and_rna_sequencing
  → Reusing existing URI: https://uts.nlm.nih.gov/uts/umls/concept/C5848503

Query: metabolomic
lookup term: metabolomic
  → Reusing existing URI: https://uts.nlm.nih.gov/uts/umls/concept/C1328813

metagenomic_analysis
Query: metagenomic analysis
lookup term: metagenomic_analysis
  → Reusing existing URI: https://uts.nlm.nih.gov/uts/umls/concept/C5906995

16s_rrna_sequencing
Q

<h1>INGEST ANATOMICAL LOCATION</h1>

In [None]:
import re
import json
import numpy as np
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
import unicodedata
from pathlib import Path
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD, SKOS, OWL
from pprint import pprint
from rdflib.namespace import DCTERMS
from groqutils import get_llm_definition
from funcutils import get_ncit_description, get_chebi_description, get_omit_description, NCBI_BASE, HEREDITARY_BASE, UMLS_BASES, foodon_file, ncit_file, omit_file, chebi_file, hash_term_sha256

ANATOMICALLOCATION_BASE = "http://purl.obolibrary.org/obo/"
ANATOMICALLOCATION_CLASS = URIRef("https://w3id.org/brainteaser/ontology/schema/AnatomicalSite")
ANATOMICALLOCATION_CONCEPT_SCHEME = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/AnatomicSite")

def load_anatomical_labels(path):
    pattern = re.compile(r'^\s*(.*?)\s*[\(\[]([A-Za-z0-9_]+)[\)\]]')
    rows = []
    with open(path, encoding="utf-8") as fh:
        for ln in fh:
            m = pattern.match(ln)
            if not m:
                continue
            label, term_id = m.group(1), m.group(2)
            uri = ANATOMICALLOCATION_BASE + term_id
            rows.append((label, uri))
    return rows

ANATOMICALLOCATION_LABELS_FILE = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\parsedOntologies\ncit_full_taxonomy.txt"
anat_rows = load_anatomical_labels(ANATOMICALLOCATION_LABELS_FILE)

exact_ix = defaultdict(list)
for lbl, uri in biom_rows:
    exact_ix[lbl.lower()].append((lbl, uri))

labels_only = [preprocess(lbl) for lbl, _ in anat_rows]
vec = TfidfVectorizer(stop_words="english")
mat = vec.fit_transform(labels_only)

def top_cosine(term, k=5, thr=0.75):
    v  = vec.transform([term])
    sc = cosine_similarity(v, mat).ravel()
    idx = np.argsort(sc)[::-1]
    out = []
    for i in idx[:k]:
        if sc[i] < thr:
            break
        lbl, uri = anat_rows[i]
        out.append((lbl, uri, sc[i]))
    return out

manual_created = {}
created = {}

manual_created = {
    "skin_lipid_metabolism_gene" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/anatomicalsite/SkinLipidMetabolismGene"),
    "intestinal_(intended_site)" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C5702674"),
    "gut" :URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0699819"),
    "pilosebaceous_unit" : URIRef("http://purl.obolibrary.org/obo/FMA_70661"),
    "intestinal_barrier_function" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C5828281"),
    "medial_prefrontal_cortex_-_human": URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C3853912"),
    "brain_tissue_(substance)" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0440746"),
    "human_body": URIRef("http://purl.obolibrary.org/obo/FMA_20394"),
    "postsynaptic_density": URIRef("https://amigo.geneontology.org/amigo/term/GO:0014069"),
    "systemic_blood_circulation" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0678860"),
    "colon" : URIRef("http://purl.obolibrary.org/obo/NCIT_C12382"),
    "hippocampus" : URIRef("http://purl.obolibrary.org/obo/NCIT_C12444"),
    "skin" : URIRef("http://purl.obolibrary.org/obo/NCIT_C12470"),
    "vagus_nerve_structure" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0042276"),
    "blood" : URIRef("http://purl.obolibrary.org/obo/NCIT_C12434"),
    "nasal_cavity" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0027423"),

}

created = dict(manual_created)

regex_map = [
    (r"intestinal_site", "intestinal_(intended_site)"),
    (r"subdiaphragmatic_vagus_nerve", "vagus_nerve_structure"),
    (r"vagus_nerve", "vagus_nerve_structure"),
    (r"\bnasal_passage\w*\b","nasal_cavity"),
    (r"gi sites", "intestinal_(intended_site)"),
    (r"gastrointestinal__gi__site", "intestinal_(intended_site)"),
    (r"hippocampu", "hippocampus"),
    (r"nasal_passage", "nasal_passages"),
    (r"colonic_tracts", "colon"),
    (r"colonic_tract", "colon"),
    (r"intestinal_barrier", "intestinal_barrier_function"),
    (r"maternal_vagina","vagina"),
    (r"git","gut"),
    (r"infant_stool_and_nose","nose"),
    (r"hippocampal_synaptic_ultrastructure","hippocampal"),
    (r"urogenital_tract","genitourinary_system"),
    (r"mpfc","medial_prefrontal_cortex_-_human"),
    (r"cognitive_brain_area","medial_prefrontal_cortex_-_human"),
    (r"cuprizone-treated_mouse_brain","brain"),
    (r"gastrointestinal__gi__tract","intestinal_(intended_site)"),
    (r"rodent_gut","gut"),
    (r"gi_tract", "intestinal_(intended_site)"),
    (r"gi_site", "intestinal_(intended_site)"),
    (r"enteric_nervous_system","nervous_system"),
    (r"gut_barrier","intestinal_barrier_function"),
    (r"selected_brain_tissue","brain_tissue_(substance)"),
    (r"systemic_circulation","systemic_blood_circulation"),
    (r"brainstem","brain_stem"),
]
CREATOR = "Samuel Piron"

UMLS_BASE = "https://uts.nlm.nih.gov/uts/umls/concept/"

for uri in manual_created.values():
    uri_str = str(uri)

    if uri_str.startswith(UMLS_BASE):
        # only a UMLS‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("UMLS Match", datatype=XSD.string)))
    elif "MMO" in uri_str:
        # only a STATO‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("MNO Match", datatype=XSD.string)))
    elif "BAO" in uri_str:
        # only a BAO‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("BAO Match", datatype=XSD.string)))
    elif "NCIT" in uri_str:
        # only a NCIT‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("NCIT Match", datatype=XSD.string)))
    elif "OMIT" in uri_str:
        # only a OMIT‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("OMIT Match", datatype=XSD.string)))
    elif "FMA" in uri_str:
        # only a FMA‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("FMA Match", datatype=XSD.string)))
    elif "GO" in uri_str:
        # only a GO‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("GO Match", datatype=XSD.string)))
    else:
        # everything else still gets a creator
        g.add((uri,
               DCTERMS.creator,
               Literal(CREATOR, datatype=XSD.string)))
        
for term_raw, uri in manual_created.items():
    uri_str = str(uri)

    # NCIT definitions
    if uri_str.startswith(NCBI_BASE) and "NCIT_" in uri_str:
        ncit_id = uri_str.rsplit("_", 1)[-1]
        desc = get_ncit_description(ncit_id, ncit_file)
        m = re.search(r'—\s*(.*?)\s*—', desc)
        if m:
            desc = m.group(1).strip()
            comment = f"{desc} [Definition Source: NCIT]"
        else:
            llm_def = get_llm_definition(term_raw)
            comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    # UMLS definitions
    elif uri_str.startswith(UMLS_BASES):
        cui = uri_str.rsplit("/", 1)[-1]
        defn = get_umls_definition(cui)
        if defn:
            comment = f"{defn.strip()} [Definition Source: UMLS]"
        else:
            llm_def = get_llm_definition(term_raw)
            comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    # MeSH definitions via your mesh_index
    elif uri_str.startswith(MESH_BASE):
        ui     = uri_str.rsplit("/",1)[-1]
        hits   = [d['name'] for d in mesh_descs if d['ui']==ui]
        if hits:
            comment = f"{hits[0]} [Definition Source: MeSH]"
        elif term_raw == "patients":
            comment = "Patients with various diseases. [Definition Source: GUTBRAIN]"
        else:
            llm_def = get_llm_definition(term_raw)
            comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    elif uri_str.startswith(HEREDITARY_BASE):
        llm_def = get_llm_definition(term_raw)
        comment = f"{llm_def} [Definition Source: llama3-8b-8192]"
    else:
        llm_def = get_llm_definition(term_raw)
        comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    g.add((uri,
           RDFS.comment,
           Literal(comment, datatype=XSD.string)))
    
for paper_id, paper_data in data.items():
    
    entities = paper_data.get("entities", [])
    
    for i, entity in enumerate(entities):
        raw_label = entity.get("label", "").strip()
        
        text_span = entity.get("text_span", "").strip()
        
        if raw_label == "anatomical location":
            text_span = entity.get("text_span", "").strip()
            cleaned_text_span = normalize_to_ascii(create_uri_fragment(text_span)).lower()
            term_raw = cleaned_text_span
            lookup_key = term_raw
            lookup_key = singularize(lookup_key)
            
            for pattern, replacement in regex_map:
                if re.search(pattern, lookup_key, flags=re.IGNORECASE):
                    lookup_key = replacement
                    print(lookup_key)
                    break
                    
            term = preprocess(lookup_key)
            
            print(f"Query: {term}")
            print(f"lookup term: {lookup_key}")
            
            
            if term_raw in created:

                entity_uri = created[term_raw]
                print(f"  → Reusing existing URI: {entity_uri}\n")
                g.add((entity_uri, RDF.type, ANATOMICALLOCATION_CLASS))
                g.add((entity_uri, RDF.type, SKOS.Concept))
                g.add((entity_uri, SKOS.inScheme, ANATOMICALLOCATION_CONCEPT_SCHEME))
                g.add((entity_uri, RDFS.label, Literal(term.title(), datatype=XSD.string)))
                mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_anatomicsite_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                tokenized_mentions[term_raw] = mention_uri
                print()
                continue

            if lookup_key in created:
                print(f"  → Reusing existing URI: {created[lookup_key]}")
                entity_uri = created[lookup_key]
                g.add((entity_uri, RDF.type, ANATOMICALLOCATION_CLASS))
                g.add((entity_uri, RDF.type, SKOS.Concept))
                g.add((entity_uri, SKOS.inScheme, ANATOMICALLOCATION_CONCEPT_SCHEME))
                g.add((entity_uri, RDFS.label, Literal(term.title(), datatype=XSD.string)))
                mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_anatomicsite_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                tokenized_mentions[term_raw] = mention_uri
                print()
                continue

            ex = exact_ix.get(term, [])
            cos = top_cosine(term)
            if ex:
                for lbl, uri in ex:
                    print(f"  • {lbl:40s} URI={uri}")
                    entity_uri = URIRef(f"{uri}")
                    created[term_raw] = entity_uri
                    g.add((entity_uri, RDF.type, ANATOMICALLOCATION_CLASS))
                    g.add((entity_uri, RDF.type, SKOS.Concept))
                    g.add((entity_uri, RDFS.label, Literal(lbl, datatype=XSD.string)))
                    g.add((entity_uri, SKOS.inScheme, ANATOMICALLOCATION_CONCEPT_SCHEME))
                    uri_str = str(entity_uri).lower()

                    if "stato_" in uri_str:
                        comment = "STATO Match"
                    elif "ncbitaxon_" in uri_str:
                        comment = "NCBITaxon Match"
                    elif "ncit_" in uri_str:
                        comment = "NCIT Match"
                    elif "obi_" in uri_str:
                        comment = "OBI Match"
                    elif "umls" in uri_str:
                        comment = "UMLS Match"
                    else:
                        comment = CREATOR 
                    g.add((entity_uri, RDFS.comment, Literal(comment, datatype=XSD.string)))
                    uri_str = str(entity_uri)
                    definition = choose_definition(uri_str, term_raw)
                    existing_defs = [
                        c for c in g.objects(entity_uri, RDFS.comment)
                        if "[Definition Source:" in str(c)
                    ]
                    if not existing_defs:
                        g.add((entity_uri,
                            RDFS.comment,
                            Literal(definition, datatype=XSD.string)))
                    mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_anatomicsite_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    continue
                    print(); 
                    
            elif cos:
                for lbl, uri, score in cos[:1]:
                    print(f"  • {lbl:40s} URI={uri:40s} score={score:.2f}")
                    name_uri = URIRef(f"{uri}")
                    created[term_raw] = name_uri
                    g.add((name_uri, RDF.type, ANATOMICALLOCATION_CLASS))
                    g.add((name_uri, RDF.type, SKOS.Concept))
                    g.add((name_uri, RDFS.label, Literal(lbl, datatype=XSD.string)))
                    g.add((name_uri, SKOS.inScheme, ANATOMICALLOCATION_CONCEPT_SCHEME))
                    uri_str = str(name_uri).lower()

                    if "stato_" in uri_str:
                        comment = "STATO Match"
                    elif "ncbitaxon_" in uri_str:
                        comment = "NCBITaxon Match"
                    elif "ncit_" in uri_str:
                        comment = "NCIT Match"
                    elif "obi_" in uri_str:
                        comment = "OBI Match"
                    elif "umls" in uri_str:
                        comment = "UMLS Match"
                    else:
                        comment = CREATOR 
                    g.add((name_uri, RDFS.comment, Literal(comment, datatype=XSD.string)))
                    uri_str = str(name_uri)
                    definition = choose_definition(uri_str, term_raw)
                    existing_defs = [
                        c for c in g.objects(name_uri, RDFS.comment)
                        if "[Definition Source:" in str(c)
                    ]
                    if not existing_defs:
                        g.add((name_uri,
                            RDFS.comment,
                            Literal(definition, datatype=XSD.string)))
                    mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_anatomicsite_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((name_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    print()
                    
            else:
                mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_anatomicsite_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                tokenized_mentions[term_raw] = mention_uri
                print("no matches")
                continue

        else:
            pass

with open(tokenized_file, "r", encoding="utf-8") as f_sent:
    tokenized_data = json.load(f_sent)

for entry in tokenized_data:
    pmid         = entry["pmid"]
    sent_id      = entry["sent_id"]
    sentence_txt = entry["sentence"].strip()
    entities     = entry["entities"]

    sent_uri = URIRef(GUTBRAINSENTENCE[f"{pmid}_{sent_id}"])
    g.add((sent_uri, RDF.type, SENTENCE))
    g.add((sent_uri, GUTPROP.hasSentenceText, Literal(sentence_txt, datatype=XSD.string)))

    if sent_id == 0:
        parent_uri = URIRef(GUTBRAIN[f"title_{pmid}"])
    else:
        parent_uri = URIRef(GUTBRAIN[f"abstract_{pmid}"])
    g.add((sent_uri,      GUTPROP.partOf,  parent_uri))
    g.add((parent_uri,    GUTPROP.composedOf, sent_uri))
        
    for ent in entities:
        if isinstance(ent, dict):
            text_span = ent.get("text_span", "").strip()
            label     = ent.get("label",    "").strip().lower()
        else:
            continue

        if label != "anatomical location":
            continue
        canonical = create_uri_fragment(text_span)
        cleaned_text_span = normalize_to_ascii(canonical).lower()
        
        if cleaned_text_span not in tokenized_mentions:
            mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(cleaned_text_span, max_length=16)])
            tokenized_mentions[cleaned_text_span] = mention_uri

            g.add((mention_uri, RDF.type, MENTION_CLASS))
            g.add((mention_uri, RDFS.label, Literal(f"mention_anatomicsite_{cleaned_text_span}", datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.taggedAs, Literal(label, datatype=XSD.string)))
            
        else:
            mention_uri = tokenized_mentions[cleaned_text_span]
            
        g.add((mention_uri, GUTPROP.locatedIn, sent_uri))

output_file = os.path.join(save_path, "gutbrain_entities.ttl")
ttl_output = g.serialize(format="turtle")
with open(output_file, "w", encoding="utf-8") as f_out:
    f_out.write(ttl_output)

print(f"The RDF graph has been saved in {output_file}")

intestinal_(intended_site)
Query: intestinal (intended site)
lookup term: intestinal_(intended_site)
  → Reusing existing URI: https://uts.nlm.nih.gov/uts/umls/concept/C5702674

intestinal_(intended_site)
Query: intestinal (intended site)
lookup term: intestinal_(intended_site)
  → Reusing existing URI: https://uts.nlm.nih.gov/uts/umls/concept/C5702674

intestinal_(intended_site)
Query: intestinal (intended site)
lookup term: intestinal_(intended_site)
  → Reusing existing URI: https://uts.nlm.nih.gov/uts/umls/concept/C5702674

hippocampus
Query: hippocampus
lookup term: hippocampus
  → Reusing existing URI: http://purl.obolibrary.org/obo/NCIT_C12444


Query: oral cavity
lookup term: oral_cavity
  • Oral Cavity                              URI=http://purl.obolibrary.org/obo/NCIT_C12421 score=1.00

nasal_cavity
Query: nasal cavity
lookup term: nasal_cavity
  → Reusing existing URI: https://uts.nlm.nih.gov/uts/umls/concept/C0027423

Query: lung
lookup term: lung
  • Lung                 

<h1>INGEST GENE</h1>

In [None]:
import re
import json
import numpy as np
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
from pathlib import Path
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD, SKOS, OWL
from pprint import pprint
from rdflib.namespace import DCTERMS
from groqutils import get_llm_definition
from funcutils import get_ncit_description, get_chebi_description, get_omit_description, NCBI_BASE, HEREDITARY_BASE, UMLS_BASES, foodon_file, ncit_file, omit_file, chebi_file, hash_term_sha256

GENE_BASE = "http://purl.obolibrary.org/obo/"
GENE_CLASS = URIRef("https://w3id.org/brainteaser/ontology/schema/Gene")
GENE_CONCEPT_SCHEME = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Gene")

def load_gene_labels(path):
    pattern = re.compile(r'^\s*(.*?)\s*[\(\[]([A-Za-z0-9_]+)[\)\]]')
    rows = []
    with open(path, encoding="utf-8") as fh:
        for ln in fh:
            m = pattern.match(ln)
            if not m:
                continue
            label, term_id = m.group(1), m.group(2)
            uri = GENE_BASE + term_id
            rows.append((label, uri))
    return rows

GENE_LABELS_FILE = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\parsedOntologies\ncit_full_taxonomy.txt"
gene_rows = load_gene_labels(GENE_LABELS_FILE)

exact_ix = defaultdict(list)
for lbl, uri in gene_rows:
    exact_ix[lbl.lower()].append((lbl, uri))

labels_only = [preprocess(lbl) for lbl, _ in gene_rows]
vec = TfidfVectorizer(stop_words="english")
mat = vec.fit_transform(labels_only)

def top_cosine(term, k=5, thr=0.75):
    v  = vec.transform([term])
    sc = cosine_similarity(v, mat).ravel()
    idx = np.argsort(sc)[::-1]
    out = []
    for i in idx[:k]:
        if sc[i] < thr:
            break
        lbl, uri = gene_rows[i]
        out.append((lbl, uri, sc[i]))
    return out
manual_created = {}
created = {}

manual_created = {
    "skin_lipid_metabolism_gene" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/gene/SkinLipidMetabolismGene"),
    "ppar-gamma" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0166417"),
    "srebp-1c" : URIRef("http://purl.obolibrary.org/obo/TFClass_human.obo#1.2.6.3.1.3"),
    "acaca_gene" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C1412104"),
    "hormone-sensitive_lipase" : URIRef("http://purl.obolibrary.org/obo/PR_000009834"),
    "adipose_triglyceride_lipase" : URIRef("http://purl.obolibrary.org/obo/PR_000012942"),
    "tumor_necrosis_factor-alpha" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C1456820"),
    "glucose1pmetab-pwy" : URIRef("https://pubchem.ncbi.nlm.nih.gov/pathway/BioCyc:ECO_GLUCOSE1PMETAB-PWY"), 
    "maltose_catabolic_process":URIRef("http://purl.obolibrary.org/obo/GO_0000025"), 
    "l-fucose-proton_symporter":URIRef("http://purl.obolibrary.org/obo/PR_000022731"),
    "urease_accessory_proteins_uree" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/gene/UreaseAccessoryProteinsUree"),
    "camkiid_inhibitor_np202":URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C5690415"),
    "aromatic_aminotransferase":URIRef("https://www.kegg.jp/dbget-bin/www_bget?K00837"),
    "2-oxoglutarate_dehydrogenase_e2_component":URIRef("https://www.genome.jp/dbget-bin/www_bget?K00658"),                                     
    "tryptophan_synthase_beta_chain" : URIRef("https://www.genome.jp/dbget-bin/www_bget?K01696"),
    "dihydrolipoyl_dehydrogenase":URIRef("https://www.genome.jp/entry/K00382"),
    "acetyl-coa_c-acetyltransferase":URIRef("https://www.genome.jp/dbget-bin/www_bget?K00626"),
    "bacterial_16s_rrna_gene" :URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C5380697"),
    "slc27a3_gene" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C1420165"),
    "pnpla3": URIRef("http://purl.obolibrary.org/obo/OGG_3000080339"),
    "elovl6": URIRef("http://purl.obolibrary.org/obo/OMIT_0044699"),
    "5-ht(1dalpha)_receptor": URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0534687"),
    "tight_junction_protein_1":URIRef("https://proconsortium.org/cgi-bin/entry_pro?id=PR_000016364"),
    "tryptophanase": URIRef("https://www.genome.jp/dbget-bin/www_bget?K01667"),
    "abhd5": URIRef("http://purl.obolibrary.org/obo/OGG_3000051099"),

}

created = dict(manual_created)

regex_map = [
    (r"ppar-", "ppar-gamma"),
    (r"\bacc\b","acaca_gene"),
    (r"hsl","hormone-sensitive_lipase"),
    (r"atgl","adipose_triglyceride_lipase"),
    (r"(?:(?<=^)|(?<=[^A-Za-z0-9]))migraine_headache\b","migraine"),
    (r"skin_lipid_metabolism-related_gene" , "skin_lipid_metabolism_gene"),
    (r"TNFA","tumor_necrosis_factor-alpha"),
    (r"pwy-7328","glucose1pmetab-pwy"),
    (r"maltose_hydrolase", "maltose_catabolic_process"),
    (r"cog1554","maltose_catabolic_process"),
    (r"fucose_permease","l-fucose-proton_symporter"),
    (r"cog0738","l-fucose-proton_symporter"),
    (r"cog2371","urease_accessory_proteins_uree"),
    (r"camkiid","camkiid_inhibitor_np202"),
    (r"k00658","2-oxoglutarate_dehydrogenase_e2_component"),
    (r"k00837","aromatic_aminotransferase"),
    (r"k01696","tryptophan_synthase_beta_chain"),
    (r"k00382", "dihydrolipoyl_dehydrogenase"),
    (r"k00626","acetyl-coa_c-acetyltransferase"),
    (r"k01667","tryptophanase"),
    (r"k03781","catalase"),
    (r"differential_expressed_gene","tissue-specific_gene_expression"),
    (r"deg","tissue-specific_gene_expression"),
    (r"gut_microbe-related_degs","tissue-specific_gene_expression"),
    (r"ccdc173","parafibromin"),
    (r"16s_rrna_gene","bacterial_16s_rrna_gene"),
    (r"microglia_activation-related_gene","microglia"),
    (r"fatty_acid_transport_gene","slc27a3_gene"),
    (r"5-ht_receptor_htr2a" ,"5-ht(1dalpha)_receptor"),
    (r"tight_junction_protein_claudin-5","tight_junction_protein_1"),
    (r"adipogenesi","adipogenesis"),
    (r"\bfasn\b","fasn_gene"),
    (r"\bhtr1a\b","htr1a_gene"),

]

CREATOR = "Samuel Piron"

UMLS_BASE = "https://uts.nlm.nih.gov/uts/umls/concept/"

for uri in manual_created.values():
    uri_str = str(uri)

    if uri_str.startswith(UMLS_BASE):
        # only a UMLS‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("UMLS Match", datatype=XSD.string)))
    elif "MMO" in uri_str:
        # only a STATO‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("MNO Match", datatype=XSD.string)))
    elif "BAO" in uri_str:
        # only a BAO‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("BAO Match", datatype=XSD.string)))
    elif "NCIT" in uri_str:
        # only a NCIT‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("NCIT Match", datatype=XSD.string)))
    elif "OMIT" in uri_str:
        # only a OMIT‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("OMIT Match", datatype=XSD.string)))
    elif "PR" in uri_str:
        # only a FMA‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("PR Match", datatype=XSD.string)))
    elif "TFClass" in uri_str:
        # only a TFClass‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("TFClass Match", datatype=XSD.string)))
    elif "GO" in uri_str:
        # only a GO‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("GO Match", datatype=XSD.string)))
    elif "BioCyc" in uri_str:
        # only a BioCyc‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("BioCyc Match", datatype=XSD.string)))
    elif "kegg" in uri_str:
        # only a KEGG‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("KEGG Match", datatype=XSD.string)))
    elif "OGG" in uri_str:
        # only a OGG‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("OGG Match", datatype=XSD.string)))
    elif "genome" in uri_str:
        # only a Genome‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("KEGG Match", datatype=XSD.string)))
    else:
        # everything else still gets a creator
        g.add((uri,
               DCTERMS.creator,
               Literal(CREATOR, datatype=XSD.string)))
        
for term_raw, uri in manual_created.items():
    uri_str = str(uri)

    # NCIT definitions
    if uri_str.startswith(NCBI_BASE) and "NCIT_" in uri_str:
        ncit_id = uri_str.rsplit("_", 1)[-1]
        desc = get_ncit_description(ncit_id, ncit_file)
        m = re.search(r'—\s*(.*?)\s*—', desc)
        if m:
            desc = m.group(1).strip()
            comment = f"{desc} [Definition Source: NCIT]"
        else:
            llm_def = get_llm_definition(term_raw)
            comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    # UMLS definitions
    elif uri_str.startswith(UMLS_BASES):
        cui = uri_str.rsplit("/", 1)[-1]
        defn = get_umls_definition(cui)
        if defn:
            comment = f"{defn.strip()} [Definition Source: UMLS]"
        else:
            llm_def = get_llm_definition(term_raw)
            comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    # MeSH definitions via your mesh_index
    elif uri_str.startswith(MESH_BASE):
        ui     = uri_str.rsplit("/",1)[-1]
        hits   = [d['name'] for d in mesh_descs if d['ui']==ui]
        if hits:
            comment = f"{hits[0]} [Definition Source: MeSH]"
        elif term_raw == "patients":
            comment = "Patients with various diseases. [Definition Source: GUTBRAIN]"
        else:
            llm_def = get_llm_definition(term_raw)
            comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    elif uri_str.startswith(HEREDITARY_BASE):
        llm_def = get_llm_definition(term_raw)
        comment = f"{llm_def} [Definition Source: llama3-8b-8192]"
    else:
        llm_def = get_llm_definition(term_raw)
        comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    g.add((uri,
           RDFS.comment,
           Literal(comment, datatype=XSD.string)))
    
for paper_id, paper_data in data.items():
    
    entities = paper_data.get("entities", [])
    
    for i, entity in enumerate(entities):
        raw_label = entity.get("label", "").strip()
        
        text_span = entity.get("text_span", "").strip()
        
        if raw_label == "gene":
            text_span = entity.get("text_span", "").strip()
            cleaned_text_span = normalize_to_ascii(create_uri_fragment(text_span)).lower()
            term_raw = cleaned_text_span
            lookup_key = term_raw
            lookup_key = singularize(lookup_key)
            
            for pattern, replacement in regex_map:
                if re.search(pattern, lookup_key, flags=re.IGNORECASE):
                    lookup_key = replacement
                    print(lookup_key)
                    break
                    
            term = preprocess(lookup_key)
            
            print(f"Query: {term}")
            print(f"lookup term: {lookup_key}")
            
            
            if term_raw in created:

                entity_uri = created[term_raw]
                print(f"  → Reusing existing URI: {entity_uri}\n")
                g.add((entity_uri, RDF.type, GENE_CLASS))
                g.add((entity_uri, RDF.type, SKOS.Concept))
                g.add((entity_uri, SKOS.inScheme, GENE_CONCEPT_SCHEME))
                g.add((entity_uri, RDFS.label, Literal(term.title(), datatype=XSD.string)))
                mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_gene_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                tokenized_mentions[term_raw] = mention_uri
                print()
                continue

            if lookup_key in created:
                print(f"  → Reusing existing URI: {created[lookup_key]}")
                entity_uri = created[lookup_key]
                g.add((entity_uri, RDF.type, GENE_CLASS))
                g.add((entity_uri, RDF.type, SKOS.Concept))
                g.add((entity_uri, SKOS.inScheme, GENE_CONCEPT_SCHEME))
                g.add((entity_uri, RDFS.label, Literal(term.title(), datatype=XSD.string)))
                mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_gene_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                tokenized_mentions[term_raw] = mention_uri
                print()
                continue

            ex = exact_ix.get(term, [])
            cos = top_cosine(term)
            if ex:
                for lbl, uri in ex:
                    print(f"  • {lbl:40s} URI={uri}")
                    entity_uri = URIRef(f"{uri}")
                    created[term_raw] = entity_uri
                    g.add((entity_uri, RDF.type, GENE_CLASS))
                    g.add((entity_uri, RDF.type, SKOS.Concept))
                    g.add((entity_uri, RDFS.label, Literal(lbl, datatype=XSD.string)))
                    g.add((entity_uri, SKOS.inScheme, GENE_CONCEPT_SCHEME))
                    uri_str = str(entity_uri).lower()

                    if "stato_" in uri_str:
                        comment = "STATO Match"
                    elif "ncbitaxon_" in uri_str:
                        comment = "NCBITaxon Match"
                    elif "ncit_" in uri_str:
                        comment = "NCIT Match"
                    elif "obi_" in uri_str:
                        comment = "OBI Match"
                    elif "umls" in uri_str:
                        comment = "UMLS Match"
                    else:
                        comment = CREATOR 
                    g.add((entity_uri, RDFS.comment, Literal(comment, datatype=XSD.string)))
                    uri_str = str(entity_uri)
                    definition = choose_definition(uri_str, term_raw)
                    existing_defs = [
                        c for c in g.objects(entity_uri, RDFS.comment)
                        if "[Definition Source:" in str(c)
                    ]
                    if not existing_defs:
                        g.add((entity_uri,
                            RDFS.comment,
                            Literal(definition, datatype=XSD.string)))
                    mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_gene_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    continue
                    print(); 
                    
            elif cos:
                for lbl, uri, score in cos[:1]:
                    print(f"  • {lbl:40s} URI={uri:40s} score={score:.2f}")
                    name_uri = URIRef(f"{uri}")
                    created[term_raw] = name_uri
                    g.add((name_uri, RDF.type, GENE_CLASS))
                    g.add((name_uri, RDF.type, SKOS.Concept))
                    g.add((name_uri, RDFS.label, Literal(lbl, datatype=XSD.string)))
                    g.add((name_uri, SKOS.inScheme, GENE_CONCEPT_SCHEME))
                    uri_str = str(name_uri).lower()

                    if "stato_" in uri_str:
                        comment = "STATO Match"
                    elif "ncbitaxon_" in uri_str:
                        comment = "NCBITaxon Match"
                    elif "ncit_" in uri_str:
                        comment = "NCIT Match"
                    elif "obi_" in uri_str:
                        comment = "OBI Match"
                    elif "umls" in uri_str:
                        comment = "UMLS Match"
                    else:
                        comment = CREATOR 
                    g.add((name_uri, RDFS.comment, Literal(comment, datatype=XSD.string)))
                    uri_str = str(name_uri)
                    definition = choose_definition(uri_str, term_raw)
                    existing_defs = [
                        c for c in g.objects(name_uri, RDFS.comment)
                        if "[Definition Source:" in str(c)
                    ]
                    if not existing_defs:
                        g.add((name_uri,
                            RDFS.comment,
                            Literal(definition, datatype=XSD.string)))
                    mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_gene_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((name_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    print()
                    
            else:
                mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_gene_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                tokenized_mentions[term_raw] = mention_uri
                print("no matches")
                continue

        else:
            pass

with open(tokenized_file, "r", encoding="utf-8") as f_sent:
    tokenized_data = json.load(f_sent)

for entry in tokenized_data:
    pmid         = entry["pmid"]
    sent_id      = entry["sent_id"]
    sentence_txt = entry["sentence"].strip()
    entities     = entry["entities"]

    sent_uri = URIRef(GUTBRAINSENTENCE[f"{pmid}_{sent_id}"])
    g.add((sent_uri, RDF.type, SENTENCE))
    g.add((sent_uri, GUTPROP.hasSentenceText, Literal(sentence_txt, datatype=XSD.string)))

    if sent_id == 0:
        parent_uri = URIRef(GUTBRAIN[f"title_{pmid}"])
    else:
        parent_uri = URIRef(GUTBRAIN[f"abstract_{pmid}"])
    g.add((sent_uri,      GUTPROP.partOf,  parent_uri))
    g.add((parent_uri,    GUTPROP.composedOf, sent_uri))
        
    for ent in entities:
        if isinstance(ent, dict):
            text_span = ent.get("text_span", "").strip()
            label     = ent.get("label",    "").strip().lower()
        else:
            continue

        if label != "gene":
            continue
        canonical = create_uri_fragment(text_span)
        cleaned_text_span = normalize_to_ascii(canonical).lower()
        
        if cleaned_text_span not in tokenized_mentions:
            mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(cleaned_text_span, max_length=16)])
            tokenized_mentions[cleaned_text_span] = mention_uri

            g.add((mention_uri, RDF.type, MENTION_CLASS))
            g.add((mention_uri, RDFS.label, Literal(f"mention_gene_{cleaned_text_span}", datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
            
        else:
            mention_uri = tokenized_mentions[cleaned_text_span]
            
        g.add((mention_uri, GUTPROP.locatedIn, sent_uri))

output_file = os.path.join(save_path, "gutbrain_entities.ttl")
ttl_output = g.serialize(format="turtle")
with open(output_file, "w", encoding="utf-8") as f_out:
    f_out.write(ttl_output)

print(f"The RDF graph has been saved in {output_file}")

skin_lipid_metabolism_gene
Query: skin lipid metabolism gene
lookup term: skin_lipid_metabolism_gene
  → Reusing existing URI: https://hereditary.dei.unipd.it/ontology/gutbrain/resource/gene/SkinLipidMetabolismGene

ppar-gamma
Query: ppar-gamma
lookup term: ppar-gamma
  → Reusing existing URI: https://uts.nlm.nih.gov/uts/umls/concept/C0166417

Query: srebp-1c
lookup term: srebp-1c
  → Reusing existing URI: http://purl.obolibrary.org/obo/TFClass_human.obo#1.2.6.3.1.3


acaca_gene
Query: acaca gene
lookup term: acaca_gene
  → Reusing existing URI: https://uts.nlm.nih.gov/uts/umls/concept/C1412104

fasn_gene
Query: fasn gene
lookup term: fasn_gene
  • FASN Gene                                URI=http://purl.obolibrary.org/obo/NCIT_C26564
ppar-gamma
Query: ppar-gamma
lookup term: ppar-gamma
  → Reusing existing URI: https://uts.nlm.nih.gov/uts/umls/concept/C0166417

Query: acox1
lookup term: acox1
  • ACOX1 Gene                               URI=http://purl.obolibrary.org/obo/NCIT_C191889 

<h1>INGEST DDF</h1>

In [None]:
import re
import json
import numpy as np
import requests
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
import unicodedata
from pathlib import Path
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD, SKOS, OWL
from pprint import pprint
from rdflib.namespace import DCTERMS
from groqutils import get_llm_definition
from umlsutils import best_umls_match, search_umls, get_umls_definition
from funcutils import get_ncit_description, get_chebi_description, get_omit_description, NCBI_BASE, HEREDITARY_BASE, UMLS_BASES, foodon_file, ncit_file, omit_file, chebi_file, hash_term_sha256

DDF_BASE = "http://purl.obolibrary.org/obo/"
DDF_CLASS = URIRef("https://w3id.org/brainteaser/ontology/schema/DiseaseDisorderOrFinding")
DDF_CONCEPT_SCHEME = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/DiseaseDisorderOrFinding")

def load_ddf_labels(path):
    pattern = re.compile(r'^\s*(.*?)\s*[\(\[]([A-Za-z0-9_]+)[\)\]]')
    rows = []
    with open(path, encoding="utf-8") as fh:
        for ln in fh:
            m = pattern.match(ln)
            if not m:
                continue
            label, term_id = m.group(1), m.group(2)
            uri = DDF_BASE + term_id
            rows.append((label, uri))
    return rows

OMIT_LABELS_FILE = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\parsedOntologies\omit_full_taxonomy.txt"
DDF_LABELS_FILE = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\parsedOntologies\ncit_full_taxonomy.txt"
ddf_rows = load_ddf_labels(DDF_LABELS_FILE)
omit_rows = load_ddf_labels(OMIT_LABELS_FILE)

exact_ix = defaultdict(list)
for lbl, uri in ddf_rows:
    exact_ix[lbl.lower()].append((lbl, uri))

exact_ix1 = defaultdict(list)
for lbl, uri in omit_rows:
    exact_ix1[lbl.lower()].append((lbl, uri))

labels_only = [preprocess(lbl) for lbl, _ in ddf_rows]
vec = TfidfVectorizer(stop_words="english")
mat = vec.fit_transform(labels_only)

labels_only1 = [preprocess(lbl) for lbl, _ in omit_rows]
vec1 = TfidfVectorizer(stop_words="english")
mat1 = vec1.fit_transform(labels_only1)

manual_created = {
    "neuropsychiatric_disorders" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/diseasedisorderorfinding/NeuropsychiatricDisorders"),
    "oleic_acid-induced_acne" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/diseasedisorderorfinding/OleicAcidInducedAcne"),
    "neurodegenerative_diseases" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/diseasedisorderorfinding/NeurodegenerativeDiseases"),
    "nervous_system_disorder" : URIRef("http://purl.obolibrary.org/obo/NCIT_C26835"),
    "inflammatory_disease" : URIRef("http://purl.obolibrary.org/obo/MONDO_0021166"),
    "cancer" : URIRef("http://purl.obolibrary.org/obo/NCBITaxon_6754"),
    "neurodegeneration" : URIRef("http://purl.obolibrary.org/obo/MP_0002229"),
    "intestinal_dysbiosis" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C4287543"),
    "brain_diseases": URIRef("http://purl.obolibrary.org/obo/OMIT_0003283"),
    "functional_gastric_disease": URIRef("http://purl.obolibrary.org/obo/MONDO_0001318"),
    "functional_gastrointestinal_disorders" : URIRef("https://bioportal.bioontology.org/ontologies/EDAM?p=classes&conceptid=topic_3409"),
    "synucleinopathies" : URIRef("http://purl.obolibrary.org/obo/MONDO_0000510"),
    "hippocampal_volume_loss,_mild" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C5394342"),
    "gastric_mucins" : URIRef("https://meshb.nlm.nih.gov/record/ui?ui=D005752"),
    "gastric_disease" : URIRef("http://purl.obolibrary.org/obo/MONDO_0004298"),
    "neurodegenerative_disorders" : URIRef("http://purl.obolibrary.org/obo/NCIT_C39737"),
    "headaches" : URIRef("http://purl.obolibrary.org/obo/NCIT_C34661"),
    "disorder" : URIRef("http://purl.obolibrary.org/obo/OGMS_0000045"),
    "neuronitis": URIRef("http://purl.obolibrary.org/obo/MONDO_0004466"),
    "endolysosomal_deficits" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/diseasedisorderorfinding/EndolysosomalDeficits"),
    "coeliac_disease" : URIRef("https://disease-ontology.org/?id=DOID:10608"),
    "intestinal_malabsorption" : URIRef("https://hpo.jax.org/browse/term/HP:0002024"),
    "cell_danger_response": URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/diseasedisorderorfinding/CellDangerResponse"),
    "altered_microbiota" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C4047662"),
    "intestinal_dysbiosis" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C4287543"),
    "migraine" : URIRef("http://purl.obolibrary.org/obo/NCIT_C89715"),
    "cognitive_impairment" : URIRef("http://purl.obolibrary.org/obo/NCIT_C116921"),
    "attention_deficit_hyperactivity_disorder": URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C1263846"),
    "amyotrophic_lateral_scleroris" : URIRef("http://purl.obolibrary.org/obo/NCIT_C34373"),
    "antipsychotic-generated_motility_effects" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/diseasedisorderorfinding/AntipsychoticGeneratedMotilityEffects"),
    "neurometabolic_alterations" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/diseasedisorderorfinding/NeurometabolicAlterations"),
    "gastrointestinal_and_mental_health_symptoms" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/diseasedisorderorfinding/GastrointestinalAndMentalHealthSymptoms"),
    "postinfection_disorders" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/diseasedisorderorfinding/PostinfectionDisorders"),
    "alterations_in_the_enteroendocrine_system" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/diseasedisorderorfinding/AlterationsInTheEnteroendocrineSystem"),
    "altered_gut_microbiota" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/diseasedisorderorfinding/AlteredGutMicrobiota"),
    "mental_disorders" : URIRef("http://purl.obolibrary.org/obo/OMIT_0002893"),
    "abnormal_blood_levels" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/diseasedisorderorfinding/AbnormalBloodLevels"),
    "alzheimers_disease" : URIRef("http://purl.obolibrary.org/obo/NCIT_C2866"),
    "amyotrophic_lateral_sclerosis" : URIRef("http://purl.obolibrary.org/obo/NCIT_C34373"),
    "intestinal_alteration" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/diseasedisorderorfinding/IntestinalAlteration"),
    "colorectal_cancer" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0009402"),
    "irritable_bowel_syndrome" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0022104"),
    "polycistic_ovary_syndrome" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0032460"),
    
}

created = dict(manual_created)

regex_map = [
    (r"\bdepressive_symptoms\b", "major_depressive_disorder"),
    (r"\bpolycystic_ovarian_syndrome\b", "polycystic_ovary_syndrome"),
    (r"\bbipolar_disorders\b", "bipolar_disorder"),
    (r"\bsars-cov2\b", "sars"),
    (r"\banxiety_depression\b", "mixed_anxiety_and_depressive_disorder"),
    (r"gastrointestinal_and_mental_health_symptoms","gastrointestinal_and_mental_health_symptoms"),
    (r"non-celiac_food_sensitivities","coeliac_disease"),
    (r"\bdementia\w*\b","dementia"),
    (r"(?:(?<=^)|(?<=[^A-Za-z0-9]))migraine_headache\b","migraine"),
    (r"\bt1d\b","Type_1_Diabetes_Mellitus"),
    (r"gut_microbiome_changes", "altered_gut_microbiota"),
    (r"\bmental_health_issue\b","mental_health"),
    (r"(?:(?<=^)|(?<=[^A-Za-z0-9]))gut_dysfunction","dysfunction"),
    (r"cognitive_or_motor_dysfunction", "dysfunction"),
    (r"gut_microbiota_disorder","disorder"),
    (r"age-related_neurodegeneration","neurodegeneration"),
    (r"low-grade_inflammation_and_anomalies","inflammation"),
    (r"viral_respiratory_infections","respiratory_tract_infections"),
    (r"respiratory_infections","respiratory_tract_infections"),
    (r"inflammatory_conditions","inflammation"),
    (r"chemotherapy-associated_psychological_distress","distress"),
    (r"stage_i_disease","disease"),
    (r"central_and_peripheral_inflammation","inflammation"),
    (r"neuronal_disorders","disorder"),
    (r"brain_mental-health-related_disease", "disease"),
    (r"autoimmune_central_nervous_system_disease","disease"),
    (r"ms_condition","multiple_sclerosis"),
    (r"psychiatric_clinical_symptoms","symptom"),
    (r"states_of_dysmetabolism","dysmetabolic_syndrome"),
    (r"anxiety-like_behaviors","anxiety"),
    (r"jejunal_expression_of_pro-inflammatory_markers", "elevated_inflammatory_markers"),
    (r"inflammatory-__autoimmune-__neurodegenerative-__metabolic-__mood-__behavioral-__cognitive-__autism-spectrum-__stress-_and_pain-related_disorders","disorder"),
    (r"motor_symptoms","symptom"),
    (r"systemic_disorders","disorder"),
    (r"mental_health_symptoms","symptom"),
    (r"microbiota_gut_brain_dysfunction","dysfunction"),
    (r"anxiety_depression_symptom","symptom"),
    (r"clinical_symptoms","symptom"),
    (r"pro-inflammatory_state","elevated_inflammatory_markers"),
    (r"peripheral_and_central_inflammation","inflammation"),
    (r"(?:(?<=^)|(?<=[^A-Za-z0-9]))cardiometabolic_disorder","cardiometabolic_syndrome"),
    (r"gastrointestinal__gi__problems", "functional_gastrointestinal_disorders"),
    (r"dysbiosis_of_the_microbiota","dysbiosis"),
    (r"post-acute_coronavirus_disease__covid_-19_syndrome", "covid-19_post-intensive_care_syndrome"),
    (r"coronavirus_disease","_covid_-19"),
    (r"postinfection disorders of gut-brain interaction","postinfection_disorders"),
    (r"dgbi","postinfection_disorders"),
    (r"visceral_hypersensitivity", "hypersensitivity"),
    (r"kidney__liver__and_heart_disease","disease"),
    (r"alzheimer_and_parkinson_disease","alzheimers_disease"),
    (r"long-term_acetate_deficiency","deficiency"),
    (r"cognitive_decline","mental_deterioration"),
    (r"systemic_inflammation","inflammation"),
    (r"gastrointestinal_disorders", "functional_gastrointestinal_disorders"),
    (r"psychiatric_and_neurodegenerative_disorders", "mental_disorders"),
    (r"\bai\b","autoimmune_disease"),
    (r"ai-related_musculoskeletal_pathology","musculoskeletal_diseases"),
    (r"oleic_acid-induced_acne","oleic_acid-induced_acne"),
    (r"acne_pathogenesis", "acne"),
    (r"serum_hormone_secretion", "hormone_secretion"),
    (r"neurodegenerative__inflammatory__metabolic__and_cardiovascular_diseases", "disease"),
    (r"alzheimer_s_and_parkinson_s_diseases","alzheimers_disease"),
    (r"(?:(?<=^)|(?<=[^A-Za-z0-9]))dysfunctions","dysfunction"),
    (r"dysregulation_of_microglia_genes","microglia_gene"),
    (r"telomere_attrition","telomere"),
    (r"neuronal_stem_cell_degradation","stem_cells"),
    (r"loss_of_chromosome_x_inactivation","x_chromosome_inactivation"),
    (r"gut_microbiome_dysbiosis","intestinal_dysbiosis"),
    (r"\bad\b","alzheimers_disease"),
    (r"parkinson_s_disease","parkinson_disease"),
    (r"\bpd\b", "parkinson_disease"),
    (r"\blbd\b", "lewy_body_dementia"),
    (r"gut_dysbiosis", "intestinal_dysbiosis"),
    (r"melanomas", "melanoma"),
    (r"cums-induced_depressive_disorder", "major_depressive_disorder"),
    (r"chronic_unpredictable_mild_stress__cums_-induced_depressive-like_symptoms", "depression"),
    (r"depressive_behaviors","depression"),
    (r"major_depressive_disorders", "major_depressive_disorder"),
    (r"human_stress", "stress"),
    (r"headaches","headaches"),
    (r"mental_health_problems","major_depressive_disorder"),
    (r"\bstress\w*\b", "stress"),
    (r"altered_gut_microbiome", "intestinal_microbiome"),
    (r"gastric_disturbances", "functional_gastric_disease"),
    (r"microbiome_alteration" , "microbiome"),
    (r"pd-like_pathology", "parkinson_disease"),
    (r"lps_paraquat-induced_weight_loss" , "weight_loss"),
    (r"inflamed_gut" , "gut"),
    (r"brain-gut_changes" , "brain-gut_axis"),
    (r"human_neurological_disorders" , "nervous_system_disorder"),
    (r"imbalance_in_the_gut_microflora" , "intestinal_microbiome"),
    (r"neurological_conditions" , "progressive_neurological_conditions"),
    (r"neurological_disorders" , "nervous_system_disorder"),
    (r"\bdd\b", "major_depressive_disorder"),
    (r"hpa_axis_dysfunction", "dysfunction"),
    (r"chronic_mild_stress", "stress"),
    (r"\bcms\b", "stress"),
    (r"anxiety-_and_depressive-like_behaviors", "major_depressive_disorder"),
    (r"cms-induced_anxiety-_and_depressive-like_behaviors", "major_depressive_disorder"),
    (r"gastrointestinal disorders", "functional_gastrointestinal_disorders"),
    (r"food_allergies", "food_allergy"),
    (r"ulcerative_histiocytic_colitis","ulcerative_colitis"),
    (r"depression_patients", "depression"),
    (r"\bibs-d\b", "irritable_bowel_syndrome"),
    (r"\bnds\b", "neurodegenerative_diseases"),
    (r"\bmsa\b", "multiple_system_atrophy"),
    (r"\bms\b", "multiple_sclerosis"),
    (r"\bnmo\b", "neuromyelitis_optica"),
    (r"alpha_synucleinopathies" , "synucleinopathies"),
    (r"hippocampal_microglia-mediated_synaptic_loss", "hippocampal_volume_loss,_mild"),
    (r"inflammatory_gut_milieu", "milieu_therapy"),
    (r"\bptsd\b", "acute_stress_disorder"),
    (r"posttraumatic_stress_disorder", "acute_stress_disorder"),
    (r"alzheimer_s_disease", "alzheimers_disease"),
    (r"vascular_system_dysfunction","dysfunction"),
    (r"intestinal_dysbiosis", "intestinal_dysbiosis"),
    (r"microbiota_dysbiosis", "dysbiosis"),
    (r"chronic_enteropathy", "enteropathy"),
    (r"amyotrophic_lateral_scleroris", "amyotrophic_lateral_scleroris"),
    (r"\bals\b", "amyotrophic_lateral_scleroris"),
    (r"dysregulation_of_gut_barrier_functions", "dysregulated_immune_function"),
    (r"transepithelial_electrical_resistance" , "electrical_resistance"),
    (r"mucin_homeostasis", "gastric_mucins"),
    (r"antimicrobial_responses","antibiotic"),
    (r"helicobacter_pylori-related_hyperhomocysteinemia", "helicobacter_pylori"),
    (r"gastric_pathologies", "gastric_disease"),
    (r"neurodegenerative_central_nervous_system_disorders", "neurodegenerative_disorders"),
    (r"ocular_alzheimer_s_disease", "alzheimers_disease"),
    (r"gastrointestinal_diseases","disease"),
    (r"autoimmune__ai__diseases","disease"),
    (r"age-associated_diseases","disease"),
    (r"age-associated_brain_diseases","disease"),
    (r"age-related_brain_diseases","disease"),
    (r"mental_diseases","disease"),
    (r"endocrine_and_metabolic_diseases","disease"),
    (r"diseases_of_the_gi_tract","disease"),
    (r"psychotic_and_affective_disorders","disorder"),
    (r"postinfection_disorders_of_gut-brain_interaction","disorder"),
    (r"age-related_diseases","disease"),
    (r"neurological_diseases","disease"),
    (r"migraine", "migraine"),
    (r"gut_dysbiosis","dysbiosis"),
    (r"cognitive_impairment","cognitive_impairment"),
    (r"gut_dysbiosis_and_inflammation","intestinal_dysbiosis"),
    (r"hyperhomocysteinemia-related_brain_cortical_thinning","hyperhomocysteinemia"),
    (r"\bbct\b","hyperhomocysteinemia"),
    (r"major_depressive_episode","major_depressive_disorder"),
    (r"\bbpd\b","major_depressive_disorder"),
    (r"mild_metabolic_disorders","disorder"),
    (r"(?:(?<=^)|(?<=[^A-Za-z0-9]))metabolic_disturbances","metabolic_disturbance"),
    (r"schizoaffective_psychosis","psychosis"),
    (r"affective_disorder","mood_disorder"),
    (r"gastrointestinal__gi__disorders","disorder"),
    (r"autoimmune_disorders","disorder"),
    (r"non-celiac_food_sensitivities","coeliac_disease"),
    (r"co-morbid_gi_inflammation","inflammation"),
    (r"inefficient_gluten_digestion","digestion"),
    (r"polygenic_brain_disorders","disorder"),
    (r"autism_spectrum_disorder","disorder"),
    (r"\basd\b","disorder"),
    (r"\bmia\b","systemic_immune_activation"),
    (r"\badhd\b","attention_deficit_hyperactivity_disorder"),
    (r"\bt2dm\b","type_2_diabetes_mellitus"),
    (r"intestinal_microbiota_alterations","intestinal_alteration"),
    (r"depression-like_behavior","major_depressive_disorder"),
    (r"chronic_and_unpredictable_mild_stress","stress"),
    (r"depression-like_behaviors","depression"),
    (r"gastric_and_esophageal_cancer","cancer"),
    (r"parkinsonian_pathology","parkinson_disease"),
    (r"neuroinflammation","neuronitis"),
    (r"nigrostriatal_neurodegeneration","neurodegeneration"),
    (r"disease-related_malnutrition","malnutrition"),
    (r"neurological_and_psychiatric_disorders","disorder"),
    (r"brain_disorders","disorder"),
    (r"\bmdd\b","major_depressive_disorders"),
    (r"inflammatory_processes","inflammation"),
    (r"anxiety_symptoms","anxiety"),
    (r"\bpcos\b","polycystic_ovary_syndrome"),
    (r"gi symptoms","hama_-_gastrointestinal_symptoms"),
    (r"altered_microbiota","microbiota"),
    (r"colon_cancer-related_anemia","colon_carcinoma"),
    (r"\bccra\b","colon_carcinoma"),
    (r"colon_cancer","colon_carcinoma"),
    (r"ccra-induced_intestinal_flora_disorder","colon_carcinoma"),
    (r"chronic_stress-induced_anhedonia","stress"),
    (r"chronic_unpredictable_stress","stress"),
    (r"cus-induced_anhedonic_behaviors","anhedonia"),
    (r"irritable_bowel_disease","inflammatory_bowel_disease"),
    (r"\bsdv\b","vagotomy"),
    (r"abnormal_crypt_foci","aberrant_crypt_foci"),
    (r"\bcrc\b","colorectal_carcinoma"),
    (r"small_intestinal_malabsorption","intestinal_malabsorption"),
    (r"impaired_colonic_microbial_metabolism","metabolism"),
    (r"chronic_alcohol_overconsumption","alcohol"),
    (r"-synuclein_pathology", "pathology"),
    (r"\bcdr\b","cell_danger_response"),
    (r"\bptsd\b","post-traumatic_stress_disorder"),
    (r"\bcte\b","chronic_traumatic_encephalopathy"),
    (r"traumatic_brain_injury","injury"),
    (r"\btbi\b","injury"),
    (r"\balteration_of_faecal_microbiota_balance\b","altered_gut_microbiota"),
    (r"\baltered_microbiota\b","intestinal_microbiome"),
    (r"\bbd\b","bipolar_disorder"),
    (r"prader-willi_syndrome","prader-willi_syndrome"),
    (r"\bosa\b", "obstructive_sleep_apnea"),
    (r"osa_onset","obstructive_sleep_apnea"),
    (r"\bvvs\b","vasovagal_syncope"),
    (r"systolic_and_diastolic_pressure_reduction", "pressure"),
    (r"mean_pressure_drop", "pressure"),
    (r"diastolic_pressure_drop","pressure"),
    (r"gut_inflammation", "inflammation"),
    (r"\bscz\b","schizophrenia"),
    (r"anxiety-like_and_depression-like_behaviours","anxiety"),
    (r"\bdepressive\w*\b","anxiety"),
    (r"\bliver_fat\b","hepatic_steatosis"),
    (r"\bhcd\b","colesterol"),
    (r"\bdisorder\w*\b","disorder"),
    (r"first-episode_depression","depression"),
    (r"\bfcr\b","fear_of_cancer_recurrence"),
    (r"learning_and_memory_impairments","cognitive_impairment"),
    (r"migraine","migraine"),
    (r"\bparkinson_s\b","parkinson_disease"),
    (r"psychiatric_and_neurodegenerative_disorders","disorder"),
    (r"gastric_disturbances","intestinal_inflammation"),
    (r"inflamed_gut","intestinal_inflammation"),
    (r"impaired_cognition","cognitive_impairment"),
    (r"helicobacter_pylori_infection","helicobacter_pylori"),
    (r"maternal_immune_activation","systemic_immune_activation"),
    (r"cancers_of_the_esophagus_and_stomach","cancer"),
    (r"gi_symptoms","symptom"),
    (r"gastrointestinal__gi__symptoms","symptom"),
    (r"chronic_restraint_stress-induced_neurobehavioral_and_gut_barrier_deficits", "deficit"),
    (r"idiopathic_pd","parkinson_disease"),
    (r"gut_brain_axis_dysfunction","dysfunction"),
    (r"gut_microbiome_composition_alterations","intestinal_alteration"),
    (r"non-tremor_pd_subtype","parkinson_disease"),
    (r"\bgut_microbiota_alteration\w*\b","intestinal_alteration"),
    (r"dysbiosis_of_intestinal_microbiota","dysbiosis"),
    (r"influenza_infections","severe_influenza_infection"),
    (r"\bcus\b","stress"),
    (r"-synuclein_aggregation","synuclein"),
    (r"\bneurodegenerative_disease\w*\b","neurodegenerative_diseases"),
    (r"chronic__developmental__autoimmune__and_degenerative_disorders","disorder"),
    (r"brain_inflammatory_activity", "brain_inflammatory_disease"),
    (r"\bcardiovascular_disease\w*\b","cardiovascular_diseases"),
    (r"hyperlipidemia","hyperlipidemia"),
    (r"intestinal_microbial_and_metabolites_dysbiosis","dysbiosis"),
    (r"microbial_dysbiosis","dysbiosis"),
    (r"systemic_low-grade_inflammation","inflammation"),
    (r"severe_psychiatric_disorders","disorder"),
    (r"chronic_unpredictable_mild_stress","stress"),
    (r"depression-_and_anxiety-like_behavior","depression"),
    (r"depression-_and_anxiety-like_behaviors","depression"),
    (r"\bcad\b","coronary_artery_disease"),
    (r"anosmia","anosmia"),
    (r"altered_gut_motility","gut-brain_axis"),
    (r"changes_in_intestinal_permeability","intestinal"),
    (r"\bndd\b","neurodegenerative_diseases"),
    (r"\bneurological_and_mental_disorder\w*\b","mental_disorders"),
    (r"microbiome_and_specific_bacterial_changes","microbiome"),
    (r"postinfection_disorders_of_gut-brain_interaction", "disease_(or_disorder);_spleen,_postinfectional"),
    (r"\bacute_and_chronic_insomnia\b","chronic_insomnia"),
    (r"\bautoimmune_disease\w*\b","autoimmune_disease"),
    (r"\benteropathic_celiac_disease\w*\b","coeliac_disease"),
    (r"\bmental_health_disorders\b","mental_disorders"),
    (r"\bpsychiatric_and_neurodegenerative_disorder\w*\b","mental_disorders"),
    (r"\bmental_dysfunction\w*\b","mental_disorders"),
    (r"\bmental_illnesses\w*\b","mental_disorders"),
    (r"\bpsychological_dysfunction\w*\b","mental_disorders"),
    (r"\bmood_disorders\b","mood_disorder"),
]

CREATOR = "Samuel Piron"

UMLS_BASES = "https://uts.nlm.nih.gov/uts/umls/concept/"

for uri in manual_created.values():
    uri_str = str(uri)

    if uri_str.startswith(UMLS_BASES):
        # only a UMLS‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("UMLS Match", datatype=XSD.string)))
    elif "MMO" in uri_str:
        # only a STATO‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("MNO Match", datatype=XSD.string)))
    elif "meshb" in uri_str:
        # only a MESH‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("MESH Match", datatype=XSD.string)))
    elif "BAO" in uri_str:
        # only a BAO‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("BAO Match", datatype=XSD.string)))
    elif "NCIT" in uri_str:
        # only a NCIT‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("NCIT Match", datatype=XSD.string)))
    elif "OMIT" in uri_str:
        # only a OMIT‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("OMIT Match", datatype=XSD.string)))
    elif "PR" in uri_str:
        # only a FMA‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("PR Match", datatype=XSD.string)))
    elif "TFClass" in uri_str:
        # only a TFClass‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("TFClass Match", datatype=XSD.string)))
    elif "GO" in uri_str:
        # only a GO‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("GO Match", datatype=XSD.string)))
    elif "BioCyc" in uri_str:
        # only a BioCyc‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("BioCyc Match", datatype=XSD.string)))
    elif "kegg" in uri_str:
        # only a KEGG‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("KEGG Match", datatype=XSD.string)))
    elif "OGG" in uri_str:
        # only a OGG‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("OGG Match", datatype=XSD.string)))
    elif "MONDO" in uri_str:
        # only a Genome‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("MONDO Match", datatype=XSD.string)))
    elif "NCBITaxon" in uri_str:
        # only a NCBITaxon‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("NCBITaxon Match", datatype=XSD.string)))
    elif "OGMS" in uri_str:
        # only a OGMS‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("OGMS Match", datatype=XSD.string)))
    elif "DOID" in uri_str:
        # only a DOID‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("DOID Match", datatype=XSD.string)))
    elif "HP" in uri_str:
        # only a HP‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("HP Match", datatype=XSD.string)))
    elif "MP" in uri_str:
        # only a MP‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("MP Match", datatype=XSD.string)))
    else:
        # everything else still gets a creator
        g.add((uri,
               DCTERMS.creator,
               Literal(CREATOR, datatype=XSD.string)))
        
for term_raw, uri in manual_created.items():
    uri_str = str(uri)

    # NCIT definitions
    if uri_str.startswith(NCBI_BASE) and "NCIT_" in uri_str:
        ncit_id = uri_str.rsplit("_", 1)[-1]
        desc = get_ncit_description(ncit_id, ncit_file)
        m = re.search(r'—\s*(.*?)\s*—', desc)
        if m:
            desc = m.group(1).strip()
            comment = f"{desc} [Definition Source: NCIT]"
        else:
            llm_def = get_llm_definition(term_raw)
            comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    # UMLS definitions
    elif uri_str.startswith(UMLS_BASES):
        cui = uri_str.rsplit("/", 1)[-1]
        defn = get_umls_definition(cui)
        if defn:
            comment = f"{defn.strip()} [Definition Source: UMLS]"
        else:
            llm_def = get_llm_definition(term_raw)
            comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    # MeSH definitions via your mesh_index
    elif uri_str.startswith(MESH_BASE):
        ui     = uri_str.rsplit("/",1)[-1]
        hits   = [d['name'] for d in mesh_descs if d['ui']==ui]
        if hits:
            comment = f"{hits[0]} [Definition Source: MeSH]"
        elif term_raw == "patients":
            comment = "Patients with various diseases. [Definition Source: GUTBRAIN]"
        else:
            llm_def = get_llm_definition(term_raw)
            comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    elif uri_str.startswith(HEREDITARY_BASE):
        llm_def = get_llm_definition(term_raw)
        comment = f"{llm_def} [Definition Source: llama3-8b-8192]"
    else:
        llm_def = get_llm_definition(term_raw)
        comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    g.add((uri,
           RDFS.comment,
           Literal(comment, datatype=XSD.string)))

def top_cosine(term, k=5, thr=0.85):
    v  = vec.transform([term])
    sc = cosine_similarity(v, mat).ravel()
    idx = np.argsort(sc)[::-1]
    out = []
    for i in idx[:k]:
        if sc[i] < thr:
            break
        lbl, uri = ddf_rows[i]
        out.append((lbl, uri, sc[i]))
    return out

def top_cosine_omit(term, k=5, thr=0.85):
    v  = vec1.transform([term])
    sc = cosine_similarity(v, mat1).ravel()
    idx = np.argsort(sc)[::-1]
    out = []
    for i in idx[:k]:
        if sc[i] < thr:
            break
        lbl, uri = omit_rows[i]
        out.append((lbl, uri, sc[i]))
    return out
    
for paper_id, paper_data in data.items():
    
    entities = paper_data.get("entities", [])
    
    for i, entity in enumerate(entities):
        raw_label = entity.get("label", "").strip()
        
        text_span = entity.get("text_span", "").strip()
        
        if raw_label == "DDF":
            text_span = entity.get("text_span", "").strip()
            cleaned_text_span = normalize_to_ascii(create_uri_fragment(text_span)).lower()
            term_raw = cleaned_text_span
            lookup_key = term_raw
            #lookup_key = singularize(lookup_key)
            
            for pattern, replacement in regex_map:
                if re.search(pattern, lookup_key, flags=re.IGNORECASE):
                    lookup_key = replacement
                    print(lookup_key)
                    break
                    
            term = preprocess(lookup_key)
            
            print(f"Query: {term}")
            print(f"lookup term: {lookup_key}")
            
            
            if term_raw in created:

                entity_uri = created[term_raw]
                print(f"  → Reusing existing URI: {entity_uri}\n")
                g.add((entity_uri, RDF.type, DDF_CLASS))
                g.add((entity_uri, RDF.type, SKOS.Concept))
                g.add((entity_uri, SKOS.inScheme, DDF_CONCEPT_SCHEME))
                if (entity_uri, RDFS.label, None) not in g:
                    g.add((entity_uri,
                            RDFS.label,
                            Literal(term.title(), datatype=XSD.string)))
                mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_ddf_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                tokenized_mentions[term_raw] = mention_uri
                print()
                continue

            if lookup_key in created:
                print(f"  → Reusing existing URI: {created[lookup_key]}")
                entity_uri = created[lookup_key]
                g.add((entity_uri, RDF.type, DDF_CLASS))
                g.add((entity_uri, RDF.type, SKOS.Concept))
                g.add((entity_uri, SKOS.inScheme, DDF_CONCEPT_SCHEME))
                if (entity_uri, RDFS.label, None) not in g:
                    g.add((entity_uri,
                            RDFS.label,
                            Literal(term.title(), datatype=XSD.string)))
                mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_ddf_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                tokenized_mentions[term_raw] = mention_uri
                print()
                continue
            
            ex = exact_ix.get(term, [])
            cos = top_cosine(term)
            cos1 = top_cosine_omit(term)
            if ex:
                lbl, uri = ex[0]
                print(f"  • {lbl:40s} URI={uri}")
                entity_uri = URIRef(f"{uri}")
                created[term_raw] = entity_uri
                g.add((entity_uri, RDF.type, DDF_CLASS))
                g.add((entity_uri, RDF.type, SKOS.Concept))
                if (entity_uri, RDFS.label, None) not in g:
                    g.add((entity_uri,
                            RDFS.label,
                            Literal(lbl.title(), datatype=XSD.string)))
                g.add((entity_uri, SKOS.inScheme, DDF_CONCEPT_SCHEME))
                uri_str = str(entity_uri).lower()

                if "stato_" in uri_str:
                    comment = "STATO Match"
                elif "ncbitaxon_" in uri_str:
                    comment = "NCBITaxon Match"
                elif "ncit_" in uri_str:
                    comment = "NCIT Match"
                elif "obi_" in uri_str:
                    comment = "OBI Match"
                elif "umls" in uri_str:
                    comment = "UMLS Match"
                elif "omit" in uri_str:
                    comment = "OMIT Match"
                else:
                    comment = CREATOR 
                g.add((entity_uri, RDFS.comment, Literal(comment, datatype=XSD.string)))
                uri_str = str(entity_uri)
                definition = choose_definition(uri_str, term_raw)
                existing_defs = [
                    c for c in g.objects(entity_uri, RDFS.comment)
                    if "[Definition Source:" in str(c)
                ]
                if not existing_defs:
                    g.add((entity_uri,
                        RDFS.comment,
                        Literal(definition, datatype=XSD.string)))
                mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_ddf_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                tokenized_mentions[term_raw] = mention_uri
                continue
                print(); 
                    
            elif cos:
                for lbl, uri, score in cos[:1]:
                    print(f"  • {lbl:40s} URI={uri:40s} score={score:.2f}")
                    name_uri = URIRef(f"{uri}")
                    created[term_raw] = name_uri
                    g.add((name_uri, RDF.type, DDF_CLASS))
                    g.add((name_uri, RDF.type, SKOS.Concept))
                    if (name_uri, RDFS.label, None) not in g:
                        g.add((name_uri,
                            RDFS.label,
                            Literal(lbl.title(), datatype=XSD.string)))
                    g.add((name_uri, SKOS.inScheme, DDF_CONCEPT_SCHEME))
                    uri_str = str(name_uri).lower()

                    if "stato_" in uri_str:
                        comment = "STATO Match"
                    elif "ncbitaxon_" in uri_str:
                        comment = "NCBITaxon Match"
                    elif "ncit_" in uri_str:
                        comment = "NCIT Match"
                    elif "obi_" in uri_str:
                        comment = "OBI Match"
                    elif "umls" in uri_str:
                        comment = "UMLS Match"
                    elif "omit" in uri_str:
                        comment = "OMIT Match"
                    else:
                        comment = CREATOR 
                    g.add((name_uri, RDFS.comment, Literal(comment, datatype=XSD.string)))
                    uri_str = str(name_uri)
                    definition = choose_definition(uri_str, term_raw)
                    existing_defs = [
                        c for c in g.objects(name_uri, RDFS.comment)
                        if "[Definition Source:" in str(c)
                    ]
                    if not existing_defs:
                        g.add((name_uri,
                            RDFS.comment,
                            Literal(definition, datatype=XSD.string)))
                    mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_ddf_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((name_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    continue
                    print()
            elif cos1:
                for lbl, uri, score in cos1[:1]:
                    print(f"  • {lbl:40s} URI={uri:40s} score={score:.2f}")
                    name_uri = URIRef(f"{uri}")
                    created[term_raw] = name_uri
                    g.add((name_uri, RDF.type, DDF_CLASS))
                    g.add((name_uri, RDF.type, SKOS.Concept))
                    if (name_uri, RDFS.label, None) not in g:
                        g.add((name_uri,
                            RDFS.label,
                            Literal(lbl.title(), datatype=XSD.string)))
                    g.add((name_uri, SKOS.inScheme, DDF_CONCEPT_SCHEME))
                    uri_str = str(name_uri).lower()

                    if "stato_" in uri_str:
                        comment = "STATO Match"
                    elif "ncbitaxon_" in uri_str:
                        comment = "NCBITaxon Match"
                    elif "ncit_" in uri_str:
                        comment = "NCIT Match"
                    elif "obi_" in uri_str:
                        comment = "OBI Match"
                    elif "umls" in uri_str:
                        comment = "UMLS Match"
                    elif "omit" in uri_str:
                        comment = "OMIT Match"
                    else:
                        comment = CREATOR 
                    g.add((name_uri, RDFS.comment, Literal(comment, datatype=XSD.string)))
                    uri_str = str(name_uri)
                    definition = choose_definition(uri_str, term_raw)
                    existing_defs = [
                        c for c in g.objects(name_uri, RDFS.comment)
                        if "[Definition Source:" in str(c)
                    ]
                    if not existing_defs:
                        g.add((name_uri,
                            RDFS.comment,
                            Literal(definition, datatype=XSD.string)))
                    mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_ddf_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((name_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    continue
                    print()
                
            else:
                api_term = lookup_key.replace("_", " ")
                umls_hits = search_umls(api_term)
                if umls_hits:
                    cui, name, score, definition = best_umls_match(api_term, umls_hits)
                    if name in created:
                        entity_uri = created[term_raw]
                    else:
                        entity_uri = URIRef(f"https://uts.nlm.nih.gov/uts/umls/concept/{cui}")
                        created[term_raw] = entity_uri

                    g.add((entity_uri, RDF.type,      DDF_CLASS))
                    g.add((entity_uri, RDF.type,      SKOS.Concept))
                    if (entity_uri, RDFS.label, None) not in g:
                        g.add((entity_uri,
                                RDFS.label,
                                Literal(name.title(), datatype=XSD.string)))
                    g.add((entity_uri, SKOS.inScheme, DDF_CONCEPT_SCHEME))
                    if definition:
                        comment_str = f"{definition.strip()} [Definition Source: UMLS]"
                    else:
                        llm_def = get_llm_definition(term_raw)
                        comment_str = f"{llm_def} [Definition Source: llama3-8b-8192]"
                    g.add((entity_uri, RDFS.comment, Literal("UMLS Match", datatype=XSD.string)))
            
                    mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                    g.add((mention_uri, RDF.type,        MENTION_CLASS))
                    g.add((mention_uri, RDFS.label,      Literal(f"mention_ddf_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs,      Literal(raw_label, datatype=XSD.string)))
                    g.add((entity_uri, GUTPROP.containedIn,    mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    print(f"  • UMLS CUI={cui}  Name={name!r}  sim={score:.2f}")
                    continue
                    
                else: 
                    mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                    g.add((mention_uri, RDF.type,      MENTION_CLASS))
                    g.add((mention_uri, RDFS.label,    Literal(f"mention_ddf_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs,      Literal(raw_label, datatype=XSD.string)))
                    tokenized_mentions[term_raw] = mention_uri
                    print("no matches locally or in UMLS")

        else:
            pass

with open(tokenized_file, "r", encoding="utf-8") as f_sent:
    tokenized_data = json.load(f_sent)

for entry in tokenized_data:
    pmid         = entry["pmid"]
    sent_id      = entry["sent_id"]
    sentence_txt = entry["sentence"].strip()
    entities     = entry["entities"]

    sent_uri = URIRef(GUTBRAINSENTENCE[f"{pmid}_{sent_id}"])
    g.add((sent_uri, RDF.type, SENTENCE))
    g.add((sent_uri, GUTPROP.hasSentenceText, Literal(sentence_txt, datatype=XSD.string)))

    if sent_id == 0:
        parent_uri = URIRef(GUTBRAIN[f"title_{pmid}"])
    else:
        parent_uri = URIRef(GUTBRAIN[f"abstract_{pmid}"])
    g.add((sent_uri,      GUTPROP.partOf,  parent_uri))
    g.add((parent_uri,    GUTPROP.composedOf, sent_uri))
        
    for ent in entities:
        if isinstance(ent, dict):
            text_span = ent.get("text_span", "").strip()
            label     = ent.get("label",    "").strip()
        else:
            continue

        if label != "DDF":
            continue
        canonical = create_uri_fragment(text_span)
        cleaned_text_span = normalize_to_ascii(canonical).lower()
        
        if cleaned_text_span not in tokenized_mentions:
            mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(cleaned_text_span, max_length=16)])
            tokenized_mentions[cleaned_text_span] = mention_uri

            g.add((mention_uri, RDF.type, MENTION_CLASS))
            g.add((mention_uri, RDFS.label, Literal(f"mention_ddf_{cleaned_text_span}", datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.taggedAs, Literal(label, datatype=XSD.string)))
            
        else:
            mention_uri = tokenized_mentions[cleaned_text_span]
            
        g.add((mention_uri, GUTPROP.locatedIn, sent_uri))

output_file = os.path.join(save_path, "gutbrain_entities.ttl")
ttl_output = g.serialize(format="turtle")
with open(output_file, "w", encoding="utf-8") as f_out:
    f_out.write(ttl_output)

print(f"The RDF graph has been saved in {output_file}")

major_depressive_disorder
Query: major depressive disorder
lookup term: major_depressive_disorder
  • Depressive Disorder, Major               URI=http://purl.obolibrary.org/obo/OMIT_0005141 score=1.00
Query: mental deterioration
lookup term: mental_deterioration
  • UMLS CUI=C0234985  Name='Mental deterioration'  sim=1.00
mood_disorder
Query: mood disorder
lookup term: mood_disorder
  • Mood Disorder                            URI=http://purl.obolibrary.org/obo/NCIT_C92200
major_depressive_disorder
Query: major depressive disorder
lookup term: major_depressive_disorder
  → Reusing existing URI: http://purl.obolibrary.org/obo/OMIT_0005141


Query: antimicrobial resistance
lookup term: antimicrobial_resistance
  • Antimicrobial Resistance Result          URI=http://purl.obolibrary.org/obo/NCIT_C85562 score=0.91
Query: inflammation
lookup term: inflammation
  • Inflammation                             URI=http://purl.obolibrary.org/obo/NCIT_C3137
Query: psychosis
lookup term: psychosis
 

<h1>INGEST ANIMAL</h1>

In [18]:
import re
import json
import numpy as np
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
import unicodedata
from pathlib import Path
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD, SKOS, OWL
from pprint import pprint
from rdflib.namespace import DCTERMS
from groqutils import get_llm_definition
from funcutils import get_ncit_description, get_chebi_description, get_omit_description, NCBI_BASE, HEREDITARY_BASE, UMLS_BASES, foodon_file, ncit_file, omit_file, chebi_file, hash_term_sha256

ANIMAL_BASE = "http://purl.obolibrary.org/obo/"
ANIMAL_CLASS = URIRef("https://w3id.org/brainteaser/ontology/schema/Animal")
ANIMAL_CONCEPT_SCHEME = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Animal")

def load_animal_labels(path):
    pattern = re.compile(r'^\s*(.*?)\s*[\(\[]([A-Za-z0-9_]+)[\)\]]')
    rows = []
    with open(path, encoding="utf-8") as fh:
        for ln in fh:
            m = pattern.match(ln)
            if not m:
                continue
            label, term_id = m.group(1), m.group(2)
            uri = ANIMAL_BASE + term_id
            rows.append((label, uri))
    return rows

ANIMAL_LABELS_FILE = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\parsedOntologies\ncbitaxon_full_taxonomy.txt"
NCIT_LABELS_FILE = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\parsedOntologies\ncit_full_taxonomy.txt"
animal_rows = load_animal_labels(ANIMAL_LABELS_FILE)
ncit_rows = load_animal_labels(NCIT_LABELS_FILE)

exact_ix = defaultdict(list)
for lbl, uri in animal_rows:
    exact_ix[lbl.lower()].append((lbl, uri))

exact_ix1 = defaultdict(list)
for lbl, uri in ncit_rows:
    exact_ix1[lbl.lower()].append((lbl, uri))

labels_only = [preprocess(lbl) for lbl, _ in animal_rows]
vec = TfidfVectorizer(stop_words="english")
mat = vec.fit_transform(labels_only)

labels_only1 = [preprocess(lbl) for lbl, _ in ncit_rows]
vec1 = TfidfVectorizer(stop_words="english")
mat1 = vec1.fit_transform(labels_only1)

def top_cosine(term, k=5, thr=0.78):
    v  = vec.transform([term])
    sc = cosine_similarity(v, mat).ravel()
    idx = np.argsort(sc)[::-1]
    out = []
    for i in idx[:k]:
        if sc[i] < thr:
            break
        lbl, uri = animal_rows[i]
        out.append((lbl, uri, sc[i]))
    return out

def top_cosine_ncit(term, k=5, thr=0.80):
    v  = vec1.transform([term])
    sc = cosine_similarity(v, mat1).ravel()
    idx = np.argsort(sc)[::-1]
    out = []
    for i in idx[:k]:
        if sc[i] < thr:
            break
        lbl, uri = ncit_rows[i]
        out.append((lbl, uri, sc[i]))
    return out

manual_created = {
                "animal" : URIRef("http://purl.obolibrary.org/obo/NCIT_C14182"),
                "mouse_intestinal_tract" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/animal/MouseIntestinalTract"),
                "offspring" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0680063"),

}
created = dict(manual_created)

regex_map = [
                (r"mouse",           "mus_musculus"),
                (r"skin_lipid",     "skin"),
                (r"mice",            "mus_musculus"),
                (r"rat"     ,        "mus_musculus"),
                (r"dogs"   ,         "Canis lupus familiaris"),
                (r"children"  ,      "offspring"),
                (r"child",           "offspring"),
                (r"pig"     ,        "sus"),
                (r"bird"   ,        "aves"),
                (r"rodent"  ,       "rodentia"),
                (r"6-ohda"  ,       "mus_musculus"),
                (r"sps-susceptible_male", "mus_musculus"),
                (r"sps-susceptible_female", "mus_musculus"),
                (r"SPS-resilient_females" ,"mus_musculus"),
                (r"SPS-resilient_males", "mus_musculus"),
                (r"sps resilient_females" ,"mus_musculus"),
                (r"sps-s_males", "mus_musculus"),
                (r"female", "animal"),
                (r"gastrointestinal__gi__tract", "mouse_intestinal_tract"),
                (r"male", "animal"),
                (r"females", "animal"),
                (r"males", "animal"),
]
CREATOR = "Samuel Piron"

UMLS_BASES = "https://uts.nlm.nih.gov/uts/umls/concept/"
#MESH_BASE = "https://meshb.nlm.nih.gov/record/ui?ui="

for uri in manual_created.values():
    uri_str = str(uri)

    if uri_str.startswith(UMLS_BASES):
        # only a UMLS‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("UMLS Match", datatype=XSD.string)))
    elif "NCIT" in uri_str:
        # only a NCIT‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("NCIT Match", datatype=XSD.string)))
    elif "OMIT" in uri_str:
        # only a OMIT‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("OMIT Match", datatype=XSD.string)))
    elif "NCBITaxon" in uri_str:
        # only a NCBITaxon‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("NCBITaxon Match", datatype=XSD.string)))
    else:
        # everything else still gets a creator
        g.add((uri,
               DCTERMS.creator,
               Literal(CREATOR, datatype=XSD.string)))
        
for term_raw, uri in manual_created.items():
    uri_str = str(uri)

    # NCIT definitions
    if uri_str.startswith(NCBI_BASE) and "NCIT_" in uri_str:
        ncit_id = uri_str.rsplit("_", 1)[-1]
        desc = get_ncit_description(ncit_id, ncit_file)
        m = re.search(r'—\s*(.*?)\s*—', desc)
        if m:
            desc = m.group(1).strip()
            comment = f"{desc} [Definition Source: NCIT]"
        else:
            llm_def = get_llm_definition(term_raw)
            comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    # UMLS definitions
    elif uri_str.startswith(UMLS_BASES):
        cui = uri_str.rsplit("/", 1)[-1]
        defn = get_umls_definition(cui)
        if defn:
            comment = f"{defn.strip()} [Definition Source: UMLS]"
        else:
            llm_def = get_llm_definition(term_raw)
            comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    # MeSH definitions via your mesh_index
    elif uri_str.startswith(MESH_BASE):
        ui     = uri_str.rsplit("/",1)[-1]
        hits   = [d['name'] for d in mesh_descs if d['ui']==ui]
        if hits:
            comment = f"{hits[0]} [Definition Source: MeSH]"
        elif term_raw == "patients":
            comment = "Patients with various diseases. [Definition Source: GUTBRAIN]"
        else:
            llm_def = get_llm_definition(term_raw)
            comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    elif uri_str.startswith(HEREDITARY_BASE):
        llm_def = get_llm_definition(term_raw)
        comment = f"{llm_def} [Definition Source: llama3-8b-8192]"
    else:
        llm_def = get_llm_definition(term_raw)
        comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    g.add((uri,
           RDFS.comment,
           Literal(comment, datatype=XSD.string)))
    
for paper_id, paper_data in data.items():
    
    entities = paper_data.get("entities", [])
    
    for i, entity in enumerate(entities):
        raw_label = entity.get("label", "").strip()
        
        text_span = entity.get("text_span", "").strip()
        
        if raw_label == "animal":
            text_span = entity.get("text_span", "").strip()
            cleaned_text_span = normalize_to_ascii(create_uri_fragment(text_span)).lower()
            term_raw = cleaned_text_span
            lookup_key = term_raw
            lookup_key = singularize(lookup_key)
            
            for pattern, replacement in regex_map:
                if re.search(pattern, lookup_key, flags=re.IGNORECASE):
                    lookup_key = replacement
                    print(lookup_key)
                    break
                    
            term = preprocess(lookup_key)
            
            print(f"Query: {term}")
            print(f"lookup term: {lookup_key}")
            
            
            if term_raw in created:

                entity_uri = created[term_raw]
                print(f"  → Reusing existing URI: {entity_uri}\n")
                g.add((entity_uri, RDF.type, ANIMAL_CLASS))
                g.add((entity_uri, RDF.type, SKOS.Concept))
                g.add((entity_uri, SKOS.inScheme, ANIMAL_CONCEPT_SCHEME))
                if (entity_uri, RDFS.label, None) not in g:
                    g.add((entity_uri,
                            RDFS.label,
                            Literal(term.title(), datatype=XSD.string)))
                mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_animal_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                tokenized_mentions[term_raw] = mention_uri
                print()
                continue

            if lookup_key in created:
                print(f"  → Reusing existing URI: {created[lookup_key]}")
                entity_uri = created[lookup_key]
                g.add((entity_uri, RDF.type, ANIMAL_CLASS))
                g.add((entity_uri, RDF.type, SKOS.Concept))
                g.add((entity_uri, SKOS.inScheme, ANIMAL_CONCEPT_SCHEME))
                if (entity_uri, RDFS.label, None) not in g:
                    g.add((entity_uri,
                            RDFS.label,
                            Literal(term.title(), datatype=XSD.string)))
                mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_animal_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                tokenized_mentions[term_raw] = mention_uri
                print()
                continue

            ex = exact_ix.get(term, [])
            cos = top_cosine(term)
            cos1 = top_cosine_ncit(term)
            if ex:
                for lbl, uri in ex:
                    print(f"  • {lbl:40s} URI={uri}")
                    entity_uri = URIRef(f"{uri}")
                    created[term_raw] = entity_uri
                    g.add((entity_uri, RDF.type, ANIMAL_CLASS))
                    g.add((entity_uri, RDF.type, SKOS.Concept))
                    if (entity_uri, RDFS.label, None) not in g:
                        g.add((entity_uri,
                            RDFS.label,
                            Literal(lbl.title(), datatype=XSD.string)))
                    g.add((entity_uri, SKOS.inScheme, ANIMAL_CONCEPT_SCHEME))
                    g.add((entity_uri, RDFS.comment, Literal("NCBITaxon Match", datatype=XSD.string)))
                    uri_str = str(entity_uri)
                    definition = choose_definition(uri_str, term_raw)
                    existing_defs = [
                        c for c in g.objects(entity_uri, RDFS.comment)
                        if "[Definition Source:" in str(c)
                    ]
                    if not existing_defs:
                        g.add((entity_uri,
                            RDFS.comment,
                            Literal(definition, datatype=XSD.string)))
                    mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_animal_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    continue
                    print(); 
                    
            elif cos:
                for lbl, uri, score in cos[:1]:
                    print(f"  • {lbl:40s} URI={uri:40s} score={score:.2f}")
                    name_uri = URIRef(f"{uri}")
                    created[term_raw] = name_uri
                    g.add((name_uri, RDF.type, ANIMAL_CLASS))
                    g.add((name_uri, RDF.type, SKOS.Concept))
                    if (name_uri, RDFS.label, None) not in g:
                        g.add((name_uri,
                            RDFS.label,
                            Literal(lbl.title(), datatype=XSD.string)))
                    g.add((name_uri, SKOS.inScheme, ANIMAL_CONCEPT_SCHEME))
                    g.add((name_uri, RDFS.comment, Literal("NCBITaxon Match", datatype=XSD.string)))
                    uri_str = str(name_uri)
                    definition = choose_definition(uri_str, term_raw)
                    existing_defs = [
                        c for c in g.objects(name_uri, RDFS.comment)
                        if "[Definition Source:" in str(c)
                    ]
                    if not existing_defs:
                        g.add((name_uri,
                            RDFS.comment,
                            Literal(definition, datatype=XSD.string)))
                    mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_animal_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((name_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    print()
            elif cos1:
                for lbl, uri, score in cos1[:1]:
                    print(f"  • {lbl:40s} URI={uri:40s} score={score:.2f}")
                    name_uri = URIRef(f"{uri}")
                    created[term_raw] = name_uri
                    g.add((name_uri, RDF.type, ANIMAL_CLASS))
                    g.add((name_uri, RDF.type, SKOS.Concept))
                    if (name_uri, RDFS.label, None) not in g:
                        g.add((name_uri,
                            RDFS.label,
                            Literal(lbl.title(), datatype=XSD.string)))
                    g.add((name_uri, SKOS.inScheme, ANIMAL_CONCEPT_SCHEME))
                    g.add((name_uri, RDFS.comment, Literal("NCIT Match", datatype=XSD.string)))
                    uri_str = str(name_uri)
                    definition = choose_definition(uri_str, term_raw)
                    existing_defs = [
                        c for c in g.objects(name_uri, RDFS.comment)
                        if "[Definition Source:" in str(c)
                    ]
                    if not existing_defs:
                        g.add((name_uri,
                            RDFS.comment,
                            Literal(definition, datatype=XSD.string)))
                    mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_animal_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((name_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    print()
                    
            else:
                mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_animal_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                tokenized_mentions[term_raw] = mention_uri
                print("no matches")
                continue

        else:
            pass

with open(tokenized_file, "r", encoding="utf-8") as f_sent:
    tokenized_data = json.load(f_sent)

for entry in tokenized_data:
    pmid         = entry["pmid"]
    sent_id      = entry["sent_id"]
    sentence_txt = entry["sentence"].strip()
    entities     = entry["entities"]

    sent_uri = URIRef(GUTBRAINSENTENCE[f"{pmid}_{sent_id}"])
    g.add((sent_uri, RDF.type, SENTENCE))
    g.add((sent_uri, GUTPROP.hasSentenceText, Literal(sentence_txt, datatype=XSD.string)))

    if sent_id == 0:
        parent_uri = URIRef(GUTBRAIN[f"title_{pmid}"])
    else:
        parent_uri = URIRef(GUTBRAIN[f"abstract_{pmid}"])
    g.add((sent_uri,      GUTPROP.partOf,  parent_uri))
    g.add((parent_uri,    GUTPROP.composedOf, sent_uri))
        
    for ent in entities:
        if isinstance(ent, dict):
            text_span = ent.get("text_span", "").strip()
            label     = ent.get("label",    "").strip()
        else:
            continue

        if label != "animal":
            continue
        canonical = create_uri_fragment(text_span)
        cleaned_text_span = normalize_to_ascii(canonical).lower()
        
        if cleaned_text_span not in tokenized_mentions:
            mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(cleaned_text_span, max_length=16)])
            tokenized_mentions[cleaned_text_span] = mention_uri

            g.add((mention_uri, RDF.type, MENTION_CLASS))
            g.add((mention_uri, RDFS.label, Literal(f"mention_animal_{cleaned_text_span}", datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.taggedAs, Literal(label, datatype=XSD.string)))
            
        else:
            mention_uri = tokenized_mentions[cleaned_text_span]
            
        g.add((mention_uri, GUTPROP.locatedIn, sent_uri))

aves
Query: aves
lookup term: aves
  • Aves                                     URI=http://purl.obolibrary.org/obo/NCBITaxon_8782
aves
Query: aves
lookup term: aves
  • Aves                                     URI=http://purl.obolibrary.org/obo/NCBITaxon_8782
Query: animal
lookup term: animal
  → Reusing existing URI: http://purl.obolibrary.org/obo/NCIT_C14182

mus_musculus
Query: mus musculus
lookup term: mus_musculus
  • Mus musculus                             URI=http://purl.obolibrary.org/obo/NCBITaxon_10090
mus_musculus
Query: mus musculus
lookup term: mus_musculus
  • Mus musculus                             URI=http://purl.obolibrary.org/obo/NCBITaxon_10090
mus_musculus
Query: mus musculus
lookup term: mus_musculus
  • Mus musculus                             URI=http://purl.obolibrary.org/obo/NCBITaxon_10090
mus_musculus
Query: mus musculus
lookup term: mus_musculus
  → Reusing existing URI: http://purl.obolibrary.org/obo/NCBITaxon_10090


mus_musculus
Query: mus musculus
look

<h1>INGEST DIETARY SUPPLEMENT</h1>

In [19]:
import re
import json
import numpy as np
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
import unicodedata
from pathlib import Path
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD, SKOS, OWL
from pprint import pprint
from nltk.stem import WordNetLemmatizer
from rdflib import Graph, Namespace, URIRef
from rdflib.namespace import RDFS
from rdflib.namespace import DCTERMS
from groqutils import get_llm_definition
from funcutils import get_ncit_description, get_chebi_description, get_omit_description, NCBI_BASE, HEREDITARY_BASE, UMLS_BASES, foodon_file, ncit_file, omit_file, chebi_file, hash_term_sha256, get_foodon_description

DIETARYSUPPLEMENT_BASE = "http://purl.obolibrary.org/obo/"
DIETARYSUPPLEMENT_CLASS = URIRef("https://w3id.org/brainteaser/ontology/schema/DietarySupplement")
DIETARYSUPPLEMENT_CONCEPT_SCHEME = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/DietarySupplement")

lemmatizer = WordNetLemmatizer()

#created = {}

# 1.1) keep only your hand-picked seeds here
manual_created = {
    "dietary_supplementation": URIRef("https://www.ncbi.nlm.nih.gov/mesh/68019587"),
    "egcg":                     URIRef("http://purl.obolibrary.org/obo/XCO_0001093"),
    "hippophae_rhamnoide":      URIRef("http://purl.obolibrary.org/obo/NCBITaxon_193516"),
    "prebiotic":                URIRef("http://purl.obolibrary.org/obo/OMIT_0026689"),
    "acacetin":                 URIRef("http://purl.obolibrary.org/obo/CHEBI_15335"),
    "gluten":                   URIRef("http://purl.obolibrary.org/obo/FOODON_03420177"),
    "lactobacillus":            URIRef("https://www.ncbi.nlm.nih.gov/mesh/D052200"),
    "triphala":                 URIRef("https://www.ncbi.nlm.nih.gov/mesh/67520904"),
    "lacticaseibacillus_rhamnosus": URIRef("http://purl.obolibrary.org/obo/NCBITaxon_47715"),
    "metabolite" :URIRef("http://purl.obolibrary.org/obo/CHEBI_25212"),
    "nvp-1704" : URIRef("http://purl.obolibrary.org/obo/CHEBI_230487"),
    "micronutrient" : URIRef("http://purl.obolibrary.org/obo/CHEBI_27027"),
 
}

created = dict(manual_created)

regex_map = [
    (r"\bprobiot(?:ic|ics|ic_supplementation)\b", "probiotic"),
    (r"feed_additive",       "feed"),
    (r"next-generation_feed_additive", "feed"),
    (r"\bmicronutrient_supplementation\b", "micronutrient"),
    (r"\bmicronutrient_treatment\b", "micronutrient"),
    (r"seabuckthorn","hippophae_rhamnoide"),
    (r"vsl__3","prebiotic"),
    (r"vsl_3","prebiotic"),
    (r"sbf","hippophae_rhamnoide"),
    (r"\bwheat_germ\w*\b","fermented_wheat_germ"),
    (r"\bfwg\w*\b","fermented_wheat_germ"),
    (r"(?:(?<=^)|(?<=[^A-Za-z0-9]))probiotic\w*\b", "probiotic"),
    (r"postbiotic","metabolite"),
    (r"(?:(?<=^)|(?<=[^A-Za-z0-9]))scfa\w*\b","short-chain_fatty_acid"),
    (r"\btriphala_polyphenol\w*\b","polyphenol"),
    (r"\bpolyphenol\w*\b", "polyphenol"),
    (r"prebiotic_supplementation","synbiotic_supplement"),
    (r"(?:(?<=^)|(?<=[^A-Za-z0-9]))starch\w*\b","Starch"),
    (r"nvp-1704_treatment", "probiotic"),
    (r"danggui_buxue_decoction","medication"),
    (r"\bdbd\w*\b","medication"),
    (r"f4_consumption","lactobacillus"),
    (r"f4_supplementation","lactobacillus"),
    (r"medicinal_herb", "medication"),
    (r"\bprebiotic\w*\b","prebiotic"),
    (r"b__licheniformi","bacillus"),
    (r"plant_polysaccharide","plant"),
    (r"zhe_busong_decoction","triphala"),
    (r"\blacticaseibacillus_rhamnosus\w*\b", "lacticaseibacillus_rhamnosus"),
    (r"high-cholesterol_diet", "diet"),
    (r"\bbacillus_licheniformi\w*\b", "bacillus"),
    (r"\bb__licheniformi\w*\b", "bacillus"),
]

for term_raw, uri in manual_created.items():
    uri_str = str(uri)

    # NCIT definitions
    if uri_str.startswith(NCBI_BASE) and "NCIT_" in uri_str:
        ncit_id = uri_str.rsplit("_", 1)[-1]
        desc = get_ncit_description(ncit_id, ncit_file)
        m = re.search(r'—\s*(.*?)\s*—', desc)
        if m:
            desc = m.group(1).strip()
            comment = f"{desc} [Definition Source: NCIT]"
        else:
            llm_def = get_llm_definition(term_raw)
            comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    # UMLS definitions
    elif uri_str.startswith(UMLS_BASES):
        cui = uri_str.rsplit("/", 1)[-1]
        defn = get_umls_definition(cui)
        if defn:
            comment = f"{defn.strip()} [Definition Source: UMLS]"
        else:
            llm_def = get_llm_definition(term_raw)
            comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    # MeSH definitions via your mesh_index
    elif uri_str.startswith(MESH_BASE):
        ui     = uri_str.rsplit("/",1)[-1]
        hits   = [d['name'] for d in mesh_descs if d['ui']==ui]
        if hits:
            comment = f"{hits[0]} [Definition Source: MeSH]"
        elif term_raw == "patients":
            comment = "Patients with various diseases. [Definition Source: GUTBRAIN]"
        else:
            llm_def = get_llm_definition(term_raw)
            comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    elif uri_str.startswith(HEREDITARY_BASE):
        llm_def = get_llm_definition(term_raw)
        comment = f"{llm_def} [Definition Source: llama3-8b-8192]"
    else:
        llm_def = get_llm_definition(term_raw)
        comment = f"{llm_def} [Definition Source: llama3-8b-8192]"

    g.add((uri,
           RDFS.comment,
           Literal(comment, datatype=XSD.string)))

def load_dietary_labels(path):
    pattern = re.compile(r'^\s*(.*?)\s*[\(\[]([A-Za-z0-9_]+)[\)\]]')
    rows = []
    with open(path, encoding="utf-8") as fh:
        for ln in fh:
            m = pattern.match(ln)
            if not m:
                continue
            label, term_id = m.group(1), m.group(2)
            uri = DIETARYSUPPLEMENT_BASE + term_id
            rows.append((label, uri))
    return rows

def load_chebi_labels(path):
    rows = []
    with open(path, encoding="utf-8") as fh:
        next(fh) 
        for ln in fh:
            uri, label = ln.rstrip("\n").split("\t", 1)
            rows.append((label, uri))
    return rows

CHEBI_LABELS_FILE = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\parsedOntologies\chebi_labels.txt"
PARSED_NCIT_FILE = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\ncit_full_taxonomy.txt"
DIETARYSUPPLEMENT_LABELS_FILE = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\parsedOntologies\ncit_full_taxonomy.txt"
dietary_rows = load_dietary_labels(DIETARYSUPPLEMENT_LABELS_FILE)
chebi_rows = load_chebi_labels(CHEBI_LABELS_FILE)

exact_ix = defaultdict(list)
for lbl, uri in dietary_rows:
    exact_ix[lbl.lower()].append((lbl, uri))

exact_ix1 = defaultdict(list)
for lbl, uri in chebi_rows:
    exact_ix1[lbl.lower()].append((lbl, uri))

labels_only = [preprocess(lbl) for lbl, _ in dietary_rows]
vec = TfidfVectorizer(stop_words="english")
mat = vec.fit_transform(labels_only)

labels_only1 = [preprocess(lbl) for lbl, _ in chebi_rows]
vec1 = TfidfVectorizer(stop_words="english")
mat1 = vec1.fit_transform(labels_only1)

def top_cosine(term, k=5, thr=0.80):
    v  = vec.transform([term])
    sc = cosine_similarity(v, mat).ravel()
    idx = np.argsort(sc)[::-1]
    out = []
    for i in idx[:k]:
        if sc[i] < thr:
            break
        lbl, uri = dietary_rows[i]
        out.append((lbl, uri, sc[i]))
    return out

def top_cosine_chebi(term, k=5, thr=0.80):
    v  = vec1.transform([term])
    sc = cosine_similarity(v, mat1).ravel()
    idx = np.argsort(sc)[::-1]
    out = []
    for i in idx[:k]:
        if sc[i] < thr:
            break
        lbl, uri = chebi_rows[i]
        out.append((lbl, uri, sc[i]))
    return out

CREATOR = "Samuel Piron"

UMLS_BASES = "https://uts.nlm.nih.gov/uts/umls/concept/"
#MESH_BASE = "https://meshb.nlm.nih.gov/record/ui?ui="

for uri in manual_created.values():
    uri_str = str(uri)

    if uri_str.startswith(UMLS_BASES):
        # only a UMLS‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("UMLS Match", datatype=XSD.string)))
    elif "NCIT" in uri_str:
        # only a NCIT‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("NCIT Match", datatype=XSD.string)))
    elif "OMIT" in uri_str:
        # only a OMIT‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("OMIT Match", datatype=XSD.string)))
    elif "NCBITaxon" in uri_str:
        # only a NCBITaxon‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("NCBITaxon Match", datatype=XSD.string)))
    elif "CHEBI" in uri_str:
        # only a CHEBI‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("CHEBI Match", datatype=XSD.string)))
    elif "FOODON" in uri_str:
        # only a FOODON‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("FOODON Match", datatype=XSD.string)))
    elif "mesh" in uri_str:
        # only a MESH‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("MESH Match", datatype=XSD.string)))
    elif "XCO" in uri_str:
        # only a XCO‐Match comment
        g.add((uri,
               RDFS.comment,
               Literal("XCO Match", datatype=XSD.string)))
    else:
        # everything else still gets a creator
        g.add((uri,
               DCTERMS.creator,
               Literal(CREATOR, datatype=XSD.string)))
    
for paper_id, paper_data in data.items():
    
    entities = paper_data.get("entities", [])
    
    for i, entity in enumerate(entities):
        raw_label = entity.get("label", "").strip()
        
        if raw_label == "dietary supplement":
            text_span = entity.get("text_span", "").strip()
            cleaned_text_span = normalize_to_ascii(create_uri_fragment(text_span)).lower()
            term_raw = cleaned_text_span
            lookup_key = term_raw
            lookup_key = singularize(lookup_key)
            
            for pattern, replacement in regex_map:
                if re.search(pattern, lookup_key, flags=re.IGNORECASE):
                    lookup_key = replacement
                    print(lookup_key)
                    break
                    
            term = preprocess(lookup_key)
            
            print(f"Query: {term}")
            print(f"lookup term: {lookup_key}")
            
            
            if term_raw in created:

                entity_uri = created[term_raw]
                print(f"  → Reusing existing URI: {entity_uri}\n")
                g.add((entity_uri, RDF.type, DIETARYSUPPLEMENT_CLASS))
                g.add((entity_uri, RDF.type, SKOS.Concept))
                g.add((entity_uri, SKOS.inScheme, DIETARYSUPPLEMENT_CONCEPT_SCHEME))
                if (entity_uri, RDFS.label, None) not in g:
                    g.add((entity_uri,
                            RDFS.label,
                            Literal(term.title(), datatype=XSD.string)))
                mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_dietarysupplement_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                tokenized_mentions[term_raw] = mention_uri
                print()
                continue

            if lookup_key in created:
                print(f"  → Reusing existing URI: {created[lookup_key]}")
                entity_uri = created[lookup_key]
                g.add((entity_uri, RDF.type, DIETARYSUPPLEMENT_CLASS))
                g.add((entity_uri, RDF.type, SKOS.Concept))
                g.add((entity_uri, SKOS.inScheme, DIETARYSUPPLEMENT_CONCEPT_SCHEME))
                if (entity_uri, RDFS.label, None) not in g:
                    g.add((entity_uri,
                            RDFS.label,
                            Literal(term.title(), datatype=XSD.string)))
                mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_dietarysupplement_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                tokenized_mentions[term_raw] = mention_uri
                print()
                continue

            ex = exact_ix.get(term, [])
            cos = top_cosine(term)
            cos1 = top_cosine_chebi(term)
            if ex:
                for lbl, uri in ex:
                    print(f"  • {lbl:40s} URI={uri}")
                    entity_uri = URIRef(f"{uri}")
                    created[term_raw] = entity_uri
                    g.add((entity_uri, RDF.type, DIETARYSUPPLEMENT_CLASS))
                    g.add((entity_uri, RDF.type, SKOS.Concept))
                    if (entity_uri, RDFS.label, None) not in g:
                        g.add((entity_uri,
                            RDFS.label,
                            Literal(lbl.title(), datatype=XSD.string)))
                    g.add((entity_uri, SKOS.inScheme, DIETARYSUPPLEMENT_CONCEPT_SCHEME))
                    uri_str = str(entity_uri).lower()

                    if "stato_" in uri_str:
                        comment = "STATO Match"
                    elif "ncbitaxon_" in uri_str:
                        comment = "NCBITaxon Match"
                    elif "ncit_" in uri_str:
                        comment = "NCIT Match"
                    elif "obi_" in uri_str:
                        comment = "OBI Match"
                    elif "umls" in uri_str:
                        comment = "UMLS Match"
                    elif "omit" in uri_str:
                        comment = "OMIT Match"
                    elif "chebi" in uri_str:
                        comment = "CHEBI Match"
                    else:
                        comment = CREATOR 
                    g.add((entity_uri, RDFS.comment, Literal(comment, datatype=XSD.string)))
                    uri_str = str(entity_uri)
                    definition = choose_definition(uri_str, term_raw)
                    existing_defs = [
                        c for c in g.objects(entity_uri, RDFS.comment)
                        if "[Definition Source:" in str(c)
                    ]
                    if not existing_defs:
                        g.add((entity_uri,
                            RDFS.comment,
                            Literal(definition, datatype=XSD.string)))
                    mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_dietarysupplement_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    continue
                    print(); 
                    
            elif cos:
                for lbl, uri, score in cos[:1]:
                    print(f"  • {lbl:40s} URI={uri:40s} score={score:.2f}")
                    name_uri = URIRef(f"{uri}")
                    created[term_raw] = name_uri
                    g.add((name_uri, RDF.type, DIETARYSUPPLEMENT_CLASS))
                    g.add((name_uri, RDF.type, SKOS.Concept))
                    if (name_uri, RDFS.label, None) not in g:
                        g.add((name_uri,
                            RDFS.label,
                            Literal(lbl.title(), datatype=XSD.string)))
                    g.add((name_uri, SKOS.inScheme, DIETARYSUPPLEMENT_CONCEPT_SCHEME))
                    uri_str = str(name_uri).lower()

                    if "stato_" in uri_str:
                        comment = "STATO Match"
                    elif "ncbitaxon_" in uri_str:
                        comment = "NCBITaxon Match"
                    elif "ncit_" in uri_str:
                        comment = "NCIT Match"
                    elif "obi_" in uri_str:
                        comment = "OBI Match"
                    elif "umls" in uri_str:
                        comment = "UMLS Match"
                    elif "omit" in uri_str:
                        comment = "OMIT Match"
                    elif "chebi" in uri_str:
                        comment = "CHEBI Match"
                    else:
                        comment = CREATOR 
                    g.add((name_uri, RDFS.comment, Literal(comment, datatype=XSD.string)))
                    uri_str = str(name_uri)
                    definition = choose_definition(uri_str, term_raw)
                    existing_defs = [
                        c for c in g.objects(name_uri, RDFS.comment)
                        if "[Definition Source:" in str(c)
                    ]
                    if not existing_defs:
                        g.add((name_uri,
                            RDFS.comment,
                            Literal(definition, datatype=XSD.string)))
                    mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_dietarysupplement_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((name_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    continue
                    print()
            elif cos1:
                for lbl, uri, score in cos1[:1]:
                    print(f"  • {lbl:40s} URI={uri:40s} score={score:.2f}")
                    name_uri = URIRef(f"{uri}")
                    created[term_raw] = name_uri
                    g.add((name_uri, RDF.type, DIETARYSUPPLEMENT_CLASS))
                    g.add((name_uri, RDF.type, SKOS.Concept))
                    if (name_uri, RDFS.label, None) not in g:
                        g.add((name_uri,
                            RDFS.label,
                            Literal(lbl.title(), datatype=XSD.string)))
                    g.add((name_uri, SKOS.inScheme, DIETARYSUPPLEMENT_CONCEPT_SCHEME))
                    uri_str = str(name_uri).lower()

                    if "stato_" in uri_str:
                        comment = "STATO Match"
                    elif "ncbitaxon_" in uri_str:
                        comment = "NCBITaxon Match"
                    elif "ncit_" in uri_str:
                        comment = "NCIT Match"
                    elif "obi_" in uri_str:
                        comment = "OBI Match"
                    elif "umls" in uri_str:
                        comment = "UMLS Match"
                    elif "omit" in uri_str:
                        comment = "OMIT Match"
                    elif "chebi" in uri_str:
                        comment = "CHEBI Match"
                    else:
                        comment = CREATOR 
                    g.add((name_uri, RDFS.comment, Literal(comment, datatype=XSD.string)))
                    uri_str = str(name_uri)
                    definition = choose_definition(uri_str, term_raw)
                    existing_defs = [
                        c for c in g.objects(name_uri, RDFS.comment)
                        if "[Definition Source:" in str(c)
                    ]
                    if not existing_defs:
                        g.add((name_uri,
                            RDFS.comment,
                            Literal(definition, datatype=XSD.string)))
                    mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_dietarysupplement_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((name_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    continue
                    print()
                    
            else:
                mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(term_raw, max_length=16)])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_dietarysupplement_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                tokenized_mentions[term_raw] = mention_uri
                print("no matches")
                continue

        else:
            pass

with open(tokenized_file, "r", encoding="utf-8") as f_sent:
    tokenized_data = json.load(f_sent)

for entry in tokenized_data:
    pmid         = entry["pmid"]
    sent_id      = entry["sent_id"]
    sentence_txt = entry["sentence"].strip()
    entities     = entry["entities"]

    sent_uri = URIRef(GUTBRAINSENTENCE[f"{pmid}_{sent_id}"])

    sent_uri = URIRef(GUTBRAINSENTENCE[f"{pmid}_{sent_id}"])
    g.add((sent_uri, RDF.type, SENTENCE))
    g.add((sent_uri, GUTPROP.hasSentenceText, Literal(sentence_txt, datatype=XSD.string)))

    if sent_id == 0:
        parent_uri = URIRef(GUTBRAIN[f"title_{pmid}"])
    else:
        parent_uri = URIRef(GUTBRAIN[f"abstract_{pmid}"])
    g.add((sent_uri,      GUTPROP.partOf,  parent_uri))
    g.add((parent_uri,    GUTPROP.composedOf, sent_uri))
        
    for ent in entities:
        if isinstance(ent, dict):
            text_span = ent.get("text_span", "").strip()
            label     = ent.get("label",    "").strip()
        else:
            continue

        if label != "dietary supplement":
            continue
        canonical = create_uri_fragment(text_span)
        cleaned_text_span = normalize_to_ascii(canonical).lower()
        
        if cleaned_text_span not in tokenized_mentions:
            mention_uri = URIRef(GUTBRAINMENTION[hash_term_sha256(cleaned_text_span, max_length=16)])
            tokenized_mentions[cleaned_text_span] = mention_uri

            g.add((mention_uri, RDF.type, MENTION_CLASS))
            g.add((mention_uri, RDFS.label, Literal(f"mention_dietarysupplement_{cleaned_text_span}", datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.taggedAs, Literal(label, datatype=XSD.string)))
            
            
        else:
            mention_uri = tokenized_mentions[cleaned_text_span]
            
        g.add((mention_uri, GUTPROP.locatedIn, sent_uri))

output_file = os.path.join(save_path, "gutbrain_entities.ttl")
ttl_output = g.serialize(format="turtle")
with open(output_file, "w", encoding="utf-8") as f_out:
    f_out.write(ttl_output)

print(f"The RDF graph has been saved in {output_file}")

probiotic
Query: probiotic
lookup term: probiotic
  • Probiotic                                URI=http://purl.obolibrary.org/obo/NCIT_C93144
probiotic
Query: probiotic
lookup term: probiotic
  • Probiotic                                URI=http://purl.obolibrary.org/obo/NCIT_C93144
Query: dietary supplementation
lookup term: dietary_supplementation
  → Reusing existing URI: https://www.ncbi.nlm.nih.gov/mesh/68019587


feed
Query: feed
lookup term: feed
  • Feed                                     URI=http://purl.obolibrary.org/obo/NCIT_C69427
feed
Query: feed
lookup term: feed
  → Reusing existing URI: http://purl.obolibrary.org/obo/NCIT_C69427


feed
Query: feed
lookup term: feed
  • Feed                                     URI=http://purl.obolibrary.org/obo/NCIT_C69427
probiotic
Query: probiotic
lookup term: probiotic
  → Reusing existing URI: http://purl.obolibrary.org/obo/NCIT_C93144


probiotic
Query: probiotic
lookup term: probiotic
  → Reusing existing URI: http://purl.obolibra

In [None]:
for paper_id, paper_data in data.items():
    relations = paper_data.get("relations", [])
    for rel in relations:
        # a) find the mention URI for subject
        subj_key     = normalize_to_ascii(create_uri_fragment(rel["subject_text_span"])).lower()
        mention_subj = tokenized_mentions.get(subj_key)
        if not mention_subj:
            print(f"⚠️ No mention for subject “{rel['subject_text_span']}”")
            continue

        # b) follow mention → concept
        subj_concepts = list(g.subjects(GUTPROP.containedIn, mention_subj))
        if not subj_concepts:
            print(f"⚠️ No concept contains {mention_subj}")
            continue
        subj_uri = subj_concepts[0]

        # c) same for object
        obj_key     = normalize_to_ascii(create_uri_fragment(rel["object_text_span"])).lower()
        mention_obj = tokenized_mentions.get(obj_key)
        if not mention_obj:
            print(f"⚠️ No mention for object “{rel['object_text_span']}”")
            continue

        obj_concepts = list(g.subjects(GUTPROP.containedIn, mention_obj))
        if not obj_concepts:
            print(f"⚠️ No concept contains {mention_obj}")
            continue
        obj_uri = obj_concepts[0]

        # d) build or lookup the predicate
        prop_local = to_camel_case(rel["predicate"])
        prop_uri   = URIRef(f"{GUTPROP}{prop_local}")
        if (prop_uri, RDF.type, OWL.ObjectProperty) not in g:
            g.add((prop_uri, RDF.type,    OWL.ObjectProperty))
            g.add((prop_uri, RDFS.label, Literal(rel["predicate"], datatype=XSD.string)))

        # e) finally link the two *concept* URIs
        g.add((subj_uri, prop_uri, obj_uri))

⚠️ No concept contains https://hereditary.dei.unipd.it/ontology/gutbrain/resource/mention/intestinal_microbiota_alterations
⚠️ No concept contains https://hereditary.dei.unipd.it/ontology/gutbrain/resource/mention/gut_diseases


In [None]:
output_file = os.path.join(save_path, "gutbrain_entities.ttl")
ttl_output = g.serialize(format="turtle")
with open(output_file, "w", encoding="utf-8") as f_out:
    f_out.write(ttl_output)

print(f"The RDF graph has been saved in {output_file}")

The RDF graph has been saved in c:\Users\samue\OneDrive\Desktop\ThesisPiron\rdf\gutbrain_entities.ttl
