In [1]:
!pip install nltk




[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import xml.etree.ElementTree as ET
from difflib import SequenceMatcher, get_close_matches

def parse_mesh_descriptors(xml_path):
    """Parse the MeSH XML and return a list of descriptors."""
    tree = ET.parse(xml_path)
    root = tree.getroot()
    descriptors = []
    for dr in root.findall('DescriptorRecord'):
        ui_el   = dr.find('DescriptorUI')
        name_el = dr.find('DescriptorName/String')
        if ui_el is None or name_el is None:
            continue
        ui   = ui_el.text
        name = name_el.text
        tree_nums = [tn.text for tn in dr.findall('TreeNumberList/TreeNumber') if tn.text]
        descriptors.append({'ui':ui, 'name':name, 'tree_numbers':tree_nums})
    return descriptors

MESH_XML = 'desc2025.xml'
descriptors = parse_mesh_descriptors(MESH_XML)
print(f"Parsed {len(descriptors)} descriptors")

Parsed 30956 descriptors


In [3]:
def get_bacteria_taxonomy(descriptors):
    """
    Find the descriptor with name 'Bacteria', 
    then collect every descriptor under that tree-number prefix.
    """
    prefix = None #bacteria is B03
    for d in descriptors:
        if d['name'].lower() == 'bacteria' and d['tree_numbers']:
            prefix = d['tree_numbers'][0]
            break
    if not prefix:
        raise RuntimeError("Could not find 'Bacteria' in descriptors")
    tax = {}
    for d in descriptors:
        for tn in d['tree_numbers']:
            if tn == prefix or tn.startswith(prefix + '.'):
                tax[tn] = {'ui':d['ui'], 'name':d['name']}
                break
    return tax

bacteria_tax = get_bacteria_taxonomy(descriptors)
#print(f"{len(bacteria_tax)} bacterial MeSH nodes")

In [4]:
def build_name_index(taxonomy):
    """
    Build a dict: lower-case name -> list of (tree#, ui, canonical name)
    """
    idx = {}
    for tree_num, info in taxonomy.items():
        key = info['name'].lower()
        idx.setdefault(key, []).append((tree_num, info['ui'], info['name']))
    return idx

name_index = build_name_index(bacteria_tax)
print(f"Indexed {len(name_index)} bacterial names")

Indexed 859 bacterial names


In [5]:
import xml.etree.ElementTree as ET

def parse_mesh_descriptors(xml_path):
    """
    Parse the MeSH XML and return a list of dicts:
      { 'ui': DescriptorUI,
        'name': DescriptorName,
        'tree_numbers': [treeNum1, treeNum2, ...]
      }
    """
    tree = ET.parse(xml_path)
    root = tree.getroot()
    descriptors = []
    for dr in root.findall('DescriptorRecord'):
        ui_el = dr.find('DescriptorUI')
        name_el = dr.find('DescriptorName/String')
        if ui_el is None or name_el is None:
            continue
        ui = ui_el.text
        name = name_el.text
        tree_numbers = [tn.text for tn in dr.findall('TreeNumberList/TreeNumber') if tn.text]
        descriptors.append({
            'ui': ui,
            'name': name,
            'tree_numbers': tree_numbers
        })
    return descriptors

def get_bacteria_taxonomy(xml_path):
    """
    From the full MeSH descriptors file, extract all descriptors
    under the 'Bacteria' branch (tree number B03).
    Returns a dict: { tree_number: { 'ui': ..., 'name': ... }, ... }
    """
    descriptors = parse_mesh_descriptors(xml_path)

    bacteria_prefix = None
    for d in descriptors:
        if d['name'] == 'Bacteria':
            bacteria_prefix = d['tree_numbers'][0]
            break

    if not bacteria_prefix:
        raise RuntimeError("Couldn't find a descriptor named 'Bacteria' in the file.")

    taxonomy = {}
    for d in descriptors:
        for tn in d['tree_numbers']:
            if tn == bacteria_prefix or tn.startswith(bacteria_prefix + '.'):
                taxonomy[tn] = {
                    'ui': d['ui'],
                    'name': d['name']
                }
                break

    return taxonomy

if __name__ == '__main__':
    xml_file = 'desc2025.xml'
    bacteria_tax = get_bacteria_taxonomy(xml_file)

    for tree_num in sorted(bacteria_tax):
        info = bacteria_tax[tree_num]
        print(f"{tree_num:10s}  {info['name']:30s}  ({info['ui']})")

B03         Bacteria                        (D001419)
B03.026     Acidobacteria                   (D061271)
B03.054     Agricultural Inoculants         (D059827)
B03.110     Atypical Bacterial Forms        (D001295)
B03.110.422  L Forms                         (D007740)
B03.110.761  Spheroplasts                    (D013104)
B03.120     Bacteria, Aerobic               (D001420)
B03.130     Bacteria, Anaerobic             (D001421)
B03.135     Bacteria, Thermoduric           (D000072280)
B03.250     Chlorobi                        (D019414)
B03.250.140  Chlorobium                      (D041883)
B03.275     Chloroflexi                     (D041862)
B03.275.150  Chloroflexus                    (D041861)
B03.275.575  Dehalococcoides                 (D000082942)
B03.280     Cyanobacteria                   (D000458)
B03.280.100  Anabaena                        (D017033)
B03.280.100.150  Anabaena cylindrica             (D046868)
B03.280.100.900  Anabaena variabilis             (D046870)
B03.28

In [6]:
import xml.etree.ElementTree as ET
from difflib import SequenceMatcher, get_close_matches

def parse_mesh_descriptors(xml_path):
    """Parse the MeSH XML and return a list of descriptors."""
    tree = ET.parse(xml_path)
    root = tree.getroot()
    descriptors = []
    for dr in root.findall('DescriptorRecord'):
        ui_el   = dr.find('DescriptorUI')
        name_el = dr.find('DescriptorName/String')
        if ui_el is None or name_el is None:
            continue
        ui   = ui_el.text
        name = name_el.text
        tree_nums = [tn.text for tn in dr.findall('TreeNumberList/TreeNumber') if tn.text]
        descriptors.append({'ui':ui, 'name':name, 'tree_numbers':tree_nums})
    return descriptors

MESH_XML = 'desc2025.xml' #from the folder
descriptors = parse_mesh_descriptors(MESH_XML)
print(f"Parsed {len(descriptors)} descriptors")
#print(descriptors)

Parsed 30956 descriptors


In [7]:
from difflib import SequenceMatcher

def find_mesh_match(input_name, name_index, n=5, cutoff=0.6):
    """
    Return matches for input_name among the MeSH bacterial names.
    Exact matches come first with score=1.0.
    Then fuzzy matches (score computed via SequenceMatcher.ratio).
    """
    key = input_name.lower()
    results = []
    if key in name_index:
        for tree_num, ui, name in name_index[key]:
            results.append((name, ui, tree_num, 1.0))
        return results

    all_names = list(name_index.keys())
    #print(all_names)
    
    close = get_close_matches(key, all_names, n=n, cutoff=cutoff)
    for cname in close:
        score = SequenceMatcher(None, key, cname).ratio()
        for tree_num, ui, name in name_index[cname]:
            results.append((name, ui, tree_num, score))
    results.sort(key=lambda x: x[3], reverse=True)
    return results

In [8]:
def get_bacteria_taxonomy(descriptors):
    """
    Find the descriptor with name 'Bacteria', 
    then collect every descriptor under that tree-number prefix.
    """
    prefix = None #bacteria is B03
    for d in descriptors:
        if d['name'].lower() == 'bacteria' and d['tree_numbers']:
            prefix = d['tree_numbers'][0]
            break
    if not prefix:
        raise RuntimeError("Could not find 'Bacteria' in descriptors")
    tax = {}
    for d in descriptors:
        for tn in d['tree_numbers']:
            if tn == prefix or tn.startswith(prefix + '.'):
                tax[tn] = {'ui':d['ui'], 'name':d['name']}
                break
    return tax

bacteria_tax = get_bacteria_taxonomy(descriptors)
#print(f"{len(bacteria_tax)} bacterial MeSH nodes")

In [9]:
def build_name_index(taxonomy):
    """
    Build a dict: lower-case name -> list of (tree#, ui, canonical name)
    """
    idx = {}
    for tree_num, info in taxonomy.items():
        key = info['name'].lower()
        idx.setdefault(key, []).append((tree_num, info['ui'], info['name']))
    return idx

name_index = build_name_index(bacteria_tax)
print(f"Indexed {len(name_index)} bacterial names")

Indexed 859 bacterial names


In [10]:
import re, json, numpy as np
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def load_taxonomy_tree(path):
    row_re = re.compile(r"^\s*(.*?)\s+\[([^\]]+)\]\s*$")
    rows = []
    with open(path, encoding="utf-8") as fh:
        for ln in fh:
            m = row_re.match(ln)
            if m:
                label, tid = m.groups()
                depth      = len(ln) - len(ln.lstrip())
                rows.append((label, tid, depth))
    return rows

TAX_FILE = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\parsedOntologies\bacteria_tree1.txt"
rows     = load_taxonomy_tree(TAX_FILE)

exact_ix  = defaultdict(list)
for lbl, tid, depth in rows:
    exact_ix[lbl.lower()].append((lbl, tid, depth))

labels_only  = [r[0] for r in rows]
vec          = TfidfVectorizer(stop_words="english")
mat          = vec.fit_transform(labels_only)

def top_cosine(term, k=5, thr=0.75):
    v   = vec.transform([term])
    sc  = cosine_similarity(v, mat).ravel()
    idx = np.argsort(sc)[::-1]
    out = []
    for i in idx[:k]:
        if sc[i] < thr: break
        lbl, tid, d = rows[i]
        out.append((lbl, tid, d, sc[i]))
    return out

abbr_re = re.compile(r"^([A-Z])\.\s+([A-Za-z_-]+)$")

def preprocess(term):
    term = term.replace('_', ' ')
    term = term.strip()
    return " ".join(lemmatizer.lemmatize(w) for w in term.split())

def genus_abbrev_lookup(term):
    m = abbr_re.match(term)
    if not m:
        return []
    initial, species = m.groups()
    species = species.lower()
    hits = []
    for lbl, tid, d in rows:
        if lbl.lower().endswith(' ' + species) and lbl[0].upper() == initial:
            hits.append((lbl, tid, d, 1.00))
    return hits

<h1>INGEST bacteria</h1>

In [26]:
import os
import re
import unicodedata
import json
from pathlib import Path
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD, SKOS, OWL
from pprint import pprint

# -----------------------------------------------------------------------------
# 1. Setup paths and namespaces
# -----------------------------------------------------------------------------
path = str(Path(os.path.abspath(os.getcwd())).absolute())
json_file = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\data\train_platinum\train_platinum_utf8.json"
#json_file = os.path.join(path, "train_gold.json")

tokenized_file = os.path.join(path, "tokenized_sentences_with_entitiesv2.json")
save_path = os.path.join(path, "rdf")
os.makedirs(save_path, exist_ok=True)

GUTBRAIN = Namespace("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/")
GUTPROP = Namespace("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/")

PAPER_CLASS       = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Paper")
MENTION_CLASS     = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Mention")
PAPER_ABSTRACT    = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/PaperAbstract")
PAPER_TITLE       = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/PaperTitle")
PAPER_COLLECTION  = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/PaperCollection")
PROJECT           = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Project")
SAMPLE            = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Sample")
SENTENCE          = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Sentence")

# -----------------------------------------------------------------------------
# 2. Load the JSON paper data
# -----------------------------------------------------------------------------
with open(json_file, "r", encoding="utf-8") as f:
    data = json.load(f)
# -----------------------------------------------------------------------------
# 3. Mapping dictionaries (keys must be in Title case)
# -----------------------------------------------------------------------------
label_mapping = {
    "Anatomical Location":   URIRef("https://w3id.org/brainteaser/ontology/schema/AnatomicalSite"),
    "Animal":                URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Animal"),
    "Biomedical Technique":  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/BiomedicalTechnique"),
    "Bacteria":              URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Species"),
    "Chemical":              URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Chemical"),
    "Dietary Supplement":    URIRef("https://w3id.org/brainteaser/ontology/schema/DietarySupplement"),
    "DDF":                   URIRef("https://w3id.org/brainteaser/ontology/schema/DiseaseDisorderOrFinding"),
    "Drug":                  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Drug"),
    "Food":                  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Food"),
    "Gene":                  URIRef("https://w3id.org/brainteaser/ontology/schema/Gene"),
    "Human":                 URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Human"),
    "Microbiome":            URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Microbiome"),
    "Statistical Technique": URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/StatisticalTechnique")
}
concept_scheme_mapping = {
    "Anatomical Location":   URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/AnatomicSite"),
    "Animal":                URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Animal"),
    "Human":                 URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Human"),
    "Drug":                  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Drug"),
    "Gene":                  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Gene"),
    "Dietary Supplement":    URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/DietarySupplement"),
    "DDF":                   URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/DiseaseDisorderOrFinding"),
    "Metabolite":            URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Metabolite"),
    "Bacteria":               URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Bacteria"),
    "Food":                  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Food"),
    "Chemical":              URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Chemical"),
    "Biomedical Technique":  URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/BiomedicalTechnique"),
    "Microbiome":            URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Microbiome"),
    "Statistical Technique": URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/StatisticalTechnique")
}

# -----------------------------------------------------------------------------
# 4. Initialize the RDF graph and bind namespaces
# -----------------------------------------------------------------------------
BACTERIA_CLASS = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Species")
FAMILY_CLASS = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Family")
OBO_BASE = "http://purl.obolibrary.org/obo/"
MESH_BASE = "https://www.ncbi.nlm.nih.gov/mesh/"
BACTERIA_CONCEPT_SCHEME = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Bacteria")

g = Graph()
g.bind("gutbrain", GUTBRAIN)
g.bind("rdfs", RDFS)
g.bind("xsd", XSD)
g.bind("skos", SKOS)
g.bind("owl", OWL)
g.bind("gutprop", GUTPROP)

g.add((SKOS.inScheme, RDF.type, OWL.ObjectProperty))
g.add((SKOS.broaderTransitive, RDF.type, OWL.ObjectProperty))

g.add((GUTPROP.partOf, RDF.type, OWL.ObjectProperty))
g.add((GUTPROP.partOf, RDFS.label, Literal("partOf", datatype=XSD.string)))
g.add((GUTPROP.hasTitle, RDF.type, OWL.ObjectProperty))
g.add((GUTPROP.hasTitle, RDFS.label, Literal("hasTitle", datatype=XSD.string)))
g.add((GUTPROP.hasAbstract, RDF.type, OWL.ObjectProperty))
g.add((GUTPROP.hasAbstract, RDFS.label, Literal("hasAbstract", datatype=XSD.string)))

g.add((GUTPROP.containedIn, RDF.type, OWL.ObjectProperty))
g.add((GUTPROP.containedIn, RDFS.label, Literal("containedIn", datatype=XSD.string)))
g.add((GUTBRAIN.contains, RDF.type, OWL.ObjectProperty))
g.add((GUTBRAIN.contains, RDFS.label, Literal("contains", datatype=XSD.string)))
g.add((GUTPROP.composedOf, RDF.type, OWL.ObjectProperty))
g.add((GUTPROP.composedOf, RDFS.label, Literal("composedOf", datatype=XSD.string)))

g.add((GUTPROP.locatedIn, RDF.type, OWL.ObjectProperty))
g.add((GUTPROP.locatedIn, RDFS.label, Literal("locatedIn", datatype=XSD.string)))

g.add((GUTPROP.paperId, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.paperId, RDFS.label, Literal("paperId", datatype=XSD.string)))
g.add((GUTPROP.paperAnnotator, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.paperAnnotator, RDFS.label, Literal("paperAnnotator", datatype=XSD.string)))
g.add((GUTPROP.paperYear, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.paperYear, RDFS.label, Literal("paperYear", datatype=XSD.string)))
g.add((GUTPROP.paperJournal, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.paperJournal, RDFS.label, Literal("paperJournal", datatype=XSD.string)))
g.add((GUTPROP.paperAuthor, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.paperAuthor, RDFS.label, Literal("paperAuthor", datatype=XSD.string)))
g.add((GUTPROP.numberOfRunsFound, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.numberOfRunsFound, RDFS.label, Literal("numberOfRunsFound", datatype=XSD.string)))
g.add((GUTPROP.NCBITaxonID, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.NCBITaxonID, RDFS.label, Literal("NCBITaxonID", datatype=XSD.string)))
g.add((GUTPROP.sdRelativeAbundance, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.sdRelativeAbundance, RDFS.label, Literal("sdRelativeAbundance", datatype=XSD.string)))
g.add((GUTPROP.medianRelativeAbundance, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.medianRelativeAbundance, RDFS.label, Literal("medianRelativeAbundance", datatype=XSD.string)))
g.add((GUTPROP.meanRelativeAbundance, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.meanRelativeAbundance, RDFS.label, Literal("meanRelativeAbundance", datatype=XSD.string)))
g.add((GUTPROP.scientificName, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.scientificName, RDFS.label, Literal("scientificName", datatype=XSD.string)))
g.add((GUTPROP.hasMentionText, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.hasMentionText, RDFS.label, Literal("hasMentionText", datatype=XSD.string)))
g.add((GUTPROP.hasSentenceText, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.hasSentenceText, RDFS.label, Literal("hasSentenceText", datatype=XSD.string)))
g.add((GUTPROP.hasTitleText, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.hasTitleText, RDFS.label, Literal("hasTitleText", datatype=XSD.string)))
g.add((GUTPROP.hasAbstractText, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.hasAbstractText, RDFS.label, Literal("taggedAs", datatype=XSD.string)))
g.add((GUTPROP.taggedAs, RDF.type, OWL.DatatypeProperty))
g.add((GUTPROP.taggedAs, RDFS.label, Literal("taggedAs", datatype=XSD.string)))

for scheme_uri in set(concept_scheme_mapping.values()):
    keys = [k for k, v in concept_scheme_mapping.items() if v == scheme_uri]
    label_text = ", ".join(k.title() for k in keys) + " Concept Scheme"
    g.add((scheme_uri, RDF.type, SKOS.ConceptScheme))
    g.add((scheme_uri, RDFS.label, Literal(label_text, datatype=XSD.string)))

is_train_platinum = "train_platinum" in os.path.basename(json_file)
#is_train_gold = "train_gold" in os.path.basename(json_file)

#if is_train_gold:
 #   gold_collection_uri = URIRef(GUTBRAIN["goldCollection"])
 #   label_text = "goldCollection"
  #  g.add((gold_collection_uri, RDF.type, PAPER_COLLECTION))
   # g.add((gold_collection_uri, RDFS.label, Literal(label_text, datatype=XSD.string)))
    
if is_train_platinum:
    platinum_collection_uri = URIRef(GUTBRAIN["platinumCollection"])
    label_text = "platinumCollection"
    g.add((platinum_collection_uri, RDF.type, PAPER_COLLECTION))
    g.add((platinum_collection_uri, RDFS.label, Literal(label_text, datatype=XSD.string)))

def create_uri_fragment(text):
    cleaned = re.sub(r'<[^>]*>', '', text)
    cleaned = normalize_text(cleaned)
    cleaned = re.sub(r'[^\w\u0370-\u03FF-]', '_', cleaned)
    return cleaned

def to_camel_case(s):
    s = re.sub(r'[^\w\s]', '', s)
    parts = re.split(r'\s+', s.strip())
    if not parts:
        return ""
    return parts[0].lower() + ''.join(word.title() for word in parts[1:])

def normalize_text(text):
    return unicodedata.normalize('NFC', text)

def normalize_to_ascii(s: str) -> str:
    nfkd = unicodedata.normalize('NFKD', s)
    ascii_bytes = nfkd.encode('ascii', 'ignore')
    return ascii_bytes.decode('ascii')

def singularize(term):
    if term.endswith("ies"):
        return term[:-3] + "y"
    elif term.endswith("s") and not term.endswith("ss"):
        return term[:-1]
    return term

tokenized_mentions = {}
created = {}
label2uri = {}

created["Bacteria"] = URIRef(f"{OBO_BASE}NCBITaxon_2")
g.add((created["Bacteria"], RDF.type, FAMILY_CLASS))
g.add((created["Bacteria"], RDF.type, SKOS.Concept))
g.add((created["Bacteria"], RDFS.label, Literal("Bacteria", datatype=XSD.string)))
g.add((created["Bacteria"], SKOS.inScheme, BACTERIA_CONCEPT_SCHEME))

# -----------------------------------------------------------------------------
# 5. Process each paper (each key in the JSON represents a paper)
# -----------------------------------------------------------------------------
for paper_id, paper_data in data.items():
    paper_uri = URIRef(GUTBRAIN[f"paper_{paper_id}"])
    g.add((paper_uri, RDF.type, PAPER_CLASS))
    
    if is_train_platinum:
        g.add((paper_uri, GUTPROP.partOf, platinum_collection_uri))
        g.add((platinum_collection_uri, GUTBRAIN.contains, paper_uri))

    #if is_train_gold:
    #    g.add((paper_uri, GUTPROP.partOf, gold_collection_uri))
    #    g.add((gold_collection_uri, GUTBRAIN.contains, paper_uri))
    
    # Each paper gets its own mention node
    #paper_mention = URIRef(GUTBRAIN[f"mention_{paper_id}"])
    #g.add((paper_mention, RDF.type, MENTION_CLASS))
    #g.add((paper_uri, GUTPROP.hasMention, paper_mention))
    
    metadata = paper_data.get("metadata", {})
    full_title = metadata.get("title", None)
    full_abstract = metadata.get("abstract", None)
    try:
        paper_id_val = int(paper_id)
    except ValueError:
        paper_id_val = paper_id
    paper_annotator = metadata.get("annotator", None)
    paper_year = metadata.get("year", None)
    paper_journal = metadata.get("journal", None)
    paper_author = metadata.get("author", None)
    
    g.add((paper_uri, GUTPROP.paperId, Literal(paper_id_val, datatype=XSD.integer)))
    if paper_annotator is not None:
        g.add((paper_uri, GUTPROP.paperAnnotator, Literal(paper_annotator, datatype=XSD.string)))
    if paper_year is not None:
        g.add((paper_uri, GUTPROP.paperYear, Literal(paper_year, datatype=XSD.gYear)))
    if paper_journal is not None:
        g.add((paper_uri, GUTPROP.paperJournal, Literal(paper_journal, datatype=XSD.string)))
    if paper_author is not None:
        g.add((paper_uri, GUTPROP.paperAuthor, Literal(paper_author, datatype=XSD.string)))
    
    title_texts = []
    abstract_texts = []
    
    entities = paper_data.get("entities", [])
    
    for i, entity in enumerate(entities):
        raw_label = entity.get("label", "").strip()
        label_title = raw_label.title() if raw_label.lower() != "ddf" else "DDF"
        
        text_span = entity.get("text_span", "").strip()
        cleaned_text = create_uri_fragment(text_span)
        cleaned_text_span = normalize_to_ascii(cleaned_text)
        
        if label_title == "Bacteria":
            if cleaned_text_span.lower().endswith(" bacteria"):
                term_raw = "Bacteria"
            elif cleaned_text_span.lower().endswith(" Bifidobacterium"):
                term_raw = "Bifidobacterium"
            else:
                term_raw = cleaned_text_span
                
            term = preprocess(term_raw)
            
            print(f"Query: {term}")
            
            if term_raw in created:
                entity_uri = created[term_raw]
                print(f"  → Reusing existing URI: {entity_uri}\n")
                continue
                
            hits = genus_abbrev_lookup(term)
            if hits:
                for l,t,d,s in hits: 
                    print(f"  • {l:40s} ID={t:15s} depth={d:<2d} score={s:.2f} (abbr)")
                    entity_uri = URIRef(f"{OBO_BASE}{t}")
                    created[term_raw] = entity_uri
                    label2uri[label_name.lower()] = entity_uri
                    g.add((entity_uri, RDF.type, BACTERIA_CLASS))
                    g.add((entity_uri, RDF.type, SKOS.Concept))
                    g.add((entity_uri, RDFS.label, Literal(label_name, datatype=XSD.string)))
                    g.add((entity_uri, SKOS.inScheme, BACTERIA_CONCEPT_SCHEME))
                    g.add((entity_uri, SKOS.broaderTransitive, created["Bacteria"]))
                    mention_uri = URIRef(GUTBRAIN[term_raw])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_bacteria_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(label_title.lower(), datatype=XSD.string)))
                    g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    print()
                    continue

            ex = exact_ix.get(term.lower(), [])
            if ex:
                for label_name, taxon_id, depth in ex:
                    print(f"  • {label_name:40s} ID={taxon_id:15s} depth={depth:<2d} (exact)")
                    entity_uri = URIRef(f"{OBO_BASE}{taxon_id}")
                    created[term_raw] = entity_uri
                    label2uri[label_name.lower()] = entity_uri
                    g.add((entity_uri, RDF.type, BACTERIA_CLASS))
                    g.add((entity_uri, RDF.type, SKOS.Concept))
                    g.add((entity_uri, RDFS.label, Literal(label_name, datatype=XSD.string)))
                    g.add((entity_uri, SKOS.inScheme, BACTERIA_CONCEPT_SCHEME))
                    g.add((entity_uri, SKOS.broaderTransitive, created["Bacteria"]))
                    mention_uri = URIRef(GUTBRAIN[term_raw])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_bacteria_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(label_title.lower(), datatype=XSD.string)))
                    g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    continue
                    print(); 
                    
            cos = top_cosine(term)
            if not cos:
                matches = find_mesh_match(term, name_index)
                if matches: 
                    for name, ui, tree, score in matches[:1]:
                        name_key = name.lower()
                        reused = False
                        if name_key in label2uri:
                            entity_uri = label2uri[name_key]
                            print(f"  → Reusing existing URI by label: {entity_uri}\n")
                            reused = True
                            continue
                        if reused:
                            break
                            
                        print(f"  • {name:30s} UI={ui:8s} Tree={tree:12s}  scoreMESH={score:.2f}")
                        name_uri = URIRef(f"{MESH_BASE}{ui}")
                        created[term_raw] = name_uri
                        label2uri[name_key] = name_uri
                        g.add((name_uri, RDF.type, BACTERIA_CLASS))
                        g.add((name_uri, RDF.type, SKOS.Concept))
                        g.add((name_uri, RDFS.label, Literal(name, datatype=XSD.string)))
                        g.add((name_uri, SKOS.inScheme, BACTERIA_CONCEPT_SCHEME))
                        g.add((name_uri, SKOS.broaderTransitive, created["Bacteria"]))
                        mention_uri = URIRef(GUTBRAIN[term_raw])
                        g.add((mention_uri, RDF.type, MENTION_CLASS))
                        g.add((mention_uri, RDFS.label, Literal(f"mention_bacteria_{term_raw}", datatype=XSD.string)))
                        g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                        g.add((mention_uri, GUTPROP.taggedAs, Literal(label_title.lower(), datatype=XSD.string)))
                        g.add((name_uri, GUTPROP.containedIn, mention_uri))
                        tokenized_mentions[term_raw] = mention_uri
                        continue
                else:
                    mention_uri = URIRef(GUTBRAIN[term_raw])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_bacteria_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(label_title.lower(), datatype=XSD.string)))
                    g.add((created["Bacteria"], GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    continue
                    
            else:
                for l,t,d,s in cos[:1]:
                    print(f"  • {l:40s} ID={t:15s} depth={d:<2d} score={s:.2f}")
                    name_uri = URIRef(f"{OBO_BASE}{t}")
                    created[term_raw] = name_uri
                    label2uri[label_name.lower()] = name_uri
                    g.add((name_uri, RDF.type, BACTERIA_CLASS))
                    g.add((name_uri, RDF.type, SKOS.Concept))
                    g.add((name_uri, RDFS.label, Literal(l, datatype=XSD.string)))
                    g.add((name_uri, SKOS.inScheme, BACTERIA_CONCEPT_SCHEME))
                    g.add((name_uri, SKOS.broaderTransitive, created["Bacteria"]))
                    mention_uri = URIRef(GUTBRAIN[term_raw])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_bacteria_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(label_title.lower(), datatype=XSD.string)))
                    g.add((name_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    continue
                print()
        
        location_lower = entity.get("location", "").strip().lower()
        if location_lower == "title":
            title_texts.append(cleaned_text_span)
        elif location_lower == "abstract":
            abstract_texts.append(cleaned_text_span)

        else:
            pass
    
    if full_title is None and title_texts:
        full_title = " ".join(title_texts)
    if full_abstract is None and abstract_texts:
        full_abstract = " ".join(abstract_texts)
    
    if full_title:
        title_uri = URIRef(GUTBRAIN[f"title_{paper_id}"])
        g.add((title_uri, RDF.type, PAPER_TITLE))
        g.add((title_uri, GUTPROP.hasTitleText, Literal(full_title, datatype=XSD.string)))
        g.add((paper_uri, GUTPROP.hasTitle, title_uri))
    
    if full_abstract:
        abstract_uri = URIRef(GUTBRAIN[f"abstract_{paper_id}"])
        g.add((abstract_uri, RDF.type, PAPER_ABSTRACT))
        g.add((abstract_uri, GUTPROP.hasAbstractText, Literal(full_abstract, datatype=XSD.string)))
        g.add((paper_uri, GUTPROP.hasAbstract, abstract_uri))
    
    # Process relations
    #relations = paper_data.get("relations", [])
    #for relation in relations:
        #subj_text = relation.get("subject_text_span", "").strip()
        #cleaned_subj_text = re.sub(r'<[^>]*>', '', subj_text).strip()
        #subj_fragment = create_uri_fragment(cleaned_subj_text)
        #subj_uri = URIRef(GUTBRAIN[subj_fragment])
        #if not list(g.triples((subj_uri, None, None))):
           # print(f"Warning: Subject not recognized: {subj_uri}. Info: '{cleaned_subj_text}'")
        
        #obj_text = relation.get("object_text_span", "").strip()
        #cleaned_obj_text = re.sub(r'<[^>]*>', '', obj_text).strip()
        #obj_fragment = create_uri_fragment(cleaned_obj_text)
        #obj_uri = URIRef(GUTBRAIN[obj_fragment])
        #if not list(g.triples((obj_uri, None, None))):
            #print(f"Warning: Object not recognized: {obj_uri}. Info: '{cleaned_obj_text}'")
        
        #pred_text = relation.get("predicate", "").strip()
        #pred_text_clean = to_camel_case(pred_text)
        #pred_uri = URIRef(GUTPROP[pred_text_clean])
        #print(f"Predicate: '{pred_text}' -> '{pred_text_clean}'")
        #g.add((pred_uri, RDF.type, OWL.ObjectProperty))
        #g.add((pred_uri, RDFS.label, Literal(pred_text_clean, datatype=XSD.string)))
       # g.add((subj_uri, pred_uri, obj_uri))

with open(tokenized_file, "r", encoding="utf-8") as f_sent:
    tokenized_data = json.load(f_sent)

for entry in tokenized_data:
    pmid         = entry["pmid"]
    sent_id      = entry["sent_id"]
    sentence_txt = entry["sentence"].strip()
    entities     = entry["entities"]

    sent_uri = URIRef(GUTBRAIN[f"sentence_{pmid}_{sent_id}"])
    g.add((sent_uri, RDF.type, SENTENCE))
    g.add((sent_uri, GUTPROP.hasSentenceText, Literal(sentence_txt, datatype=XSD.string)))

    if sent_id == 0:
        parent_uri = URIRef(GUTBRAIN[f"title_{pmid}"])
    else:
        parent_uri = URIRef(GUTBRAIN[f"abstract_{pmid}"])
    g.add((sent_uri,      GUTPROP.partOf,  parent_uri))
    g.add((parent_uri,    GUTPROP.composedOf, sent_uri))
        
    for ent in entities:
        if isinstance(ent, dict):
            text_span = ent.get("text_span", "").strip()
            label     = ent.get("label",    "").strip().lower()
        else:
            continue 

        if label != "bacteria":
            continue
        canonical = create_uri_fragment(text_span)
        cleaned_text_span = normalize_to_ascii(canonical)
        if cleaned_text_span not in tokenized_mentions:
            mention_uri = URIRef(GUTBRAIN[cleaned_text_span])
            tokenized_mentions[cleaned_text_span] = mention_uri

            g.add((mention_uri, RDF.type, MENTION_CLASS))
            g.add((mention_uri, RDFS.label, Literal(f"mention_bacteria_{cleaned_text_span}", datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.taggedAs, Literal(label, datatype=XSD.string)))
            
        else:
            mention_uri = tokenized_mentions[cleaned_text_span]
            
        g.add((mention_uri, GUTPROP.locatedIn, sent_uri))

Query: veillonella


ValueError: not enough values to unpack (expected 3, got 2)

<h1>INGEST CHEMICAL</h1>

In [12]:
import re
import json
import numpy as np
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
import unicodedata
from pathlib import Path
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD, SKOS, OWL

CHEBI_BASE = "http://purl.obolibrary.org/obo/"
CHEMICAL_CLASS = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Chemical")
CHEMICAL_CONCEPT_SCHEME = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Chemical")
UBERON_URI = URIRef("http://purl.obolibrary.org/obo/UBERON_0002097")

created["Chemical Entity"] = URIRef(f"{CHEBI_BASE}CHEBI_24431")
g.add((created["Chemical Entity"], RDF.type, CHEMICAL_CLASS))
g.add((created["Chemical Entity"], RDF.type, SKOS.Concept))
g.add((created["Chemical Entity"], RDFS.label, Literal("Chemical Entity", datatype=XSD.string)))
g.add((created["Chemical Entity"], SKOS.inScheme, CHEMICAL_CONCEPT_SCHEME))

def load_ncbitaxon_labels(path):
    pattern = re.compile(r'^\s*(.*?)\s*[\(\[]([A-Za-z0-9_]+)[\)\]]')
    rows = []
    with open(path, encoding="utf-8") as fh:
        for ln in fh:
            m = pattern.match(ln)
            if not m:
                continue
            label, term_id = m.group(1), m.group(2)
            uri = CHEBI_BASE + term_id
            rows.append((label, uri))
    return rows
    
def load_chebi_labels(path):
    rows = []
    with open(path, encoding="utf-8") as fh:
        next(fh) 
        for ln in fh:
            uri, label = ln.rstrip("\n").split("\t", 1)
            rows.append((label, uri))
    return rows

CHEBI_LABELS_FILE = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\parsedOntologies\chebi_labels.txt"
NCBITAXON_LABELS_FILE = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\parsedOntologies\ncit_full_taxonomy.txt"
chebi_rows = load_chebi_labels(CHEBI_LABELS_FILE)
ncbi_rows = load_ncbitaxon_labels(NCBITAXON_LABELS_FILE)

exact_ix = defaultdict(list)
for lbl, uri in chebi_rows:
    exact_ix[lbl.lower()].append((lbl, uri))

exact_ix1 = defaultdict(list)
for lbl, uri in ncbi_rows:
    exact_ix1[lbl.lower()].append((lbl, uri))

labels_only = [preprocess(lbl) for lbl, _ in chebi_rows]
vec = TfidfVectorizer(stop_words="english")
mat = vec.fit_transform(labels_only)

labels_only1 = [preprocess(lbl) for lbl, _ in ncbi_rows]
vec1 = TfidfVectorizer(stop_words="english")
mat1 = vec1.fit_transform(labels_only1)

def top_cosine(term, k=5, thr=0.75):
    v  = vec.transform([term])
    sc = cosine_similarity(v, mat).ravel()
    idx = np.argsort(sc)[::-1]
    out = []
    for i in idx[:k]:
        if sc[i] < thr:
            break
        lbl, uri = chebi_rows[i]
        out.append((lbl, uri, sc[i]))
    return out

def top_cosine_ncbitaxon(term, k=5, thr=0.75):
    v  = vec1.transform([term])
    sc = cosine_similarity(v, mat1).ravel()
    idx = np.argsort(sc)[::-1]
    out = []
    for i in idx[:k]:
        if sc[i] < thr:
            break
        lbl, uri = ncbi_rows[i]
        out.append((lbl, uri, sc[i]))
    return out

greek_map = {
    'α': 'alpha',  'Α': 'alpha',
    'β': 'beta',   'Β': 'beta',
    'γ': 'gamma',  'Γ': 'gamma',
    'δ': 'delta',  'Δ': 'delta',
    'ε': 'epsilon','Ε': 'epsilon',
    'ζ': 'zeta',   'Ζ': 'zeta',
    'η': 'eta',    'Η': 'eta',
    'θ': 'theta',  'Θ': 'theta',
    'ι': 'iota',   'Ι': 'iota',
    'κ': 'kappa',  'Κ': 'kappa',
    'λ': 'lambda', 'Λ': 'lambda',
    'μ': 'mu',     'Μ': 'mu',
    'ν': 'nu',     'Ν': 'nu',
    'ξ': 'xi',     'Ξ': 'xi',
    'ο': 'omicron','Ο': 'omicron',
    'π': 'pi',     'Π': 'pi',
    'ρ': 'rho',    'Ρ': 'rho',
    'σ': 'sigma',  'Σ': 'sigma',
    'τ': 'tau',    'Τ': 'tau',
    'υ': 'upsilon','Υ': 'upsilon',
    'φ': 'phi',    'Φ': 'phi',
    'χ': 'chi',    'Χ': 'chi',
    'ψ': 'psi',    'Ψ': 'psi',
    'ω': 'omega',  'Ω': 'omega',
}

def preprocess(term):
    for greek_char, name in greek_map.items():
        if greek_char in term:
            term = term.replace(greek_char, name)
    term = term.replace('_', ' ')
    term = term.strip()
    return " ".join(lemmatizer.lemmatize(w) for w in term.split())
    
for paper_id, paper_data in data.items():
    
    entities = paper_data.get("entities", [])
    
    for i, entity in enumerate(entities):
        raw_label = entity.get("label", "").strip()
        
        text_span = entity.get("text_span", "").strip()
        
        if raw_label == "chemical":
            text_span = entity.get("text_span", "").strip()
            cleaned_text = create_uri_fragment(text_span)
            cleaned_text_span = normalize_to_ascii(cleaned_text)
            term_raw = cleaned_text_span.lower()
            term = preprocess(term_raw)
            print(f"Query: {term}")
            
            if term_raw in created:
                entity_uri = created[term_raw]
                print(f"  → Reusing existing URI: {entity_uri}\n")
                continue

            ex = exact_ix.get(term, [])
            if ex:
                for lbl, uri in ex:
                    print(f"  • {lbl:40s} URI={uri}")
                    entity_uri = URIRef(f"{uri}")
                    created[term_raw] = entity_uri
                    g.add((entity_uri, RDF.type, CHEMICAL_CLASS))
                    g.add((entity_uri, RDF.type, SKOS.Concept))
                    g.add((entity_uri, RDFS.label, Literal(lbl, datatype=XSD.string)))
                    g.add((entity_uri, SKOS.inScheme, CHEMICAL_CONCEPT_SCHEME))
                    g.add((entity_uri, SKOS.broaderTransitive, created["Chemical Entity"]))
                    mention_uri = URIRef(GUTBRAIN[term_raw])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_chemical_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    continue
                    print(); 
                    
            cos = top_cosine(term)
            cos1 = top_cosine_ncbitaxon(term)
            if cos:
                for lbl, uri, score in cos[:1]:
                    print(f"  • {lbl:40s} URI={uri:40s} score={score:.2f}")
                    name_uri = URIRef(f"{uri}")
                    created[term_raw] = name_uri
                    g.add((name_uri, RDF.type, CHEMICAL_CLASS))
                    g.add((name_uri, RDF.type, SKOS.Concept))
                    g.add((name_uri, RDFS.label, Literal(lbl, datatype=XSD.string)))
                    g.add((name_uri, SKOS.inScheme, CHEMICAL_CONCEPT_SCHEME))
                    g.add((name_uri, SKOS.broaderTransitive, created["Chemical Entity"]))
                    mention_uri = URIRef(GUTBRAIN[term_raw])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_chemical_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((name_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    print()
            elif cos1:
                for lbl, uri, score in cos1[:1]:
                    print(f"  • {lbl:40s} URI={uri:40s} score={score:.2f}")
                    name_uri = URIRef(f"{uri}")
                    created[term_raw] = name_uri
                    g.add((name_uri, RDF.type, CHEMICAL_CLASS))
                    g.add((name_uri, RDF.type, SKOS.Concept))
                    g.add((name_uri, RDFS.label, Literal(lbl, datatype=XSD.string)))
                    g.add((name_uri, SKOS.inScheme, CHEMICAL_CONCEPT_SCHEME))
                    g.add((name_uri, SKOS.broaderTransitive, created["Chemical Entity"]))
                    mention_uri = URIRef(GUTBRAIN[term_raw])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_chemical_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((name_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    print()
            else:
                mention_uri = URIRef(GUTBRAIN[term_raw])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_chemical_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                g.add((created["Chemical Entity"], GUTPROP.containedIn, mention_uri))
                tokenized_mentions[term_raw] = mention_uri
                continue
        else:
            pass

with open(tokenized_file, "r", encoding="utf-8") as f_sent:
    tokenized_data = json.load(f_sent)

for entry in tokenized_data:
    pmid         = entry["pmid"]
    sent_id      = entry["sent_id"]
    sentence_txt = entry["sentence"].strip()
    entities     = entry["entities"]

    sent_uri = URIRef(GUTBRAIN[f"sentence_{pmid}_{sent_id}"])
    g.add((sent_uri, RDF.type, SENTENCE))
    g.add((sent_uri, GUTPROP.hasSentenceText, Literal(sentence_txt, datatype=XSD.string)))

    if sent_id == 0:
        parent_uri = URIRef(GUTBRAIN[f"title_{pmid}"])
    else:
        parent_uri = URIRef(GUTBRAIN[f"abstract_{pmid}"])
    g.add((sent_uri,      GUTPROP.partOf,  parent_uri))
    g.add((parent_uri,    GUTPROP.composedOf, sent_uri))
        
    for ent in entities:
        if isinstance(ent, dict):
            text_span = ent.get("text_span", "").strip()
            label     = ent.get("label",    "").strip().lower()
        else:
            continue

        if label != "chemical":
            continue
        canonical = create_uri_fragment(text_span)
        cleaned_text_span = normalize_to_ascii(canonical).lower()
        
        if cleaned_text_span not in tokenized_mentions:
            mention_uri = URIRef(GUTBRAIN[cleaned_text_span])
            tokenized_mentions[cleaned_text_span] = mention_uri

            g.add((mention_uri, RDF.type, MENTION_CLASS))
            g.add((mention_uri, RDFS.label, Literal(f"mention_chemical_{cleaned_text_span}", datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.taggedAs, Literal(label, datatype=XSD.string)))
            
        else:
            mention_uri = tokenized_mentions[cleaned_text_span]
            
        g.add((mention_uri, GUTPROP.locatedIn, sent_uri))

Query: simple sugar
Query: sfa
Query: monounsaturated fatty acid
  • monounsaturated fatty acid               URI=http://purl.obolibrary.org/obo/CHEBI_25413
  • monounsaturated fatty acid               URI=http://purl.obolibrary.org/obo/CHEBI_25413 score=1.00

Query: metabolite acetate
  • metabolite                               URI=http://purl.obolibrary.org/obo/CHEBI_25212 score=0.81

Query: fat and sugar content
Query: tnf-a
  • TNF Gene                                 URI=http://purl.obolibrary.org/obo/NCIT_C18368 score=0.92

Query: il-6
  • Will County, IL                          URI=http://purl.obolibrary.org/obo/NCIT_C108381 score=0.83

Query: il-17
Query: serum hormone
Query: insulin
  • insulin                                  URI=http://purl.obolibrary.org/obo/CHEBI_145810
  • insulin                                  URI=http://purl.obolibrary.org/obo/CHEBI_145810 score=1.00

Query: testosterone
  • testosterone                             URI=http://purl.obolibrary.org/obo

<h1>INGEST FOOD</h1>

In [13]:
import os
import re
import unicodedata
import json
from pathlib import Path
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD, SKOS, OWL
from pprint import pprint

FOODON_BASE = "http://purl.obolibrary.org/obo/"
NCIT_BASE = "http://purl.obolibrary.org/obo/"
FOOD_CLASS = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Food")
FOOD_CONCEPT_SCHEME = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Food")

TAX1_FILE = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\parsedOntologies\food_tree.txt"
TAX2_FILE = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\parsedOntologies\foodon_terms.txt"

def load_taxonomy_tree(path):
    row_re = re.compile(r"^\s*(.*?)\s+\[([^\]]+)\]\s*$")
    rows = []
    with open(path, encoding="utf-8") as fh:
        for ln in fh:
            m = row_re.match(ln)
            if m:
                label, tid = m.groups()
                depth = len(ln) - len(ln.lstrip())
                rows.append((label, tid, depth))
    return rows

rows1 = load_taxonomy_tree(TAX1_FILE)
rows2 = load_taxonomy_tree(TAX2_FILE)

exact1 = defaultdict(list)
for lbl, tid, depth in rows1:
    exact1[lbl.lower()].append((lbl, tid, depth))

exact2 = defaultdict(list)
for lbl, tid, depth in rows2:
    exact2[lbl.lower()].append((lbl, tid, depth))

labels1 = [lbl for lbl,_,_ in rows1]
vec1    = TfidfVectorizer(stop_words="english")
mat1    = vec1.fit_transform(labels1)

labels2 = [lbl for lbl,_,_ in rows2]
vec2    = TfidfVectorizer(stop_words="english")
mat2    = vec2.fit_transform(labels2)

def top_cosine(rows, vec, mat, labels, term, k=5, thr=0.75):
    v   = vec.transform([term])
    sc  = cosine_similarity(v, mat).ravel()
    idx = np.argsort(sc)[::-1]
    out = []
    for i in idx[:k]:
        if sc[i] < thr:
            break
        lbl, tid, depth = rows[i]
        out.append((lbl, tid, depth, sc[i]))
    return out

created["Food"] = URIRef(f"{NCIT_BASE}NCIT_C62695")
g.add((created["Food"], RDF.type, FOOD_CLASS))
g.add((created["Food"], RDF.type, SKOS.Concept))
g.add((created["Food"], RDFS.label, Literal("Food", datatype=XSD.string)))
g.add((created["Food"], SKOS.inScheme, FOOD_CONCEPT_SCHEME))

for paper_id, paper_data in data.items():
    
    entities = paper_data.get("entities", [])
    
    for i, entity in enumerate(entities):
        raw_label = entity.get("label", "").strip()
        
        text_span = entity.get("text_span", "").strip()
        
        if raw_label == "food":
            text_span = entity.get("text_span", "").strip()
            cleaned_text = create_uri_fragment(text_span)
            cleaned_text_span = normalize_to_ascii(cleaned_text)
            term_raw = cleaned_text_span.lower()
            term = preprocess(term_raw)
            
            print(f"Query: {term}")
            
            if term_raw in created:
                entity_uri = created[term_raw]
                print(f"  → Reusing existing URI: {entity_uri}\n")
                continue

            ex1 = exact1.get(term.lower(), [])
            if ex1:
                for l,t,d in ex1:
                    print(f"  • {l:40s} ID={t:15s} depth={d:<2d} score=1.00 (TAX1 exact)")
                    entity_uri = URIRef(f"{NCIT_BASE}{t}")
                    created[term_raw] = entity_uri
                    g.add((entity_uri, RDF.type, FOOD_CLASS))
                    g.add((entity_uri, RDF.type, SKOS.Concept))
                    g.add((entity_uri, RDFS.label, Literal(l, datatype=XSD.string)))
                    g.add((entity_uri, SKOS.inScheme, FOOD_CONCEPT_SCHEME))
                    g.add((entity_uri, SKOS.broaderTransitive, created["Food"]))
                    mention_uri = URIRef(GUTBRAIN[term_raw])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_food_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    continue
                    print(); 
                    
            cos1 = top_cosine(rows1, vec1, mat1, labels1, term, k=5, thr=0.75)
            if cos1:
                for l,t,d,s in cos1[:1]:
                    print(f"  • {l:40s} ID={t:15s} depth={d:<2d} score={s:.2f} (TAX1 cosine)")
                    name_uri = URIRef(f"{NCIT_BASE}{t}")
                    created[term_raw] = name_uri
                    g.add((name_uri, RDF.type, FOOD_CLASS))
                    g.add((name_uri, RDF.type, SKOS.Concept))
                    g.add((name_uri, RDFS.label, Literal(l, datatype=XSD.string)))
                    g.add((name_uri, SKOS.inScheme, FOOD_CONCEPT_SCHEME))
                    g.add((name_uri, SKOS.broaderTransitive, created["Food"]))
                    mention_uri = URIRef(GUTBRAIN[term_raw])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_food_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((name_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    print()
                continue

            cos2 = top_cosine(rows2, vec2, mat2, labels2, term, k=5, thr=0.75)
            if cos2:
                for l,t,d,s in cos2[:1]:
                    print(f"  • {l:40s} ID={t:15s} depth={d:<2d} score={s:.2f} (TAX2 cosine)")
                    name_uri = URIRef(f"{FOODON_BASE}{t}")
                    created[term_raw] = name_uri
                    g.add((name_uri, RDF.type, FOOD_CLASS))
                    g.add((name_uri, RDF.type, SKOS.Concept))
                    g.add((name_uri, RDFS.label, Literal(l, datatype=XSD.string)))
                    g.add((name_uri, SKOS.inScheme, FOOD_CONCEPT_SCHEME))
                    g.add((name_uri, SKOS.broaderTransitive, created["Food"]))
                    mention_uri = URIRef(GUTBRAIN[term_raw])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_food_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((name_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
            else:
                mention_uri = URIRef(GUTBRAIN[term_raw])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_food_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                g.add((created["Food"], GUTPROP.containedIn, mention_uri))
                tokenized_mentions[term_raw] = mention_uri
                continue

        else:
            pass

with open(tokenized_file, "r", encoding="utf-8") as f_sent:
    tokenized_data = json.load(f_sent)

for entry in tokenized_data:
    pmid         = entry["pmid"]
    sent_id      = entry["sent_id"]
    sentence_txt = entry["sentence"].strip()
    entities     = entry["entities"]

    sent_uri = URIRef(GUTBRAIN[f"sentence_{pmid}_{sent_id}"])
    g.add((sent_uri, RDF.type, SENTENCE))
    g.add((sent_uri, GUTPROP.hasSentenceText, Literal(sentence_txt, datatype=XSD.string)))

    if sent_id == 0:
        parent_uri = URIRef(GUTBRAIN[f"title_{pmid}"])
    else:
        parent_uri = URIRef(GUTBRAIN[f"abstract_{pmid}"])
    g.add((sent_uri,      GUTPROP.partOf,  parent_uri))
    g.add((parent_uri,    GUTPROP.composedOf, sent_uri))
        
    for ent in entities:
        if isinstance(ent, dict):
            text_span = ent.get("text_span", "").strip()
            label     = ent.get("label",    "").strip().lower()
        else:
            continue

        if label != "food":
            continue
        canonical = create_uri_fragment(text_span)
        cleaned_text_span = normalize_to_ascii(canonical).lower()
        
        if cleaned_text_span not in tokenized_mentions:
            mention_uri = URIRef(GUTBRAIN[cleaned_text_span])
            tokenized_mentions[cleaned_text_span] = mention_uri

            g.add((mention_uri, RDF.type, MENTION_CLASS))
            g.add((mention_uri, RDFS.label, Literal(f"mention_food_{cleaned_text_span}", datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.taggedAs, Literal(label, datatype=XSD.string)))
            
        else:
            mention_uri = tokenized_mentions[cleaned_text_span]
            
        g.add((mention_uri, GUTPROP.locatedIn, sent_uri))

Query: vegetable
  • Vegetable                                ID=NCIT_C178192    depth=4  score=1.00 (TAX1 exact)
  • Vegetable                                ID=NCIT_C178192    depth=4  score=1.00 (TAX1 cosine)

Query: whole grain cereal
  • Whole Grain                              ID=NCIT_C178197    depth=2  score=1.00 (TAX1 cosine)

Query: high-fiber diet
  • high fiber food                          ID=FOODON_03510048 depth=0  score=0.76 (TAX2 cosine)
Query: high-fiber diet
  → Reusing existing URI: http://purl.obolibrary.org/obo/FOODON_03510048

Query: high-fiber diet
  → Reusing existing URI: http://purl.obolibrary.org/obo/FOODON_03510048

Query: high-fiber diet
  → Reusing existing URI: http://purl.obolibrary.org/obo/FOODON_03510048

Query: high-fiber diet
  → Reusing existing URI: http://purl.obolibrary.org/obo/FOODON_03510048

Query: wheat germ
  • wheat germ                               ID=FOODON_03301595 depth=0  score=1.00 (TAX2 cosine)
Query: wheat germ
  • wheat germ     

<h1>INGEST HUMAN</h1>

In [14]:
import os
import re
import unicodedata
import json
from pathlib import Path
from collections import defaultdict
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD, SKOS, OWL
from pprint import pprint

# -----------------------------------------------------------------------------
# Constants / Namespaces
# -----------------------------------------------------------------------------
NCBI_BASE             = "http://purl.obolibrary.org/obo/"
MESH_BASE             = "https://www.ncbi.nlm.nih.gov/mesh/"
HUMAN_CLASS           = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Human")
HUMAN_CONCEPT_SCHEME  = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Human")

# add a “Humans” concept
#created["Humans"] = URIRef(f"{NCBI_BASE}NCBITaxon_9606")
#g.add((created["Humans"], RDF.type, HUMAN_CLASS))
#g.add((created["Humans"], RDF.type, SKOS.Concept))
#g.add((created["Humans"], RDFS.label, Literal("Humans", datatype=XSD.string)))
#g.add((created["Humans"], SKOS.inScheme, HUMAN_CONCEPT_SCHEME))

def parse_mesh_descriptors(xml_path):
    descs = []
    tree  = ET.parse(xml_path)
    root  = tree.getroot()
    for dr in root.findall('DescriptorRecord'):
        ui   = dr.findtext('DescriptorUI')
        name = dr.findtext('DescriptorName/String')
        tns  = [tn.text for tn in dr.findall('TreeNumberList/TreeNumber') if tn.text]
        if ui and name:
            descs.append({'ui':ui,'name':name,'tree_numbers':tns})
    return descs

def build_name_index(descriptors):
    idx = defaultdict(list)
    for d in descriptors:
        if not d['tree_numbers']: continue
        tn = d['tree_numbers'][0]
        idx[d['name'].lower()].append((tn, d['ui'], d['name']))
    return idx

MESH_XML     = 'desc2025.xml'
mesh_descs   = parse_mesh_descriptors(MESH_XML)
mesh_index   = build_name_index(mesh_descs)

mesh_items   = [(tn,ui,name) for vs in mesh_index.values() for tn,ui,name in vs]
mesh_labels  = [name for (_,_,name) in mesh_items]
mesh_uids    = [ui   for (_,ui,_)   in mesh_items]

mesh_vec     = TfidfVectorizer(stop_words="english").fit(mesh_labels)
mesh_mat     = mesh_vec.transform(mesh_labels)

def load_taxonomy_tree(path):
    row_re = re.compile(r"^\s*(.*?)\s+\[([^\]]+)\]\s*$")
    rows = []
    with open(path, encoding="utf-8") as fh:
        for ln in fh:
            m = row_re.match(ln)
            if m:
                label, tid = m.groups()
                depth      = len(ln) - len(ln.lstrip())
                rows.append((label, tid, depth))
    return rows

TAX_FILE = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\parsedOntologies\patients_output.txt"
rows     = load_taxonomy_tree(TAX_FILE)

exact_ix  = defaultdict(list)
for lbl, tid, depth in rows:
    exact_ix[lbl.lower()].append((lbl, tid, depth))

labels_only  = [r[0] for r in rows]
vec          = TfidfVectorizer(stop_words="english")
mat          = vec.fit_transform(labels_only)

def top_cosine(term, k=5, thr=0.75):
    v   = vec.transform([term])
    sc  = cosine_similarity(v, mat).ravel()
    idx = np.argsort(sc)[::-1]
    out = []
    for i in idx[:k]:
        if sc[i] < thr: break
        lbl, tid, d = rows[i]
        out.append((lbl, tid, d, sc[i]))
    return out
    
for paper_id, paper_data in data.items():
    entities = paper_data.get("entities", [])
    for i, entity in enumerate(entities):
        raw_label = entity.get("label", "").strip()
        
        text_span = entity.get("text_span", "").strip()
        
        if raw_label == "human":
            text_span = entity.get("text_span", "").strip()
            cleaned_text = create_uri_fragment(text_span)
            cleaned_text_span = normalize_to_ascii(cleaned_text)
            term_raw = cleaned_text_span.lower()
            term = preprocess(term_raw)
            
            print(f"Query: {term}")
            
            if term_raw in created:
                entity_uri = created[term_raw]
                print(f"  → Reusing existing URI: {entity_uri}\n")
                continue

            mesh_ex = mesh_index.get(term, [])
            if mesh_ex:
                for t,u,n in mesh_ex:
                    print(f"  • {t:40s} ID={u:15s} depth={n:40s} score=1.00 (TAX1 exact)")
                    entity_uri = URIRef(f"{MESH_BASE}{u}")
                    created[term_raw] = entity_uri
                    g.add((entity_uri, RDF.type, HUMAN_CLASS))
                    g.add((entity_uri, RDF.type, SKOS.Concept))
                    g.add((entity_uri, RDFS.label, Literal(n, datatype=XSD.string)))
                    g.add((entity_uri, SKOS.inScheme, HUMAN_CONCEPT_SCHEME))
                    #g.add((entity_uri, SKOS.broaderTransitive, created["Humans"]))
                    mention_uri = URIRef(GUTBRAIN[term_raw])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_human_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    continue
                    print(); 
                    
            cos = top_cosine(term)
            if cos:
                for name, ui, depth, score in cos[:1]:
                    print(f"  • {name:40s} ID={ui:15s} depth={depth:<2d} score={score:.2f} (Patients cosine)")
                    name_uri = URIRef(f"{MESH_BASE}{ui}")
                    created[term_raw] = name_uri
                    g.add((name_uri, RDF.type, HUMAN_CLASS))
                    g.add((name_uri, RDF.type, SKOS.Concept))
                    g.add((name_uri, RDFS.label, Literal(name, datatype=XSD.string)))
                    g.add((name_uri, SKOS.inScheme, HUMAN_CONCEPT_SCHEME))
                    #g.add((name_uri, SKOS.broaderTransitive, created["Humans"]))
                    mention_uri = URIRef(GUTBRAIN[term_raw])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_human_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((name_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    print()
                continue
            else:
                mention_uri = URIRef(GUTBRAIN[term_raw])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_human_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                #g.add((created["Humans"], GUTPROP.containedIn, mention_uri))
                tokenized_mentions[term_raw] = mention_uri
                continue

        else:
            print("Not creating human")

with open(tokenized_file, "r", encoding="utf-8") as f_sent:
    tokenized_data = json.load(f_sent)

for entry in tokenized_data:
    pmid         = entry["pmid"]
    sent_id      = entry["sent_id"]
    sentence_txt = entry["sentence"].strip()
    entities     = entry["entities"]

    sent_uri = URIRef(GUTBRAIN[f"sentence_{pmid}_{sent_id}"])

    sent_uri = URIRef(GUTBRAIN[f"sentence_{pmid}_{sent_id}"])
    g.add((sent_uri, RDF.type, SENTENCE))
    g.add((sent_uri, GUTPROP.hasSentenceText, Literal(sentence_txt, datatype=XSD.string)))

    if sent_id == 0:
        parent_uri = URIRef(GUTBRAIN[f"title_{pmid}"])
    else:
        parent_uri = URIRef(GUTBRAIN[f"abstract_{pmid}"])
    g.add((sent_uri,      GUTPROP.partOf,  parent_uri))
    g.add((parent_uri,    GUTPROP.composedOf, sent_uri))
        
    for ent in entities:
        if isinstance(ent, dict):
            text_span = ent.get("text_span", "").strip()
            label     = ent.get("label",    "").strip().lower()
        else:
            continue

        if label != "human":
            continue
        canonical = create_uri_fragment(text_span)
        cleaned_text_span = normalize_to_ascii(canonical).lower()
        
        if cleaned_text_span not in tokenized_mentions:
            mention_uri = URIRef(GUTBRAIN[cleaned_text_span])
            tokenized_mentions[cleaned_text_span] = mention_uri
            print(cleaned_text_span)
            g.add((mention_uri, RDF.type, MENTION_CLASS))
            g.add((mention_uri, RDFS.label, Literal(f"mention_human_{cleaned_text_span}", datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.taggedAs, Literal(label, datatype=XSD.string)))
            
        else:
            mention_uri = tokenized_mentions[cleaned_text_span]
            
        g.add((mention_uri, GUTPROP.locatedIn, sent_uri))

Not creating human
Query: patient
Query: patient
Not creating human
Not creating human
Query: people
Not creating human
Not creating human
Query: bariatric patient
Not creating human
Not creating human
Not creating human
Not creating human
Not creating human
Not creating human
Not creating human
Query: patient
Not creating human
Not creating human
Not creating human
Not creating human
Not creating human
Not creating human
Not creating human
Query: patient
Not creating human
Not creating human
Not creating human
Not creating human
Not creating human
Not creating human
Not creating human
Not creating human
Not creating human
Not creating human
Not creating human
Not creating human
Not creating human
Not creating human
Not creating human
Not creating human
Not creating human
Not creating human
Not creating human
Not creating human
Not creating human
Not creating human
Not creating human
Not creating human
Not creating human
Not creating human
Not creating human
Not creating human
Not crea

<h1>INGEST DRUG</h1>

In [15]:
import re
import json
import numpy as np
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
import unicodedata
from pathlib import Path
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD, SKOS, OWL
from pprint import pprint

DRUG_BASE = "http://purl.obolibrary.org/obo/"
DRUG_CLASS = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Drug")
DRUG_CONCEPT_SCHEME = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Drug")

created["Drug"] = URIRef(f"{CHEBI_BASE}CHEBI_23888")
g.add((created["Drug"], RDF.type, DRUG_CLASS))
g.add((created["Drug"], RDF.type, SKOS.Concept))
g.add((created["Drug"], RDFS.label, Literal("Drug", datatype=XSD.string)))
g.add((created["Drug"], SKOS.inScheme, DRUG_CONCEPT_SCHEME))


def load_chebi_labels(path):
    rows = []
    with open(path, encoding="utf-8") as fh:
        next(fh) 
        for ln in fh:
            uri, label = ln.rstrip("\n").split("\t", 1)
            rows.append((label, uri))
    return rows

CHEBI_LABELS_FILE = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\parsedOntologies\chebi_labels.txt"
chebi_rows = load_chebi_labels(CHEBI_LABELS_FILE)

exact_ix = defaultdict(list)
for lbl, uri in chebi_rows:
    exact_ix[lbl.lower()].append((lbl, uri))

labels_only = [lbl for lbl, _ in chebi_rows]
vec = TfidfVectorizer(stop_words="english")
mat = vec.fit_transform(labels_only)

def top_cosine(term, k=5, thr=0.75):
    v  = vec.transform([term])
    sc = cosine_similarity(v, mat).ravel()
    idx = np.argsort(sc)[::-1]
    out = []
    for i in idx[:k]:
        if sc[i] < thr:
            break
        lbl, uri = chebi_rows[i]
        out.append((lbl, uri, sc[i]))
    return out

greek_map = {
    'α': 'alpha',  'Α': 'alpha',
    'β': 'beta',   'Β': 'beta',
    'γ': 'gamma',  'Γ': 'gamma',
    'δ': 'delta',  'Δ': 'delta',
    'ε': 'epsilon','Ε': 'epsilon',
    'ζ': 'zeta',   'Ζ': 'zeta',
    'η': 'eta',    'Η': 'eta',
    'θ': 'theta',  'Θ': 'theta',
    'ι': 'iota',   'Ι': 'iota',
    'κ': 'kappa',  'Κ': 'kappa',
    'λ': 'lambda', 'Λ': 'lambda',
    'μ': 'mu',     'Μ': 'mu',
    'ν': 'nu',     'Ν': 'nu',
    'ξ': 'xi',     'Ξ': 'xi',
    'ο': 'omicron','Ο': 'omicron',
    'π': 'pi',     'Π': 'pi',
    'ρ': 'rho',    'Ρ': 'rho',
    'σ': 'sigma',  'Σ': 'sigma',
    'τ': 'tau',    'Τ': 'tau',
    'υ': 'upsilon','Υ': 'upsilon',
    'φ': 'phi',    'Φ': 'phi',
    'χ': 'chi',    'Χ': 'chi',
    'ψ': 'psi',    'Ψ': 'psi',
    'ω': 'omega',  'Ω': 'omega',
}

def preprocess(term):
    for greek_char, name in greek_map.items():
        if greek_char in term:
            term = term.replace(greek_char, name)
    term = term.replace('_', ' ')
    term = term.strip()
    return term.lower()
    
for paper_id, paper_data in data.items():
    
    entities = paper_data.get("entities", [])
    
    for i, entity in enumerate(entities):
        raw_label = entity.get("label", "").strip()
        
        text_span = entity.get("text_span", "").strip()
        
        if raw_label == "drug":
            text_span = entity.get("text_span", "").strip()
            cleaned_text = create_uri_fragment(text_span)
            cleaned_text_span = normalize_to_ascii(cleaned_text)
            term_raw = cleaned_text_span.lower()
            term = preprocess(term_raw)
            
            print(f"Query: {term}")
            
            if term_raw in created:
                entity_uri = created[term_raw]
                print(f"  → Reusing existing URI: {entity_uri}\n")
                continue

            ex = exact_ix.get(term, [])
            if ex:
                for lbl, uri in ex:
                    print(f"  • {lbl:40s} URI={uri}")
                    entity_uri = URIRef(f"{uri}")
                    created[term_raw] = entity_uri
                    g.add((entity_uri, RDF.type, DRUG_CLASS))
                    g.add((entity_uri, RDF.type, SKOS.Concept))
                    g.add((entity_uri, RDFS.label, Literal(lbl, datatype=XSD.string)))
                    g.add((entity_uri, SKOS.inScheme, DRUG_CONCEPT_SCHEME))
                    g.add((entity_uri, SKOS.broaderTransitive, created["Drug"]))
                    mention_uri = URIRef(GUTBRAIN[term_raw])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_drug_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    continue
                    print(); 
                    
            cos = top_cosine(term)
            if cos:
                for lbl, uri, score in cos[:1]:
                    print(f"  • {lbl:40s} URI={uri:40s} score={score:.2f}")
                    name_uri = URIRef(f"{uri}")
                    created[term_raw] = name_uri
                    g.add((name_uri, RDF.type, DRUG_CLASS))
                    g.add((name_uri, RDF.type, SKOS.Concept))
                    g.add((name_uri, RDFS.label, Literal(lbl, datatype=XSD.string)))
                    g.add((name_uri, SKOS.inScheme, DRUG_CONCEPT_SCHEME))
                    g.add((name_uri, SKOS.broaderTransitive, created["Drug"]))
                    mention_uri = URIRef(GUTBRAIN[term_raw])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_drug_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((name_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    print()
            else:
                mention_uri = URIRef(GUTBRAIN[term_raw])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_drug_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                g.add((created["Drug"], GUTPROP.containedIn, mention_uri))
                tokenized_mentions[term_raw] = mention_uri
                continue

        else:
            pass

with open(tokenized_file, "r", encoding="utf-8") as f_sent:
    tokenized_data = json.load(f_sent)

for entry in tokenized_data:
    pmid         = entry["pmid"]
    sent_id      = entry["sent_id"]
    sentence_txt = entry["sentence"].strip()
    entities     = entry["entities"]

    sent_uri = URIRef(GUTBRAIN[f"sentence_{pmid}_{sent_id}"])
    g.add((sent_uri, RDF.type, SENTENCE))
    g.add((sent_uri, GUTPROP.hasSentenceText, Literal(sentence_txt, datatype=XSD.string)))

    if sent_id == 0:
        parent_uri = URIRef(GUTBRAIN[f"title_{pmid}"])
    else:
        parent_uri = URIRef(GUTBRAIN[f"abstract_{pmid}"])
    g.add((sent_uri,      GUTPROP.partOf,  parent_uri))
    g.add((parent_uri,    GUTPROP.composedOf, sent_uri))
        
    for ent in entities:
        if isinstance(ent, dict):
            text_span = ent.get("text_span", "").strip()
            label     = ent.get("label",    "").strip().lower()
        else:
            continue

        if label != "drug":
            continue
        canonical = create_uri_fragment(text_span)
        cleaned_text_span = normalize_to_ascii(canonical).lower()
        
        if cleaned_text_span not in tokenized_mentions:
            mention_uri = URIRef(GUTBRAIN[cleaned_text_span])
            tokenized_mentions[cleaned_text_span] = mention_uri

            g.add((mention_uri, RDF.type, MENTION_CLASS))
            g.add((mention_uri, RDFS.label, Literal(f"mention_drug_{cleaned_text_span}", datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.taggedAs, Literal(label, datatype=XSD.string)))
            
        else:
            mention_uri = tokenized_mentions[cleaned_text_span]
            
        g.add((mention_uri, GUTPROP.locatedIn, sent_uri))

Query: antibiotics
  • amino acid derivative antibiotics        URI=http://purl.obolibrary.org/obo/CHEBI_22476 score=0.78

Query: antibiotic growth promotants
Query: agps
Query: agps
Query: agps
Query: agps
Query: agp treatments
Query: agps
Query: agps
Query: agps
Query: non-absorbable antibiotic vancomycin
Query: olanzapine
  • olanzapine                               URI=http://purl.obolibrary.org/obo/CHEBI_7735
  • olanzapine                               URI=http://purl.obolibrary.org/obo/CHEBI_7735 score=1.00

Query: olanzapine
  → Reusing existing URI: http://purl.obolibrary.org/obo/CHEBI_7735

Query: olanzapine
  → Reusing existing URI: http://purl.obolibrary.org/obo/CHEBI_7735

Query: olanzapine
  → Reusing existing URI: http://purl.obolibrary.org/obo/CHEBI_7735

Query: placebo tablets
Query: placebo
Query: placebo
Query: placebo
Query: olanzapine
  → Reusing existing URI: http://purl.obolibrary.org/obo/CHEBI_7735

Query: antibiotic
  • piperidine antibiotic                    

<h1>INGEST MICROBIOME</h1>

In [16]:
import re
import json
import numpy as np
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
import unicodedata
from pathlib import Path
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD, SKOS, OWL
from pprint import pprint

MICROBIOME_BASE = "http://purl.obolibrary.org/obo/"
MICROBIOME_CLASS = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/Microbiome")
MICROBIOME_CONCEPT_SCHEME = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Microbiome")

def load_ohmi_labels(path):
    pattern = re.compile(r'^\s*(.*?)\s*\[([A-Za-z0-9_]+)\]')
    rows = []
    with open(path, encoding="utf-8") as fh:
        for ln in fh:
            m = pattern.match(ln)
            if not m:
                continue
            label, term_id = m.group(1), m.group(2)
            uri = MICROBIOME_BASE + term_id
            rows.append((label, uri))
    return rows

OHMI_LABELS_FILE = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\parsedOntologies\OHMI_full_taxonomy.txt"
ohmi_rows = load_ohmi_labels(OHMI_LABELS_FILE)

exact_ix = defaultdict(list)
for lbl, uri in ohmi_rows:
    exact_ix[lbl.lower()].append((lbl, uri))

labels_only = [preprocess(lbl) for lbl, _ in ohmi_rows]
vec = TfidfVectorizer(stop_words="english")
mat = vec.fit_transform(labels_only)

def top_cosine(term, k=5, thr=0.75):
    v  = vec.transform([term])
    sc = cosine_similarity(v, mat).ravel()
    idx = np.argsort(sc)[::-1]
    out = []
    for i in idx[:k]:
        if sc[i] < thr:
            break
        lbl, uri = ohmi_rows[i]
        out.append((lbl, uri, sc[i]))
    return out

def preprocess(term):
    term = term.replace('_', ' ')
    term = term.strip()
    return term.lower()
    
for paper_id, paper_data in data.items():
    
    entities = paper_data.get("entities", [])
    
    for i, entity in enumerate(entities):
        raw_label = entity.get("label", "").strip()
        
        text_span = entity.get("text_span", "").strip()
        
        if raw_label == "microbiome":
            text_span = entity.get("text_span", "").strip()
            cleaned_text = create_uri_fragment(text_span)
            cleaned_text_span = normalize_to_ascii(cleaned_text)
            term_raw = cleaned_text_span.lower()
            term = preprocess(term_raw)
            
            if re.search(r'\bbacter(?:ia|ium)\b', text_span.lower()):
                mention_uri = URIRef(GUTBRAIN[term_raw])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_microbiome_{term_raw}", datatype=XSD.string)))
                continue
            
            print(f"Query: {term}")
            
            if term_raw in created:
                entity_uri = created[term_raw]
                print(f"  → Reusing existing URI: {entity_uri}\n")
                continue

            ex = exact_ix.get(term, [])
            if ex:
                for lbl, uri in ex:
                    print(f"  • {lbl:40s} URI={uri}")
                    entity_uri = URIRef(f"{uri}")
                    created[term_raw] = entity_uri
                    g.add((entity_uri, RDF.type, MICROBIOME_CLASS))
                    g.add((entity_uri, RDF.type, SKOS.Concept))
                    g.add((entity_uri, RDFS.label, Literal(lbl, datatype=XSD.string)))
                    g.add((entity_uri, SKOS.inScheme, MICROBIOME_CONCEPT_SCHEME))
                    mention_uri = URIRef(GUTBRAIN[term_raw])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_microbiome_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    continue
                    print(); 
                    
            cos = top_cosine(term)
            if cos:
                for lbl, uri, score in cos[:1]:
                    print(f"  • {lbl:40s} URI={uri:40s} score={score:.2f}")
                    name_uri = URIRef(f"{uri}")
                    created[term_raw] = name_uri
                    g.add((name_uri, RDF.type, MICROBIOME_CLASS))
                    g.add((name_uri, RDF.type, SKOS.Concept))
                    g.add((name_uri, RDFS.label, Literal(lbl, datatype=XSD.string)))
                    g.add((name_uri, SKOS.inScheme, MICROBIOME_CONCEPT_SCHEME))
                    mention_uri = URIRef(GUTBRAIN[term_raw])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_microbiome_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((name_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    print()
            else:
                mention_uri = URIRef(GUTBRAIN[term_raw])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_microbiome_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                tokenized_mentions[term_raw] = mention_uri
                continue

        else:
            pass

with open(tokenized_file, "r", encoding="utf-8") as f_sent:
    tokenized_data = json.load(f_sent)

for entry in tokenized_data:
    pmid         = entry["pmid"]
    sent_id      = entry["sent_id"]
    sentence_txt = entry["sentence"].strip()
    entities     = entry["entities"]

    sent_uri = URIRef(GUTBRAIN[f"sentence_{pmid}_{sent_id}"])
    g.add((sent_uri, RDF.type, SENTENCE))
    g.add((sent_uri, GUTPROP.hasSentenceText, Literal(sentence_txt, datatype=XSD.string)))

    if sent_id == 0:
        parent_uri = URIRef(GUTBRAIN[f"title_{pmid}"])
    else:
        parent_uri = URIRef(GUTBRAIN[f"abstract_{pmid}"])
    g.add((sent_uri,      GUTPROP.partOf,  parent_uri))
    g.add((parent_uri,    GUTPROP.composedOf, sent_uri))
        
    for ent in entities:
        if isinstance(ent, dict):
            text_span = ent.get("text_span", "").strip()
            label     = ent.get("label",    "").strip().lower()
        else:
            continue

        if label != "microbiome":
            continue
        canonical = create_uri_fragment(text_span)
        cleaned_text_span = normalize_to_ascii(canonical).lower()
        
        if cleaned_text_span not in tokenized_mentions:
            mention_uri = URIRef(GUTBRAIN[cleaned_text_span])
            tokenized_mentions[cleaned_text_span] = mention_uri

            g.add((mention_uri, RDF.type, MENTION_CLASS))
            g.add((mention_uri, RDFS.label, Literal(f"mention_microbiome_{cleaned_text_span}", datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.taggedAs, Literal(label, datatype=XSD.string)))
            
        else:
            mention_uri = tokenized_mentions[cleaned_text_span]
            
        g.add((mention_uri, GUTPROP.locatedIn, sent_uri))

Query: gut microbiota
  • microbiota                               URI=http://purl.obolibrary.org/obo/OHMI_0000463 score=0.95

Query: chicken gut microbiome
  • human gut microbiome                     URI=http://purl.obolibrary.org/obo/OHMI_0000020 score=0.91

Query: gut microbiome
  • human gut microbiome                     URI=http://purl.obolibrary.org/obo/OHMI_0000020 score=0.91

Query: poultry gut microbiome
  • human gut microbiome                     URI=http://purl.obolibrary.org/obo/OHMI_0000020 score=0.91

Query: chicken gut microbiome
  → Reusing existing URI: http://purl.obolibrary.org/obo/OHMI_0000020

Query: oral and gut microbiota
  • microbiota                               URI=http://purl.obolibrary.org/obo/OHMI_0000463 score=0.82

Query: gut and oral microbiota
  • microbiota                               URI=http://purl.obolibrary.org/obo/OHMI_0000463 score=0.82

Query: intestinal microbiome
  • intestinal cancer                        URI=http://purl.obolibrary.or

<h1>INGEST STATISTICAL TECHNIQUE</h1>

In [17]:
import re
import json
import numpy as np
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
import unicodedata
from pathlib import Path
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD, SKOS, OWL
from pprint import pprint

STATISTICALTECHNIQUE_BASE = "http://purl.obolibrary.org/obo/"
STATISTICALTECHNIQUE_CLASS = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/StatisticalTechnique")
STATISTICALTECHNIQUE_CONCEPT_SCHEME = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/StatisticalTechnique")

def load_statistical_labels(path):
    pattern = re.compile(r'^\s*(.*?)\s*[\(\[]([A-Za-z0-9_]+)[\)\]]')
    rows = []
    with open(path, encoding="utf-8") as fh:
        for ln in fh:
            m = pattern.match(ln)
            if not m:
                continue
            label, term_id = m.group(1), m.group(2)
            uri = STATISTICALTECHNIQUE_BASE + term_id
            rows.append((label, uri))
    return rows

STATISTICAL_LABELS_FILE = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\parsedOntologies\stato_full_taxonomy.txt"
STATO_LABELS_FILE = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\parsedOntologies\subtree_C19044.txt"
stat_rows = load_statistical_labels(STATISTICAL_LABELS_FILE)
stat1_rows = load_statistical_labels(STATO_LABELS_FILE)

exact_ix = defaultdict(list)
for lbl, uri in stat_rows:
    exact_ix[lbl.lower()].append((lbl, uri))

exact_ix1 = defaultdict(list)
for lbl, uri in stat1_rows:
    exact_ix1[lbl.lower()].append((lbl, uri))

labels_only = [preprocess(lbl) for lbl, _ in stat_rows]
vec = TfidfVectorizer(stop_words="english")
mat = vec.fit_transform(labels_only)

labels1_only = [preprocess(lbl) for lbl, _ in stat1_rows]
vec1 = TfidfVectorizer(stop_words="english")
mat1 = vec1.fit_transform(labels1_only)

def top_cosine(term, k=5, thr=0.75):
    v  = vec.transform([term])
    sc = cosine_similarity(v, mat).ravel()
    idx = np.argsort(sc)[::-1]
    out = []
    for i in idx[:k]:
        if sc[i] < thr:
            break
        lbl, uri = stat_rows[i]
        out.append((lbl, uri, sc[i]))
    return out

def top_cosine1(term, k=5, thr=0.75):
    v   = vec1.transform([term])
    sc  = cosine_similarity(v, mat1).ravel()
    idx = np.argsort(sc)[::-1]
    out = []
    for i in idx[:k]:
        if sc[i] < thr:
            break
        lbl, uri = stat1_rows[i]
        out.append((lbl, uri, sc[i]))
    return out
    
for paper_id, paper_data in data.items():
    
    entities = paper_data.get("entities", [])
    
    for i, entity in enumerate(entities):
        raw_label = entity.get("label", "").strip()
        
        text_span = entity.get("text_span", "").strip()
        
        if raw_label == "statistical technique":
            text_span = entity.get("text_span", "").strip()
            cleaned_text = create_uri_fragment(text_span)
            cleaned_text_span = normalize_to_ascii(cleaned_text)
            term_raw = cleaned_text_span.lower()
            term = preprocess(term_raw)
            
            print(f"Query: {term}")
            
            if term_raw in created:
                entity_uri = created[term_raw]
                print(f"  → Reusing existing URI: {entity_uri}\n")
                continue

            ex = exact_ix.get(term, [])
            if ex:
                for lbl, uri in ex:
                    print(f"  • {lbl:40s} URI={uri}")
                    entity_uri = URIRef(f"{uri}")
                    created[term_raw] = entity_uri
                    g.add((entity_uri, RDF.type, STATISTICALTECHNIQUE_CLASS))
                    g.add((entity_uri, RDF.type, SKOS.Concept))
                    g.add((entity_uri, RDFS.label, Literal(lbl, datatype=XSD.string)))
                    g.add((entity_uri, SKOS.inScheme, STATISTICALTECHNIQUE_CONCEPT_SCHEME))
                    mention_uri = URIRef(GUTBRAIN[term_raw])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_stattechnique_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    continue
                    print(); 
                    
            cos = top_cosine(term)
            cos1 = top_cosine1(term)
            if cos:
                for lbl, uri, score in cos[:1]:
                    print(f"  • {lbl:40s} URI={uri:40s} score={score:.2f}")
                    name_uri = URIRef(f"{uri}")
                    created[term_raw] = name_uri
                    g.add((name_uri, RDF.type, STATISTICALTECHNIQUE_CLASS))
                    g.add((name_uri, RDF.type, SKOS.Concept))
                    g.add((name_uri, RDFS.label, Literal(lbl, datatype=XSD.string)))
                    g.add((name_uri, SKOS.inScheme, STATISTICALTECHNIQUE_CONCEPT_SCHEME))
                    mention_uri = URIRef(GUTBRAIN[term_raw])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_stattechnique_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((name_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    print()
                    
            elif cos1:
                for lbl, uri, score in cos1[:1]:
                    print(f"  • {lbl:40s} URI={uri:40s} score={score:.2f}")
                    name_uri = URIRef(f"{uri}")
                    created[term_raw] = name_uri
                    g.add((name_uri, RDF.type, STATISTICALTECHNIQUE_CLASS))
                    g.add((name_uri, RDF.type, SKOS.Concept))
                    g.add((name_uri, RDFS.label, Literal(lbl, datatype=XSD.string)))
                    g.add((name_uri, SKOS.inScheme, STATISTICALTECHNIQUE_CONCEPT_SCHEME))
                    mention_uri = URIRef(GUTBRAIN[term_raw])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_stattechnique_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((name_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    print()
            else:
                mention_uri = URIRef(GUTBRAIN[term_raw])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_stattechnique_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                tokenized_mentions[term_raw] = mention_uri
                print("no matches")
                continue

        else:
            pass

with open(tokenized_file, "r", encoding="utf-8") as f_sent:
    tokenized_data = json.load(f_sent)

for entry in tokenized_data:
    pmid         = entry["pmid"]
    sent_id      = entry["sent_id"]
    sentence_txt = entry["sentence"].strip()
    entities     = entry["entities"]

    sent_uri = URIRef(GUTBRAIN[f"sentence_{pmid}_{sent_id}"])
    g.add((sent_uri, RDF.type, SENTENCE))
    g.add((sent_uri, GUTPROP.hasSentenceText, Literal(sentence_txt, datatype=XSD.string)))

    if sent_id == 0:
        parent_uri = URIRef(GUTBRAIN[f"title_{pmid}"])
    else:
        parent_uri = URIRef(GUTBRAIN[f"abstract_{pmid}"])
    g.add((sent_uri,      GUTPROP.partOf,  parent_uri))
    g.add((parent_uri,    GUTPROP.composedOf, sent_uri))
        
    for ent in entities:
        if isinstance(ent, dict):
            text_span = ent.get("text_span", "").strip()
            label     = ent.get("label",    "").strip().lower()
        else:
            continue

        if label != "statistical technique":
            continue
        canonical = create_uri_fragment(text_span)
        cleaned_text_span = normalize_to_ascii(canonical).lower()
        
        if cleaned_text_span not in tokenized_mentions:
            mention_uri = URIRef(GUTBRAIN[cleaned_text_span])
            tokenized_mentions[cleaned_text_span] = mention_uri

            g.add((mention_uri, RDF.type, MENTION_CLASS))
            g.add((mention_uri, RDFS.label, Literal(f"mention_stattechnique_{cleaned_text_span}", datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.taggedAs, Literal(label, datatype=XSD.string)))
            
        else:
            mention_uri = tokenized_mentions[cleaned_text_span]
            
        g.add((mention_uri, GUTPROP.locatedIn, sent_uri))

Query: random-effects meta-analyses
  • Meta-Analysis                            URI=http://purl.obolibrary.org/obo/NCIT_C17886 score=0.84

Query: wald test
  • Wald test                                URI=http://purl.obolibrary.org/obo/STATO_0000559
  • Wald test                                URI=http://purl.obolibrary.org/obo/STATO_0000559 score=1.00

Query: receiver operating characteristic curve analysis
no matches
Query: random forests
  • random variable                          URI=http://purl.obolibrary.org/obo/STATO_0000221 score=0.78

Query: rf
no matches
Query: boruta algorithm
  • algorithm                                URI=http://purl.obolibrary.org/obo/IAO_0000064 score=1.00

Query: rf
no matches
Query: chao1 index
no matches
Query: shannon
no matches
Query: inverse simpson
no matches
Query: bray curtis dissimilarities
no matches
Query: two sided mann whitney test
  • Mann-Whitney U-test                      URI=http://purl.obolibrary.org/obo/STATO_0000076 score=0.85

Q

<h1>INGEST BIOMEDICAL TECHNIQUE</h1>

In [18]:
import re
import json
import numpy as np
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
import unicodedata
from pathlib import Path
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD, SKOS, OWL
from pprint import pprint

BIOMEDICALTECHNIQUE_BASE = "http://purl.obolibrary.org/obo/"
BIOMEDICALTECHNIQUE_CLASS = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/BiomedicalTechnique")
BIOMEDICALTECHNIQUE_CONCEPT_SCHEME = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/BiomedicalTechnique")

def load_biomedical_labels(path):
    pattern = re.compile(r'^\s*(.*?)\s*[\(\[]([A-Za-z0-9_]+)[\)\]]')
    rows = []
    with open(path, encoding="utf-8") as fh:
        for ln in fh:
            m = pattern.match(ln)
            if not m:
                continue
            label, term_id = m.group(1), m.group(2)
            uri = BIOMEDICALTECHNIQUE_BASE + term_id
            rows.append((label, uri))
    return rows

BIOMEDICAL_LABELS_FILE = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\parsedOntologies\ncbitaxon_full_taxonomy.txt"
STATO_LABELS_FILE = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\parsedOntologies\stato_full_taxonomy.txt"
biom_rows = load_biomedical_labels(BIOMEDICAL_LABELS_FILE)
biom1_rows = load_biomedical_labels(STATO_LABELS_FILE)

exact_ix = defaultdict(list)
for lbl, uri in biom_rows:
    exact_ix[lbl.lower()].append((lbl, uri))

exact_ix1 = defaultdict(list)
for lbl, uri in biom1_rows:
    exact_ix1[lbl.lower()].append((lbl, uri))

labels_only = [preprocess(lbl) for lbl, _ in biom_rows]
vec = TfidfVectorizer(stop_words="english")
mat = vec.fit_transform(labels_only)

labels1_only = [preprocess(lbl) for lbl, _ in biom1_rows]
vec1 = TfidfVectorizer(stop_words="english")
mat1 = vec1.fit_transform(labels1_only)

def top_cosine(term, k=5, thr=0.75):
    v  = vec.transform([term])
    sc = cosine_similarity(v, mat).ravel()
    idx = np.argsort(sc)[::-1]
    out = []
    for i in idx[:k]:
        if sc[i] < thr:
            break
        lbl, uri = biom_rows[i]
        out.append((lbl, uri, sc[i]))
    return out

def top_cosine1(term, k=5, thr=0.75):
    v   = vec1.transform([term])
    sc  = cosine_similarity(v, mat1).ravel()
    idx = np.argsort(sc)[::-1]
    out = []
    for i in idx[:k]:
        if sc[i] < thr:
            break
        lbl, uri = biom1_rows[i]
        out.append((lbl, uri, sc[i]))
    return out
    
for paper_id, paper_data in data.items():
    
    entities = paper_data.get("entities", [])
    
    for i, entity in enumerate(entities):
        raw_label = entity.get("label", "").strip()
        
        text_span = entity.get("text_span", "").strip()
        
        if raw_label == "biomedical technique":
            text_span = entity.get("text_span", "").strip()
            cleaned_text = create_uri_fragment(text_span)
            cleaned_text_span = normalize_to_ascii(cleaned_text)
            term_raw = cleaned_text_span.lower()
            term = preprocess(term_raw)
            
            print(f"Query: {term}")
            
            if term_raw in created:
                entity_uri = created[term_raw]
                print(f"  → Reusing existing URI: {entity_uri}\n")
                continue

            ex = exact_ix.get(term, [])
            if ex:
                for lbl, uri in ex:
                    print(f"  • {lbl:40s} URI={uri}")
                    entity_uri = URIRef(f"{uri}")
                    created[term_raw] = entity_uri
                    g.add((entity_uri, RDF.type, BIOMEDICALTECHNIQUE_CLASS))
                    g.add((entity_uri, RDF.type, SKOS.Concept))
                    g.add((entity_uri, RDFS.label, Literal(lbl, datatype=XSD.string)))
                    g.add((entity_uri, SKOS.inScheme, BIOMEDICALTECHNIQUE_CONCEPT_SCHEME))
                    mention_uri = URIRef(GUTBRAIN[term_raw])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_biomtechnique_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    continue
                    print(); 
                    
            cos = top_cosine(term)
            cos1 = top_cosine1(term)
            if cos:
                for lbl, uri, score in cos[:1]:
                    print(f"  • {lbl:40s} URI={uri:40s} score={score:.2f}")
                    name_uri = URIRef(f"{uri}")
                    created[term_raw] = name_uri
                    g.add((name_uri, RDF.type, BIOMEDICALTECHNIQUE_CLASS))
                    g.add((name_uri, RDF.type, SKOS.Concept))
                    g.add((name_uri, RDFS.label, Literal(lbl, datatype=XSD.string)))
                    g.add((name_uri, SKOS.inScheme, BIOMEDICALTECHNIQUE_CONCEPT_SCHEME))
                    mention_uri = URIRef(GUTBRAIN[term_raw])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_biomtechnique_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((name_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    print()
                    
            elif cos1:
                for lbl, uri, score in cos1[:1]:
                    print(f"  • {lbl:40s} URI={uri:40s} score={score:.2f}")
                    name_uri = URIRef(f"{uri}")
                    created[term_raw] = name_uri
                    g.add((name_uri, RDF.type, BIOMEDICALTECHNIQUE_CLASS))
                    g.add((name_uri, RDF.type, SKOS.Concept))
                    g.add((name_uri, RDFS.label, Literal(lbl, datatype=XSD.string)))
                    g.add((name_uri, SKOS.inScheme, BIOMEDICALTECHNIQUE_CONCEPT_SCHEME))
                    mention_uri = URIRef(GUTBRAIN[term_raw])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_biomtechnique_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((name_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    print()
            else:
                mention_uri = URIRef(GUTBRAIN[term_raw])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_biomtechnique_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                tokenized_mentions[term_raw] = mention_uri
                print("no matches")
                continue

        else:
            pass

with open(tokenized_file, "r", encoding="utf-8") as f_sent:
    tokenized_data = json.load(f_sent)

for entry in tokenized_data:
    pmid         = entry["pmid"]
    sent_id      = entry["sent_id"]
    sentence_txt = entry["sentence"].strip()
    entities     = entry["entities"]
    
    sent_uri = URIRef(GUTBRAIN[f"sentence_{pmid}_{sent_id}"])
    g.add((sent_uri, RDF.type, SENTENCE))
    g.add((sent_uri, GUTPROP.hasSentenceText, Literal(sentence_txt, datatype=XSD.string)))

    if sent_id == 0:
        parent_uri = URIRef(GUTBRAIN[f"title_{pmid}"])
    else:
        parent_uri = URIRef(GUTBRAIN[f"abstract_{pmid}"])
    g.add((sent_uri,      GUTPROP.partOf,  parent_uri))
    g.add((parent_uri,    GUTPROP.composedOf, sent_uri))
        
    for ent in entities:
        if isinstance(ent, dict):
            text_span = ent.get("text_span", "").strip()
            label     = ent.get("label",    "").strip().lower()
        else:
            continue

        if label != "biomedical technique":
            continue
        canonical = create_uri_fragment(text_span)
        cleaned_text_span = normalize_to_ascii(canonical).lower()
        
        if cleaned_text_span not in tokenized_mentions:
            mention_uri = URIRef(GUTBRAIN[cleaned_text_span])
            tokenized_mentions[cleaned_text_span] = mention_uri

            g.add((mention_uri, RDF.type, MENTION_CLASS))
            g.add((mention_uri, RDFS.label, Literal(f"mention_biomtechnique_{cleaned_text_span}", datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.taggedAs, Literal(label, datatype=XSD.string)))
            
        else:
            mention_uri = tokenized_mentions[cleaned_text_span]
            
        g.add((mention_uri, GUTPROP.locatedIn, sent_uri))

Query: dqi-i
no matches
Query: 16s rdna surveys
  • fungal sp. 16S-7                         URI=http://purl.obolibrary.org/obo/NCBITaxon_1080634 score=0.84

Query: 16s rdna surveys
  → Reusing existing URI: http://purl.obolibrary.org/obo/NCBITaxon_1080634

Query: metatranscriptomics analyses
no matches
Query: metabolomics
no matches
Query: community-based metabolic modeling
no matches
Query: 16s rrna amplicon sequencing
  • sequencing assay                         URI=http://purl.obolibrary.org/obo/OBI_0600047 score=0.79

Query: childhood behaviour checklist
no matches
Query: lc-ms
no matches
Query: gc
  • Bacillus sp. GC-4                        URI=http://purl.obolibrary.org/obo/NCBITaxon_996987 score=0.88

Query: 16s rrna gene sequencing
no matches
Query: 16s rrna sequencing
  • sequencing assay                         URI=http://purl.obolibrary.org/obo/OBI_0600047 score=0.79

Query: picrust analysis
no matches
Query: dual hit toxin model
no matches
Query: forced swimming test
  • 

<h1>INGEST ANATOMICAL LOCATION</h1>

In [19]:
import re
import json
import numpy as np
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
import unicodedata
from pathlib import Path
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD, SKOS, OWL
from pprint import pprint

ANATOMICALLOCATION_BASE = "http://purl.obolibrary.org/obo/"
ANATOMICALLOCATION_CLASS = URIRef("https://w3id.org/brainteaser/ontology/schema/AnatomicalSite")
ANATOMICALLOCATION_CONCEPT_SCHEME = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/AnatomicSite")

def load_anatomical_labels(path):
    pattern = re.compile(r'^\s*(.*?)\s*[\(\[]([A-Za-z0-9_]+)[\)\]]')
    rows = []
    with open(path, encoding="utf-8") as fh:
        for ln in fh:
            m = pattern.match(ln)
            if not m:
                continue
            label, term_id = m.group(1), m.group(2)
            uri = ANATOMICALLOCATION_BASE + term_id
            rows.append((label, uri))
    return rows

ANATOMICALLOCATION_LABELS_FILE = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\parsedOntologies\ncit_full_taxonomy.txt"
anat_rows = load_anatomical_labels(ANATOMICALLOCATION_LABELS_FILE)

exact_ix = defaultdict(list)
for lbl, uri in biom_rows:
    exact_ix[lbl.lower()].append((lbl, uri))

labels_only = [preprocess(lbl) for lbl, _ in anat_rows]
vec = TfidfVectorizer(stop_words="english")
mat = vec.fit_transform(labels_only)

def top_cosine(term, k=5, thr=0.75):
    v  = vec.transform([term])
    sc = cosine_similarity(v, mat).ravel()
    idx = np.argsort(sc)[::-1]
    out = []
    for i in idx[:k]:
        if sc[i] < thr:
            break
        lbl, uri = anat_rows[i]
        out.append((lbl, uri, sc[i]))
    return out
    
for paper_id, paper_data in data.items():
    
    entities = paper_data.get("entities", [])
    
    for i, entity in enumerate(entities):
        raw_label = entity.get("label", "").strip()
        
        text_span = entity.get("text_span", "").strip()
        
        if raw_label == "anatomical location":
            text_span = entity.get("text_span", "").strip()
            cleaned_text = create_uri_fragment(text_span)
            cleaned_text_span = normalize_to_ascii(cleaned_text)
            term_raw = cleaned_text_span.lower()
            term = preprocess(term_raw)
            
            print(f"Query: {term}")
            
            if term_raw in created:
                entity_uri = created[term_raw]
                print(f"  → Reusing existing URI: {entity_uri}\n")
                continue

            ex = exact_ix.get(term, [])
            if ex:
                for lbl, uri in ex:
                    print(f"  • {lbl:40s} URI={uri}")
                    entity_uri = URIRef(f"{uri}")
                    created[term_raw] = entity_uri
                    g.add((entity_uri, RDF.type, ANATOMICALLOCATION_CLASS))
                    g.add((entity_uri, RDF.type, SKOS.Concept))
                    g.add((entity_uri, RDFS.label, Literal(lbl, datatype=XSD.string)))
                    g.add((entity_uri, SKOS.inScheme, ANATOMICALLOCATION_CONCEPT_SCHEME))
                    mention_uri = URIRef(GUTBRAIN[term_raw])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_anatomicsite_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    continue
                    print(); 
                    
            cos = top_cosine(term)
            if cos:
                for lbl, uri, score in cos[:1]:
                    print(f"  • {lbl:40s} URI={uri:40s} score={score:.2f}")
                    name_uri = URIRef(f"{uri}")
                    created[term_raw] = name_uri
                    g.add((name_uri, RDF.type, ANATOMICALLOCATION_CLASS))
                    g.add((name_uri, RDF.type, SKOS.Concept))
                    g.add((name_uri, RDFS.label, Literal(lbl, datatype=XSD.string)))
                    g.add((name_uri, SKOS.inScheme, ANATOMICALLOCATION_CONCEPT_SCHEME))
                    mention_uri = URIRef(GUTBRAIN[term_raw])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_anatomicsite_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((name_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    print()
                    
            else:
                mention_uri = URIRef(GUTBRAIN[term_raw])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_anatomicsite_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                tokenized_mentions[term_raw] = mention_uri
                print("no matches")
                continue

        else:
            pass

with open(tokenized_file, "r", encoding="utf-8") as f_sent:
    tokenized_data = json.load(f_sent)

for entry in tokenized_data:
    pmid         = entry["pmid"]
    sent_id      = entry["sent_id"]
    sentence_txt = entry["sentence"].strip()
    entities     = entry["entities"]

    sent_uri = URIRef(GUTBRAIN[f"sentence_{pmid}_{sent_id}"])
    g.add((sent_uri, RDF.type, SENTENCE))
    g.add((sent_uri, GUTPROP.hasSentenceText, Literal(sentence_txt, datatype=XSD.string)))

    if sent_id == 0:
        parent_uri = URIRef(GUTBRAIN[f"title_{pmid}"])
    else:
        parent_uri = URIRef(GUTBRAIN[f"abstract_{pmid}"])
    g.add((sent_uri,      GUTPROP.partOf,  parent_uri))
    g.add((parent_uri,    GUTPROP.composedOf, sent_uri))
        
    for ent in entities:
        if isinstance(ent, dict):
            text_span = ent.get("text_span", "").strip()
            label     = ent.get("label",    "").strip().lower()
        else:
            continue

        if label != "anatomical location":
            continue
        canonical = create_uri_fragment(text_span)
        cleaned_text_span = normalize_to_ascii(canonical).lower()
        
        if cleaned_text_span not in tokenized_mentions:
            mention_uri = URIRef(GUTBRAIN[cleaned_text_span])
            tokenized_mentions[cleaned_text_span] = mention_uri

            g.add((mention_uri, RDF.type, MENTION_CLASS))
            g.add((mention_uri, RDFS.label, Literal(f"mention_anatomicsite_{cleaned_text_span}", datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.taggedAs, Literal(label, datatype=XSD.string)))
            
        else:
            mention_uri = tokenized_mentions[cleaned_text_span]
            
        g.add((mention_uri, GUTPROP.locatedIn, sent_uri))

Query: gastrointestinal  gi  site
no matches
Query: gi sites
no matches
Query: intestinal site
no matches
Query: hippocampus
  • Hippocampus                              URI=http://purl.obolibrary.org/obo/NCBITaxon_72046
  • Hippocampus                              URI=http://purl.obolibrary.org/obo/NCIT_C12444 score=1.00

Query: oral cavity
  • Oral Cavity                              URI=http://purl.obolibrary.org/obo/NCIT_C12421 score=1.00

Query: nasal passages
  • Nose, Nasal Passages                     URI=http://purl.obolibrary.org/obo/NCIT_C13320 score=0.85

Query: lungs
no matches
Query: gut
no matches
Query: skin
  • Skin Of The Back                         URI=http://purl.obolibrary.org/obo/NCIT_C142318 score=1.00

Query: bladder
  • Bladder                                  URI=http://purl.obolibrary.org/obo/NCIT_C12414 score=1.00

Query: vagina
  • Vagina                                   URI=http://purl.obolibrary.org/obo/NCIT_C12407 score=1.00

Query: gastrointestinal tr

<h1>INGEST GENE</h1>

In [20]:
import re
import json
import numpy as np
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
import unicodedata
from pathlib import Path
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD, SKOS, OWL
from pprint import pprint

GENE_BASE = "http://purl.obolibrary.org/obo/"
GENE_CLASS = URIRef("https://w3id.org/brainteaser/ontology/schema/Gene")
GENE_CONCEPT_SCHEME = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Gene")

def load_gene_labels(path):
    pattern = re.compile(r'^\s*(.*?)\s*[\(\[]([A-Za-z0-9_]+)[\)\]]')
    rows = []
    with open(path, encoding="utf-8") as fh:
        for ln in fh:
            m = pattern.match(ln)
            if not m:
                continue
            label, term_id = m.group(1), m.group(2)
            uri = GENE_BASE + term_id
            rows.append((label, uri))
    return rows

GENE_LABELS_FILE = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\parsedOntologies\ncit_full_taxonomy.txt"
gene_rows = load_gene_labels(GENE_LABELS_FILE)

exact_ix = defaultdict(list)
for lbl, uri in gene_rows:
    exact_ix[lbl.lower()].append((lbl, uri))

labels_only = [preprocess(lbl) for lbl, _ in gene_rows]
vec = TfidfVectorizer(stop_words="english")
mat = vec.fit_transform(labels_only)

def top_cosine(term, k=5, thr=0.75):
    v  = vec.transform([term])
    sc = cosine_similarity(v, mat).ravel()
    idx = np.argsort(sc)[::-1]
    out = []
    for i in idx[:k]:
        if sc[i] < thr:
            break
        lbl, uri = gene_rows[i]
        out.append((lbl, uri, sc[i]))
    return out

manual_created = {
    "skin_lipid_metabolism_gene" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/Gene/skin_lipid_metabolism_gene
    "ppar-gamma" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0166417"),
    "srebp-1c" : URIRef("http://purl.obolibrary.org/obo/TFClass_human.obo#1.2.6.3.1.3"),
    "acaca_gene" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C1412104"),
    "hormone-sensitive_lipase" : URIRef("http://purl.obolibrary.org/obo/PR_000009834"),
    "adipose_triglyceride_lipase" : URIRef("http://purl.obolibrary.org/obo/PR_000012942"),
    "tumor_necrosis_factor-alpha" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C1456820"),
    "glucose1pmetab-pwy" : URIRef("https://pubchem.ncbi.nlm.nih.gov/pathway/BioCyc:ECO_GLUCOSE1PMETAB-PWY"), 
    "maltose_catabolic_process":URIRef("http://purl.obolibrary.org/obo/GO_0000025"), 
    "l-fucose-proton_symporter":URIRef("http://purl.obolibrary.org/obo/PR_000022731"),
    "urease_accessory_proteins_uree" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/Gene/urease_accessory_proteins_uree"),
    "camkiid_inhibitor_np202":URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C5690415"),
    "aromatic_aminotransferase":URIRef("https://www.kegg.jp/dbget-bin/www_bget?K00837"),
    "2-oxoglutarate_dehydrogenase_e2_component":URIRef("https://www.genome.jp/dbget-bin/www_bget?K00658"),                                     
    "tryptophan_synthase_beta_chain" : URIRef("https://www.genome.jp/dbget-bin/www_bget?K01696"),
    "dihydrolipoyl_dehydrogenase":URIRef("https://www.genome.jp/entry/K00382"),
    "acetyl-coa_c-acetyltransferase":URIRef("https://www.genome.jp/dbget-bin/www_bget?K00626"),
    "catalase" : URIRef("https://www.genome.jp/dbget-bin/www_bget?ko:K03781"),
    "bacterial_16S_rrna_gene" :URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C5380697"),
    "slc27a3_gene" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C1420165"),
    "pnpla3": URIRef("http://purl.obolibrary.org/obo/OGG_3000080339"),
    "elovl6": URIRef("http://purl.obolibrary.org/obo/OMIT_0044699"),
    "5-ht(1dalpha)_receptor": URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C0534687"),
    "tight_junction_protein_1":URIRef("https://proconsortium.org/cgi-bin/entry_pro?id=PR_000016364")                                                                           
}

created = dict(manual_created)

for key, uri in manual_created.items():
    g.add((uri, RDF.type,      GENE_CLASS))
    g.add((uri, RDF.type,      SKOS.Concept))
    g.add((uri, SKOS.inScheme, GENE_CONCEPT_SCHEME))
    label = key.replace("_", " ").capitalize()
    g.add((uri, RDFS.label, Literal(label, datatype=XSD.string)))


regex_map = [
    (r"\bppar-\b", "ppar-gamma"),
    (r"acc","acaca_gene"),
    (r"hsl","hormone-sensitive_lipase"),
    (r"atgl","adipose_triglyceride_lipase"),
    (r"(?:(?<=^)|(?<=[^A-Za-z0-9]))migraine_headache\b","migraine"),
    (r"skin_lipid_metabolism-related_genes" , "skin_lipid_metabolism_gene"),
    (r"TNFA","tumor_necrosis_factor-alpha"),
    (r"pwy-7328","glucose1pmetab-pwy"),
    (r"maltose_hydrolase", "maltose_catabolic_process"),
    (r"cog1554","maltose_catabolic_process"),
    (r"fucose_permease","l-fucose-proton_symporter"),
    (r"cog0738","l-fucose-proton_symporter"),
    (r"cog2371","urease_accessory_proteins_uree"),
    (r"camkiid","camkiid_inhibitor_np202"),
    (r"k00658","2-oxoglutarate_dehydrogenase_e2_component"),
    (r"k00837","aromatic_aminotransferase"),
    (r"k01696","tryptophan_synthase_beta_chain"),
    (r"k00382", "dihydrolipoyl_dehydrogenase"),
    (r"k00626","acetyl-coa_c-acetyltransferase"),
    (r"k03781","catalase"),
    (r"differential_expressed_genes","tissue-specific_gene_expression"),
    (r"degs","tissue-specific_gene_expression"),
    (r"gut_microbe-related_degs","tissue-specific_gene_expression"),
    (r"ccdc173","parafibromin"),
    (r"16s_rrna_gene","bacterial_16s_rrna_gene"),
    (r"microglia_activation-related_genes","microglia"),
    (r"fatty_acid_transport_genes","slc27a3_gene"),
    (r"5-ht_receptor_htr2a" ,"5-ht(1dalpha)_receptor"),
    (r"tight_junction_protein_claudin-5","tight_junction_protein_1"),
]
    
for paper_id, paper_data in data.items():
    
    entities = paper_data.get("entities", [])
    
    for i, entity in enumerate(entities):
        raw_label = entity.get("label", "").strip()
        
        text_span = entity.get("text_span", "").strip()
        
        if raw_label == "gene":
            text_span = entity.get("text_span", "").strip()
            cleaned_text_span = normalize_to_ascii(create_uri_fragment(text_span)).lower()
            term_raw = cleaned_text_span
            lookup_key = term_raw
            lookup_key = singularize(lookup_key)
            
            for pattern, replacement in regex_map:
                if re.search(pattern, lookup_key, flags=re.IGNORECASE):
                    lookup_key = replacement
                    break
                    
            term = preprocess(lookup_key)
            
            print(f"Query: {term}")
            #print(lookup_key)
            
            if lookup_key in created:
                entity_uri = created[lookup_key]
                for lookup_key in manual_created:
                    mention_uri = URIRef(GUTBRAIN[term_raw])
                    g.add((mention_uri, RDF.type,        MENTION_CLASS))
                    g.add((mention_uri, RDFS.label,      Literal(f"mention_gene_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri

            ex = exact_ix.get(term, [])
            if ex:
                for lbl, uri in ex:
                    print(f"  • {lbl:40s} URI={uri}")
                    entity_uri = URIRef(f"{uri}")
                    created[term_raw] = entity_uri
                    g.add((entity_uri, RDF.type, GENE_CLASS))
                    g.add((entity_uri, RDF.type, SKOS.Concept))
                    g.add((entity_uri, RDFS.label, Literal(lbl, datatype=XSD.string)))
                    g.add((entity_uri, SKOS.inScheme, GENE_CONCEPT_SCHEME))
                    mention_uri = URIRef(GUTBRAIN[term_raw])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_gene_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    continue
                    print(); 
                    
            cos = top_cosine(term)
            if cos:
                for lbl, uri, score in cos[:1]:
                    print(f"  • {lbl:40s} URI={uri:40s} score={score:.2f}")
                    name_uri = URIRef(f"{uri}")
                    created[term_raw] = name_uri
                    g.add((name_uri, RDF.type, GENE_CLASS))
                    g.add((name_uri, RDF.type, SKOS.Concept))
                    g.add((name_uri, RDFS.label, Literal(lbl, datatype=XSD.string)))
                    g.add((name_uri, SKOS.inScheme, GENE_CONCEPT_SCHEME))
                    mention_uri = URIRef(GUTBRAIN[term_raw])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_gene_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((name_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    print()
                    
            else:
                mention_uri = URIRef(GUTBRAIN[term_raw])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_gene_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                tokenized_mentions[term_raw] = mention_uri
                print("no matches")
                continue

        else:
            pass

with open(tokenized_file, "r", encoding="utf-8") as f_sent:
    tokenized_data = json.load(f_sent)

for entry in tokenized_data:
    pmid         = entry["pmid"]
    sent_id      = entry["sent_id"]
    sentence_txt = entry["sentence"].strip()
    entities     = entry["entities"]

    sent_uri = URIRef(GUTBRAIN[f"sentence_{pmid}_{sent_id}"])
    g.add((sent_uri, RDF.type, SENTENCE))
    g.add((sent_uri, GUTPROP.hasSentenceText, Literal(sentence_txt, datatype=XSD.string)))

    if sent_id == 0:
        parent_uri = URIRef(GUTBRAIN[f"title_{pmid}"])
    else:
        parent_uri = URIRef(GUTBRAIN[f"abstract_{pmid}"])
    g.add((sent_uri,      GUTPROP.partOf,  parent_uri))
    g.add((parent_uri,    GUTPROP.composedOf, sent_uri))
        
    for ent in entities:
        if isinstance(ent, dict):
            text_span = ent.get("text_span", "").strip()
            label     = ent.get("label",    "").strip().lower()
        else:
            continue

        if label != "gene":
            continue
        canonical = create_uri_fragment(text_span)
        cleaned_text_span = normalize_to_ascii(canonical).lower()
        
        if cleaned_text_span not in tokenized_mentions:
            mention_uri = URIRef(GUTBRAIN[cleaned_text_span])
            tokenized_mentions[cleaned_text_span] = mention_uri

            g.add((mention_uri, RDF.type, MENTION_CLASS))
            g.add((mention_uri, RDFS.label, Literal(f"mention_gene_{cleaned_text_span}", datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
            
        else:
            mention_uri = tokenized_mentions[cleaned_text_span]
            
        g.add((mention_uri, GUTPROP.locatedIn, sent_uri))

Query: skin lipid metabolism-related genes
no matches
Query: ppar-
  • PPAR Pathway                             URI=http://purl.obolibrary.org/obo/NCIT_C39199 score=0.87

Query: srebp-1c
no matches
Query: acc
no matches
Query: fasn
  • FASN Gene                                URI=http://purl.obolibrary.org/obo/NCIT_C26564 score=0.95

Query: ppar-
  → Reusing existing URI: http://purl.obolibrary.org/obo/NCIT_C39199

Query: acox1
  • ACOX1 Gene                               URI=http://purl.obolibrary.org/obo/NCIT_C191889 score=0.95

Query: hsl
no matches
Query: atgl
no matches
Query: skin lipid metabolism-related genes
no matches
Query: ppar-
  → Reusing existing URI: http://purl.obolibrary.org/obo/NCIT_C39199

Query: srebp-1c
no matches
Query: fasn
  → Reusing existing URI: http://purl.obolibrary.org/obo/NCIT_C26564

Query: atgl
no matches
Query: trem2
  • TREM2 Gene                               URI=http://purl.obolibrary.org/obo/NCIT_C125468 score=0.94

Query: c3
  • C3 Gene          

<h1>INGEST DDF</h1>

In [27]:
import re
import json
import numpy as np
import requests
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
import unicodedata
from pathlib import Path
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD, SKOS, OWL
from pprint import pprint

DDF_BASE = "http://purl.obolibrary.org/obo/"
DDF_CLASS = URIRef("https://w3id.org/brainteaser/ontology/schema/DiseaseDisorderOrFinding")
DDF_CONCEPT_SCHEME = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/DiseaseDisorderOrFinding")

UMLS_API_KEY = os.environ.get("UMLS_API_KEY", "17876e02-ff01-461f-9ed0-2128f01fcb1b")
UMLS_BASE    = "https://uts-ws.nlm.nih.gov/rest"
UMLS_VERSION = "current"

def search_umls(term, version=UMLS_VERSION, api_key=UMLS_API_KEY):
    """
    Query UMLS /search endpoint, return up to 10 (cui, name) tuples.
    """
    url = f"{UMLS_BASE}/search/{version}"
    params = {
        "string": term,
        "apiKey": api_key,
        "pageNumber": 1,
        "pageSize": 10,
        "searchType": "exact"
    }
    resp = requests.get(url, params=params)
    resp.raise_for_status()
    data = resp.json()
    hits = data.get("result", {}).get("results", [])
    return [(hit["ui"], hit["name"]) for hit in hits]

def best_umls_match(term, umls_hits):
    """
    Given a list of (cui, name), compute TFIDF cosine vs. `term`
    and return the single (cui, name, score) with highest score.
    """
    # prepare corpus: [term, name1, name2, ...]
    texts = [term] + [name for _, name in umls_hits]
    vec = TfidfVectorizer(stop_words="english")
    X = vec.fit_transform(texts)
    term_vec = X[0]
    candidate_vecs = X[1:]
    scores = cosine_similarity(term_vec, candidate_vecs).ravel()
    best_idx = scores.argmax()
    cui, name = umls_hits[best_idx]
    return cui, name, scores[best_idx]

def load_ddf_labels(path):
    pattern = re.compile(r'^\s*(.*?)\s*[\(\[]([A-Za-z0-9_]+)[\)\]]')
    rows = []
    with open(path, encoding="utf-8") as fh:
        for ln in fh:
            m = pattern.match(ln)
            if not m:
                continue
            label, term_id = m.group(1), m.group(2)
            uri = DDF_BASE + term_id
            rows.append((label, uri))
    return rows

OMIT_LABELS_FILE = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\parsedOntologies\omit_full_taxonomy.txt"
DDF_LABELS_FILE = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\parsedOntologies\ncit_full_taxonomy.txt"
ddf_rows = load_ddf_labels(DDF_LABELS_FILE)
omit_rows = load_ddf_labels(OMIT_LABELS_FILE)

exact_ix = defaultdict(list)
for lbl, uri in ddf_rows:
    exact_ix[lbl.lower()].append((lbl, uri))

exact_ix1 = defaultdict(list)
for lbl, uri in omit_rows:
    exact_ix1[lbl.lower()].append((lbl, uri))

labels_only = [preprocess(lbl) for lbl, _ in ddf_rows]
vec = TfidfVectorizer(stop_words="english")
mat = vec.fit_transform(labels_only)

labels_only1 = [preprocess(lbl) for lbl, _ in omit_rows]
vec1 = TfidfVectorizer(stop_words="english")
mat1 = vec1.fit_transform(labels_only1)

manual_created = {
    "neuropsychiatric_disorders" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/DiseaseDisorderOrFinding/neuropsychiatric_disorders"),
    "oleic_acid-induced_acne" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/DiseaseDisorderOrFinding/oleic_acid-induced_acne"),
    "inflammatory_disease" : URIRef("http://purl.obolibrary.org/obo/MONDO_0021166"),
    "cancer" : URIRef("http://purl.obolibrary.org/obo/NCBITaxon_6754"),
    "neurodegeneration" : URIRef("http://purl.obolibrary.org/obo/MONDO_0021166"),
    "intestinal_dysbiosis" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C4287543"),
    "brain_diseases": URIRef("http://purl.obolibrary.org/obo/OMIT_0003283"),
    "functional_gastric_disease": URIRef("http://purl.obolibrary.org/obo/MONDO_0001318"),
    "functional_gastrointestinal_disorders" : URIRef("https://bioportal.bioontology.org/ontologies/EDAM?p=classes&conceptid=topic_3409"),    "synucleinopathies" : URIRef("http://purl.obolibrary.org/obo/MONDO_0000510"),
    "hippocampal_volume_loss,_mild" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C5394342"),
    "gastric_mucins" : URIRef("https://meshb.nlm.nih.gov/record/ui?ui=D005752"),
    "gastric_disease" : URIRef("http://purl.obolibrary.org/obo/MONDO_0004298"),
    "neurodegenerative_disorders" : URIRef("http://purl.obolibrary.org/obo/NCIT_C39737"),
    "headaches" : URIRef("http://purl.obolibrary.org/obo/NCIT_C34661"),
    "disorder" : URIRef("http://purl.obolibrary.org/obo/OGMS_0000045"),
    "neuronitis": URIRef("http://purl.obolibrary.org/obo/MONDO_0004466"),
    "endolysosomal_deficits" : URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/DiseaseDisorderOrFinding/endolysosomal_deficits"),
    "coeliac_disease" : URIRef("https://disease-ontology.org/?id=DOID:10608"),
    "intestinal_malabsorption" : URIRef("https://hpo.jax.org/browse/term/HP:0002024"),
    "cell_danger_response": URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/resource/DiseaseDisorderOrFinding/cell_danger_response"),
    "altered_microbiota" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C4047662"),
    "intestinal_dysbiosis" : URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C4287543"),
    "migraine" : URIRef("http://purl.obolibrary.org/obo/NCIT_C89715"),
    "cognitive_impairment" : URIRef("http://purl.obolibrary.org/obo/NCIT_C116921"),
    "attention_deficit_hyperactivity_disorder": URIRef("https://uts.nlm.nih.gov/uts/umls/concept/C1263846"),
    
}

created = dict(manual_created)

for key, uri in manual_created.items():
    g.add((uri, RDF.type,      DDF_CLASS))
    g.add((uri, RDF.type,      SKOS.Concept))
    g.add((uri, SKOS.inScheme, DDF_CONCEPT_SCHEME))
    label = key.replace("_", " ").capitalize()
    g.add((uri, RDFS.label, Literal(label, datatype=XSD.string)))


regex_map = [
    (r"\bdepressive_symptoms\b", "major_depressive_disorder"),
    (r"(?:(?<=^)|(?<=[^A-Za-z0-9]))migraine_headache\b","migraine"),
    (r"\bt1d\b","Type_1_Diabetes_Mellitus"),
    (r"long-term_acetate_deficiency","deficiency"),
    (r"cognitive_decline","mental_deterioration"),
    (r"systemic_inflammation","inflammation"),
    (r"gastrointestinal_disorders", "digestive_system_disorder"),
    (r"psychiatric_and_neurodegenerative_disorders", "mental_disorders"),
    (r"\bai\b","autoimmune_disease"),
    (r"ai-related_musculoskeletal_pathology","musculoskeletal_diseases"),
    (r"oleic_acid-induced_acne","oleic_acid-induced_acne"),
    (r"acne_pathogenesis", "acne"),
    (r"serum_hormone_secretion", "hormone_secretion"),
    (r"(?:(?<=^)|(?<=[^A-Za-z0-9]))diseases", "disease"),
    (r"neurodegenerative__inflammatory__metabolic__and_cardiovascular_diseases", "disease"),
    (r"alzheimer_s_and_parkinson_s_diseases","disease"),
    (r"(?:(?<=^)|(?<=[^A-Za-z0-9]))dysfunctions","dysfunction"),
    (r"dysregulation_of_microglia_genes","microglia_gene"),
    (r"telomere_attrition","telomere"),
    (r"neuronal_stem_cell_degradation","stem_cells"),
    (r"loss_of_chromosome_x_inactivation","x_chromosome_inactivation"),
    (r"gut_microbiome_dysbiosis","intestinal_dysbiosis"),
    (r"\bad\b","alzheimer_s_disease"),
    (r"parkinson_s_disease","parkinson_disease"),
    (r"\bpd\b", "parkinson_disease"),
    (r"\blbd\b", "lewy_body_dementia"),
    (r"gut_dysbiosis", "intestinal_dysbiosis"),
    (r"melanomas", "melanoma"),
    (r"cums-induced_depressive_disorder", "depressive_disorder"),
    (r"chronic_unpredictable_mild_stress__cums_-induced_depressive-like_symptoms", "depression"),
    (r"depressive_behaviors","depression"),
    (r"major_depressive_disorders", "major_depressive_disorder"),
    (r"human_stress", "stress"),
    (r"headaches","headaches"),
    (r"mental_health_problems","depressive_disorder"),
    (r"\bstress\w*\b", "stress"),
    (r"altered_gut_microbiome", "intestinal_microbiome"),
    (r"gastric_disturbances", "functional_gastric_disease"),
    (r"microbiome_alteration" , "microbiome"),
    (r"pd-like_pathology", "parkinson_disease"),
    (r"lps_paraquat-induced_weight_loss" , "weight_loss"),
    (r"inflamed_gut" , "gut"),
    (r"brain-gut_changes" , "brain-gut_axis"),
    (r"human_neurological_disorders" , "neurological_disorders"),
    (r"imbalance_in_the_gut_microflora" , "intestinal_microbiome"),
    (r"neurological_conditions" , "progressive_neurological_conditions"),
    (r"neurological_disorders" , "nervous_system_disorder"),
    (r"\bdd\b", "depressive_disorder"),
    (r"hpa_axis_dysfunction", "dysfunction"),
    (r"chronic_mild_stress", "stress"),
    (r"\bcms\b", "stress"),
    (r"anxiety-_and_depressive-like_behaviors", "depressive_disorder"),
    (r"cms-induced_anxiety-_and_depressive-like_behaviors", "depressive_disorder"),
    (r"gastrointestinal disorders", "functional_gastrointestinal_disorders"),
    (r"food_allergies", "food_allergy"),
    (r"ulcerative_histiocytic_colitis","ulcerative_colitis"),
    (r"depression_patients", "depression"),
    (r"\bibs-d\b", "irritable_bowel_syndrome"),
    (r"\bnds\b", "neurodegenerative_diseases"),
    (r"\bmsa\b", "multiple_system_atrophy"),
    (r"\bms\b", "multiple_sclerosis"),
    (r"\bnmo\b", "neuromyelitis_optica"),
    (r"alpha_synucleinopathies" , "synucleinopathies"),
    (r"hippocampal_microglia-mediated_synaptic_loss", "hippocampal_volume_loss,_mild"),
    (r"inflammatory_gut_milieu", "milieu_therapy"),
    (r"\bptsd\b", "posttraumatic_stress_disorder"),
    (r"alzheimer_s_disease", "alzheimer_disease"),
    #(r"oleic_acid-induce_acne", "acne"),
    (r"vascular_system_dysfunction","dysfunction"),
    (r"intestinal_dysbiosis", "intestinal_dysbiosis"),
    (r"microbiota_dysbiosis", "dysbiosis"),
    (r"chronic_enteropathy", "enteropathy"),
    (r"amyotrophic_lateral_scleroris", "amyotrophic_lateral_scleroris"),
    (r"\bals\b", "amyotrophic_lateral_scleroris"),
    (r"dysregulation_of_gut_barrier_functions", "dysregulated_immune_function"),
    (r"transepithelial_electrical_resistance" , "electrical_resistance"),
    (r"mucin_homeostasis", "gastric_mucins"),
    (r"antimicrobial_responses","antibiotic"),
    (r"helicobacter_pylori-related_hyperhomocysteinemia", "helicobacter_pylori"),
    (r"gastric_pathologies", "gastric_disease"),
    (r"neurodegenerative_central_nervous_system_disorders", "neurodegenerative_disorders"),
    (r"ocular_alzheimer_s_disease", "alzheimer_disease"),
    (r"gastrointestinal_diseases","disease"),
    (r"migraine", "migraine"),
    (r"gut_dysbiosis","dysbiosis"),
    (r"(?:(?<=^)|(?<=[^A-Za-z0-9]))diseases","disease"),
    (r"cognitive_impairment","cognitive_impairment"),
    (r"gut_dysbiosis_and_inflammation","intestinal_dysbiosis"),
    (r"hyperhomocysteinemia-related_brain_cortical_thinning","hyperhomocysteinemia"),
    (r"\bbct\b","hyperhomocysteinemia"),
    (r"major_depressive_episode","major_depressive_disorder"),
    (r"\bbpd\b","major_depressive_disorder"),
    (r"mild_metabolic_disorders","disorder"),
    (r"(?:(?<=^)|(?<=[^A-Za-z0-9]))metabolic_disturbances","metabolic_disturbance"),
    (r"schizoaffective_psychosis","psychosis"),
    (r"gastrointestinal__gi__disorders","disorder"),
    (r"autoimmune_disorders","disorder"),
    (r"non-celiac_food_sensitivities","celiac_disease"),
    (r"co-morbid_gi_inflammation","inflammation"),
    (r"inefficient_gluten_digestion","digestion"),
    (r"polygenic_brain_disorders","disorder"),
    (r"autism_spectrum_disorder","disorder"),
    (r"\basd\b","disorder"),
    (r"\bmia\b","systemic_immune_activation"),
    (r"\badhd\b","attention_deficit_hyperactivity_disorder"),
    (r"\bt2dm\b","type_2_diabetes_mellitus"),
    (r"intestinal_microbiota_alterations","intestinal_microbiome"),
    (r"depression-like_behavior","major_depressive_disorder"),
    (r"chronic_and_unpredictable_mild_stress","stress"),
    (r"depression-like_behaviors","depression"),
    (r"gastric_and_esophageal_cancer","cancer"),
    (r"parkinsonian_pathology","parkinson disease"),
    (r"neuroinflammation","neuronitis"),
    (r"nigrostriatal_neurodegeneration","neurodegeneration"),
    (r"disease-related_malnutrition","malnutrition"),
    (r"neurological_and_psychiatric_disorders","disorder"),
    (r"brain_disorders","disorder"),
    (r"\bmdd\b","major_depressive_disorders"),
    (r"inflammatory_processes","inflammation"),
    (r"anxiety_symptoms","anxiety"),
    (r"\bpcos\b","polycystic_ovary_syndrome"),
    (r"gi symptoms","hama_-_gastrointestinal_symptoms"),
    (r"altered_microbiota","microbiota"),
    (r"colon_cancer-related_anemia","colon_carcinoma"),
    (r"\bccra\b","colon_carcinoma"),
    (r"colon_cancer","colon_carcinoma"),
    (r"ccra-induced_intestinal_flora_disorder","colon_carcinoma"),
    (r"chronic_stress-induced_anhedonia","stress"),
    (r"chronic_unpredictable_stress","stress"),
    (r"cus-induced_anhedonic_behaviors","anhedonia"),
    (r"irritable_bowel_disease","inflammatory_bowel_disease"),
    (r"\bsdv\b","subdiaphragmatic_vagotomy"),
    (r"abnormal_crypt_foci","aberrant_crypt_foci"),
    (r"\bcrc\b","colorectal_cancer"),
    (r"small_intestinal_malabsorption","intestinal_malabsorption"),
    (r"impaired_colonic_microbial_metabolism","metabolism"),
    (r"chronic_alcohol_overconsumption","alcohol"),
    (r"-synuclein_pathology", "pathology"),
    (r"\bcdr\b","cell_danger_response"),
    (r"\bptsd\b","post-traumatic_stress_disorder"),
    (r"\bcte\b","chronic_traumatic_encephalopathy"),
    (r"traumatic_brain_injury","injury"),
    (r"\btbi\b","injury"),
    (r"alteration_of_faecal_microbiota_balance","altered_microbiota"),
    (r"\bbd\b","bipolar_disorder"),
    (r"prader-willi_syndrome","prader-willi_syndrome"),
    (r"\bosa\b", "obstructive_sleep_apnea"),
    (r"osa_onset","obstructive_sleep_apnea"),
    (r"\bvvs\b","vasovagal_syncope"),
    (r"systolic_and_diastolic_pressure_reduction", "pressure"),
    (r"mean_pressure_drop", "pressure"),
    (r"diastolic_pressure_drop","pressure"),
    (r"\bscz\b","schizophrenia"),
    (r"anxiety-like_and_depression-like_behaviours","anxiety"),
    (r"\bdepressive\w*\b","anxiety"),
    (r"liver_fat","fat"),
    (r"\bhcd\b","colesterol"),
    (r"\bdisorder\w*\b","disorder"),
    (r"first-episode_depression","depression"),
    (r"\bfcr\b","fear_of_cancer_recurrence"),
    (r"learning_and_memory_impairments","cognitive_impairment"),
    (r"migraine","migraine"),
    (r"neuropsychiatric_disorders","disorder"),
    (r"psychiatric_and_neurodegenerative_disorders","disorder"),
    (r"gastric_disturbances","intestinal_inflammation"),
    (r"inflamed_gut","intestinal_inflammation"),
    (r"impaired_cognition","cognitive_impairment"),
    (r"helicobacter_pylori_infection","helicobacter_pylori"),
    (r"maternal_immune_activation","systemic_immune_activation"),
    (r"cancers_of_the_esophagus_and_stomach","cancer"),
    (r"gi_symptoms","symptoms"),
    (r"gastrointestinal__gi__symptoms","symptoms"),
    (r"\bcus\b","stress"),
    (r"abnormal_blood_levels","blood"),
    (r"-synuclein_aggregation","synuclein"),
    (r"chronic__developmental__autoimmune__and_degenerative_disorders","disorder"),
    (r"brain_inflammatory_activity", "brain_inflammatory_disease"),
    (r"hyperlipidemia","hyperlipidemia"),
    (r"intestinal_microbial_and_metabolites_dysbiosis","dysbiosis"),
    (r"microbial_dysbiosis","dysbiosis"),
    (r"systemic_low-grade_inflammation","inflammation"),
    (r"severe_psychiatric_disorders","disorder"),
    (r"chronic_unpredictable_mild_stress","stress"),
    (r"depression-_and_anxiety-like_behavior","depression"),
    (r"depression-_and_anxiety-like_behaviors","depression"),
    (r"\bcad\b","coronary_artery_disease"),
    (r"anosmia","anosmia"),
    (r"altered_gut_motility","gut-brain_axis"),
    (r"changes_in_intestinal_permeability","intestinal"),
    (r"\bndd\b","neurodegenerative_diseases"),
    (r"neurological_and_mental_disorders","disorder"),
    (r"microbiome_and_specific_bacterial_changes","microbiome"),
    
]

def top_cosine(term, k=5, thr=0.85):
    v  = vec.transform([term])
    sc = cosine_similarity(v, mat).ravel()
    idx = np.argsort(sc)[::-1]
    out = []
    for i in idx[:k]:
        if sc[i] < thr:
            break
        lbl, uri = ddf_rows[i]
        out.append((lbl, uri, sc[i]))
    return out

def top_cosine_omit(term, k=5, thr=0.85):
    v  = vec1.transform([term])
    sc = cosine_similarity(v, mat1).ravel()
    idx = np.argsort(sc)[::-1]
    out = []
    for i in idx[:k]:
        if sc[i] < thr:
            break
        lbl, uri = omit_rows[i]
        out.append((lbl, uri, sc[i]))
    return out
    
for paper_id, paper_data in data.items():
    
    entities = paper_data.get("entities", [])
    
    for i, entity in enumerate(entities):
        raw_label = entity.get("label", "").strip()
        
        text_span = entity.get("text_span", "").strip()
        
        if raw_label == "DDF":
            text_span = entity.get("text_span", "").strip()
            cleaned_text_span = normalize_to_ascii(create_uri_fragment(text_span)).lower()
            term_raw = cleaned_text_span
            lookup_key = term_raw
            
            for pattern, replacement in regex_map:
                if re.search(pattern, lookup_key, flags=re.IGNORECASE):
                    lookup_key = replacement
                    break
                    
            term = preprocess(lookup_key)
            
            print(f"Query: {term}")
            #print(lookup_key)
            
            if lookup_key in created:
                entity_uri = created[lookup_key]
                for lookup_key in manual_created:
                    mention_uri = URIRef(GUTBRAIN[term_raw])
                    g.add((mention_uri, RDF.type,        MENTION_CLASS))
                    g.add((mention_uri, RDFS.label,      Literal(f"mention_ddf_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
            
            ex = exact_ix.get(term, [])
            if ex:
                lbl, uri = ex[0]
                print(f"  • {lbl:40s} URI={uri}")
                entity_uri = URIRef(f"{uri}")
                created[term_raw] = entity_uri
                g.add((entity_uri, RDF.type, DDF_CLASS))
                g.add((entity_uri, RDF.type, SKOS.Concept))
                g.add((entity_uri, RDFS.label, Literal(lbl, datatype=XSD.string)))
                g.add((entity_uri, SKOS.inScheme, DDF_CONCEPT_SCHEME))
                mention_uri = URIRef(GUTBRAIN[term_raw])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_ddf_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                tokenized_mentions[term_raw] = mention_uri
                continue
                print(); 
                    
            cos = top_cosine(term)
            cos1 = top_cosine_omit(term)
            if cos:
                for lbl, uri, score in cos[:1]:
                    print(f"  • {lbl:40s} URI={uri:40s} score={score:.2f}")
                    name_uri = URIRef(f"{uri}")
                    created[term_raw] = name_uri
                    g.add((name_uri, RDF.type, DDF_CLASS))
                    g.add((name_uri, RDF.type, SKOS.Concept))
                    g.add((name_uri, RDFS.label, Literal(lbl, datatype=XSD.string)))
                    g.add((name_uri, SKOS.inScheme, DDF_CONCEPT_SCHEME))
                    mention_uri = URIRef(GUTBRAIN[term_raw])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_ddf_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((name_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    continue
                    print()
            elif cos1:
                for lbl, uri, score in cos1[:1]:
                    print(f"  • {lbl:40s} URI={uri:40s} score={score:.2f}")
                    name_uri = URIRef(f"{uri}")
                    created[term_raw] = name_uri
                    g.add((name_uri, RDF.type, DDF_CLASS))
                    g.add((name_uri, RDF.type, SKOS.Concept))
                    g.add((name_uri, RDFS.label, Literal(lbl, datatype=XSD.string)))
                    g.add((name_uri, SKOS.inScheme, DDF_CONCEPT_SCHEME))
                    mention_uri = URIRef(GUTBRAIN[term_raw])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_ddf_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((name_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    continue
                    print()
                
            else:
                api_term = lookup_key.replace("_", " ")
                umls_hits = search_umls(api_term)
                if umls_hits:
                    cui, name, score = best_umls_match(api_term, umls_hits)
                    if name in created:
                        entity_uri = created[term_raw]
                    else:
                        entity_uri = URIRef(f"http://linkedlifedata.com/resource/umls/id/{cui}")
                        created[term_raw] = entity_uri

                    g.add((entity_uri, RDF.type,      DDF_CLASS))
                    g.add((entity_uri, RDF.type,      SKOS.Concept))
                    g.add((entity_uri, RDFS.label,    Literal(name, datatype=XSD.string)))
                    g.add((entity_uri, SKOS.inScheme, DDF_CONCEPT_SCHEME))
                    g.add((entity_uri, RDFS.comment, Literal("UMLS Match", datatype=XSD.string)))
            
                    mention_uri = URIRef(GUTBRAIN[term_raw])
                    g.add((mention_uri, RDF.type,        MENTION_CLASS))
                    g.add((mention_uri, RDFS.label,      Literal(f"mention_ddf_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs,      Literal(raw_label, datatype=XSD.string)))
                    g.add((entity_uri, GUTPROP.containedIn,    mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    print(f"  • UMLS CUI={cui}  Name={name!r}  sim={score:.2f}")
                    continue
                    
                else: 
                    mention_uri = URIRef(GUTBRAIN[term_raw])
                    g.add((mention_uri, RDF.type,      MENTION_CLASS))
                    g.add((mention_uri, RDFS.label,    Literal(f"mention_ddf_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs,      Literal(raw_label, datatype=XSD.string)))
                    tokenized_mentions[term_raw] = mention_uri
                    print("no matches locally or in UMLS")

        else:
            pass

with open(tokenized_file, "r", encoding="utf-8") as f_sent:
    tokenized_data = json.load(f_sent)

for entry in tokenized_data:
    pmid         = entry["pmid"]
    sent_id      = entry["sent_id"]
    sentence_txt = entry["sentence"].strip()
    entities     = entry["entities"]

    sent_uri = URIRef(GUTBRAIN[f"sentence_{pmid}_{sent_id}"])
    g.add((sent_uri, RDF.type, SENTENCE))
    g.add((sent_uri, GUTPROP.hasSentenceText, Literal(sentence_txt, datatype=XSD.string)))

    if sent_id == 0:
        parent_uri = URIRef(GUTBRAIN[f"title_{pmid}"])
    else:
        parent_uri = URIRef(GUTBRAIN[f"abstract_{pmid}"])
    g.add((sent_uri,      GUTPROP.partOf,  parent_uri))
    g.add((parent_uri,    GUTPROP.composedOf, sent_uri))
        
    for ent in entities:
        if isinstance(ent, dict):
            text_span = ent.get("text_span", "").strip()
            label     = ent.get("label",    "").strip()
        else:
            continue

        if label != "DDF":
            continue
        canonical = create_uri_fragment(text_span)
        cleaned_text_span = normalize_to_ascii(canonical).lower()
        
        if cleaned_text_span not in tokenized_mentions:
            mention_uri = URIRef(GUTBRAIN[cleaned_text_span])
            tokenized_mentions[cleaned_text_span] = mention_uri

            g.add((mention_uri, RDF.type, MENTION_CLASS))
            g.add((mention_uri, RDFS.label, Literal(f"mention_ddf_{cleaned_text_span}", datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.taggedAs, Literal(label, datatype=XSD.string)))
            
        else:
            mention_uri = tokenized_mentions[cleaned_text_span]
            
        g.add((mention_uri, GUTPROP.locatedIn, sent_uri))

output_file = os.path.join(save_path, "gutbrain_entities.ttl")
ttl_output = g.serialize(format="turtle")
with open(output_file, "w", encoding="utf-8") as f_out:
    f_out.write(ttl_output)

print(f"The RDF graph has been saved in {output_file}")

Original span: 'depressive_symptoms'; lookup_key (after regex): 'major_depressive_disorder'
  • Depressive Disorder, Major               URI=http://purl.obolibrary.org/obo/OMIT_0005141 score=1.00
Original span: 'mental_deterioration'; lookup_key (after regex): 'mental_deterioration'
  • UMLS CUI=C0234985  Name='Mental deterioration'  sim=1.00
Original span: 'mood_disorders'; lookup_key (after regex): 'mood_disorders'
  • Mood Disorders                           URI=http://purl.obolibrary.org/obo/OMIT_0019924 score=1.00
Original span: 'depressive_symptoms'; lookup_key (after regex): 'major_depressive_disorder'
  • Depressive Disorder, Major               URI=http://purl.obolibrary.org/obo/OMIT_0005141 score=1.00
Original span: 'antimicrobial_resistance'; lookup_key (after regex): 'antimicrobial_resistance'
  • Antimicrobial Resistance Result          URI=http://purl.obolibrary.org/obo/NCIT_C85562 score=0.91
Original span: 'inflammation'; lookup_key (after regex): 'inflammation'
  • Infl

KeyboardInterrupt: 

<h1>INGEST ANIMAL</h1>

In [22]:
import re
import json
import numpy as np
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
import unicodedata
from pathlib import Path
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD, SKOS, OWL
from pprint import pprint

ANIMAL_BASE = "http://purl.obolibrary.org/obo/"
ANIMAL_CLASS = URIRef("https://w3id.org/brainteaser/ontology/schema/Animal")
ANIMAL_CONCEPT_SCHEME = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/Animal")

def load_animal_labels(path):
    pattern = re.compile(r'^\s*(.*?)\s*[\(\[]([A-Za-z0-9_]+)[\)\]]')
    rows = []
    with open(path, encoding="utf-8") as fh:
        for ln in fh:
            m = pattern.match(ln)
            if not m:
                continue
            label, term_id = m.group(1), m.group(2)
            uri = ANIMAL_BASE + term_id
            rows.append((label, uri))
    return rows

ANIMAL_LABELS_FILE = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\parsedOntologies\ncbitaxon_full_taxonomy.txt"
NCIT_LABELS_FILE = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\parsedOntologies\ncit_full_taxonomy.txt"
animal_rows = load_animal_labels(ANIMAL_LABELS_FILE)
ncit_rows = load_animal_labels(NCIT_LABELS_FILE)

exact_ix = defaultdict(list)
for lbl, uri in animal_rows:
    exact_ix[lbl.lower()].append((lbl, uri))

exact_ix1 = defaultdict(list)
for lbl, uri in ncit_rows:
    exact_ix1[lbl.lower()].append((lbl, uri))

labels_only = [preprocess(lbl) for lbl, _ in animal_rows]
vec = TfidfVectorizer(stop_words="english")
mat = vec.fit_transform(labels_only)

labels_only1 = [preprocess(lbl) for lbl, _ in ncit_rows]
vec1 = TfidfVectorizer(stop_words="english")
mat1 = vec1.fit_transform(labels_only1)

def top_cosine(term, k=5, thr=0.78):
    v  = vec.transform([term])
    sc = cosine_similarity(v, mat).ravel()
    idx = np.argsort(sc)[::-1]
    out = []
    for i in idx[:k]:
        if sc[i] < thr:
            break
        lbl, uri = animal_rows[i]
        out.append((lbl, uri, sc[i]))
    return out

def top_cosine_ncit(term, k=5, thr=0.80):
    v  = vec1.transform([term])
    sc = cosine_similarity(v, mat1).ravel()
    idx = np.argsort(sc)[::-1]
    out = []
    for i in idx[:k]:
        if sc[i] < thr:
            break
        lbl, uri = ncit_rows[i]
        out.append((lbl, uri, sc[i]))
    return out

def singularize(term):
    if term.endswith("ies"):
        return term[:-3] + "y"
    elif term.endswith("s") and not term.endswith("ss"):
        return term[:-1]
    return term

created["animal"] = URIRef("http://purl.obolibrary.org/obo/NCIT_C14182")

special_map = {
                "mouse":           "mus_musculus",
                "skin lipid":      "skin",
                "mice":            "mus_musculus",
                "rat":             "mus_musculus",
                "dogs":            "Canis lupus familiaris",
                "children":        "offspring",
                "pig":             "sus",
                "bird":            "aves",
                "rodent":          "rodentia",
                "6-ohda":          "mus_musculus",
                "sps-susceptible male": "mus_musculus",
                "sps-susceptible female": "mus_musculus",
                "SPS-resilient females": "mus_musculus",
                "SPS-resilient males": "mus_musculus",
                "sps resilient females": "mus_musculus",
                "sps-s males": "mus_musculus",
                "female": "animal",
                "male": "animal",
                "females": "animal",
                "males": "animal",
        }
    
for paper_id, paper_data in data.items():
    
    entities = paper_data.get("entities", [])
    
    for i, entity in enumerate(entities):
        raw_label = entity.get("label", "").strip()
        
        text_span = entity.get("text_span", "").strip()
        
        if raw_label == "animal":
            text_span = entity.get("text_span", "").strip()
            #cleaned_text = create_uri_fragment(text_span)
            #cleaned_text_span = normalize_to_ascii(cleaned_text)
            #term_raw = cleaned_text_span.lower()
            #term = preprocess(term_raw)
            # mappature speciali
            
            cleaned_text_span = normalize_to_ascii(create_uri_fragment(text_span)).lower()
            term_raw = cleaned_text_span
            lookup_key = term_raw
            lookup_key = singularize(lookup_key)
            
            for key, val in special_map.items():
                if key in lookup_key:
                    lookup_key = val
                    break
            
            term = preprocess(lookup_key)

            #print(f"Query: {term}")
            print(f"Original span: {term_raw!r}; Lookup term: {term!r}")
            
            if lookup_key in created:
                entity_uri = created[lookup_key]
                print(f"  → Reusing existing URI: {entity_uri}\n")
                continue

            ex = exact_ix.get(term, [])
            if ex:
                for lbl, uri in ex:
                    print(f"  • {lbl:40s} URI={uri}")
                    entity_uri = URIRef(f"{uri}")
                    created[term_raw] = entity_uri
                    g.add((entity_uri, RDF.type, ANIMAL_CLASS))
                    g.add((entity_uri, RDF.type, SKOS.Concept))
                    g.add((entity_uri, RDFS.label, Literal(lbl, datatype=XSD.string)))
                    g.add((entity_uri, SKOS.inScheme, ANIMAL_CONCEPT_SCHEME))
                    mention_uri = URIRef(GUTBRAIN[term_raw])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_animal_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    continue
                    print(); 
                    
            cos = top_cosine(term)
            cos1 = top_cosine_ncit(term)
            if cos:
                for lbl, uri, score in cos[:1]:
                    print(f"  • {lbl:40s} URI={uri:40s} score={score:.2f}")
                    name_uri = URIRef(f"{uri}")
                    created[term_raw] = name_uri
                    g.add((name_uri, RDF.type, ANIMAL_CLASS))
                    g.add((name_uri, RDF.type, SKOS.Concept))
                    g.add((name_uri, RDFS.label, Literal(lbl, datatype=XSD.string)))
                    g.add((name_uri, SKOS.inScheme, ANIMAL_CONCEPT_SCHEME))
                    mention_uri = URIRef(GUTBRAIN[term_raw])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_animal_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((name_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    print()
            elif cos1:
                for lbl, uri, score in cos1[:1]:
                    print(f"  • {lbl:40s} URI={uri:40s} score={score:.2f}")
                    name_uri = URIRef(f"{uri}")
                    created[term_raw] = name_uri
                    g.add((name_uri, RDF.type, ANIMAL_CLASS))
                    g.add((name_uri, RDF.type, SKOS.Concept))
                    g.add((name_uri, RDFS.label, Literal(lbl, datatype=XSD.string)))
                    g.add((name_uri, SKOS.inScheme, ANIMAL_CONCEPT_SCHEME))
                    mention_uri = URIRef(GUTBRAIN[term_raw])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_animal_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((name_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    print()
                    
            else:
                mention_uri = URIRef(GUTBRAIN[term_raw])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_animal_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                tokenized_mentions[term_raw] = mention_uri
                print("no matches")
                continue

        else:
            pass

with open(tokenized_file, "r", encoding="utf-8") as f_sent:
    tokenized_data = json.load(f_sent)

for entry in tokenized_data:
    pmid         = entry["pmid"]
    sent_id      = entry["sent_id"]
    sentence_txt = entry["sentence"].strip()
    entities     = entry["entities"]

    sent_uri = URIRef(GUTBRAIN[f"sentence_{pmid}_{sent_id}"])
    g.add((sent_uri, RDF.type, SENTENCE))
    g.add((sent_uri, GUTPROP.hasSentenceText, Literal(sentence_txt, datatype=XSD.string)))

    if sent_id == 0:
        parent_uri = URIRef(GUTBRAIN[f"title_{pmid}"])
    else:
        parent_uri = URIRef(GUTBRAIN[f"abstract_{pmid}"])
    g.add((sent_uri,      GUTPROP.partOf,  parent_uri))
    g.add((parent_uri,    GUTPROP.composedOf, sent_uri))
        
    for ent in entities:
        if isinstance(ent, dict):
            text_span = ent.get("text_span", "").strip()
            label     = ent.get("label",    "").strip()
        else:
            continue

        if label != "animal":
            continue
        canonical = create_uri_fragment(text_span)
        cleaned_text_span = normalize_to_ascii(canonical).lower()
        
        if cleaned_text_span not in tokenized_mentions:
            mention_uri = URIRef(GUTBRAIN[cleaned_text_span])
            tokenized_mentions[cleaned_text_span] = mention_uri

            g.add((mention_uri, RDF.type, MENTION_CLASS))
            g.add((mention_uri, RDFS.label, Literal(f"mention_animal_{cleaned_text_span}", datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.taggedAs, Literal(label, datatype=XSD.string)))
            
        else:
            mention_uri = tokenized_mentions[cleaned_text_span]
            
        g.add((mention_uri, GUTPROP.locatedIn, sent_uri))

Original span: 'birds'; Lookup term: 'aves'
  • Aves                                     URI=http://purl.obolibrary.org/obo/NCBITaxon_8782
  • Aves                                     URI=http://purl.obolibrary.org/obo/NCBITaxon_8782 score=1.00

Original span: 'bird'; Lookup term: 'aves'
  • Aves                                     URI=http://purl.obolibrary.org/obo/NCBITaxon_8782
  • Aves                                     URI=http://purl.obolibrary.org/obo/NCBITaxon_8782 score=1.00

Original span: 'animals'; Lookup term: 'animal'
  → Reusing existing URI: http://purl.obolibrary.org/obo/NCIT_C14182

Original span: 'diabetic_mice'; Lookup term: 'mus musculus'
  • Mus musculus                             URI=http://purl.obolibrary.org/obo/NCBITaxon_10090
  • Mus musculus                             URI=http://purl.obolibrary.org/obo/NCBITaxon_10090 score=1.00

Original span: 'streptozotocin__stz_-induced_t1d_mice'; Lookup term: 'mus musculus'
  • Mus musculus                           

<h1>INGEST DIETARY SUPPLEMENT</h1>

In [23]:
import re
import json
import numpy as np
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
import unicodedata
from pathlib import Path
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD, SKOS, OWL
from pprint import pprint
from nltk.stem import WordNetLemmatizer

DIETARYSUPPLEMENT_BASE = "http://purl.obolibrary.org/obo/"
DIETARYSUPPLEMENT_CLASS = URIRef("https://w3id.org/brainteaser/ontology/schema/DietarySupplement")
DIETARYSUPPLEMENT_CONCEPT_SCHEME = URIRef("https://hereditary.dei.unipd.it/ontology/gutbrain/schema/conceptScheme/DietarySupplement")

lemmatizer = WordNetLemmatizer()

#created = {}

# 1.1) keep only your hand-picked seeds here
manual_created = {
    "dietary_supplementation": URIRef("https://www.ncbi.nlm.nih.gov/mesh/68019587"),
    "egcg":                     URIRef("http://purl.obolibrary.org/obo/XCO_0001093"),
    "hippophae_rhamnoide":      URIRef("http://purl.obolibrary.org/obo/NCBITaxon_193516"),
    "prebiotic":                URIRef("http://purl.obolibrary.org/obo/OMIT_0026689"),
    "acacetin":                 URIRef("http://purl.obolibrary.org/obo/CHEBI_15335"),
    "gluten":                   URIRef("http://purl.obolibrary.org/obo/FOODON_03420177"),
    "lactobacillus":            URIRef("https://www.ncbi.nlm.nih.gov/mesh/D052200"),
    "triphala":                 URIRef("https://www.ncbi.nlm.nih.gov/mesh/67520904"),
    "lacticaseibacillus_rhamnosus":
                                URIRef("http://purl.obolibrary.org/obo/NCBITaxon_47715"),
}

created = dict(manual_created)

for key, uri in manual_created.items():
    g.add((uri, RDF.type,      DIETARYSUPPLEMENT_CLASS))
    g.add((uri, RDF.type,      SKOS.Concept))
    g.add((uri, SKOS.inScheme, DIETARYSUPPLEMENT_CONCEPT_SCHEME))
    label = key.replace("_", " ").capitalize()
    g.add((uri, RDFS.label, Literal(label, datatype=XSD.string)))


regex_map = [
    (r"\bprobiot(?:ic|ics|ic_supplementation)\b", "probiotic"),
    (r"feed_additive",       "feed"),
    (r"next-generation_feed_additive", "feed"),
    (r"seabuckthorn","hippophae_rhamnoide"),
    (r"vsl__3","prebiotic"),
    (r"vsl_3","prebiotic"),
    ("sbf","hippophae_rhamnoide"),
    ("fwg","fermented_wheat_germ"),
    (r"(?:(?<=^)|(?<=[^A-Za-z0-9]))probiotic\w*\b", "probiotic"),
    (r"postbiotic","metabolite"),
    (r"(?:(?<=^)|(?<=[^A-Za-z0-9]))scfa\w*\b","short-chain_fatty_acid"),
    (r"prebiotic_supplementation","synbiotic_supplement"),
    (r"(?:(?<=^)|(?<=[^A-Za-z0-9]))starch\w*\b","Starch"),
    (r"nvp-1704_treatment", "probiotic"),
    (r"danggui_buxue_decoction","medication"),
    (r"\bdbd\w*\b","medication"),
    (r"f4_consumption","lactobacillus"),
    (r"f4_supplementation","lactobacillus"),
    (r"medicinal_herb", "medication"),
    (r"\bprebiotic\w*\b","prebiotic"),
    (r"b__licheniformi","bacillus"),
    (r"plant_polysaccharide","plant"),
    (r"zhe_busong_decoction","triphala"),
    (r"\blacticaseibacillus_rhamnosus\w*\b", "lacticaseibacillus_rhamnosus"),
    (r"high-cholesterol_diet", "diet"),
    
]

def load_dietary_labels(path):
    pattern = re.compile(r'^\s*(.*?)\s*[\(\[]([A-Za-z0-9_]+)[\)\]]')
    rows = []
    with open(path, encoding="utf-8") as fh:
        for ln in fh:
            m = pattern.match(ln)
            if not m:
                continue
            label, term_id = m.group(1), m.group(2)
            uri = DIETARYSUPPLEMENT_BASE + term_id
            rows.append((label, uri))
    return rows

def load_chebi_labels(path):
    rows = []
    with open(path, encoding="utf-8") as fh:
        next(fh) 
        for ln in fh:
            uri, label = ln.rstrip("\n").split("\t", 1)
            rows.append((label, uri))
    return rows

CHEBI_LABELS_FILE = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\parsedOntologies\chebi_labels.txt"
DIETARYSUPPLEMENT_LABELS_FILE = r"C:\Users\samue\OneDrive\Desktop\ThesisPiron\parsedOntologies\ncit_full_taxonomy.txt"
dietary_rows = load_dietary_labels(DIETARYSUPPLEMENT_LABELS_FILE)
chebi_rows = load_chebi_labels(CHEBI_LABELS_FILE)

exact_ix = defaultdict(list)
for lbl, uri in dietary_rows:
    exact_ix[lbl.lower()].append((lbl, uri))

exact_ix1 = defaultdict(list)
for lbl, uri in chebi_rows:
    exact_ix1[lbl.lower()].append((lbl, uri))

labels_only = [preprocess(lbl) for lbl, _ in dietary_rows]
vec = TfidfVectorizer(stop_words="english")
mat = vec.fit_transform(labels_only)

labels_only1 = [preprocess(lbl) for lbl, _ in chebi_rows]
vec1 = TfidfVectorizer(stop_words="english")
mat1 = vec1.fit_transform(labels_only1)

def top_cosine(term, k=5, thr=0.80):
    v  = vec.transform([term])
    sc = cosine_similarity(v, mat).ravel()
    idx = np.argsort(sc)[::-1]
    out = []
    for i in idx[:k]:
        if sc[i] < thr:
            break
        lbl, uri = dietary_rows[i]
        out.append((lbl, uri, sc[i]))
    return out

def top_cosine_chebi(term, k=5, thr=0.80):
    v  = vec1.transform([term])
    sc = cosine_similarity(v, mat1).ravel()
    idx = np.argsort(sc)[::-1]
    out = []
    for i in idx[:k]:
        if sc[i] < thr:
            break
        lbl, uri = chebi_rows[i]
        out.append((lbl, uri, sc[i]))
    return out
    
for paper_id, paper_data in data.items():
    
    entities = paper_data.get("entities", [])
    
    for i, entity in enumerate(entities):
        raw_label = entity.get("label", "").strip()
        
        text_span = entity.get("text_span", "").strip()
        
        if raw_label == "dietary supplement":
            text_span = entity.get("text_span", "").strip()
            cleaned_text_span = normalize_to_ascii(create_uri_fragment(text_span)).lower()
            term_raw = cleaned_text_span
            lookup_key = term_raw
            lookup_key = singularize(lookup_key)
            
            for pattern, replacement in regex_map:
                if re.search(pattern, lookup_key, flags=re.IGNORECASE):
                    lookup_key = replacement
                    break
                    
            term = preprocess(lookup_key)
            
            print(f"Query: {term}")
            #print(lookup_key)
            
            if lookup_key in created:
                entity_uri = created[lookup_key]
                for lookup_key in manual_created:
                    mention_uri = URIRef(GUTBRAIN[term_raw])
                    g.add((mention_uri, RDF.type,        MENTION_CLASS))
                    g.add((mention_uri, RDFS.label,      Literal(f"mention_dietarysupplement_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri

            ex = exact_ix.get(term, [])
            if ex:
                for lbl, uri in ex:
                    print(f"  • {lbl:40s} URI={uri}")
                    entity_uri = URIRef(f"{uri}")
                    created[term_raw] = entity_uri
                    g.add((entity_uri, RDF.type, DIETARYSUPPLEMENT_CLASS))
                    g.add((entity_uri, RDF.type, SKOS.Concept))
                    g.add((entity_uri, RDFS.label, Literal(lbl, datatype=XSD.string)))
                    g.add((entity_uri, SKOS.inScheme, DIETARYSUPPLEMENT_CONCEPT_SCHEME))
                    mention_uri = URIRef(GUTBRAIN[term_raw])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_dietarysupplement_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((entity_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    continue
                    print(); 
                    
            cos = top_cosine(term)
            cos1 = top_cosine_chebi(term)
            if cos:
                for lbl, uri, score in cos[:1]:
                    print(f"  • {lbl:40s} URI={uri:40s} score={score:.2f}")
                    name_uri = URIRef(f"{uri}")
                    created[term_raw] = name_uri
                    g.add((name_uri, RDF.type, DIETARYSUPPLEMENT_CLASS))
                    g.add((name_uri, RDF.type, SKOS.Concept))
                    g.add((name_uri, RDFS.label, Literal(lbl, datatype=XSD.string)))
                    g.add((name_uri, SKOS.inScheme, DIETARYSUPPLEMENT_CONCEPT_SCHEME))
                    mention_uri = URIRef(GUTBRAIN[term_raw])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_dietarysupplement_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((name_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    continue
                    print()
            elif cos1:
                for lbl, uri, score in cos1[:1]:
                    print(f"  • {lbl:40s} URI={uri:40s} score={score:.2f}")
                    name_uri = URIRef(f"{uri}")
                    created[term_raw] = name_uri
                    g.add((name_uri, RDF.type, DIETARYSUPPLEMENT_CLASS))
                    g.add((name_uri, RDF.type, SKOS.Concept))
                    g.add((name_uri, RDFS.label, Literal(lbl, datatype=XSD.string)))
                    g.add((name_uri, SKOS.inScheme, DIETARYSUPPLEMENT_CONCEPT_SCHEME))
                    mention_uri = URIRef(GUTBRAIN[term_raw])
                    g.add((mention_uri, RDF.type, MENTION_CLASS))
                    g.add((mention_uri, RDFS.label, Literal(f"mention_dietarysupplement_{term_raw}", datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                    g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                    g.add((name_uri, GUTPROP.containedIn, mention_uri))
                    tokenized_mentions[term_raw] = mention_uri
                    continue
                    print()
                    
            else:
                mention_uri = URIRef(GUTBRAIN[term_raw])
                g.add((mention_uri, RDF.type, MENTION_CLASS))
                g.add((mention_uri, RDFS.label, Literal(f"mention_dietarysupplement_{term_raw}", datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.hasMentionText, Literal(term_raw, datatype=XSD.string)))
                g.add((mention_uri, GUTPROP.taggedAs, Literal(raw_label, datatype=XSD.string)))
                tokenized_mentions[term_raw] = mention_uri
                print("no matches")
                continue

        else:
            pass

with open(tokenized_file, "r", encoding="utf-8") as f_sent:
    tokenized_data = json.load(f_sent)

for entry in tokenized_data:
    pmid         = entry["pmid"]
    sent_id      = entry["sent_id"]
    sentence_txt = entry["sentence"].strip()
    entities     = entry["entities"]

    sent_uri = URIRef(GUTBRAIN[f"sentence_{pmid}_{sent_id}"])

    sent_uri = URIRef(GUTBRAIN[f"sentence_{pmid}_{sent_id}"])
    g.add((sent_uri, RDF.type, SENTENCE))
    g.add((sent_uri, GUTPROP.hasSentenceText, Literal(sentence_txt, datatype=XSD.string)))

    if sent_id == 0:
        parent_uri = URIRef(GUTBRAIN[f"title_{pmid}"])
    else:
        parent_uri = URIRef(GUTBRAIN[f"abstract_{pmid}"])
    g.add((sent_uri,      GUTPROP.partOf,  parent_uri))
    g.add((parent_uri,    GUTPROP.composedOf, sent_uri))
        
    for ent in entities:
        if isinstance(ent, dict):
            text_span = ent.get("text_span", "").strip()
            label     = ent.get("label",    "").strip()
        else:
            continue

        if label != "dietary supplement":
            continue
        canonical = create_uri_fragment(text_span)
        cleaned_text_span = normalize_to_ascii(canonical).lower()
        
        if cleaned_text_span not in tokenized_mentions:
            mention_uri = URIRef(GUTBRAIN[cleaned_text_span])
            tokenized_mentions[cleaned_text_span] = mention_uri

            g.add((mention_uri, RDF.type, MENTION_CLASS))
            g.add((mention_uri, RDFS.label, Literal(f"mention_dietarysupplement_{cleaned_text_span}", datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.hasMentionText, Literal(text_span, datatype=XSD.string)))
            g.add((mention_uri, GUTPROP.taggedAs, Literal(label, datatype=XSD.string)))
            
            
        else:
            mention_uri = tokenized_mentions[cleaned_text_span]
            
        g.add((mention_uri, GUTPROP.locatedIn, sent_uri))

output_file = os.path.join(save_path, "gutbrain_entities.ttl")
ttl_output = g.serialize(format="turtle")
with open(output_file, "w", encoding="utf-8") as f_out:
    f_out.write(ttl_output)

print(f"The RDF graph has been saved in {output_file}")

Query: probiotic
  • Probiotic                                URI=http://purl.obolibrary.org/obo/NCIT_C93144
  • Probiotic                                URI=http://purl.obolibrary.org/obo/NCIT_C93144 score=1.00
Query: probiotic
  • Probiotic                                URI=http://purl.obolibrary.org/obo/NCIT_C93144
  • Probiotic                                URI=http://purl.obolibrary.org/obo/NCIT_C93144 score=1.00
Query: dietary supplementation
no matches
Query: feed
  • Feed                                     URI=http://purl.obolibrary.org/obo/NCIT_C69427
  • Feed                                     URI=http://purl.obolibrary.org/obo/NCIT_C69427 score=1.00
Query: feed
  • Feed                                     URI=http://purl.obolibrary.org/obo/NCIT_C69427
  • Feed                                     URI=http://purl.obolibrary.org/obo/NCIT_C69427 score=1.00
Query: feed
  • Feed                                     URI=http://purl.obolibrary.org/obo/NCIT_C69427
  • Feed        