In [4]:
import json
import nltk
from nltk.tokenize import sent_tokenize

# Scarica i dati necessari per il tokenizzatore (se non già presenti)
nltk.download('punkt')

def process_json_file(input_file, output_file):
    # Carica il file JSON di input
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # Lista per accumulare le informazioni a livello di frase
    output_sentences = []

    # Itera per ogni documento (identificato dal suo pmid)
    for paper_id, entry in data.items():
        metadata = entry.get("metadata", {})
        title = metadata.get("title", "")
        abstract = metadata.get("abstract", "")
        
        # Combina title e abstract; se preferisci lavorare separatamente, puoi adattare la logica
        combined_text = title + " " + abstract
        
        # Effettua la tokenizzazione in frasi
        sentences = sent_tokenize(combined_text)
        
        # Estrai la lista delle entità (i text_span generati dal NER)
        ner_entities = entry.get("entities", [])
        # Costruiamo una lista di entità (stringhe) da cercare nelle frasi
        entity_texts = [ent.get("text_span", "") for ent in ner_entities if "text_span" in ent]
        
        # Per ogni frase, controlla quali entità sono presenti
        for idx, sentence in enumerate(sentences):
            found_entities = []
            for ent_text in entity_texts:
                # Se l'entità non è vuota e compare nella frase
                if ent_text and ent_text in sentence:
                    found_entities.append(ent_text)
            # Rimuove eventuali duplicati
            found_entities = list(set(found_entities))
            
            # Salva le informazioni della frase nell'output
            output_sentences.append({
                "pmid": paper_id,
                "sent_id": idx,
                "sentence": sentence,
                "entities": found_entities  # lista di entità trovate in questa frase
            })
    
    # Scrive l'output in un file JSON
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(output_sentences, f, indent=2, ensure_ascii=False)
    
    print(f"Le frasi processate sono state salvate in {output_file}")

# Imposta i percorsi dei file di input e output
input_file = 'train_platinum.json'
output_file = 'tokenized_sentences_with_entities.json'

process_json_file(input_file, output_file)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\samue\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Le frasi processate sono state salvate in tokenized_sentences_with_entities.json


In [1]:
import requests

def download_file(url, save_path):
    # Stream the file in chunks
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(save_path, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)

url = "https://nlmpubs.nlm.nih.gov/projects/mesh/MESH_FILES/xmlmesh/desc2025.xml"
save_path = "desc2025.xml"
download_file(url, save_path)
print(f"File downloaded to {save_path}")

File downloaded to desc2025.xml


In [15]:
import xml.etree.ElementTree as ET

def parse_mesh_descriptors(xml_path):
    """
    Parse the MeSH XML and return a list of dicts:
      { 'ui': DescriptorUI,
        'name': DescriptorName,
        'tree_numbers': [treeNum1, treeNum2, ...]
      }
    """
    tree = ET.parse(xml_path)
    root = tree.getroot()
    descriptors = []
    for dr in root.findall('DescriptorRecord'):
        ui_el = dr.find('DescriptorUI')
        name_el = dr.find('DescriptorName/String')
        if ui_el is None or name_el is None:
            continue
        ui = ui_el.text
        name = name_el.text
        tree_numbers = [tn.text for tn in dr.findall('TreeNumberList/TreeNumber') if tn.text]
        descriptors.append({
            'ui': ui,
            'name': name,
            'tree_numbers': tree_numbers
        })
    return descriptors

def get_bacteria_taxonomy(xml_path):
    """
    From the full MeSH descriptors file, extract all descriptors
    under the 'Bacteria' branch (tree number B03).
    Returns a dict: { tree_number: { 'ui': ..., 'name': ... }, ... }
    """
    descriptors = parse_mesh_descriptors(xml_path)

    bacteria_prefix = None
    for d in descriptors:
        if d['name'] == 'Bacteria':
            bacteria_prefix = d['tree_numbers'][0]
            break

    if not bacteria_prefix:
        raise RuntimeError("Couldn't find a descriptor named 'Bacteria' in the file.")

    taxonomy = {}
    for d in descriptors:
        for tn in d['tree_numbers']:
            if tn == bacteria_prefix or tn.startswith(bacteria_prefix + '.'):
                taxonomy[tn] = {
                    'ui': d['ui'],
                    'name': d['name']
                }
                break

    return taxonomy

if __name__ == '__main__':
    xml_file = 'desc2025.xml'
    bacteria_tax = get_bacteria_taxonomy(xml_file)

    for tree_num in sorted(bacteria_tax):
        info = bacteria_tax[tree_num]
        print(f"{tree_num:10s}  {info['name']:30s}  ({info['ui']})")

B03         Bacteria                        (D001419)
B03.026     Acidobacteria                   (D061271)
B03.054     Agricultural Inoculants         (D059827)
B03.110     Atypical Bacterial Forms        (D001295)
B03.110.422  L Forms                         (D007740)
B03.110.761  Spheroplasts                    (D013104)
B03.120     Bacteria, Aerobic               (D001420)
B03.130     Bacteria, Anaerobic             (D001421)
B03.135     Bacteria, Thermoduric           (D000072280)
B03.250     Chlorobi                        (D019414)
B03.250.140  Chlorobium                      (D041883)
B03.275     Chloroflexi                     (D041862)
B03.275.150  Chloroflexus                    (D041861)
B03.275.575  Dehalococcoides                 (D000082942)
B03.280     Cyanobacteria                   (D000458)
B03.280.100  Anabaena                        (D017033)
B03.280.100.150  Anabaena cylindrica             (D046868)
B03.280.100.900  Anabaena variabilis             (D046870)
B03.28

In [17]:
import xml.etree.ElementTree as ET
from difflib import SequenceMatcher, get_close_matches

def parse_mesh_descriptors(xml_path):
    """Parse the MeSH XML and return a list of descriptors."""
    tree = ET.parse(xml_path)
    root = tree.getroot()
    descriptors = []
    for dr in root.findall('DescriptorRecord'):
        ui_el   = dr.find('DescriptorUI')
        name_el = dr.find('DescriptorName/String')
        if ui_el is None or name_el is None:
            continue
        ui   = ui_el.text
        name = name_el.text
        tree_nums = [tn.text for tn in dr.findall('TreeNumberList/TreeNumber') if tn.text]
        descriptors.append({'ui':ui, 'name':name, 'tree_numbers':tree_nums})
    return descriptors

MESH_XML = 'desc2025.xml' #from the folder
descriptors = parse_mesh_descriptors(MESH_XML)
print(f"Parsed {len(descriptors)} descriptors")
#print(descriptors)

Parsed 30956 descriptors


In [18]:
def get_bacteria_taxonomy(descriptors):
    """
    Find the descriptor with name 'Bacteria', 
    then collect every descriptor under that tree-number prefix.
    """
    prefix = None #bacteria is B03
    for d in descriptors:
        if d['name'].lower() == 'bacteria' and d['tree_numbers']:
            prefix = d['tree_numbers'][0]
            break
    if not prefix:
        raise RuntimeError("Could not find 'Bacteria' in descriptors")
    tax = {}
    for d in descriptors:
        for tn in d['tree_numbers']:
            if tn == prefix or tn.startswith(prefix + '.'):
                tax[tn] = {'ui':d['ui'], 'name':d['name']}
                break
    return tax

bacteria_tax = get_bacteria_taxonomy(descriptors)
#print(f"{len(bacteria_tax)} bacterial MeSH nodes")

In [19]:
def build_name_index(taxonomy):
    """
    Build a dict: lower-case name -> list of (tree#, ui, canonical name)
    """
    idx = {}
    for tree_num, info in taxonomy.items():
        key = info['name'].lower()
        idx.setdefault(key, []).append((tree_num, info['ui'], info['name']))
    return idx

name_index = build_name_index(bacteria_tax)
print(f"Indexed {len(name_index)} bacterial names")

Indexed 859 bacterial names


In [24]:
from difflib import SequenceMatcher

def find_mesh_match(input_name, name_index, n=5, cutoff=0.6):
    """
    Return matches for input_name among the MeSH bacterial names.
    Exact matches come first with score=1.0.
    Then fuzzy matches (score computed via SequenceMatcher.ratio).
    """
    key = input_name.lower()
    results = []
    if key in name_index:
        for tree_num, ui, name in name_index[key]:
            results.append((name, ui, tree_num, 1.0))
        return results

    all_names = list(name_index.keys())
    #print(all_names)
    
    close = get_close_matches(key, all_names, n=n, cutoff=cutoff)
    for cname in close:
        score = SequenceMatcher(None, key, cname).ratio()
        for tree_num, ui, name in name_index[cname]:
            results.append((name, ui, tree_num, score))
    results.sort(key=lambda x: x[3], reverse=True)
    return results


In [48]:
import re
import json

def strip_html_tags(text):
    return re.sub(r'<[^>]+>', '', text)

with open("train_platinum.json", "r", encoding="utf-8") as f:
    data = json.load(f)

bacteria_terms = set()
for record in data.values():
    for ent in record.get("entities", []):
        if ent.get("label", "").lower() == "bacteria":
            raw = ent["text_span"]
            clean = strip_html_tags(raw).strip()
            if clean:
                bacteria_terms.add(clean)

bacteria_terms.add(clean)

for term in sorted(bacteria_terms):
    print(f"Query: {term}")
    matches = find_mesh_match(term, name_index)
    if not matches:
        print("no matches")
    else:
        for name, ui, tree, score in matches[:5]:
            print(f"  • {name:30s} UI={ui:8s} Tree={tree:12s}  score={score:.2f}")
    print()

Query: A. muciniphila
  • Acidiphilium                   UI=D041801  Tree=B03.440.400.425.100.110  score=0.62

Query: Acidaminococcus intestini
  • Acidaminococcus                UI=D045850  Tree=B03.353.250   score=0.75
  • Micrococcus luteus             UI=D016982  Tree=B03.510.024.850.500.500  score=0.60

Query: Actinobacteria
  • Actinobacteria                 UI=D039903  Tree=B03.510.024   score=1.00

Query: Actinobacteriota
  • Actinobacteria                 UI=D039903  Tree=B03.510.024   score=0.93
  • Acidobacteria                  UI=D061271  Tree=B03.026       score=0.83
  • Cyanobacteria                  UI=D000458  Tree=B03.280       score=0.76
  • Acinetobacter                  UI=D000150  Tree=B03.440.400.425.537.050  score=0.76
  • Acetobacter                    UI=D000091  Tree=B03.440.400.425.100.100  score=0.74

Query: Acute/chronic insomnia-related signature bacteria
no matches

Query: Agathobaculum
  • Thiobacillus                   UI=D013855  Tree=B03.440.400.450.

In [53]:
import re
import json
import numpy as np
import xml.etree.ElementTree as ET
from difflib import get_close_matches, SequenceMatcher
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# --- 1) parse MeSH descriptors and build only the 'Bacteria' subtree ---
def parse_mesh_descriptors(xml_path):
    tree = ET.parse(xml_path)
    root = tree.getroot()
    descriptors = []
    for dr in root.findall('DescriptorRecord'):
        ui_el   = dr.find('DescriptorUI')
        name_el = dr.find('DescriptorName/String')
        if ui_el is None or name_el is None:
            continue
        ui   = ui_el.text
        name = name_el.text
        tree_nums = [tn.text for tn in dr.findall('TreeNumberList/TreeNumber') if tn.text]
        descriptors.append({'ui':ui, 'name':name, 'tree_numbers':tree_nums})
    return descriptors

def get_bacteria_taxonomy(descriptors):
    # find the 'Bacteria' root (TreeNumber prefix B03)
    prefix = None
    for d in descriptors:
        if d['name'].lower() == 'bacteria' and d['tree_numbers']:
            prefix = d['tree_numbers'][0]
            break
    if not prefix:
        raise RuntimeError("Could not find 'Bacteria' descriptor")
    tax = {}
    for d in descriptors:
        for tn in d['tree_numbers']:
            if tn == prefix or tn.startswith(prefix + '.'):
                tax[tn] = {'ui':d['ui'], 'name':d['name']}
                break
    return tax

def build_name_index(taxonomy):
    idx = {}
    for tree_num, info in taxonomy.items():
        key = info['name'].lower()
        idx.setdefault(key, []).append((tree_num, info['ui'], info['name']))
    return idx

MESH_XML = 'desc2025.xml'
descriptors   = parse_mesh_descriptors(MESH_XML)
bacteria_tax  = get_bacteria_taxonomy(descriptors)
name_index    = build_name_index(bacteria_tax)

# --- 2) load bacterial terms from your JSON ---
def strip_html_tags(text):
    return re.sub(r'<[^>]+>', '', text)

with open("train_platinum.json", "r", encoding="utf-8") as f:
    data = json.load(f)

bacteria_terms = sorted({
    strip_html_tags(ent["text_span"]).strip()
    for rec in data.values()
    for ent in rec.get("entities", [])
    if ent.get("label","").lower() == "bacteria"
})

# --- 3) prepare MeSH names list aligned with name_index ---
mesh_names   = sorted({info[2] for infos in name_index.values() for info in infos})
mesh_triples = [name_index[name.lower()][0] for name in mesh_names]  # (tree,ui,name)

# --- 4) embed both lists ---
model = SentenceTransformer('all-mpnet-base-v2')
mesh_embeds = model.encode(mesh_names,   convert_to_tensor=False)  # (M, D)
bact_embeds = model.encode(bacteria_terms, convert_to_tensor=False)  # (N, D)

# --- 5) compute M×N cosine-similarity ---
sims = cosine_similarity(mesh_embeds, bact_embeds)  # shape=(M,N)

# --- 6) for each bacteria, exact-match or top 5 by similarity ---
for j, bact in enumerate(bacteria_terms):
    print(f"Query: {bact}")
    key = bact.lower()
    if key in name_index:
        # exact match: list all tree/UI for this name
        for tree, ui, name in name_index[key]:
            print(f"  • {name:30s} UI={ui:8s} Tree={tree:12s}  score=1.00")
    else:
        # no exact match: find top 5 mesh indices
        col = sims[:, j]
        top5 = np.argsort(col)[::-1][:5]
        for i in top5:
            name       = mesh_names[i]
            tree, ui,_ = mesh_triples[i]
            score      = col[i]
            print(f"  • {name:30s} UI={ui:8s} Tree={tree:12s}  score={score:.2f}")
    print()


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Query: A. muciniphila
  • Aeromonas hydrophila           UI=D016980  Tree=B03.440.450.019.025.380  score=0.64
  • Bilophila                      UI=D045282  Tree=B03.440.425.410.200  score=0.62
  • Aeromonadaceae                 UI=D044044  Tree=B03.440.450.019  score=0.60
  • Aerococcaceae                  UI=D056567  Tree=B03.353.750.030  score=0.59
  • Aeromonas                      UI=D000333  Tree=B03.440.450.019.025  score=0.58

Query: Acidaminococcus intestini
  • Acidaminococcus                UI=D045850  Tree=B03.353.250   score=0.91
  • Acidithiobacillus              UI=D042763  Tree=B03.440.400.425.103  score=0.78
  • Acidobacteria                  UI=D061271  Tree=B03.026       score=0.77
  • Pediococcus acidilactici       UI=D000070016 Tree=B03.353.750.450.737.500  score=0.74
  • Enterococcus faecium           UI=D016984  Tree=B03.353.750.250.250.300  score=0.70

Query: Actinobacteria
  • Actinobacteria                 UI=D039903  Tree=B03.510.024   score=1.00

Query: Acti

In [50]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Downloading sentence_transformers-4.1.0-py3-none-any.whl (345 kB)
Installing collected packages: sentence_transformers
Successfully installed sentence_transformers-4.1.0


