In [6]:
from pathlib import Path

from bio_mcp.cache.load import load_biotools, load_galaxy_singularity
from bio_mcp.mcp.server import search_containers, describe_container

## Loading snapshots

Galaxy containers from the CVMFS and tool metadata from e.g. biotools are indexed by lowercased tool names so they can be mapped together.

In [None]:
bt = load_biotools(Path("/home/ubuntu/bio-mcp/data/scrnaseq_biotools.yaml"))
gx = load_galaxy_singularity(
    Path("/home/ubuntu/bio-mcp/data/scrnaseq_galaxy_cvmfs.json")
)

In [None]:
g = gx.get("cellbender")
g

In [None]:
bt

In [None]:
gx.keys()

## `search_containers`

In [None]:
search_containers("bcftools", gx)  # Should return []

In [None]:
search_containers("seurat", gx)

In [None]:
search_containers("cellranger", gx)  # near match

In [None]:
search_containers("multiqc", gx)

In [None]:
search_containers(["cellbender", "multiqc"], gx)

## `describe_container()`

In [None]:
describe_container(gx, bt, "cellbender")

In [None]:
for tool in ["cellbender"]:
    print(tool)

## Create metadata embeddings


In [None]:
from sentence_transformers.cross_encoder import CrossEncoder
from bio_mcp.cache.embeddings import *

In [None]:
# Load a pretrained ST model
model = CrossEncoder("cross-encoder/stsb-distilroberta-base")

In [None]:
example_queries = [
    "I have fastq files from a single cell sequencing experiment, what is the first step i can do?",
    "How do I qc, clustering and integration of my pre-processed scrna data?",
    "what tools are available for scrna analysis",
    "How can I remove technical artifacts (e.g. identify and remove doublets from my data)",
    "I have just generated scrna data, how do I analyse it for publication?"
]

In [None]:
# Create corpus of tool metadata
meta_text = []
for tool, _ in bt.items():
    text = make_embedding_text(bt.get(tool))
    meta_text.append(text)

In [24]:
meta_text

['Tool name: CellBender\nDescription: a deep generative model for unsupervised removal of background noise from scRNA-seq datasets.\n\nCellBender is a software package for eliminating technical artifacts from high-throughput single-cell RNA sequencing (scRNA-seq) data.\n\nWelcome to CellBender’s documentation! — CellBender documentation.\n\nFree document hosting provided by Read the Docs.\n\nStephen J Fleming, John C Marioni, and Mehrtash Babadi. CellBender remove-background: a deep generative model for unsupervised removal of background noise from scRNA-seq datasets. bioRxiv 791699; doi: https://doi.org/10.1101/791699.\nEDAM topics: Gene expression, RNA, RNA-Seq\nEDAM operations: Quantification, Data retrieval, Expression analysis',
 'Tool name: cellranger',
 'Tool name: FASTQC\nDescription: This tool aims to provide a QC report which can spot problems or biases which originate either in the sequencer or in the starting library material. It can be run in one of two modes. It can eithe

In [None]:
# Rank all 
ranks = model.rank(example_queries[0], meta_text)

Batches: 100%|██████████| 1/1 [00:02<00:00,  2.46s/it]

(5, 384)





In [None]:
print(f"Query: {example_queries[0]}")
for rank in ranks:
    print(f"{rank['score']:.2f}\t{meta_text[rank['corpus_id']]}")

In [36]:
rank2 = model.rank("What should I use for phylogenetics", meta_text)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches: 100%|██████████| 1/1 [00:01<00:00,  1.59s/it]


In [43]:
for rank in rank2:
    print(f"{rank['score']:.2f}\t{meta_text[rank['corpus_id']]}")

0.61	Tool name: seurat
Description: Seurat is an R package designed for QC, analysis, and exploration of single-cell RNA-seq data. Seurat aims to enable users to identify and interpret sources of heterogeneity from single-cell transcriptomic measurements, and to integrate diverse types of single-cell data.
EDAM topics: RNA-Seq, Transcriptomics
0.52	Tool name: CellBender
Description: a deep generative model for unsupervised removal of background noise from scRNA-seq datasets.

CellBender is a software package for eliminating technical artifacts from high-throughput single-cell RNA sequencing (scRNA-seq) data.

Welcome to CellBender’s documentation! — CellBender documentation.

Free document hosting provided by Read the Docs.

Stephen J Fleming, John C Marioni, and Mehrtash Babadi. CellBender remove-background: a deep generative model for unsupervised removal of background noise from scRNA-seq datasets. bioRxiv 791699; doi: https://doi.org/10.1101/791699.
EDAM topics: Gene expression, RN

## Aliasing edams

In [8]:
all_yaml = load_biotools(Path("/home/ubuntu/cache/johan.yaml"))

In [None]:
all_yaml.get("samtools")

{'biocontainers': 'samtools',
 'biotools': 'samtools',
 'bunya': ['1.13-gcc-10.3.0',
  '1.13-gcc-11.3.0',
  '1.16.1-gcc-11.3.0',
  '1.18-gcc-12.3.0                  (D)'],
 'description': 'SAMtools and BCFtools are widely used programs for processing and analysing high-throughput sequencing data. They include tools for file format conversion and manipulation, sorting, querying, statistics, variant calling, and effect analysis amongst other methods.',
 'edam-inputs': [{'formats': ['CRAM', 'SAM', 'BAM'], 'term': 'BAM'}],
 'edam-operations': ['Data filtering',
  'Visualisation',
  'Data formatting',
  'Indexing',
  'Data parsing',
  'Data sorting',
  'Data editing'],
 'edam-outputs': [{'formats': ['CRAM', 'SAM', 'BAM'], 'term': 'BAM'}],
 'edam-topics': ['Mapping',
  'Sequence analysis',
  'Sequencing',
  'Rare diseases'],
 'galaxy': [{'description': 'Samtools markdup: marks duplicate alignments',
   'title': 'Samtools markdup 1.22+galaxy1',
   'url': 'https://usegalaxy.org.au/root?tool_id

In [13]:
collected_data = []
for tool_id, tool_data in all_yaml.items():
    entry = {
        "id": tool_id,
        "description": tool_data.get("description", ""),
        "edam_inputs": tool_data.get("edam-inputs", []),
        "edam_outputs": tool_data.get("edam-outputs", []),
        "edam_operations": tool_data.get("edam-operations", []),
        "edam_topics": tool_data.get("edam-topics", []),
    }
    collected_data.append(entry)

In [44]:
# Generate set of unique edam operations
x = { op
        for tool in collected_data
        for op in (tool.get("edam_operations") or [])
}
x

{'3D profile generation',
 'Aggregation',
 'Alignment',
 'Alternative splicing prediction',
 'Analysis',
 'Ancestral reconstruction',
 'Annotation',
 'Antimicrobial resistance prediction',
 'Backbone modelling',
 'Base-calling',
 'Bisulfite mapping',
 'Blind peptide database search',
 'Box-Whisker plot plotting',
 'Calculation',
 'Cell migration analysis',
 'Chimera detection',
 'Chromatographic alignment',
 'Classification',
 'Clustering',
 'Coding region prediction',
 'Comparison',
 'Conversion',
 'Copy number estimation',
 'Correlation',
 'Cross-assembly',
 'DMR identification',
 'DNA barcoding',
 'DNA mapping',
 'DNA transcription',
 'DNA translation',
 'Data editing',
 'Data filtering',
 'Data formatting',
 'Data handling',
 'Data parsing',
 'Data retrieval',
 'Data sorting',
 'Database search',
 'De-novo assembly',
 'Deisotoping',
 'Demultiplexing',
 'Dendrogram visualisation',
 'Deposition',
 'Design',
 'Differential binding analysis',
 'Differential gene expression analysis',
 

In [54]:
from pronto import Ontology
from bio_mcp.cache.embeddings import load_edam_obo

In [55]:
obo = load_edam_obo()

In [56]:
obo

{'EDAM_data:0005': {'id': 'EDAM_data:0005',
  'name': 'Resource type',
  'def': '"A type of computational resource used in bioinformatics." [http://edamontology.org]',
  'namespace': 'data'},
 'EDAM_data:0006': {'id': 'EDAM_data:0006',
  'name': 'Data',
  'def': '"Information, represented in an information artefact (data record) that is \'understandable\' by dedicated computational tools that can use the data as input or produce it as output." [http://edamontology.org]',
  'namespace': 'data'},
 'EDAM_data:0007': {'id': 'EDAM_data:0007',
  'name': 'Tool',
  'def': '"A bioinformatics package or tool, e.g. a standalone application or web service." [http://edamontology.org]',
  'namespace': 'data'},
 'EDAM_data:0581': {'id': 'EDAM_data:0581',
  'name': 'Database',
  'def': '"A digital data archive typically based around a relational model but sometimes using an object-oriented, tree or graph-based model." [http://edamontology.org]',
  'namespace': 'data'},
 'EDAM_data:0582': {'id': 'EDAM_

In [7]:
# Test concept extract
from bio_mcp.cache.embeddings import extract_concepts_from_biotools_yaml

In [11]:
concepts = extract_concepts_from_biotools_yaml(yaml_path=Path("/home/ubuntu/cache/johan.yaml"))

In [12]:
concepts

{}