# ov.Agent Ten-Task Regression Suite (Step-by-step)

This notebook keeps things minimal: 
1. **Step 1** checks the runtime and initializes `ov.Agent`.
2. **Step 2** fetches datasets directly via Scanpy or downloadable URLs.
3. **Steps 3–13** each hold a standalone `ov.Agent(prompt, data)` block so you can trigger any task with a single cell.


## Step 1 – Environment & agent check
Import OmicVerse/Scanpy, confirm versions, and instantiate a single `ov.Agent` session that all later steps reuse.

In [None]:

import os
import sys
from pathlib import Path

import scanpy as sc
import omicverse as ov

print(f"Python executable: {sys.executable}")
print(f"Python version: {sys.version}")
print(f"OmicVerse version: {getattr(ov, '__version__', 'unknown')} @ {ov.__file__}")
print(f"Scanpy version: {sc.__version__}")
print("
Supported models:")
print(ov.list_supported_models())

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY')
GEMINI_API_KEY = os.getenv('GEMINI_API_KEY')
api_key = OPENAI_API_KEY or ANTHROPIC_API_KEY or GEMINI_API_KEY
if not api_key:
    print('⚠️  Set OPENAI_API_KEY / ANTHROPIC_API_KEY / GEMINI_API_KEY before running the agent.')

model_id = os.getenv('OV_AGENT_MODEL', 'gpt-5')
sc.settings.set_figure_params(dpi=100)
agent = ov.Agent(model=model_id, api_key=api_key)
agent


## Step 2 – Download or stage datasets
The cell below pulls what it can directly via `scanpy.datasets` and prints URL targets for heavier assets (Multiome, ATAC, TME, spatial, etc.). Set `AUTO_FETCH_WEB = True` if you want the script to download the external files automatically.

In [None]:

import urllib.request

DATA_ROOT = Path('ov_agent_data')
DATA_ROOT.mkdir(parents=True, exist_ok=True)

SCANPY_DATASETS = {
    'pbmc_fastq_qc': ('scanpy.datasets.pbmc3k', sc.datasets.pbmc3k),
    'pancreas_multi_simba': ('scanpy.datasets.pbmc68k_reduced', sc.datasets.pbmc68k_reduced),
    'paul15_traj': ('scanpy.datasets.paul15', sc.datasets.paul15),
}

DATA_CACHE = {}
for key, (label, loader) in SCANPY_DATASETS.items():
    try:
        DATA_CACHE[key] = loader()
        print(f"✅ Loaded {key} via {label}")
    except Exception as exc:
        DATA_CACHE[key] = None
        print(f"⚠️  Failed to load {key} via {label}: {exc}")

DATA_PATHS = {
    'pbmc_multiome_rna': DATA_ROOT / 'pbmc_multiome_rna.h5ad',
    'pbmc_multiome_atac': DATA_ROOT / 'pbmc_multiome_atac.h5ad',
    'pbmc5k_atac_emb': DATA_ROOT / 'pbmc5k_atac_emb.h5ad',
    'pbmc5k_rna_reference': DATA_ROOT / 'pbmc_rna_reference.h5ad',
    'tme_cellphonedb': DATA_ROOT / 'tme_cellphonedb.h5ad',
    'metatime_input': DATA_ROOT / 'metatime_input.h5ad',
    'cefcon_input': DATA_ROOT / 'cefcon_nestorowa.h5ad',
    'scdrug_input': DATA_ROOT / 'scdrug_scanpyobj.h5ad',
    'visium_slice_151676': DATA_ROOT / '151676_filtered_feature_bc_matrix.h5',
    'visium_slice_151507': DATA_ROOT / '151507_filtered_feature_bc_matrix.h5',
}

WEB_DATASETS = [
    {
        'name': 'pbmc_multiome_rna',
        'url': 'https://figshare.com/ndownloader/files/41460054',
        'path': DATA_PATHS['pbmc_multiome_rna'],
    },
    {
        'name': 'pbmc_multiome_atac',
        'url': 'https://figshare.com/ndownloader/files/41460051',
        'path': DATA_PATHS['pbmc_multiome_atac'],
    },
    {
        'name': 'pbmc5k_atac_emb',
        'url': 'https://figshare.com/ndownloader/files/41418600',
        'path': DATA_PATHS['pbmc5k_atac_emb'],
    },
    {
        'name': 'pbmc5k_rna_reference',
        'url': 'https://cf.10xgenomics.com/samples/cell-exp/1.1.0/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz',
        'path': DATA_PATHS['pbmc5k_rna_reference'],
    },
    {
        'name': 'tme_cellphonedb',
        'url': 'https://github.com/ventolab/CellphoneDB/raw/master/notebooks/data_tutorial.zip',
        'path': DATA_PATHS['tme_cellphonedb'],
    },
    {
        'name': 'metatime_input',
        'url': 'https://figshare.com/ndownloader/files/41440050',
        'path': DATA_PATHS['metatime_input'],
    },
    {
        'name': 'cefcon_input',
        'url': 'https://github.com/WPZgithub/CEFCON/raw/e74d2d248b88fb3349023d1a97d3cc8a52cc4060/notebooks/data/nestorowa16_preprocessed.h5ad',
        'path': DATA_PATHS['cefcon_input'],
    },
    {
        'name': 'scdrug_input',
        'url': 'https://figshare.com/ndownloader/files/47461946',
        'path': DATA_PATHS['scdrug_input'],
    },
    {
        'name': 'visium_slice_151676',
        'url': 'https://drive.google.com/uc?export=download&id=1Omte1adVFzyRDw7VloOAQYwtv_NjdWcG',
        'path': DATA_PATHS['visium_slice_151676'],
    },
    {
        'name': 'visium_slice_151507',
        'url': 'https://drive.google.com/uc?export=download&id=1zsMZnG-tYr9ebquG6YULi6gvN00zGEGZ',
        'path': DATA_PATHS['visium_slice_151507'],
    },
]

def download_file(url: str, destination: Path) -> None:
    destination.parent.mkdir(parents=True, exist_ok=True)
    if destination.exists():
        print(f"   ↪ {destination} already present")
        return
    print(f"   ↓ Downloading {url} → {destination}")
    with urllib.request.urlopen(url) as resp, open(destination, 'wb') as out:
        out.write(resp.read())
    print(f"   ✅ Saved {destination}")

AUTO_FETCH_WEB = False
if AUTO_FETCH_WEB:
    for entry in WEB_DATASETS:
        try:
            download_file(entry['url'], entry['path'])
        except Exception as exc:
            print(f"⚠️  Could not download {entry['name']}: {exc}")
else:
    for entry in WEB_DATASETS:
        print(f"ℹ️  Stage {entry['name']} manually from {entry['url']} → {entry['path']}")

print("
Dataset cache summary:")
for key, value in DATA_CACHE.items():
    status = 'ready' if value is not None else 'missing'
    print(f" - {key}: {status}")

TASK_RESULTS = {}


### Step 3 – PBMC 5k/8k FASTQ → QC → cluster-stability benchmarking
Trigger the full PBMC end-to-end workflow with a single prompt.

In [None]:

step3_prompt = """You are ov.Agent orchestrating an end-to-end PBMC 5k/8k workflow.

Use the skill registry to load and cite:
- `single-preprocessing` for QC/HVG/scaling guidance.
- `single-clustering` for multi-head resolution sweeps.
- `data-viz-plots` plus `single-downstream-analysis` for stability diagnostics.

Requirements:
1. Start from raw 10x FASTQs, run kb-python alignment to generate `adata.h5ad` counts (show CLI commands).
2. Apply PBMC-grade QC thresholds, normalization, and HVG selection, documenting parameter choices.
3. Run Leiden, Louvain, Gaussian mixture, and LDA clustering across several resolutions, computing UMAP drift metrics that quantify stability.
4. Return ordered code blocks plus a markdown table summarizing head, resolution, drift (0–1), and the recommended resolution."""

pbmc_fastq_data = DATA_CACHE.get('pbmc_fastq_qc')
if pbmc_fastq_data is None:
    raise RuntimeError('Load PBMC data via scanpy.datasets in Step 2 before running this cell.')
TASK_RESULTS['pbmc_fastq_qc'] = agent.run(step3_prompt, pbmc_fastq_data)
TASK_RESULTS['pbmc_fastq_qc']


### Step 4 – Pancreas multi-study merge with SIMBA embeddings
Stress-test SIMBA-based integration on pancreas donors.

In [None]:

step4_prompt = """Integrate the Baron, Segerstolpe, and Muraro pancreas scRNA-seq donors under strong batch effects.

Use the registry to load:
- `single-preprocessing` for donor-level normalization and covariate regression.
- `single-multiomics` for SIMBA-style heterogeneous graph construction.
- `single-clustering` and `data-viz-plots` for UMAP diagnostics.

Workflow expectations:
1. Highlight how to harmonize preprocessing parameters across cohorts before building the SIMBA graph.
2. Provide the concrete SIMBA commands that add nodes/edges, train embeddings, and export 2D projections.
3. Quantify integration with before/after UMAPs, kBET, and silhouette scores, commenting on endocrine vs. exocrine separation.
4. Explain how to persist the learned embeddings for downstream classifiers."""

pancreas_data = DATA_CACHE.get('pancreas_multi_simba')
if pancreas_data is None:
    raise RuntimeError('Stage a pancreas multi-donor AnnData object in DATA_CACHE['pancreas_multi_simba'].')
TASK_RESULTS['pancreas_multi_simba'] = agent.run(step4_prompt, pancreas_data)
TASK_RESULTS['pancreas_multi_simba']


### Step 5 – Paul15 hematopoietic trajectories with MetaTiME diagnostics
Rebuild hematopoietic trajectories plus MetaTiME cycle checks.

In [None]:

step5_prompt = """Reconstruct megakaryocyte vs. lymphoid trajectories on Paul15-like hematopoiesis data.

Use the registry to pull:
- `single-trajectory` for diffusion, PAGA, Palantir/VIA, and MetaTiME checkpoints.
- `single-preprocessing` for QC and normalization.
- `single-downstream-analysis` plus `data-viz-plots` for marker summaries.

Deliverables:
1. Describe preprocessing plus neighborhood graph construction before diffusion/PAGA.
2. Run diffusion maps, Palantir/VIA, and identify root and terminal states with pseudotime ordering.
3. Produce branch marker tables for at least two fates and explain MetaTiME cycle diagnostics that validate the ordering.
4. Return code snippets, saved-figure descriptions, and a markdown list of pseudotime milestones."""

paul15_data = DATA_CACHE.get('paul15_traj')
if paul15_data is None:
    raise RuntimeError('Paul15 data is missing—rerun Step 2 or provide your own AnnData object.')
TASK_RESULTS['paul15_traj'] = agent.run(step5_prompt, paul15_data)
TASK_RESULTS['paul15_traj']


### Step 6 – PBMC Multiome 10k GLUE + MOFA factor discovery
Align RNA/ATAC embeddings and decode factors.

In [None]:

step6_prompt = """Perform cross-modal alignment for PBMC Multiome 10k.

Use skill registry lookups for:
- `single-multiomics` (GLUE pairing + MOFA training).
- `single-preprocessing` (modality-specific normalization).
- `data-viz-plots` (factor visualization).

Tasks:
1. Pair RNA and ATAC embeddings with GLUE and report the path to the paired metadata.
2. Train MOFA on matched matrices, labelling shared, RNA-only, and ATAC-only factors with variance explained tables.
3. Highlight at least one IFN-response and one chromatin-accessibility-specific factor, with marker genes/peaks.
4. Provide code snippets plus interpretation bullets for each factor category."""

multiome_rna = DATA_PATHS['pbmc_multiome_rna']
multiome_atac = DATA_PATHS['pbmc_multiome_atac']
if not multiome_rna.exists() or not multiome_atac.exists():
    raise FileNotFoundError('Download both RNA and ATAC embeddings in Step 2 before running this cell.')
multiome_data = sc.read(multiome_rna)
multiome_data.uns['atac_embedding_path'] = str(multiome_atac)
TASK_RESULTS['multiome_glue'] = agent.run(step6_prompt, multiome_data)
TASK_RESULTS['multiome_glue']


### Step 7 – PBMC 5k scATAC label transfer via GLUE embeddings
Move annotations from RNA to ATAC with cross-modal KNN graphs.

In [None]:

step7_prompt = """Transfer PBMC RNA annotations onto PBMC 5k scATAC cells.

Use skill registry entries:
- `single-multiomics` for GLUE-derived embeddings and cross-modal graphs.
- `single-annotation` for label transfer/validation patterns.

Instructions:
1. Load the GLUE embeddings (`data/analysis_lymph/rna-emb.h5ad` and `data/analysis_lymph/atac-emb.h5ad`).
2. Build the cross-modal KNN graph, migrate labels with confidence, and surface per-cluster agreement statistics.
3. Flag potential mismatches and explain how to visualize transferred labels on ATAC UMAPs.
4. Return python commands plus a markdown table of cluster vs. confidence."""

atac_path = DATA_PATHS['pbmc5k_atac_emb']
rna_ref_path = DATA_PATHS['pbmc5k_rna_reference']
if not atac_path.exists() or not rna_ref_path.exists():
    raise FileNotFoundError('Stage both ATAC and RNA reference embeddings before running the transfer cell.')
atac_data = sc.read(atac_path)
atac_data.uns['rna_reference_path'] = str(rna_ref_path)
TASK_RESULTS['pbmc5k_scatac_transfer'] = agent.run(step7_prompt, atac_data)
TASK_RESULTS['pbmc5k_scatac_transfer']


### Step 8 – Tumor microenvironment ligand–receptor diagnostics (CellPhoneDBViz)
Compare exhausted T cells vs. M2 macrophages across treatments.

In [None]:

step8_prompt = """Contrast ligand–receptor usage between exhausted T cells and M2 macrophages for treated vs. untreated tumors.

Use registry calls for:
- `single-cellphone-db` (interaction formatting, execution, visualization).
- `single-downstream-analysis` and `data-viz-plots` (interpretation + figure export).

Expectations:
1. Show metadata formatting, CellPhoneDB execution commands, and result parsing.
2. Highlight top ligand–receptor pairs per condition with effect sizes/p-values.
3. Describe how to build heatmaps and chord diagrams (include filenames) via CellPhoneDBViz helpers.
4. Provide interpretation guidance on shifts between conditions."""

tme_path = DATA_PATHS['tme_cellphonedb']
if not tme_path.exists():
    raise FileNotFoundError('Download the CellPhoneDB-ready AnnData file before running this cell.')
tme_data = sc.read(tme_path)
TASK_RESULTS['cellphonedb'] = agent.run(step8_prompt, tme_data)
TASK_RESULTS['cellphonedb']


### Step 9 – MetaTiME-driven immune microenvironment annotation
Score immune states with MetaTiME meta-components.

In [None]:

step9_prompt = """Annotate tumor-infiltrating immune cells with MetaTiME.

Use skill registry lookups for:
- `single-trajectory` (MetaTiME scoring + pseudotime context).
- `single-preprocessing` (optional inferCNV-based malignant removal).
- `single-downstream-analysis` (report formatting).

Deliverables:
1. Optionally filter malignant cells via infercnvpy outputs and recompute neighbors in SCVI space.
2. Run MetaTiME scoring, rank meta-components per cluster, and interpret dominant immune states.
3. Provide preprocessing + scoring code plus a markdown report mapping cluster → top meta-component with interpretation."""

metatime_path = DATA_PATHS['metatime_input']
if not metatime_path.exists():
    raise FileNotFoundError('Provide the MetaTiME-ready AnnData file (TiME_adata_scvi.h5ad) before running this cell.')
metatime_data = sc.read(metatime_path)
TASK_RESULTS['metatime'] = agent.run(step9_prompt, metatime_data)
TASK_RESULTS['metatime']


### Step 10 – CEFCON driver regulator discovery
Find branch-specific regulators with the CEFCON pipeline.

In [None]:

step10_prompt = """Discover lineage-specific driver regulators with CEFCON on Nestorowa/Paul15 hematopoiesis.

Use the registry to load:
- `single-trajectory` (fate modeling context).
- `single-downstream-analysis` (regulator reporting).

Steps:
1. Document preprocessing and prior network setup, including how to load the NicheNet graph.
2. Run `ov.single.pyCEFCON`, exporting regulon tables for at least two branches (erythroid vs. granulocyte).
3. Provide tuning advice (walk length, regularization) and show how to visualize regulator activity heatmaps.
4. Summarize key regulators per branch in markdown."""

cefcon_path = DATA_PATHS['cefcon_input']
if not cefcon_path.exists():
    raise FileNotFoundError('Download the Nestorowa/Paul15 preprocessed AnnData for CEFCON before running this cell.')
cefcon_data = sc.read(cefcon_path)
TASK_RESULTS['cefcon'] = agent.run(step10_prompt, cefcon_data)
TASK_RESULTS['cefcon']


### Step 11 – Precision oncology prioritization (inferCNV + scDrug)
Rank therapies per malignant clone after inferCNV calling.

In [None]:

step11_prompt = """Combine inferCNV-based malignant calling with scDrug predictions for precision oncology.

Use registry lookups for:
- `single-multiomics` (scDrug + modality handling).
- `single-downstream-analysis` (drug ranking summaries).
- `data-viz-plots` (CNV heatmap references).

Workflow:
1. Run infercnvpy to separate malignant vs. normal cells and reference the CNV heatmap artifact.
2. Feed malignant clones into scDrug, compute predicted IC50 values, and rank at least five compounds per clone.
3. Provide code for exporting the ranking table plus guidance on cross-referencing copy-number context when interpreting drug hits."""

scdrug_path = DATA_PATHS['scdrug_input']
if not scdrug_path.exists():
    raise FileNotFoundError('Stage the scanpyobj.h5ad file before invoking the scDrug workflow.')
scdrug_data = sc.read(scdrug_path)
TASK_RESULTS['scdrug'] = agent.run(step11_prompt, scdrug_data)
TASK_RESULTS['scdrug']


### Step 12 – SpaceFlow pseudo-spatiotemporal mapping (Visium 151676)
Derive domains and pSM layers from a single Visium slice.

In [None]:

step12_prompt = """Compute SpaceFlow embeddings and pseudo-spatiotemporal maps for Visium DLPFC slice 151676.

Use registry entries for:
- `single-to-spatial-mapping` (Visium pre-processing).
- `spatial-trajectory` (SpaceFlow training + domain discovery).
- `data-viz-plots` (domain overlays on histology).

Plan:
1. Load `151676_filtered_feature_bc_matrix.h5`, normalize spots, and build spatial KNN graphs for SpaceFlow.
2. Train SpaceFlow, report domain assignments plus pseudo-spatiotemporal maps (pSM) and save the embeddings/pSM matrices.
3. Describe how to visualize embeddings/domains layered on histology with filenames for the exported figures."""

spaceflow_path = DATA_PATHS['visium_slice_151676']
if not spaceflow_path.exists():
    raise FileNotFoundError('Download the 151676 Visium matrix (.h5) before running SpaceFlow.')
spaceflow_data = sc.read_10x_h5(spaceflow_path)
TASK_RESULTS['spaceflow_151676'] = agent.run(step12_prompt, spaceflow_data)
TASK_RESULTS['spaceflow_151676']


### Step 13 – STAligner multi-slice alignment (151676 ↔ 151507)
Align consecutive Visium sections with triplet-loss GATs.

In [None]:

step13_prompt = """Align consecutive Visium DLPFC slices (151676 and 151507) with STAligner.

Use the skill registry to load:
- `spatial-alignment` (STAligner configuration and triplet-loss GAT training).
- `single-to-spatial-mapping` (Visium preprocessing for both slices).
- `data-viz-plots` (aligned cortical-layer visualization).

Expectations:
1. Preprocess both filtered matrices, harmonize features, and construct spot graphs before alignment.
2. Run STAligner with triplet-loss GAT, report where embeddings and aligned coordinates are saved, and describe convergence checks.
3. Summarize conserved cortical layers across slices plus any mismatches, pointing to exported alignment plots."""

slice_a = DATA_PATHS['visium_slice_151676']
slice_b = DATA_PATHS['visium_slice_151507']
if not slice_a.exists() or not slice_b.exists():
    raise FileNotFoundError('Download both Visium slices (151676 & 151507) before running STAligner.')
slice_a_data = sc.read_10x_h5(slice_a)
slice_a_data.uns['staligner_peer_slice_path'] = str(slice_b)
TASK_RESULTS['staligner'] = agent.run(step13_prompt, slice_a_data)
TASK_RESULTS['staligner']
