# JHM 通路/毒性全流程 Notebook (Emu3-Chat)

本版本用于 Emu3-Chat 替换/对照实验。模型权重已在本地保存，请设置本地路径。

当前训练/预测脚本仍沿用既有流程（ChemBERTa 版本）


In [None]:
# 固定使用物理 GPU 1：必须在 import torch 前设置；修改后需要 Restart Kernel 才会生效
import os, sys, subprocess
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
print('CUDA_VISIBLE_DEVICES=', os.environ.get('CUDA_VISIBLE_DEVICES'))

import torch
print('Python:', sys.version)
print('Torch:', torch.__version__)
print('CUDA available:', torch.cuda.is_available())
print('Visible CUDA device count:', torch.cuda.device_count())
if torch.cuda.is_available():
    print('Visible device 0 name:', torch.cuda.get_device_name(0))
    print('Current device:', torch.cuda.current_device())


In [None]:
from pathlib import Path
import os
PROJECT_ROOT = Path('/media/xuchengjie/E7562A2674DB25F7/Orders/JHM')
os.chdir(PROJECT_ROOT)
print('cwd set to', Path.cwd())


In [None]:
# GPU_PIN_CELL
# 固定使用物理 GPU 1（注意：必须在 import torch 之前设置；改完后需要 Restart Kernel）
import os
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
print('CUDA_VISIBLE_DEVICES=', os.environ.get('CUDA_VISIBLE_DEVICES'))


In [None]:
# TRAIN_CONFIG
PATHWAY_EPOCHS = 20
PATHWAY_BATCH = 8
PATHWAY_LR = 2e-5
PATHWAY_MAX_LENGTH = 256

# Early stopping on validation metric (recommended for papers)
PATHWAY_EARLY_STOP = True
PATHWAY_EARLY_STOP_METRIC = 'roc_auc_macro'  # or 'ap_macro'
PATHWAY_EARLY_STOP_PATIENCE = 8
PATHWAY_EARLY_STOP_MIN_DELTA = 0.001
PATHWAY_USE_POS_WEIGHT = True

# Backbone（默认 ChemBERTa baseline；如要用 Emu3-Chat，请改成你的本地目录，并设置 backbone_type='causal_lm'）
PATHWAY_BACKBONE = 'DeepChem/ChemBERTa-10M-MTR'
PATHWAY_BACKBONE_TYPE = 'auto'  # auto/encoder/causal_lm
PATHWAY_MODEL_OUT = 'models/pathway_tox21.pt'

# 大模型加载选项（仅在 PATHWAY_DEVICE_MAP 不为 None 时生效；Emu3-Chat 推荐 device_map='auto' + 4bit + offload）
PATHWAY_DEVICE_MAP = None  # 'auto'
PATHWAY_TORCH_DTYPE = None  # 'float16' | 'bfloat16' | 'float32'
PATHWAY_MAX_GPU_MEMORY = '7GiB'
PATHWAY_MAX_CPU_MEMORY = '128GiB'
PATHWAY_OFFLOAD_FOLDER = 'data/cache/emu3_offload'
PATHWAY_LOAD_IN_4BIT = False
PATHWAY_LOAD_IN_8BIT = False

# LoRA / QLoRA（效果优先：建议 Emu3-Chat + 4bit + LoRA；将 PATHWAY_LORA_R 设为 16/32 启用）
# 11GB 推荐配置（取消注释即可）：
# PATHWAY_BACKBONE = '/home/xuchengjie/Program/Emu3'
# PATHWAY_BACKBONE_TYPE = 'causal_lm'
# PATHWAY_DEVICE_MAP = 'auto'
# PATHWAY_TORCH_DTYPE = 'float16'
# PATHWAY_LOAD_IN_4BIT = True
# PATHWAY_GRADIENT_CHECKPOINTING = True
# PATHWAY_LORA_R = 16
# PATHWAY_BATCH = 1
# PATHWAY_GRAD_ACCUM = 16
# PATHWAY_LR = 1e-4
# PATHWAY_MODEL_OUT = 'models/pathway_tox21_emu3_lora.pt'
PATHWAY_GRADIENT_CHECKPOINTING = False
PATHWAY_LORA_R = 0
PATHWAY_LORA_ALPHA = 32
PATHWAY_LORA_DROPOUT = 0.05
PATHWAY_LORA_TARGET_MODULES = 'q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj'
PATHWAY_LORA_BIAS = 'none'
PATHWAY_GRAD_ACCUM = 1
PATHWAY_WEIGHT_DECAY = 0.0
PATHWAY_USE_PHYSCHEM = True
PATHWAY_PHYSCHEM = 'data/processed/tox21_physchem.csv'

# 亲和力模型：全量运行可用 MAX_ROWS=0（不采样）
AFFINITY_MAX_ROWS = 0
AFFINITY_FP_RADIUS = 2
AFFINITY_FP_NBITS = 4096
AFFINITY_FP_COUNTS = True
AFFINITY_MODEL = 'etr'  # etr/rf/hgb/mlp/sgd
AFFINITY_SPLIT = 'scaffold'  # random/scaffold/protein/scaffold-protein
AFFINITY_TYPES = 'Ki,Kd'  # 或 'any'
AFFINITY_KMER = 2
AFFINITY_USE_RDKIT_DESC = True
AFFINITY_USE_AAC = True
AFFINITY_USE_SEQ_LEN = True
AFFINITY_ETR_N_ESTIMATORS = 400
AFFINITY_USE_PROTEIN_STATS = True
AFFINITY_CENTER_BY_PROTEIN = False
AFFINITY_PROTEIN_HASH_DIM = 128
AFFINITY_MIN_PROTEIN_SAMPLES = 5000
AFFINITY_LIGHT_MODEL_OUT = 'models/affinity_light_scaffold.joblib'

# 深度亲和力模型（GPU 推荐）
AFFINITY_DEEP_EPOCHS = 6
AFFINITY_DEEP_BATCH = 8
AFFINITY_DEEP_LR = 2e-5
AFFINITY_DEEP_MAX_SMI = 128
AFFINITY_DEEP_MAX_PROT = 512
AFFINITY_DEEP_MAX_ROWS = 0
AFFINITY_DEEP_SPLIT = 'scaffold'
AFFINITY_DEEP_TYPES = 'Ki,Kd'
AFFINITY_DEEP_MIN_PROTEIN_SAMPLES = 5000
AFFINITY_DEEP_MIN_BEFORE_TYPE = True
AFFINITY_DEEP_AGGREGATE = 'median'
AFFINITY_DEEP_PMIN = None
AFFINITY_DEEP_PMAX = None
AFFINITY_DEEP_SMILES_BACKBONE = 'seyonec/ChemBERTa-zinc-base-v1'
AFFINITY_DEEP_PROTEIN_BACKBONE = 'facebook/esm2_t6_8M_UR50D'
AFFINITY_DEEP_HIDDEN_DIM = 512
AFFINITY_DEEP_DROPOUT = 0.1
AFFINITY_DEEP_USE_INTERACTIONS = True
AFFINITY_DEEP_USE_EXTRA = True
AFFINITY_DEEP_FP_RADIUS = AFFINITY_FP_RADIUS
AFFINITY_DEEP_FP_NBITS = AFFINITY_FP_NBITS
AFFINITY_DEEP_FP_COUNTS = AFFINITY_FP_COUNTS
AFFINITY_DEEP_KMER = AFFINITY_KMER
AFFINITY_DEEP_USE_RDKIT_DESC = AFFINITY_USE_RDKIT_DESC
AFFINITY_DEEP_USE_AAC = AFFINITY_USE_AAC
AFFINITY_DEEP_USE_SEQ_LEN = AFFINITY_USE_SEQ_LEN
AFFINITY_DEEP_USE_PROTEIN_STATS = True
AFFINITY_DEEP_PROTEIN_HASH_DIM = 128
AFFINITY_DEEP_CENTER_BY_PROTEIN = False
AFFINITY_DEEP_USE_TEACHER = True
AFFINITY_DEEP_TEACHER_ETR_N = 400
AFFINITY_DEEP_TEACHER_ETR_MAX_FEAT = 'sqrt'
AFFINITY_DEEP_TEACHER_OUT = 'models/affinity_teacher.joblib'
AFFINITY_DEEP_AMP = True
AFFINITY_DEEP_OUT = 'models/affinity.pt'


## AlphaGenome API (可选)

使用前请在环境变量中设置 `ALPHA_GENOME_API_KEY`（例如写入 `.env`，然后 `load_dotenv()`）。


In [None]:
import os
from dotenv import load_dotenv
from alphagenome.models import dna_client

load_dotenv()
ALPHAGENOME_API_KEY = os.getenv('ALPHA_GENOME_API_KEY')
assert ALPHAGENOME_API_KEY, '请先设置 ALPHA_GENOME_API_KEY'

ag = dna_client.create(api_key=ALPHAGENOME_API_KEY)
meta = ag.output_metadata()
print('AlphaGenome metadata loaded:', type(meta))


## 0) Emu3-Chat 本地模型路径与加载测试
请将 `EMU3_CHAT_PATH` 替换为本地 HuggingFace 目录（例如 `/path/to/BAAI/Emu3-Chat`）。


In [None]:
# Emu3-Chat 加载（仅用物理 GPU 1；请先 Restart Kernel 并先运行上面的 GPU 固定单元）
EMU3_CHAT_PATH = '/home/xuchengjie/Program/Emu3'

import gc
import torch
from transformers import AutoConfig, AutoProcessor, AutoTokenizer, AutoModelForCausalLM

gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

assert (not torch.cuda.is_available()) or (torch.cuda.device_count() == 1), (
    f'Expected 1 visible CUDA device (physical GPU1 masked to cuda:0). Got {torch.cuda.device_count()}.'
)

config = AutoConfig.from_pretrained(EMU3_CHAT_PATH, trust_remote_code=True)

# FP16 的 Emu3-Chat 在 11GB 卡上通常放不下：必须强制 offload 到 CPU，或启用 4-bit/8-bit。
# 这里先尝试：FP16 + device_map='auto' + max_memory(7GiB) 强制部分权重 offload。
try:
    max_memory = {0: '7GiB', 'cpu': '128GiB'}
    model = AutoModelForCausalLM.from_pretrained(
        EMU3_CHAT_PATH,
        dtype=torch.float16,
        device_map='auto',
        max_memory=max_memory,
        offload_folder='data/cache/emu3_offload',
        offload_state_dict=True,
        offload_buffers=True,
        low_cpu_mem_usage=True,
        trust_remote_code=True,
    )
    print('Emu3-Chat loaded (fp16 + cpu offload)')
except torch.OutOfMemoryError as e:
    print('FP16+offload still OOM, fallback to 4-bit quantization:', e)
    from transformers import BitsAndBytesConfig
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=torch.float16,
    )
    model = AutoModelForCausalLM.from_pretrained(
        EMU3_CHAT_PATH,
        quantization_config=bnb_config,
        device_map='auto',
        trust_remote_code=True,
    )
    print('Emu3-Chat loaded (4-bit)')

# AutoProcessor 可能因缺少 image processor 文件而失败；失败则回退 AutoTokenizer
try:
    processor = AutoProcessor.from_pretrained(EMU3_CHAT_PATH, trust_remote_code=True)
    print('Emu3-Chat loaded with AutoProcessor')
except Exception as e:
    print('AutoProcessor failed, fallback to AutoTokenizer:', e)
    tokenizer = AutoTokenizer.from_pretrained(EMU3_CHAT_PATH, trust_remote_code=True)
    print('Emu3-Chat loaded with AutoTokenizer')


In [None]:
# emu3_cleanup
# 如果后续还要训练模型/跑大批量推理，建议在完成 Emu3 测试后释放显存
import gc
import torch
try:
    del model
except NameError:
    pass
try:
    del processor
except NameError:
    pass
try:
    del tokenizer
except NameError:
    pass
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
print('Emu3 objects cleared')


**GPU 说明**：当前虚拟环境已安装 CPU 版 PyTorch。若需要 GPU，请在该内核里替换安装 CUDA 版 PyTorch，然后重启内核。

示例（CUDA 12.1）：
```
pip install --upgrade torch --index-url https://download.pytorch.org/whl/cu121
```
CUDA 11.8 可替换为 `cu118`。


## 0.5) .smiles 理化性质特征（用于下游分类/分析）

从 `nr-*.smiles` / `sr-*.smiles` 生成理化性质特征，并输出分布图。


In [None]:
!python scripts/featurize_smiles.py --smiles-dir . --patterns 'nr-*.smiles,sr-*.smiles' --out-dir data/processed/smiles_physchem_by_assay
!python scripts/featurize_smiles.py --input data/processed/tox21.csv --out data/processed/tox21_physchem.csv
!python scripts/featurize_smiles.py --input data/input/compounds.csv --out data/processed/compounds_physchem.csv
from pathlib import Path
import pandas as pd
from IPython.display import display, Image

display(pd.read_csv('data/processed/smiles_physchem.csv').head(10))
display(pd.read_csv('data/processed/smiles_physchem_summary.csv').head(10))

plots = [
  'data/processed/smiles_physchem_hist.png',
  'data/processed/smiles_physchem_corr.png',
  'data/processed/smiles_pca.png',
  'data/processed/smiles_tsne_density.png',
  'data/processed/smiles_tsne_scatter.png',
  'data/processed/smiles_similarity_matrix.png',
  'data/processed/smiles_similarity_clustermap.png',
]
for p in plots:
  if Path(p).exists():
    display(Image(filename=p))
  else:
    print('missing plot:', p)


## 1) 处理 Tox21 数据


In [None]:
!python scripts/prepare_tox21.py --out data/processed/tox21.csv
import pandas as pd
from IPython.display import display
display(pd.read_csv('data/processed/tox21.csv').head(10))


In [None]:
from pathlib import Path
import pandas as pd
from IPython.display import display, Image

csv_path = Path('data/processed/tox21_label_stats.csv')
png_path = Path('data/processed/tox21_label_stats.png')
rate_path = Path('data/processed/tox21_positive_rate.png')
tox21_path = Path('data/processed/tox21.csv')

if not tox21_path.exists():
    !python scripts/prepare_tox21.py --out data/processed/tox21.csv
if not csv_path.exists() or not png_path.exists() or not rate_path.exists():
    !python scripts/report_tox21_stats.py

display(pd.read_csv(csv_path))
display(Image(filename=str(png_path)))
display(Image(filename=str(rate_path)))


## 2) 生成 decoy 阴性对照 (基于 DTX 化合物池)

如需先小规模验证，可设置 `MAX_DECOY_MOLS=50000`。留空表示全量。


In [None]:
# decoy 已禁用：.smiles 已包含阴性/阳性
print('Skip decoy generation')


In [None]:
import pandas as pd
from IPython.display import display
stats = pd.read_csv('data/processed/tox21_label_stats.csv')
stats['positive_rate'] = stats['pos'] / (stats['pos'] + stats['neg'])
display(stats[['assay','pos','neg','positive_rate']])


## 3) 训练通路/毒性多标签模型（支持 Emu3-Chat QLoRA）


In [None]:
import os
import subprocess

# 提示：默认用 ChemBERTa；如要 Emu3-Chat QLoRA，请在 TRAIN_CONFIG 里配置 PATHWAY_BACKBONE/TYPE/4bit/LoRA 等参数
cmd = [
    'python', 'scripts/train_pathway_model.py',
    '--epochs', str(PATHWAY_EPOCHS),
    '--batch', str(PATHWAY_BATCH),
    '--grad-accum', str(PATHWAY_GRAD_ACCUM),
    '--lr', str(PATHWAY_LR),
    '--weight-decay', str(PATHWAY_WEIGHT_DECAY),
    '--max-length', str(PATHWAY_MAX_LENGTH),
    '--backbone', str(PATHWAY_BACKBONE),
    '--backbone-type', str(PATHWAY_BACKBONE_TYPE),
    '--out', str(PATHWAY_MODEL_OUT),
    '--physchem', str(PATHWAY_PHYSCHEM),
    '--use-physchem',
]

if PATHWAY_TORCH_DTYPE:
    cmd += ['--torch-dtype', str(PATHWAY_TORCH_DTYPE)]
if PATHWAY_DEVICE_MAP:
    cmd += [
        '--device-map', str(PATHWAY_DEVICE_MAP),
        '--max-gpu-memory', str(PATHWAY_MAX_GPU_MEMORY),
        '--max-cpu-memory', str(PATHWAY_MAX_CPU_MEMORY),
        '--offload-folder', str(PATHWAY_OFFLOAD_FOLDER),
    ]
if PATHWAY_LOAD_IN_4BIT:
    cmd.append('--load-in-4bit')
if PATHWAY_LOAD_IN_8BIT:
    cmd.append('--load-in-8bit')

if PATHWAY_GRADIENT_CHECKPOINTING:
    cmd.append('--gradient-checkpointing')
if PATHWAY_LORA_R and int(PATHWAY_LORA_R) > 0:
    cmd += [
        '--lora-r', str(PATHWAY_LORA_R),
        '--lora-alpha', str(PATHWAY_LORA_ALPHA),
        '--lora-dropout', str(PATHWAY_LORA_DROPOUT),
        '--lora-bias', str(PATHWAY_LORA_BIAS),
        '--lora-target-modules', str(PATHWAY_LORA_TARGET_MODULES),
    ]
if PATHWAY_USE_POS_WEIGHT:
    cmd.append('--use-pos-weight')
if PATHWAY_EARLY_STOP:
    cmd += [
        '--early-stop-metric', str(PATHWAY_EARLY_STOP_METRIC),
        '--early-stop-patience', str(PATHWAY_EARLY_STOP_PATIENCE),
        '--early-stop-min-delta', str(PATHWAY_EARLY_STOP_MIN_DELTA),
    ]
env = os.environ.copy()
env['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
env['CUDA_VISIBLE_DEVICES'] = '1'
print('Running:', ' '.join(cmd))
print('CUDA_VISIBLE_DEVICES for subprocess:', env['CUDA_VISIBLE_DEVICES'])
subprocess.run(cmd, check=True, env=env)


In [None]:
from pathlib import Path
import pandas as pd
from IPython.display import display, Image

display(pd.read_csv('data/processed/pathway_training_metrics.csv').tail(10))

plots = [
    'data/processed/pathway_training_metrics.png',
    'data/processed/pathway_metrics_overview.png',
    'data/processed/pathway_pred_score_dist.png',
    'data/processed/pathway_roc_curve.png',
    'data/processed/pathway_roc_curve_macro.png',
    'data/processed/pathway_pr_curve.png',
    'data/processed/pathway_pr_curve_macro.png',
]
for p in plots:
    if Path(p).exists():
        display(Image(filename=p))
    else:
        print('missing plot:', p)


## 4) 获取通路蛋白序列 (UniProt)


In [None]:
!python scripts/fetch_proteins.py


## 4.1) 生成 Uniprot → Gene 映射（用于 AlphaGenome 合并）


In [None]:
!python scripts/fetch_uniprot_gene_map.py


## 5) 亲和力训练数据准备 (BindingDB + UniProt)


In [None]:
!python scripts/download_bindingdb.py
!python scripts/prepare_bindingdb.py --human-only
!python scripts/fetch_uniprot_sequences.py --input data/affinity/bindingdb_filtered.csv


## 6) 训练亲和力模型


In [None]:
import sys
import subprocess
import torch

if not torch.cuda.is_available():
    print('WARNING: 未检测到 GPU，深度亲和力模型训练会非常慢。')

py = sys.executable
cmd = [
    py, 'scripts/train_affinity.py',
    '--epochs', str(AFFINITY_DEEP_EPOCHS),
    '--batch', str(AFFINITY_DEEP_BATCH),
    '--lr', str(AFFINITY_DEEP_LR),
    '--hidden-dim', str(AFFINITY_DEEP_HIDDEN_DIM),
    '--dropout', str(AFFINITY_DEEP_DROPOUT),
    '--max-smi', str(AFFINITY_DEEP_MAX_SMI),
    '--max-prot', str(AFFINITY_DEEP_MAX_PROT),
    '--max-rows', str(AFFINITY_DEEP_MAX_ROWS),
    '--split', str(AFFINITY_DEEP_SPLIT),
    '--affinity-types', str(AFFINITY_DEEP_TYPES),
    '--min-protein-samples', str(AFFINITY_DEEP_MIN_PROTEIN_SAMPLES),
    '--aggregate', str(AFFINITY_DEEP_AGGREGATE),
    '--smiles-backbone', str(AFFINITY_DEEP_SMILES_BACKBONE),
    '--protein-backbone', str(AFFINITY_DEEP_PROTEIN_BACKBONE),
    '--out', str(AFFINITY_DEEP_OUT),
]
if AFFINITY_DEEP_MIN_BEFORE_TYPE:
    cmd.append('--min-protein-before-type')
if AFFINITY_DEEP_PMIN is not None:
    cmd += ['--paffinity-min', str(AFFINITY_DEEP_PMIN)]
if AFFINITY_DEEP_PMAX is not None:
    cmd += ['--paffinity-max', str(AFFINITY_DEEP_PMAX)]
if AFFINITY_DEEP_AMP:
    cmd.append('--amp')
if AFFINITY_DEEP_USE_INTERACTIONS:
    cmd.append('--use-interactions')
else:
    cmd.append('--no-interactions')
if AFFINITY_DEEP_USE_EXTRA:
    cmd.append('--extra-features')
    cmd += ['--fp-radius', str(AFFINITY_DEEP_FP_RADIUS)]
    cmd += ['--fp-nbits', str(AFFINITY_DEEP_FP_NBITS)]
    if AFFINITY_DEEP_FP_COUNTS:
        cmd.append('--fp-counts')
    if not AFFINITY_DEEP_USE_RDKIT_DESC:
        cmd.append('--no-rdkit-desc')
    if not AFFINITY_DEEP_USE_AAC:
        cmd.append('--no-aac')
    if not AFFINITY_DEEP_USE_SEQ_LEN:
        cmd.append('--no-seq-len')
    if AFFINITY_DEEP_KMER and AFFINITY_DEEP_KMER > 0:
        cmd += ['--kmer', str(AFFINITY_DEEP_KMER)]
    else:
        cmd.append('--no-kmer')
if AFFINITY_DEEP_USE_PROTEIN_STATS:
    cmd.append('--use-protein-stats')
if AFFINITY_DEEP_PROTEIN_HASH_DIM and AFFINITY_DEEP_PROTEIN_HASH_DIM > 0:
    cmd += ['--protein-hash-dim', str(AFFINITY_DEEP_PROTEIN_HASH_DIM)]
if AFFINITY_DEEP_CENTER_BY_PROTEIN:
    cmd.append('--center-by-protein')
if AFFINITY_DEEP_USE_TEACHER:
    cmd.append('--teacher-etr')
    cmd += ['--teacher-etr-n-estimators', str(AFFINITY_DEEP_TEACHER_ETR_N)]
    cmd += ['--teacher-etr-max-features', str(AFFINITY_DEEP_TEACHER_ETR_MAX_FEAT)]
    cmd += ['--teacher-etr-out', str(AFFINITY_DEEP_TEACHER_OUT)]

print('Running:', ' '.join(cmd))
subprocess.run(cmd, check=True)


In [None]:
import pandas as pd
from IPython.display import display, Image
display(pd.read_csv('data/processed/affinity_training_metrics.csv'))
display(Image(filename='data/processed/affinity_training_metrics.png'))
display(Image(filename='data/processed/affinity_pred_scatter.png'))
display(Image(filename='data/processed/affinity_residuals.png'))
display(Image(filename='data/processed/affinity_error_vs_pred.png'))


## 7) 输入化合物预测 + 毒性信息整合

请将待预测化合物放入 `data/input/compounds.csv`，需至少包含 `smiles` 列。


**运行预测与汇总（全靶点）**：默认会输出所有靶点（`--topk 0`），并生成 `top_targets_report.csv`。


In [None]:
import sys
import subprocess
from pathlib import Path

py = sys.executable
cmds = [
    [py, 'scripts/predict_pathways.py', '--input', 'data/input/compounds.csv', '--physchem', 'data/processed/compounds_physchem.csv'],
]

# 若存在深度亲和力模型，则优先使用；否则使用轻量版
if Path('models/affinity.pt').exists():
    cmds.append([py, 'scripts/predict_affinity.py', '--input', 'data/input/compounds.csv', '--model', str(AFFINITY_DEEP_OUT), '--topk', '0'])
else:
    cmds.append([py, 'scripts/predict_affinity_light.py', '--input', 'data/input/compounds.csv', '--model', str(AFFINITY_LIGHT_MODEL_OUT), '--topk', '0'])

cmds += [
    [py, 'scripts/ctx_hazard_lookup.py', '--input', 'data/input/compounds.csv'],
    [
        py, 'scripts/analyze_top_targets.py',
        '--affinity', 'data/processed/affinity_predictions.csv',
        '--pathways', 'data/processed/pathway_predictions.csv',
        '--ctx', 'data/toxicity/ctx_hazard.csv',
        '--out', 'data/processed/top_targets_report.csv',
    ],
]

for cmd in cmds:
    print('Running:', ' '.join(cmd))
    subprocess.run(cmd, check=True)


In [None]:
from pathlib import Path
import pandas as pd
from IPython.display import display

report = Path('data/processed/top_targets_report.csv')
if report.exists():
    df = pd.read_csv(report)
    display(df.head(10))
    if 'gene' in df.columns:
        print('Unique genes:', df['gene'].dropna().nunique())
else:
    print('Missing top_targets_report.csv; please run the prediction cell above.')


## 8) AlphaGenome 批量预测（可选，参与毒性/靶点分析）

请在 `data/input/alphagenome_queries.csv` 填写 query_type（sequence/interval/variant）以及对应字段。
结果将写入 `data/processed/alphagenome_results.jsonl` 与 `data/processed/alphagenome_summary.csv`。


In [None]:
# 自动从 top_targets_report.csv 生成 AlphaGenome 查询（按 gene）
AG_TOPK = 200  # 生成多少个基因区间查询；可根据需求调整
AG_REQUESTED_OUTPUTS = "RNA_SEQ"  # 逗号分隔，可选：RNA_SEQ,DNASE,ATAC...
AG_ONTOLOGY_TERMS = "UBERON:0002048"  # 逗号分隔；默认肺组织，可按需替换
!python scripts/build_alphagenome_queries.py --targets data/processed/top_targets_report.csv --out data/input/alphagenome_queries.csv --topk {AG_TOPK} --requested-outputs "{AG_REQUESTED_OUTPUTS}" --ontology-terms "{AG_ONTOLOGY_TERMS}"


In [None]:
from pathlib import Path
import pandas as pd
from IPython.display import display

# 强制重跑可将此设为 True
AG_FORCE = True
summary = Path('data/processed/alphagenome_summary.csv')
results = Path('data/processed/alphagenome_results.jsonl')
if AG_FORCE:
    summary.unlink(missing_ok=True)
    results.unlink(missing_ok=True)

if not summary.exists():
    print('Running AlphaGenome batch...')
    !python scripts/alphagenome_batch.py --input data/input/alphagenome_queries.csv

if summary.exists():
    display(pd.read_csv(summary).head(20))
    print('Raw results:', results)
else:
    print('AlphaGenome summary still missing; check alphagenome_queries.csv and API key')
