## 01 model training

In [None]:
# 01 ATAC-MAE traing
export CUDA_HOME=/usr/local/cuda-12.2
export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} 
export PATH=${CUDA_HOME}/bin:${PATH}
python ./script/ATAC-MAE.train.py \
  --data ./data/CPM_TMM.fix.tsv --outdir ./model/ATAC-MAE \
  --latent_dim 256 --width 256 --depth 3 --epochs 200 --lr 1e-4 --no_mixed --xla
# 02 ExprFormer traing
export CUDA_HOME=/usr/local/cuda-12.2
export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} 
export PATH=${CUDA_HOME}/bin:${PATH}
python ./script/ExprFormer_train.py \
  --fasta ./data/Os.fa --gff ./data/Os.gff3 --expr ./data/TPM_matrix_filtered.tsv \
  --pcc ./data/pcc_cluster_with_coords.tsv --z_peak ./model/ATAC-MAE/z_peak_embeddings.tsv \
  --cluster_map ./data/cluster.txt --outdir ./model/ExprFormer --epochs 200 --batch_size 72 --lr 5e-5  \
  --lambda_mag 0.02 --lambda_pcc 0.001 --lambda_ko 0.001 --lambda_imp 0.001 --d_seq 256 --d_pool 128 --d_head 128 --threads 8  --seed 42
# 03 Seq2ATAC traing
python ./script/Seq2ATAC.train.py  --fasta ./data/Os.fa --atac_matrix ./data/CPM_TMM.fix.tsv \
  --outdir ./model/Seq2ATAC --batch_size 128 --epochs 200 --n_workers 32  

## 02 Predicting variant effects on gene expression

In [None]:
# 01 Predicting variant effects on ATAC 
# Single-nucleotide variants (SNPs) were implemented by directly substituting the corresponding bases in the reference sequence with the mutant alleles. For insertion and deletion (indel) events, no additional length constraints were imposed; instead, sequences were edited strictly according to the mutation definitions, with insertions introduced by adding the inserted fragments at the specified positions and deletions implemented by removing the corresponding reference segments, thereby faithfully capturing the effects of indels on chromatin accessibility (ATAC).
export CUDA_HOME=/usr/local/cuda-12.2
export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} 
export PATH=${CUDA_HOME}/bin:${PATH}
python ./script/Seq2ATAC_predict.py \
  --fasta ./data/Os.fa \
  --alt_dir  ./atac-mutant_genomes-new \ # 
  --peaks ./data/CPM_TMM.fix.tsv \
  --bpnet ./model/Seq2ATAC/bpnet_realpeak_seq2atac_model.keras \
  --encoder ./model/stage1_encoder.keras \
  --outdir atac-predict-out \
  --chr Chr10 \
  --batch_size 256

# 02 Predicting variant effects on gene expresion
# To ensure consistency of gene structure and analysis windows across different mutation types, a unified length-constraining strategy was applied to all mutant sequences. Single-nucleotide substitutions and length-preserving replacements were directly introduced by substituting the corresponding bases in the reference sequence. For deletions, small missing segments were padded with *N* (<5bp)at the corresponding positions to prevent changes in window length. For insertion events, particularly larger insertions, the inserted sequence was first fully retained and the resulting sequence was then truncated at the window terminus to restore an identical length to that of the original window.
# run
export CUDA_HOME=/usr/local/cuda-12.2
export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} 
export PATH=${CUDA_HOME}/bin:${PATH}
#WT predict
python ./script/ ExprFormer.predict.py --model ./model/ExprFormer/stage2pp_v4_2_stableval_explainSampling_model.keras \
  --fasta ./data/Os.fa --gff ./data/Os.gff3 \
  --expr ./data/TPM_matrix_filtered.tsv --pcc ./data/pcc_cluster_with_coords.tsv --z_peak ./atac-predict-out/WT_z.tsv\
  --genes gene_list.txt --prefix WT --outdir predict_gene_expresion 

# ALT predict 
python ./script/ ExprFormer.predict.py --model ./model/ExprFormer/stage2pp_v4_2_stableval_explainSampling_model.keras \
  --fasta ALT.fa  \
  --gff ./data/Os.gff3 \
  --expr ./data/TPM_matrix_filtered.tsv --pcc ./data/pcc_cluster_with_coords.tsv --z_peak ./atac-predict-out/ALT_z.tsv \
  --genes gene_list.txt --prefix alt --outdir gene-exp-predict

## 03 Sequence Feature Attribution and Deep Learning Interpretability

In [None]:
export CUDA_HOME=/usr/local/cuda-12.2
export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} 
export PATH=${CUDA_HOME}/bin:${PATH}
python ./script/ExprFormer_seq_explain.py \
  --model ./model/ExprFormer/stage2pp_v4_2_stableval_explainSampling_model.keras --fasta ./data/Os.fa --gff ./data/Os.gff3 \
  --expr ./data/TPM_matrix_filtered.tsv --pcc ./data/pcc_cluster_with_coords.tsv --z_peak ./model/ATAC-MAE/z_peak_embeddings.tsv \
  --gene_list ./data/spike.gene_list.txt --tissues YP2 --windows 1:5500 --outdir spike_gene --dump_attr_tsv  
