

In this section, we demonstrate the generation of single-cell–level regulatory graphs required for running scReGAT on matched multi-omics datasets.

As an illustrative example, we employ human pancreatic tissue data, originally reported by Wang et al.


In [1]:
import os
import sys
import random
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import hypergeom
from scregat.data_process import prepare_model_input, sum_counts, plot_edge, ATACGraphDataset
import scanpy as sc
import itertools
import torch

  from .autonotebook import tqdm as notebook_tqdm
  import pkg_resources


In [2]:
import scregat
import os

print(scregat.data_process.__file__)
print(os.path.dirname(scregat.__file__))


/root/scReGAT/scregat/data_process.py
/root/scReGAT/scregat


In [3]:
ATAC_h5ad_file = "../../C0027_atac_celltype.h5ad"
RNA_h5ad_file = "../../C0027_rna_celltype.h5ad"


# - It is important to note that both **adata_rna** and **adata_atac** must contain a `celltype` column in their `obs`.  
# - The cell types present in **adata_atac** must also be represented in the `obs` of **adata_rna**.  
# - The `celltype` entries in **adata_rna** are allowed to include a larger number of categories.  


In [4]:
adata_atac = sc.read_h5ad(ATAC_h5ad_file)
adata_rna = sc.read_h5ad(RNA_h5ad_file)




In [5]:
adata_atac

AnnData object with n_obs × n_vars = 6156 × 89233
    obs: 'donor', 'celltype', 'celltype_rna'
    var: '0', '1', '2', '3', '4', '5'

In [6]:
adata_atac.obs['celltype_rna'] == adata_atac.obs['celltype']
adata_atac.obs

Unnamed: 0,donor,celltype,celltype_rna
C0027_AAACAGCCACACTAAT-1,C0027,Beta,Beta
C0027_AAACAGCCATCATGGC-1,C0027,Beta,Beta
C0027_AAACAGCCATGAGTTT-1,C0027,Alpha,Alpha
C0027_AAACATGCACTGGCTG-1,C0027,Beta,Beta
C0027_AAACATGCAGGACACA-1,C0027,Beta,Beta
...,...,...,...
C0027_TTTGTTGGTGCTCCGT-1,C0027,Beta,Beta
C0027_TTTGTTGGTGTGTCCC-1,C0027,Beta,Beta
C0027_TTTGTTGGTTAGTGAT-1,C0027,Acinar,Acinar
C0027_TTTGTTGGTTTCGCGC-1,C0027,Beta,Beta


In [7]:
adata_atac.var

Unnamed: 0,0,1,2,3,4,5
chr1-9816-10734,chr1:9816-10734,chr1:9816-10734,Peaks,chr1,9816,10734
chr1-14483-15389,chr1:14483-15389,chr1:14483-15389,Peaks,chr1,14483,15389
chr1-17065-17950,chr1:17065-17950,chr1:17065-17950,Peaks,chr1,17065,17950
chr1-28812-29619,chr1:28812-29619,chr1:28812-29619,Peaks,chr1,28812,29619
chr1-235307-236151,chr1:235307-236151,chr1:235307-236151,Peaks,chr1,235307,236151
...,...,...,...,...,...,...
GL000192.1-510831-511698,GL000192.1:510831-511698,GL000192.1:510831-511698,Peaks,GL000192.1,510831,511698
GL000192.1-512746-513650,GL000192.1:512746-513650,GL000192.1:512746-513650,Peaks,GL000192.1,512746,513650
GL000192.1-538992-539887,GL000192.1:538992-539887,GL000192.1:538992-539887,Peaks,GL000192.1,538992,539887
GL000192.1-540481-541352,GL000192.1:540481-541352,GL000192.1:540481-541352,Peaks,GL000192.1,540481,541352


In [8]:
adata_rna

AnnData object with n_obs × n_vars = 6156 × 60658
    obs: 'donor', 'celltype', 'celltype_rna'
    var: '0', '1', '2', '3', '4', '5'

In [9]:
adata_rna.obs

Unnamed: 0,donor,celltype,celltype_rna
C0027_AAACAGCCACACTAAT-1,C0027,Beta,Beta
C0027_AAACAGCCATCATGGC-1,C0027,Beta,Beta
C0027_AAACAGCCATGAGTTT-1,C0027,Alpha,Alpha
C0027_AAACATGCACTGGCTG-1,C0027,Beta,Beta
C0027_AAACATGCAGGACACA-1,C0027,Beta,Beta
...,...,...,...
C0027_TTTGTTGGTGCTCCGT-1,C0027,Beta,Beta
C0027_TTTGTTGGTGTGTCCC-1,C0027,Beta,Beta
C0027_TTTGTTGGTTAGTGAT-1,C0027,Acinar,Acinar
C0027_TTTGTTGGTTTCGCGC-1,C0027,Beta,Beta


In [10]:
adata_rna.var

Unnamed: 0,0,1,2,3,4,5
DDX11L1,ENSG00000223972.5_2,DDX11L1,Gene Expression,chr1,11868,12010
WASH7P,ENSG00000227232.5_2,WASH7P,Gene Expression,chr1,29569,29570
RP11-34P13.3,ENSG00000243485.5_4,RP11-34P13.3,Gene Expression,chr1,29553,30267
FAM138A,ENSG00000237613.2_2,FAM138A,Gene Expression,chr1,36080,36081
OR4G4P,ENSG00000268020.3_4,OR4G4P,Gene Expression,chr1,52472,52473
...,...,...,...,...,...,...
CH507-145C22.3,ENSG00000278878.1_4,CH507-145C22.3,Gene Expression,GL000195.1,137994,137995
CH17-351M24.1,ENSG00000263278.2_4,CH17-351M24.1,Gene Expression,GL000204.1,49838,49839
CH507-513H4.4,ENSG00000280614.1_4,CH507-513H4.4,Gene Expression,GL000220.1,154724,154725
CH507-513H4.3,ENSG00000281181.1_4,CH507-513H4.3,Gene Expression,GL000220.1,154724,154725


In [11]:
adata_atac.obs.celltype.unique()

['Beta', 'Alpha', 'Delta', 'Acinar', 'Immune', 'Stellate', 'Ductal', 'Gamma']
Categories (8, object): ['Acinar', 'Alpha', 'Beta', 'Delta', 'Ductal', 'Gamma', 'Immune', 'Stellate']

In [12]:
adata_rna.obs.celltype.unique()

['Beta', 'Alpha', 'Delta', 'Acinar', 'Immune', 'Stellate', 'Ductal', 'Gamma']
Categories (8, object): ['Acinar', 'Alpha', 'Beta', 'Delta', 'Ductal', 'Gamma', 'Immune', 'Stellate']

# Check whether the cell type order matches

In [13]:
adata_rna.obs_names == adata_atac.obs_names

array([ True,  True,  True, ...,  True,  True,  True])

In [14]:
adata_rna.var_names_make_unique()
adata_atac.var_names_make_unique()



In [15]:
adata_rna.obs['celltype'] = adata_rna.obs['celltype'].astype('object')
df_rna = sum_counts(adata_rna,by = 'celltype', marker_gene_num=300)

**finished identifying marker genes by COSG**


In [16]:
df_rna

Unnamed: 0_level_0,RYR2,DMD,AC068491.1,PDGFD,LMO4,CLCN5,LYPLAL1,C12orf79,MT-ND1,ZRANB2,...,CLTC,RASGRF2,PCLO,CPA1,DEPTOR,MYO10,TTC32,ME1,KCTD16,ETV1
celltype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Acinar,98,239,30,322,136,73,348,1244,159,246,...,229,5,270,408,55,883,18,126,132,14
Alpha,125,10725,172,45,243,781,585,2068,1989,1449,...,2005,77,4212,12,181,8376,146,662,306,187
Beta,3302,129,1093,7,50,455,3181,4324,2058,3914,...,5102,1855,11361,14,339,191,258,3953,787,649
Delta,746,605,33,64,7,189,440,509,769,379,...,691,98,2468,1,148,662,50,355,127,221
Ductal,6,70,14,54,23,11,29,131,68,39,...,43,11,73,96,3,115,6,19,7,5
Gamma,2,62,1,10,4,9,6,41,66,18,...,37,9,82,0,20,37,6,9,69,20
Immune,8,22,47,3,12,5,20,104,54,40,...,51,2,33,0,6,28,3,23,1,12
Stellate,16,33,6,5,19,13,22,93,89,30,...,44,13,24,0,7,60,3,31,5,1


In [20]:
import scregat
import os
# This step adds tissue-specific Hi-C regulatory relationships.
# The user needs to provide a set of files. For example,
#in 
base_dir = '../data/'
os.listdir(base_dir)
# we include a file called PO_brain.txt,
# which contains brain tissue-specific Hi-C links.


['TF_Gene_tissue_Brain.csv',
 'PO.txt',
 'PO_brain.txt',
 'readme.md',
 'hg38.chrom.sizes',
 'PP.txt',
 'celltype_specific_cRE_interactions',
 'TF_Gene_tissue_cutoff1.csv',
 'all_tissue_SNP_Gene.txt',
 'genes.protein.tss.tsv',
 'trrust_rawdata.human.tsv',
 'PP_brain.txt',
 'model_init.pth']

In [21]:
adata_atac

AnnData object with n_obs × n_vars = 6156 × 89233
    obs: 'donor', 'celltype', 'celltype_rna'
    var: '0', '1', '2', '3', '4', '5'

In [22]:

# Note that regardless of whether the multi-omics data are matched at the single-cell level, 
# the regulatory graph will be constructed under the unmatched configuration. 
# Single-cell–level matched features will instead be incorporated during model training.


dataset_obj = prepare_model_input(
    # [Core Data] Single-cell ATAC-seq AnnData object.
    # Requirement: .X must be Peak-by-Cell matrix; .obs must contain cell type annotations.
    adata_atac = adata_atac,
    
    # [Output Path] Root directory for storing intermediate processed files.
    # The script will create a 'processed_files' folder here (e.g., sorted bed files).
    path_data_root = '../data/',
    
    # [File Reference] String path to the original ATAC file (used for naming/logging).
    file_atac = ATAC_h5ad_file, 
    
    # [Core Data] RNA expression matrix aggregated by cell type (Pseudo-bulk).
    # Format: Pandas DataFrame. 
    # Index (Rows): Cell type names (must match 'celltype_rna' in adata_atac.obs).
    # Columns: Gene Symbols (e.g., 'TP53').
    df_rna_celltype = df_rna,
    
    # [Prior Knowledge] Path to eQTL (expression Quantitative Trait Loci) data.
    # Used to link SNPs/non-coding regions to target genes.
    path_eqtl = '../data/all_tissue_SNP_Gene.txt',
    
    # [Prior Knowledge] Suffix for Hi-C interaction files to specify tissue context.
    # e.g., if set to "_brain", the code looks for 'PP_brain.txt' and 'PO_brain.txt'.
    # PP = Promoter-Promoter, PO = Promoter-Other (Enhancer).
    # Assuming that tissue-specific Hi-C data are not available, this option may be omitted. 
    Hi_C_file_suffix = "",  
    # [Preprocessing] Whether to convert genomic coordinates from hg19 to hg38.
    # Set True if input ATAC peaks are hg19 (requires LiftOver tool); False if already hg38.
    hg19tohg38 = True,
    liftover_path = '../liftOver',
    chain_file = '../hg19ToHg38.over.chain.gz',
    
    # [QC Filter] Peak filtering threshold.
    # Peaks must be accessible in at least 1% (0.01) of cells to be retained.
    min_percent = 0.01,
    
    # [Prior Knowledge] Whether to use an extended TF (Transcription Factor) database.
    # False: Uses TRRUST only (Curated, high-confidence).
    # True: Uses TRRUST + CHEA3/ChIP-seq aggregated data.
    use_additional_tf = True,
    
    # [Prior Knowledge] Reliability threshold for the extended TF database.
    # Only applies if use_additional_tf=True.
    # Integer indicating in how many tissues/datasets the TF-Gene link must appear to be kept.
    # 10 indicates a high-confidence, conserved regulatory relationship.
    tissue_cuttof = 10
)

only dataset_obj ...
执行 liftover 命令: "../liftOver" "../data/processed_files/peaks.sort.bed" "../hg19ToHg38.over.chain.gz" "../data/peaks_process/peaks_hg38.bed" "../data/peaks_process/unmap.bed"


Reading liftover chains
Mapping coordinates


processing Hi-C ...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


processing TF ...
additional TF...
total candidate tf-gene:  28054




In [24]:
dataset_obj.list_graph[0]

Data(x=[29826, 1], edge_index=[2, 34203], y=[1], edge_tf=[359, 2], y_exp=[1634], cell='C0027_AAACAGCCACACTAAT-1')

In [25]:
file_atac_test = '../data/dataset_atac_kRG_Pancreas.pkl'
with open(file_atac_test, 'wb') as w_pkl:
    str_pkl = pickle.dumps(dataset_obj)
    w_pkl.write(str_pkl)



## Training the Model and Obtaining Regulatory Scores

You can train the model and generate regulatory scores directly from the command line.  
For example, we used the following command to produce a sample dataset and a model weight file.  

**Note:** In this example we explicitly enabled single‑cell–level matched features by using the `--use_sc_exp` flag together with the `--rna_file` argument:

```bash
python ./scReGAT/run_scregat_cli.py \
  --input_file ./dataset_atac_kRG_Pancreas.pkl \
  --output_file ./RS_score_Pancreas.h5ad \
  --use_sc_exp \
  --rna_file ./C0027_rna_celltype.h5ad \
  --save_model_path ./Pancreas.pth \
  --load_model_path ./scReGAT/data/model_init.pth \
  --gpu 2
```

---

## Argument Description

```python
def parse_args():
    parser = argparse.ArgumentParser(description="Run scReGAT Model Training and Inference")

    # --- I/O parameters ---
    parser.add_argument('--input_file', type=str, required=True, 
                        help='Path to the input ATAC pickle file (e.g., dataset_atac_core_MFG.pkl)')
    parser.add_argument('--output_file', type=str, required=True, 
                        help='Path to save the output AnnData file (e.g., result.h5ad)')

    # --- Single-cell expression integration parameters ---
    parser.add_argument('--use_sc_exp', action='store_true',
                        help='Enable integration of single-cell RNA expression data into graph node features')
    parser.add_argument('--rna_file', type=str, default=None,
                        help='Path to the RNA .h5ad file; required if --use_sc_exp is set')

    # --- Model saving and loading ---
    parser.add_argument('--save_model_path', type=str, default=None, 
                        help='Optional path to save trained model parameters')
    parser.add_argument('--load_model_path', type=str, default=None, 
                        help='Optional path to load pre-trained model parameters')

    # --- Training control ---
    parser.add_argument('--skip_train', action='store_true',
                        help='Skip the training phase and run inference directly')
    parser.add_argument('--seed', type=int, default=1233, 
                        help='Random seed (default: 1233)')
    parser.add_argument('--epochs', type=int, default=4, 
                        help='Number of training epochs (default: 4)')
    parser.add_argument('--lr', type=float, default=1e-4, 
                        help='Learning rate (default: 1e-4)')
    parser.add_argument('--batch_size', type=int, default=15, 
                        help='Training batch size (default: 15)')
    parser.add_argument('--sparse_loss_weight', type=float, default=0.1, 
                        help='Weight for sparse loss (default: 0.1)')

    # --- Testing / inference parameters ---
    parser.add_argument('--test_batch_size', type=int, default=20, 
                        help='Batch size for inference (default: 20)')
    parser.add_argument('--test_ratio', type=float, default=0.5, 
                        help='Ratio of cells to use for testing (default: 0.5)')

    # --- Hardware parameters ---
    parser.add_argument('--gpu', type=int, default=1, 
                        help='GPU ID to use; set -1 to use CPU (default: 1)')

    return parser.parse_args()
```

---

## Recommendation

We recommend users initialize training on new datasets using the model weight file:

```
./scReGAT/data/model_init.pth
```

This provides a stable starting point for training and helps ensure stability across experiments.  

---
