In [1]:
import pandas as pd
import anndata as ad
from scipy.sparse import csr_matrix, coo_matrix, csc_matrix
from scipy.io import mmread, mmwrite
import os
from os.path import join
import numpy as np
import scanpy as sc
import scipy.io as sio
import scipy.sparse as sps
import h5py
import gzip
import gc

In [5]:
def reorder(ad1, ad2):
    shared_barcodes = ad1.obs_names.intersection(ad2.obs_names)
    ad1 = ad1[shared_barcodes].copy()
    ad2 = ad2[shared_barcodes].copy()
    return ad1, ad2

def subset_ad(ad, subset_index):
    ad = ad[subset_index].copy()
    return ad

def load_peak_expr(_dir):
    data = sio.mmread(join(_dir, 'data.mtx'))
    cname = pd.read_csv(join(_dir, 'barcode.csv'), index_col=0).index.to_list()
    feat = pd.read_csv(join(_dir, 'feat.csv'), index_col=0)['x'].to_list()
    ad = sc.AnnData(sps.csr_matrix(data.T))
    ad.obs_names = cname
    ad.var_names = feat
    return ad

def set_ad(ad_ref, ref_batch_key='src', tg_batch_key='batch'):
    cell_names = pd.DataFrame(np.reshape(ad_ref.obs_names.to_list(), (-1, 1)))
    cell_names.columns = ['cell_ids'] 

    gene_names = pd.DataFrame(np.reshape(ad_ref.var_names.to_list(), (-1, 1)))
    gene_names.columns = ['gene_ids'] 
    ad_new = ad.AnnData(ad_ref.X.copy(), obs=pd.DataFrame(index=cell_names.cell_ids), var=pd.DataFrame(index = gene_names.gene_ids))
    ad_new.obs[tg_batch_key] = ad_ref.obs[ref_batch_key].to_list()
    ad_new.var_names_make_unique()
    return ad_new

def load_h5(path):
    with h5py.File(path, 'r') as f:
        print(f['matrix'].keys())
        print(f['matrix']['features'].keys())

        barcodes = [_.decode('utf-8') for _ in f['matrix']['barcodes'][:]]
        data = f['matrix']['data'][:]
        indices = f['matrix']['indices'][:]
        indptr = f['matrix']['indptr'][:]
        shape = f['matrix']['shape'][:]

        feature_type = [_.decode('utf-8') for _ in f['matrix']['features']['feature_type'][:]]
        feature_id   = [_.decode('utf-8') for _ in f['matrix']['features']['id'][:]]
        feature_name = [_.decode('utf-8') for _ in f['matrix']['features']['name'][:]]
        feature_interval = [_.decode('utf-8') for _ in f['matrix']['features']['interval'][:]]


        X = sps.csc_matrix(
            (data, indices, indptr), 
            shape = shape
        ).tocsc().astype(np.float32).T.toarray()

        adata = sc.AnnData(X)
        adata.obs_names = barcodes
        adata.var_names = feature_id
        adata.var['type'] = feature_type
        adata.var['name'] = feature_name
        adata.var['interval'] = feature_interval
    return adata

In [3]:
work_dir = '/disco_500t/xuhua/gitrepo/dance/data'
# tmp_path = join(work_dir, 'Misar/')
# os.makedirs(tmp_path, exist_ok=True)

In [6]:
data_dir = '/disco_500t/xuhua/data/MISAR_seq/'
ad_bridge = load_h5(join(data_dir, 'E15_5-S1_raw_feature_bc_matrix.h5'))
ad_test1 = load_h5(join(data_dir, 'E13_5-S1_raw_feature_bc_matrix.h5'))
ad_test2 = load_h5(join(data_dir, 'E18_5-S1_raw_feature_bc_matrix.h5'))  # inconsistent peak name across batches
peak_mat = sps.csr_matrix(sio.mmread(join(data_dir, 'BaiduDisk/section1/peak_mat.mtx')).T)
peak_spot_name = pd.read_csv(join(data_dir, 'BaiduDisk/section1/peak_spot_names.csv')).x.values

meta = pd.read_csv(join(data_dir, 'BaiduDisk/section1/meta_data.csv'), index_col=0)

ad_bridge.obs_names = [f'E15_5-S1#{_}' for _ in ad_bridge.obs_names]
ad_test1.obs_names = [f'E13_5-S1#{_}' for _ in ad_test1.obs_names]
ad_test2.obs_names = [f'E18_5-S1#{_}' for _ in ad_test2.obs_names]

# split rna and peak
ad15_rna = ad_bridge[:, ad_bridge.var['type'] == 'Gene Expression'].copy()
ad13_rna = ad_test1[:, ad_test1.var['type'] == 'Gene Expression'].copy()
ad18_rna = ad_test2[:, ad_test2.var['type'] == 'Gene Expression'].copy()

ad13_atac = load_peak_expr(join(data_dir, 'S1-E13-E15-18-peak_data/E13'))
ad15_atac = load_peak_expr(join(data_dir, 'S1-E13-E15-18-peak_data/E15'))
ad18_atac = load_peak_expr(join(data_dir, 'S1-E13-E15-18-peak_data/E18'))
ad13_atac.obs_names = [f'E13_5-S1#{_}' for _ in ad13_atac.obs_names]
ad15_atac.obs_names = [f'E15_5-S1#{_}' for _ in ad15_atac.obs_names]
ad18_atac.obs_names = [f'E18_5-S1#{_}' for _ in ad18_atac.obs_names]

ad15_rna = subset_ad(ad15_rna, ad15_rna.obs_names.intersection(meta.index))
ad13_rna = subset_ad(ad13_rna, ad13_rna.obs_names.intersection(meta.index))
ad18_rna = subset_ad(ad18_rna, ad18_rna.obs_names.intersection(meta.index))

ad15_rna.obs = meta.loc[ad15_rna.obs_names].copy()
ad15_atac.obs = meta.loc[ad15_atac.obs_names].copy()
ad13_rna.obs = meta.loc[ad13_rna.obs_names].copy()
ad13_atac.obs = meta.loc[ad13_atac.obs_names].copy()
ad18_rna.obs = meta.loc[ad18_rna.obs_names].copy()
ad18_atac.obs = meta.loc[ad18_atac.obs_names].copy()

ad15_atac = ad15_atac[ad15_rna.obs_names].copy()
ad13_atac = ad13_atac[ad13_rna.obs_names].copy()
ad18_rna  = ad18_rna[ad18_atac.obs_names].copy()  # 这是要和当初E18取atac的obs_name顺序一致

del peak_mat, ad_bridge, ad_test1, ad_test2
gc.collect()

1823

In [7]:
ad13_rna.obs['src'] = ad13_atac.obs['src'] = ['e13']*ad13_rna.n_obs
ad15_rna.obs['src'] = ad15_atac.obs['src'] = ['e15']*ad15_rna.n_obs
ad18_rna.obs['src'] = ad18_atac.obs['src'] = ['e18']*ad18_rna.n_obs

In [9]:
ad_rna_all = sc.concat([ad13_rna, ad15_rna, ad18_rna])
ad_atac_all = sc.concat([ad13_atac, ad15_atac, ad18_atac])

sc.pp.highly_variable_genes(ad_rna_all, flavor='seurat_v3', n_top_genes=5000, batch_key='src')
hvg_names = ad_rna_all.var.query('highly_variable').index.to_numpy()

sc.pp.highly_variable_genes(ad_atac_all, flavor='seurat_v3', n_top_genes=50000, batch_key='src')
hvp_names = ad_atac_all.var.query('highly_variable').index.to_numpy()

In [10]:
ad13_rna = ad13_rna[:, hvg_names].copy(); ad13_atac = ad13_atac[:, hvp_names].copy()
ad15_rna = ad15_rna[:, hvg_names].copy(); ad15_atac = ad15_atac[:, hvp_names].copy()
ad18_rna = ad18_rna[:, hvg_names].copy(); ad18_atac = ad18_atac[:, hvp_names].copy()

## filter feat names
filtered_atac_feats = [_ for _ in ad13_atac.var_names if _.startswith('chr')]
ad13_atac = ad13_atac[:, filtered_atac_feats].copy()
ad15_atac = ad15_atac[:, filtered_atac_feats].copy()
ad18_atac = ad18_atac[:, filtered_atac_feats].copy()

In [11]:
RNA_ADS = [ad13_rna, ad15_rna, ad18_rna]
ATAC_ADS = [ad13_atac, ad15_atac, ad18_atac]
mod_dict = {'rna': RNA_ADS, 'atac':ATAC_ADS}
n_batches = 3
mod_sets = ['rna', 'atac']

In [12]:
work_dir = '/disco_500t/xuhua/gitrepo/dance/data'
for i in range(3):
    tmp_path = join(work_dir, f'Misar_cv{i+1}/')
    os.makedirs(tmp_path, exist_ok=True)
    
    train_idx = list(set(np.arange(3)) - set({i}))
    test_idx  = [i]
    train_rna_data = sc.concat([RNA_ADS[idx] for idx in train_idx])
    test_rna_data  = sc.concat([RNA_ADS[idx] for idx in test_idx])
    train_atac_data = sc.concat([ATAC_ADS[idx] for idx in train_idx])
    test_atac_data  = sc.concat([ATAC_ADS[idx] for idx in test_idx])

    input_train_mod1 = set_ad(train_rna_data, ref_batch_key='src', tg_batch_key='batch')
    input_test_mod1 = set_ad(test_rna_data, ref_batch_key='src', tg_batch_key='batch')
    input_train_mod2 = set_ad(train_atac_data, ref_batch_key='src', tg_batch_key='batch')
    input_test_mod2 = set_ad(test_atac_data, ref_batch_key='src', tg_batch_key='batch')

    input_train_mod1.var['feature_types'] = pd.Categorical(len(input_train_mod1.var_names)*['GEX'])
    input_train_mod1.uns = {'dataset_id': 'mouse_brain', 'organism': 'mouse'}
    input_train_mod1.layers['counts'] = input_train_mod1.X.copy()
    input_train_mod2.var['feature_types'] = pd.Categorical(len(input_train_mod2.var_names)*['ATAC'])
    input_train_mod2.uns = {'dataset_id': 'mouse_brain', 'organism': 'mouse'}
    input_train_mod2.layers['counts'] = input_train_mod2.X.copy()
    input_test_mod1.var['feature_types'] = pd.Categorical(len(input_test_mod1.var_names)*['GEX'])
    input_test_mod1.uns = {'dataset_id': 'mouse_brain', 'organism': 'mouse'}
    input_test_mod1.layers['counts'] = input_test_mod1.X.copy()
    input_test_mod2.var['feature_types'] = pd.Categorical(len(input_test_mod2.var_names)*['ATAC'])
    input_test_mod2.uns = {'dataset_id': 'mouse_brain', 'organism': 'mouse'}
    input_test_mod2.layers['counts'] = input_test_mod2.X.copy()

    input_train_mod1.write_h5ad(tmp_path + "openproblems_bmmc_multiome_phase2_rna.censor_dataset.output_train_mod1.h5ad", compression = None)
    input_train_mod2.write_h5ad(tmp_path + "openproblems_bmmc_multiome_phase2_rna.censor_dataset.output_train_mod2.h5ad", compression = None)
    input_test_mod1.write_h5ad(tmp_path + "openproblems_bmmc_multiome_phase2_rna.censor_dataset.output_test_mod1.h5ad", compression = None)
    input_test_mod2.write_h5ad(tmp_path + "openproblems_bmmc_multiome_phase2_rna.censor_dataset.output_test_mod2.h5ad", compression = None)

In [13]:
import csv
def csv_read(path, header=True, index=True):
    res = []
    with open(path, mode='r', newline='') as file:
        reader = csv.reader(file)

        for i,row in enumerate(reader):
            if header and i==0:
                continue
            try:
                float_row = [float(item) for item in row]
                if index:
                    float_row = float_row[1:]
                res.append(float_row)  # Each row is now a list of floats
            except ValueError as e:
                print(f"Error converting to float: {e}")
    res = np.vstack(res)   
    return res

In [14]:
out_dir = '/disco_500t/xuhua/gitrepo/dance/output/RNA2ATAC'

for cv in range(3):
    _dir = join(out_dir, f'Misar_3slices_cv{cv+1}')
    pr_X = csv_read(join(_dir, 'babel.csv'), index=True, header=True)
    print(pr_X.shape)
    ad_pr = sc.AnnData(pr_X, obs=ATAC_ADS[cv].obs.copy(), var=ATAC_ADS[cv].var.copy())
    ad_pr.write_h5ad(f'/disco_500t/xuhua/gitrepo/BridgeNorm/figures/imputation/Misar_E13-E15-E18/babel/cv{cv+1}_imputedATAC.h5ad')

(1777, 49988)
(1949, 49988)
(2129, 49988)
