In [1]:
import pandas as pd
import anndata as ad
from scipy.sparse import csr_matrix, coo_matrix, csc_matrix
from scipy.io import mmread, mmwrite
import os
from os.path import join
import numpy as np
import scanpy as sc
import scipy.io as sio
import scipy.sparse as sps
import h5py
import gzip
import gc

In [2]:
def reorder(ad1, ad2):
    shared_barcodes = ad1.obs_names.intersection(ad2.obs_names)
    ad1 = ad1[shared_barcodes].copy()
    ad2 = ad2[shared_barcodes].copy()
    return ad1, ad2

def load_peak_expr(_dir):
    data = sio.mmread(join(_dir, 'data.mtx'))
    cname = pd.read_csv(join(_dir, 'barcode.csv'), index_col=0)['x'].to_list()
    feat = pd.read_csv(join(_dir, 'feat.csv'), index_col=0)['x'].to_list()
    ad = sc.AnnData(sps.csr_matrix(data.T))
    ad.obs_names = cname
    ad.var_names = feat
    return ad

def set_ad(ad_ref, ref_batch_key='src', tg_batch_key='batch'):
    cell_names = pd.DataFrame(np.reshape(ad_ref.obs_names.to_list(), (-1, 1)))
    cell_names.columns = ['cell_ids'] 

    gene_names = pd.DataFrame(np.reshape(ad_ref.var_names.to_list(), (-1, 1)))
    gene_names.columns = ['gene_ids'] 
    ad_new = ad.AnnData(ad_ref.X.copy(), obs=pd.DataFrame(index=cell_names.cell_ids), var=pd.DataFrame(index = gene_names.gene_ids))
    ad_new.obs[tg_batch_key] = ad_ref.obs[ref_batch_key].to_list()
    ad_new.var_names_make_unique()
    return ad_new

In [3]:
data_dir = '/disco_500t/xuhua/data/real_mosaic_cases/mouse_brain_rna+atac/'

df1_rna = pd.read_csv(join(data_dir, 'rna+atac/GSM6204636_MouseBrain_20um_matrix.tsv'), sep='\t')
df1_spatial_pos = pd.read_csv(join(data_dir, 'rna+atac/GSM6204623_MouseBrain_20um_spatial_rna_part/tissue_positions_list.csv'), header=None, index_col=0)
ad1_rna = sc.AnnData(df1_rna.T, obsm={'spatial': df1_spatial_pos.loc[df1_rna.columns, [2, 3]].values})

ad1_atac = load_peak_expr(join(data_dir, 'rna+atac/For_Imputation_Task/GSM6204623_peak_data'))
df1_atac_spatial = pd.read_csv(join(data_dir, 'rna+atac/GSM6204623_MouseBrain_20um_spatial_rna_part/tissue_positions_list.csv'), index_col=0, header=None)
ad1_atac.obsm['spatial'] = df1_atac_spatial.loc[ad1_atac.obs_names, [2, 3]].values
ad1_rna, ad1_atac = reorder(ad1_rna, ad1_atac)

# ===
df2_rna = pd.read_csv(join(data_dir, 'rna+atac/GSM6753041_MouseBrain_20um_repATAC_matrix.tsv'), sep='\t')
df2_rna_spatial = pd.read_csv(join(data_dir, 'rna+atac/GSM6753041_MouseBrain_20um_repATAC_spatial/tissue_positions_list.csv'), index_col=0, header=None)
ad2_rna = sc.AnnData(df2_rna.T, obsm={'spatial': df2_rna_spatial.loc[df2_rna.columns, [2, 3]].values})

ad2_atac = load_peak_expr(join(data_dir, 'rna+atac/For_Imputation_Task/GSM6758284_peak_data'))
df2_atac_spatial = pd.read_csv(join(data_dir, 'rna+atac//GSM6753041_MouseBrain_20um_repATAC_spatial/tissue_positions_list.csv'), index_col=0, header=None)
ad2_atac.obsm['spatial'] = df2_atac_spatial.loc[ad2_atac.obs_names, [2, 3]].values
ad2_rna, ad2_atac = reorder(ad2_rna, ad2_atac)

# ===
df3_rna = pd.read_csv(join(data_dir, 'rna+atac/GSM6753043_MouseBrain_20um_100barcodes_ATAC_matrix.tsv'), sep='\t')
df3_rna_spatial = pd.read_csv(join(data_dir, 'rna+atac/GSM6753043_MouseBrain_20um_100barcodes_ATAC_spatial/tissue_positions_list.csv'), index_col=0, header=None)
ad3_rna = sc.AnnData(df3_rna.T, obsm={'spatial': df3_rna_spatial.loc[df3_rna.columns, [2, 3]].values})

ad3_atac = load_peak_expr(join(data_dir, 'rna+atac/For_Imputation_Task/GSM6758285_peak_data'))
df3_atac_spatial = pd.read_csv(join(data_dir, 'rna+atac//GSM6753043_MouseBrain_20um_100barcodes_ATAC_spatial/tissue_positions_list.csv'), index_col=0, header=None)
ad3_atac.obsm['spatial'] = df3_atac_spatial.loc[ad3_atac.obs_names, [2, 3]].values
ad3_rna, ad3_atac = reorder(ad3_rna, ad3_atac)

shared_gene = ad1_rna.var_names.intersection(ad2_rna.var_names).intersection(ad3_rna.var_names)
shared_peak = ad1_atac.var_names.intersection(ad2_atac.var_names).intersection(ad3_atac.var_names)
ad1_rna = ad1_rna[:, shared_gene].copy(); ad2_rna = ad2_rna[:, shared_gene].copy(); ad3_rna = ad3_rna[:, shared_gene].copy()
ad1_atac = ad1_atac[:, shared_peak].copy(); ad2_atac = ad2_atac[:, shared_peak].copy(); ad3_atac = ad3_atac[:, shared_peak].copy()

ad1_rna.obs_names = [f's1-{_}' for _ in ad1_rna.obs_names]
ad1_atac.obs_names = [f's1-{_}' for _ in ad1_atac.obs_names]
ad2_rna.obs_names = [f's2-{_}' for _ in ad2_rna.obs_names]
ad2_atac.obs_names = [f's2-{_}' for _ in ad2_atac.obs_names]
ad3_rna.obs_names = [f's3-{_}' for _ in ad3_rna.obs_names]
ad3_atac.obs_names = [f's3-{_}' for _ in ad3_atac.obs_names]

ad1_rna.obs['src'] = ['s1']*ad1_rna.n_obs
ad1_atac.obs['src'] = ['s1']*ad1_atac.n_obs
ad2_rna.obs['src'] = ['s2']*ad2_rna.n_obs
ad2_atac.obs['src'] = ['s2']*ad2_atac.n_obs
ad3_rna.obs['src'] = ['s3']*ad3_rna.n_obs
ad3_atac.obs['src'] = ['s3']*ad3_atac.n_obs

In [4]:
ad_rna_all = sc.concat([ad1_rna, ad2_rna, ad3_rna])
ad_atac_all = sc.concat([ad1_atac, ad2_atac, ad3_atac])

sc.pp.highly_variable_genes(ad_rna_all, flavor='seurat_v3', n_top_genes=5000, batch_key='src')
hvg_names = ad_rna_all.var.query('highly_variable').index.to_numpy()

# ac.pp.tfidf(ad_atac_all, scale_factor=1e4)
sc.pp.highly_variable_genes(ad_atac_all, flavor='seurat_v3', n_top_genes=50000, batch_key='src')
hvp_names = ad_atac_all.var.query('highly_variable').index.to_numpy()

In [5]:
ad1_rna = ad1_rna[:, hvg_names].copy(); ad1_atac = ad1_atac[:, hvp_names].copy()
ad2_rna = ad2_rna[:, hvg_names].copy(); ad2_atac = ad2_atac[:, hvp_names].copy()
ad3_rna = ad3_rna[:, hvg_names].copy(); ad3_atac = ad3_atac[:, hvp_names].copy()

## filter feat names
filtered_atac_feats = [_ for _ in ad1_atac.var_names if _.startswith('chr')]
ad1_atac = ad1_atac[:, filtered_atac_feats].copy()
ad2_atac = ad2_atac[:, filtered_atac_feats].copy()
ad3_atac = ad3_atac[:, filtered_atac_feats].copy()

In [6]:
RNA_ADS = [ad1_rna, ad2_rna, ad3_rna]
ATAC_ADS = [ad1_atac, ad2_atac, ad3_atac]
mod_dict = {'rna': RNA_ADS, 'atac':ATAC_ADS}
n_batches = 3
mod_sets = ['rna', 'atac']

In [7]:
work_dir = '/disco_500t/xuhua/gitrepo/dance/data'
for i in range(3):
    tmp_path = join(work_dir, f'MB_cv{i+1}/')
    os.makedirs(tmp_path, exist_ok=True)
    
    train_idx = list(set(np.arange(3)) - set({i}))
    test_idx  = [i]
    train_rna_data = sc.concat([RNA_ADS[idx] for idx in train_idx])
    test_rna_data  = sc.concat([RNA_ADS[idx] for idx in test_idx])
    train_atac_data = sc.concat([ATAC_ADS[idx] for idx in train_idx])
    test_atac_data  = sc.concat([ATAC_ADS[idx] for idx in test_idx])

    input_train_mod1 = set_ad(train_rna_data, ref_batch_key='src', tg_batch_key='batch')
    input_test_mod1 = set_ad(test_rna_data, ref_batch_key='src', tg_batch_key='batch')
    input_train_mod2 = set_ad(train_atac_data, ref_batch_key='src', tg_batch_key='batch')
    input_test_mod2 = set_ad(test_atac_data, ref_batch_key='src', tg_batch_key='batch')

    input_train_mod1.var['feature_types'] = pd.Categorical(len(input_train_mod1.var_names)*['GEX'])
    input_train_mod1.uns = {'dataset_id': 'mouse_brain', 'organism': 'mouse'}
    input_train_mod1.layers['counts'] = input_train_mod1.X.copy()
    input_train_mod2.var['feature_types'] = pd.Categorical(len(input_train_mod2.var_names)*['ATAC'])
    input_train_mod2.uns = {'dataset_id': 'mouse_brain', 'organism': 'mouse'}
    input_train_mod2.layers['counts'] = input_train_mod2.X.copy()
    input_test_mod1.var['feature_types'] = pd.Categorical(len(input_test_mod1.var_names)*['GEX'])
    input_test_mod1.uns = {'dataset_id': 'mouse_brain', 'organism': 'mouse'}
    input_test_mod1.layers['counts'] = input_test_mod1.X.copy()
    input_test_mod2.var['feature_types'] = pd.Categorical(len(input_test_mod2.var_names)*['ATAC'])
    input_test_mod2.uns = {'dataset_id': 'mouse_brain', 'organism': 'mouse'}
    input_test_mod2.layers['counts'] = input_test_mod2.X.copy()

    input_train_mod1.write_h5ad(tmp_path + "openproblems_bmmc_multiome_phase2_rna.censor_dataset.output_train_mod1.h5ad", compression = None)
    input_train_mod2.write_h5ad(tmp_path + "openproblems_bmmc_multiome_phase2_rna.censor_dataset.output_train_mod2.h5ad", compression = None)
    input_test_mod1.write_h5ad(tmp_path + "openproblems_bmmc_multiome_phase2_rna.censor_dataset.output_test_mod1.h5ad", compression = None)
    input_test_mod2.write_h5ad(tmp_path + "openproblems_bmmc_multiome_phase2_rna.censor_dataset.output_test_mod2.h5ad", compression = None)

In [8]:
import csv
def csv_read(path, header=True, index=True):
    res = []
    with open(path, mode='r', newline='') as file:
        reader = csv.reader(file)

        for i,row in enumerate(reader):
            if header and i==0:
                continue
            try:
                float_row = [float(item) for item in row]
                if index:
                    float_row = float_row[1:]
                res.append(float_row)  # Each row is now a list of floats
            except ValueError as e:
                print(f"Error converting to float: {e}")
    res = np.vstack(res)   
    return res

In [9]:
out_dir = '/disco_500t/xuhua/gitrepo/dance/output/RNA2ATAC'

for cv in range(3):
    _dir = join(out_dir, f'MB_3slices_cv{cv+1}')
    pr_X = csv_read(join(_dir, 'babel.csv'), index=True, header=True)
    print(pr_X.shape)
    ad_pr = sc.AnnData(pr_X, obs=ATAC_ADS[cv].obs.copy(), var=ATAC_ADS[cv].var.copy())
    ad_pr.write_h5ad(f'/disco_500t/xuhua/gitrepo/BridgeNorm/figures/imputation/3slices_MB_RNA+ATAC/babel/cv{cv+1}_imputedATAC.h5ad')

(2372, 49991)
(2497, 49991)
(9215, 49991)
