In [None]:
import scanpy as sc
import numpy as np
import episcanpy as epi
import anndata as ad
from scipy import sparse

neurips2021_multiome

In [None]:
multiome = sc.read('./neurips2021_multiome.h5ad')

In [None]:
rna = multiome[:, multiome.var['feature_types'] == 'GEX']

In [None]:
atac = multiome[:, multiome.var['feature_types'] == 'ATAC']

In [None]:
rna.X = rna.layers['counts'].copy()

In [None]:
sc.pp.normalize_total(rna, target_sum=1e4)
sc.pp.log1p(rna)
sc.pp.highly_variable_genes(rna, n_top_genes=4000, batch_key='batch')

In [None]:
rna.write('./data/neurips-multiome/rna.h5ad')

In [None]:
rna_hvg = rna[:, rna.var.highly_variable]

In [None]:
rna_hvg.write('./data/neurips-multiome/rna_hvg.h5ad')

In [None]:
atac.layers['binary'] = atac.X.copy()
sc.pp.normalize_total(atac, target_sum=1e4)
atac.layers['cpm'] = atac.X.copy()
atac.X = atac.layers['binary'].copy()
epi.pp.tfidf(atac)
atac.layers['tf-idf-binary'] = atac.layers['tf-idf']
atac.X = atac.layers['counts'].copy()
epi.pp.tfidf(atac)
atac.layers['tf-idf-counts'] = atac.layers['tf-idf']

In [None]:
min_score_value = 0.576
nb_feature_selected = 20000
epi.pl.variability_features(atac, log=None, min_score=min_score_value, nb_features=nb_feature_selected)
epi.pl.variability_features(atac, log='log10', min_score=min_score_value, nb_features=nb_feature_selected)

In [None]:
atac.write('./data/neurips-multiome/atac.h5ad')

In [None]:
epi.pp.select_var_feature(atac, nb_features=nb_feature_selected)

In [None]:
atac.X = atac.layers['binary'].copy()
epi.pp.tfidf(atac)
atac.layers['tf-idf-binary'] = atac.layers['tf-idf']
atac.X = atac.layers['counts'].copy()
epi.pp.tfidf(atac)
atac.layers['tf-idf-counts'] = atac.layers['tf-idf']

In [None]:
atac.write('./data/neurips-multiome/atac_hvf.h5ad')

neurips2021_cite

In [None]:
cite = sc.read('./neurips2021_cite.h5ad')

In [None]:
rna = cite[:, cite.var['feature_types'] == 'GEX']

In [None]:
adt = cite[:, cite.var['feature_types'] == 'ADT']

In [None]:
def clr_normalize_each_cell(adata, inplace=True):
    import numpy as np
    import scipy

    def seurat_clr(x):
        s = np.sum(np.log1p(x[x > 0]))
        exp = np.exp(s / len(x))
        return np.log1p(x / exp)

    if not inplace:
        adata = adata.copy()

    adata.X = np.apply_along_axis(
        seurat_clr, 1, (adata.X.toarray() if scipy.sparse.issparse(adata.X) else adata.X)
    )
    return adata

In [None]:
adt.X = adt.layers['counts'].copy()

In [None]:
adt = clr_normalize_each_cell(adt)

In [None]:
adt.X = adt.layers['counts'].copy()

In [None]:
adt.write('./data/neurips-cite/protein.h5ad')

In [None]:
rna.X = rna.layers['counts'].copy()

In [None]:
sc.pp.normalize_total(rna, target_sum=1e4)

In [None]:
sc.pp.log1p(rna)

In [None]:
sc.pp.highly_variable_genes(rna, n_top_genes=4000, batch_key='batch')

In [None]:
rna.write('./data/neurips-cite/rna.h5ad')

In [None]:
rna_hvg = rna[:, rna.var.highly_variable]

In [None]:
rna_hvg.write('./data/neurips-cite/rna_hvg.h5ad')

neurips2021_multiome_mapping

In [None]:
rna = sc.read("./data/neurips-multiome/rna_hvg.h5ad")
atac = sc.read("./data/neurips-multiome/atac_hvf.h5ad")

In [None]:
rna.obs['Modality'] = 'multiome'
atac.obs['Modality'] = 'multiome'

rna.X = rna.layers['counts']
atac.X = atac.layers['counts']

adata = rna.T.concatenate(atac.T,index_unique=None).T

In [None]:
batch_col = "batch"
celltype_col = "cell_type"

groups = adata.obs.groupby([batch_col, celltype_col])

idx_1 = []
idx_2 = []
idx_4 = []  

for (batch, ct), df in groups:
    n = len(df)
    if n == 0:
        continue

    k1 = max(1, int(n * 0.3))  
    k2 = max(1, int(n * 0.3))   
    k4 = n - k1 - k2            

    all_idx = df.index.values

    np.random.shuffle(all_idx)

    part1 = all_idx[:k1]
    part2 = all_idx[k1:k1+k2]
    part4 = all_idx[k1+k2:]

    idx_1.extend(part1)
    idx_2.extend(part2)
    idx_4.extend(part4)

idx_1 = np.array(idx_1)
idx_2 = np.array(idx_2)
idx_4 = np.array(idx_4)

print("idx_1 (30%) =", idx_1.shape[0])
print("idx_2 (30%) =", idx_2.shape[0])
print("idx_4 (40%) =", idx_4.shape[0])

adata.obs.loc[idx_1, 'Modality'] = 'rna'
adata.obs.loc[idx_2, 'Modality'] = 'atac'

adata.write('./data/neurips-multiome/mapping.h5ad')

neurips2021_cite_mapping

In [None]:
rna = sc.read("./data/neurips-cite/rna_hvg.h5ad")
protein = sc.read('./data/neurips-cite/protein.h5ad')

In [None]:
rna.obs['Modality'] = 'cite'
protein.obs['Modality'] = 'cite'

rna.X = rna.layers['counts']
protein.X = protein.layers['counts']

adata = rna.T.concatenate(protein.T,index_unique=None).T

In [None]:
batch_col = "batch"
celltype_col = "cell_type"

groups = adata.obs.groupby([batch_col, celltype_col])

idx_1 = []
idx_2 = []
idx_4 = [] 

for (batch, ct), df in groups:
    n = len(df)
    if n == 0:
        continue

    k1 = max(1, int(n * 0.3))   
    k2 = max(1, int(n * 0.3))  
    k4 = n - k1 - k2           

    all_idx = df.index.values

    np.random.shuffle(all_idx)

    part1 = all_idx[:k1]
    part2 = all_idx[k1:k1+k2]
    part4 = all_idx[k1+k2:]

    idx_1.extend(part1)
    idx_2.extend(part2)
    idx_4.extend(part4)

idx_1 = np.array(idx_1)
idx_2 = np.array(idx_2)
idx_4 = np.array(idx_4)

print("idx_1 (30%) =", idx_1.shape[0])
print("idx_2 (30%) =", idx_2.shape[0])
print("idx_4 (40%) =", idx_4.shape[0])

adata.obs.loc[idx_1, 'Modality'] = 'rna'
adata.obs.loc[idx_2, 'Modality'] = 'adt'

adata.write('./data/neurips-cite/mapping.h5ad')

multiomeandcite

In [None]:
multiome = sc.read("./neurips2021_multiome.h5ad")
cite = sc.read("./neurips2021_cite.h5ad")

rna_multiome = multiome[:, multiome.var['feature_types'] == 'GEX'].copy()
rna_cite = cite[:, cite.var['feature_types'] == 'GEX'].copy()

atac = sc.read("./data/neurips-multiome/atac_hvf.h5ad")
adt = sc.read("./data/neurips-cite/protein.h5ad") 

In [None]:
rna_multiome = multiome[:, multiome.var['feature_types'] == 'GEX'].copy()
rna_cite = cite[:, cite.var['feature_types'] == 'GEX'].copy()
rna_multiome.obs['batch'] = rna_multiome.obs['batch'].astype(str) + '_multiome'
rna_cite.obs['batch'] = rna_cite.obs['batch'].astype(str) + '_cite'
common_genes = rna_multiome.var_names.intersection(rna_cite.var_names)
rna_multiome = rna_multiome[:, common_genes].copy()
rna_cite = rna_cite[:, common_genes].copy()
rna = ad.concat([rna_multiome, rna_cite], join='inner')
sc.pp.normalize_total(rna, target_sum=1e4)
sc.pp.log1p(rna)
sc.pp.highly_variable_genes(rna, n_top_genes=4000, batch_key='batch')
rna.X = rna.layers['counts'].copy()
rna = rna[:, rna.var.highly_variable].copy()

In [None]:
def stratified_sample(adata, frac, type_key="cell_type", batch_key="batch", random_state=0):
    df = adata.obs[[type_key, batch_key]]
    groups = df.groupby([type_key, batch_key])
    
    idx = []
    for _, g in groups:
        n = max(1, int(len(g) * frac))
        idx.extend(g.sample(n=n, random_state=random_state).index.tolist())
    return idx


adt_idx  = stratified_sample(adt,  frac=0.2, type_key="cell_type", batch_key="batch")
atac_idx = stratified_sample(atac, frac=0.2, type_key="cell_type", batch_key="batch")


used_cells = set(adt_idx) | set(atac_idx)

remaining_cells = rna.obs_names.difference(used_cells)
remaining_cells = list(remaining_cells)

rna_remaining = rna[remaining_cells, :].copy()

rna_idx = stratified_sample(rna_remaining,frac=0.2,type_key="cell_type",batch_key="batch")

In [None]:

atac.obs['batch'] = atac.obs['batch'].astype(str) + '_multiome'

obs_template = rna_cite.obs.copy()

n_cells = obs_template.shape[0]
n_features = atac.shape[1]

if sparse.issparse(atac.X):
    X_zero = sparse.csr_matrix((n_cells, n_features))
else:
    X_zero = np.zeros((n_cells, n_features), dtype=atac.X.dtype)

layers_zero = {}
for layer_name in atac.layers.keys():
    if sparse.issparse(atac.layers[layer_name]):
        layers_zero[layer_name] = sparse.csr_matrix((n_cells, n_features))
    else:
        layers_zero[layer_name] = np.zeros((n_cells, n_features), dtype=atac.layers[layer_name].dtype)

atac_zero = sc.AnnData(
    X=X_zero,
    obs=obs_template,
    var=atac.var.copy(),
    layers=layers_zero
)

atac = ad.concat([atac, atac_zero], join='inner')

atac = atac[rna.obs_names, :].copy()

In [None]:

adt.obs['batch'] = adt.obs['batch'].astype(str) + '_cite'

obs_template = rna_multiome.obs.copy()
n_cells = obs_template.shape[0]
n_features = adt.shape[1]

if sparse.issparse(adt.X):
    X_zero = sparse.csr_matrix((n_cells, n_features))
else:
    X_zero = np.zeros((n_cells, n_features), dtype=adt.X.dtype)

layers_zero = {}
for layer_name in adt.layers.keys():
    if sparse.issparse(adt.layers[layer_name]):
        layers_zero[layer_name] = sparse.csr_matrix((n_cells, n_features))
    else:
        layers_zero[layer_name] = np.zeros((n_cells, n_features), dtype=adt.layers[layer_name].dtype)

adt_zero = ad.AnnData(
    X=X_zero,
    obs=obs_template,
    var=adt.var.copy(),
    layers=layers_zero
)

adt = ad.concat([adt, adt_zero], join='inner')

adt = adt[rna.obs_names, :].copy()

In [None]:
rna.write("./data/trimodal_rna.h5ad")
atac.write("./data/trimodal_atac.h5ad")
adt.write("./data/trimodal_adt.h5ad")

multiomeandcitemappingandimputing

In [None]:
multiome = sc.read("./neurips2021_multiome.h5ad")
cite = sc.read("./neurips2021_cite.h5ad")

rna_multiome = multiome[:, multiome.var['feature_types'] == 'GEX'].copy()
rna_cite = cite[:, cite.var['feature_types'] == 'GEX'].copy()

atac = sc.read("./data/neurips-multiome/atac_hvf.h5ad")
adt = sc.read("./data/neurips-cite/protein.h5ad") 

In [None]:
rna_multiome = multiome[:, multiome.var['feature_types'] == 'GEX'].copy()
rna_cite = cite[:, cite.var['feature_types'] == 'GEX'].copy()
rna_multiome.obs['batch'] = rna_multiome.obs['batch'].astype(str) + '_multiome'
rna_cite.obs['batch'] = rna_cite.obs['batch'].astype(str) + '_cite'
common_genes = rna_multiome.var_names.intersection(rna_cite.var_names)
rna_multiome = rna_multiome[:, common_genes].copy()
rna_cite = rna_cite[:, common_genes].copy()
rna = ad.concat([rna_multiome, rna_cite], join='inner')
sc.pp.normalize_total(rna, target_sum=1e4)
sc.pp.log1p(rna)
sc.pp.highly_variable_genes(rna, n_top_genes=4000, batch_key='batch')
rna.X = rna.layers['counts'].copy()
rna = rna[:, rna.var.highly_variable].copy()

In [None]:
def stratified_sample(adata, frac, type_key="cell_type", batch_key="batch", random_state=0):
    df = adata.obs[[type_key, batch_key]]
    groups = df.groupby([type_key, batch_key])
    
    idx = []
    for _, g in groups:
        n = max(1, int(len(g) * frac))
        idx.extend(g.sample(n=n, random_state=random_state).index.tolist())
    return idx


adt_idx  = stratified_sample(adt,  frac=0.2, type_key="cell_type", batch_key="batch")
atac_idx = stratified_sample(atac, frac=0.2, type_key="cell_type", batch_key="batch")


used_cells = set(adt_idx) | set(atac_idx) 

remaining_cells = rna.obs_names.difference(used_cells)
remaining_cells = list(remaining_cells) 

rna_remaining = rna[remaining_cells, :].copy()

rna_idx = stratified_sample(rna_remaining,frac=0.2,type_key="cell_type",batch_key="batch")

In [None]:
atac_all = sc.read("/home/zeng/zjy/MultiAI/MultiGAI/data/neurips-multiome/atac.h5ad")
extra_peaks = ["chr11-118343914-118344801"]
original_vars = atac.var.index.tolist() 
all_selected_vars = list(set(original_vars + extra_peaks))
atac = atac_all[:, all_selected_vars].copy()

atac.obs['batch'] = atac.obs['batch'].astype(str) + '_multiome'

obs_template = rna_cite.obs.copy()

n_cells = obs_template.shape[0]
n_features = atac.shape[1]

if sparse.issparse(atac.X):
    X_zero = sparse.csr_matrix((n_cells, n_features))
else:
    X_zero = np.zeros((n_cells, n_features), dtype=atac.X.dtype)

layers_zero = {}
for layer_name in atac.layers.keys():
    if sparse.issparse(atac.layers[layer_name]):
        layers_zero[layer_name] = sparse.csr_matrix((n_cells, n_features))
    else:
        layers_zero[layer_name] = np.zeros((n_cells, n_features), dtype=atac.layers[layer_name].dtype)

atac_zero = sc.AnnData(
    X=X_zero,
    obs=obs_template,
    var=atac.var.copy(),
    layers=layers_zero
)

atac = ad.concat([atac, atac_zero], join='inner')

atac = atac[rna.obs_names, :].copy()

In [None]:
adt.obs['batch'] = adt.obs['batch'].astype(str) + '_cite'

obs_template = rna_multiome.obs.copy()
n_cells = obs_template.shape[0]
n_features = adt.shape[1]

if sparse.issparse(adt.X):
    X_zero = sparse.csr_matrix((n_cells, n_features))
else:
    X_zero = np.zeros((n_cells, n_features), dtype=adt.X.dtype)

layers_zero = {}
for layer_name in adt.layers.keys():
    if sparse.issparse(adt.layers[layer_name]):
        layers_zero[layer_name] = sparse.csr_matrix((n_cells, n_features))
    else:
        layers_zero[layer_name] = np.zeros((n_cells, n_features), dtype=adt.layers[layer_name].dtype)

adt_zero = ad.AnnData(
    X=X_zero,
    obs=obs_template,
    var=adt.var.copy(),
    layers=layers_zero
)

adt = ad.concat([adt, adt_zero], join='inner')

adt = adt[rna.obs_names, :].copy()

In [None]:
rna.obs["Modality"]  = rna.obs["Modality"].astype(str)

rna.obs.loc[adt_idx,  "Modality"] = "adt"

rna.obs.loc[atac_idx,  "Modality"] = "atac"

rna.obs.loc[rna_idx,  "Modality"] = "rna"

In [None]:
rna.write("./data/trimodal_mappingandimputing_rna.h5ad")
atac.write("./data/trimodal_mappingandimputing_atac.h5ad")
adt.write("./data/trimodal_mappingandimputing_adt.h5ad")