In [9]:
import pandas as pd
import anndata as ad
import scipy.sparse as sps
from scipy.sparse import csr_matrix, coo_matrix, csc_matrix
from scipy.io import mmread, mmwrite
import os
import numpy as np
from os.path import join
from pathlib import Path, PurePath
import gzip
import scanpy as sc

In [2]:
import json
import copy
from matplotlib.image import imread
def load_spatial(path, adata, library_id='0'):
    tissue_positions_file = join(path, "tissue_positions.csv")
    files = dict(
        tissue_positions_file=tissue_positions_file,
        scalefactors_json_file=join(path, "scalefactors_json.json"),
        hires_image=join(path, "tissue_hires_image.png"),
        lowres_image=join(path, "tissue_lowres_image.png"),
    )
    
    adata.uns["spatial"] = dict()
    adata.uns["spatial"][library_id] = dict()
    adata.uns["spatial"][library_id]["images"] = dict()
    for res in ["hires", "lowres"]:
        try:
            adata.uns["spatial"][library_id]["images"][res] = imread(
                str(files[f"{res}_image"])
            )
        except Exception:
            raise OSError(f"Could not find '{res}_image'")

    # read json scalefactors
    adata.uns["spatial"][library_id]["scalefactors"] = json.loads(
        Path(files["scalefactors_json_file"]).read_bytes()
    )

    # adata.uns["spatial"][library_id]["metadata"] = {
    #     k: (str(attrs[k], "utf-8") if isinstance(attrs[k], bytes) else attrs[k])
    #     for k in ("chemistry_description", "software_version")
    #     if k in attrs
    # }

    # read coordinates
    positions = pd.read_csv(
        files["tissue_positions_file"],
        header=0 if Path(tissue_positions_file).name == "tissue_positions.csv" else None,
        index_col=0,
    )
    positions.columns = [
        "in_tissue",
        "array_row",
        "array_col",
        "pxl_col_in_fullres",
        "pxl_row_in_fullres",
    ]
    # print(positions.head())

    adata.obs = adata.obs.join(positions, how="left")

    adata.obsm["spatial"] = adata.obs[
        ["pxl_row_in_fullres", "pxl_col_in_fullres"]
    ].to_numpy()
   
    adata.obs.drop(
        columns=["pxl_row_in_fullres", "pxl_col_in_fullres"],
        inplace=True,
    )

def load_data(_dir):
    feat_names = pd.read_csv(join(_dir, 'features.tsv.gz'), compression='gzip', sep='\t', header=None)
    barcodes   = pd.read_csv(join(_dir, 'barcodes.tsv.gz'), compression='gzip', sep='\t', header=None)

    with gzip.open(join(_dir, 'matrix.mtx.gz'), 'rb') as gzipped_file:
        mat = mmread(gzipped_file)

    ad = sc.AnnData(sps.csr_matrix(mat.T))
    ad.obs_names = barcodes[0].values
    ad.var_names = feat_names[1].values
    ad.var['id'] = feat_names[0].values
    ad.var['type'] = feat_names[2].values
    return ad

In [18]:
def create_new_ad(ad_rna, ad_adt, batch_key, dataset_id, species):
    X = csr_matrix(ad_rna.X)
    cell_names = pd.DataFrame(ad_rna.obs_names)
    cell_names.columns = ['cell_ids']
    gene_names = pd.DataFrame(ad_rna.var_names)
    gene_names.columns = ['gene_ids']
    input_train_mod1 = ad.AnnData(X, obs=pd.DataFrame(index=cell_names.cell_ids), var=pd.DataFrame(index = gene_names.gene_ids))
    # input_train_mod1.var['gene_ids'] = input_train_mod1.var_names
    input_train_mod1.var['feature_types'] = pd.Categorical(len(input_train_mod1.var_names)*['GEX'])
    input_train_mod1.obs['batch'] = pd.Categorical(ad_rna.obs[batch_key].to_list())
    input_train_mod1.uns = {"dataset_id": dataset_id, "organism": species}
    input_train_mod1.layers['counts'] = input_train_mod1.X.copy()
    
    temp = csr_matrix(ad_adt.X)
    obs_mod2 = pd.DataFrame(index = ad_adt.obs_names.to_list())
    var_mod2 = pd.DataFrame(index = ad_adt.var_names.to_list())
    input_train_mod2 = ad.AnnData(temp, obs=obs_mod2, var=var_mod2)
    # input_train_mod2.var['gene_ids'] = input_train_mod2.var_names
    input_train_mod2.var['feature_types'] = pd.Categorical(len(input_train_mod2.var_names)*['ADT'])
    input_train_mod2.obs['batch'] = pd.Categorical(ad_adt.obs[batch_key].to_list())
    input_train_mod2.uns = {"dataset_id": dataset_id, "organism": species}
    input_train_mod2.layers['counts'] = input_train_mod2.X.copy()
    return input_train_mod1, input_train_mod2

In [11]:
data_dir = '/disco_500t/xuhua/data/spatial_multi_omics/lymp_node/LN-2024-new/outs'

ad3 = load_data(join(data_dir, 'filtered_feature_bc_matrix'))
ad3_rna = ad3[:, ad3.var['type']=='Gene Expression'].copy()
ad3_adt = ad3[:, ad3.var['type']=='Antibody Capture'].copy()
load_spatial(join(data_dir, 'spatial'), ad3_rna)
load_spatial(join(data_dir, 'spatial'), ad3_adt)

ad3_rna.obs['src'] = ad3_adt.obs['src'] = ['s3']*ad3_rna.n_obs
ad3_rna.obs_names = [f's3-{x}' for x in ad3_rna.obs_names]
ad3_adt.obs_names = [f's3-{x}' for x in ad3_adt.obs_names]

ad3_rna.var_names_make_unique()
ad3_adt.var_names_make_unique()

data_dir = '/disco_500t/xuhua/data/spatial_multi_omics/lymp_tonsil_ramen'

ad_a1_rna = sc.read_h5ad(join(data_dir, 'lymph_A1/adata_RNA.h5ad'))
ad_a1_adt = sc.read_h5ad(join(data_dir, 'lymph_A1/adata_ADT.h5ad'))
meta1 = pd.read_csv(join(data_dir, 'lymph_A1/A1_LN_cloupe_Kwoh.csv'), index_col=0) 
ad_a1_rna.obs['lab'] = meta1.loc[ad_a1_rna.obs_names, 'manual'].to_list()
ad_a1_adt.obs['lab'] = meta1.loc[ad_a1_adt.obs_names, 'manual'].to_list()
ad_a1_rna.obs['src'] = ad_a1_adt.obs['src'] = ['s1'] * ad_a1_rna.n_obs
ad_a1_rna.obs_names = [f's1-{x}' for x in ad_a1_rna.obs_names]
ad_a1_adt.obs_names = [f's1-{x}' for x in ad_a1_adt.obs_names]
ad_a1_rna.var_names_make_unique()
ad_a1_adt.var_names_make_unique()

ad_d1_rna = sc.read_h5ad(join(data_dir, 'lymph_D1/adata_RNA.h5ad'))
ad_d1_adt = sc.read_h5ad(join(data_dir, 'lymph_D1/adata_ADT.h5ad'))
meta2 = pd.read_csv(join(data_dir, 'lymph_D1/D1_LN_cloupe_Kwoh.csv'), index_col=0) 
ad_d1_rna.obs['lab'] = meta2.loc[ad_d1_rna.obs_names, 'manual'].to_list()
ad_d1_adt.obs['lab'] = meta2.loc[ad_d1_adt.obs_names, 'manual'].to_list()
ad_d1_rna.obs['src'] = ad_d1_adt.obs['src'] = ['s2'] * ad_d1_rna.n_obs
ad_d1_rna.obs_names = [f's2-{x}' for x in ad_d1_rna.obs_names]
ad_d1_adt.obs_names = [f's2-{x}' for x in ad_d1_adt.obs_names]
ad_d1_rna.var_names_make_unique()
ad_d1_adt.var_names_make_unique()

## unify feature names
shared_gene = ad_a1_rna.var_names.intersection(ad_d1_rna.var_names).intersection(ad3_rna.var_names)
shared_prot = ad_a1_adt.var_names.intersection(ad_d1_adt.var_names).intersection(ad3_adt.var_names)

ad_a1_rna, ad_d1_rna, ad3_rna = ad_a1_rna[:, shared_gene].copy(), ad_d1_rna[:, shared_gene].copy(), ad3_rna[:, shared_gene].copy()
ad_a1_adt, ad_d1_adt, ad3_adt = ad_a1_adt[:, shared_prot].copy(), ad_d1_adt[:, shared_prot].copy(), ad3_adt[:, shared_prot].copy()

  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


In [12]:
ad_rna_all = sc.concat([ad_a1_rna, ad_d1_rna, ad3_rna])
ad_adt_all = sc.concat([ad_a1_adt, ad_d1_adt, ad3_adt])

sc.pp.highly_variable_genes(ad_rna_all, batch_key="src", flavor="seurat_v3", n_top_genes=5000)

ad_a1_rna = ad_a1_rna[:, ad_rna_all.var.query('highly_variable').index].copy()
ad_d1_rna = ad_d1_rna[:, ad_rna_all.var.query('highly_variable').index].copy()
ad3_rna = ad3_rna[:, ad_rna_all.var.query('highly_variable').index].copy()

In [13]:
RNA_ADS = [ad_a1_rna, ad_d1_rna, ad3_rna]
ADT_ADS = [ad_a1_adt, ad_d1_adt, ad3_adt]

In [20]:
work_dir = '/disco_500t/xuhua/gitrepo/dance/data'
for i in range(3):
    tmp_path = join(work_dir, f'Lymph_cv{i+1}/')
    os.makedirs(tmp_path+'openproblems_bmmc_cite_phase2_rna', exist_ok=True)

    train_idx = list(set(np.arange(3)) - set({i}))
    test_idx  = [i]
    train_rna_data = sc.concat([RNA_ADS[idx] for idx in train_idx])
    test_rna_data  = sc.concat([RNA_ADS[idx] for idx in test_idx])
    train_adt_data = sc.concat([ADT_ADS[idx] for idx in train_idx])
    test_adt_data  = sc.concat([ADT_ADS[idx] for idx in test_idx])

    input_train_mod1, input_train_mod2 = create_new_ad(train_rna_data, train_adt_data, 'src', 'train', 'human')
    input_test_mod1, input_test_mod2 = create_new_ad(test_rna_data, test_adt_data, 'src', 'test', 'human')

    ## Store data to the specific location
    input_train_mod1.write_h5ad(tmp_path + "openproblems_bmmc_cite_phase2_rna/openproblems_bmmc_cite_phase2_rna.censor_dataset.output_train_mod1.h5ad")
    input_train_mod2.write_h5ad(tmp_path + "openproblems_bmmc_cite_phase2_rna/openproblems_bmmc_cite_phase2_rna.censor_dataset.output_train_mod2.h5ad")
    input_test_mod1.write_h5ad(tmp_path + "openproblems_bmmc_cite_phase2_rna/openproblems_bmmc_cite_phase2_rna.censor_dataset.output_test_mod1.h5ad")
    input_test_mod2.write_h5ad(tmp_path + "openproblems_bmmc_cite_phase2_rna/openproblems_bmmc_cite_phase2_rna.censor_dataset.output_test_mod2.h5ad")
    

In [21]:
import csv
def csv_read(path, header=True, index=True):
    res = []
    with open(path, mode='r', newline='') as file:
        reader = csv.reader(file)

        for i,row in enumerate(reader):
            if header and i==0:
                continue
            try:
                float_row = [float(item) for item in row]
                if index:
                    float_row = float_row[1:]
                res.append(float_row)  # Each row is now a list of floats
            except ValueError as e:
                print(f"Error converting to float: {e}")
    res = np.vstack(res)   
    return res

In [22]:
out_dir = '/disco_500t/xuhua/gitrepo/dance/output/RNA2ADT'

for cv in range(3):
    _dir = join(out_dir, f'Lymph_3slices_cv{cv+1}')
    pr_X = csv_read(join(_dir, 'babel.csv'), index=True, header=True)
    print(pr_X.shape)
    ad_pr = sc.AnnData(pr_X, obs=ADT_ADS[cv].obs.copy(), var=ADT_ADS[cv].var.copy())
    ad_pr.write_h5ad(f'/disco_500t/xuhua/gitrepo/BridgeNorm/figures/imputation/Lymph/babel/cv{cv+1}_imputedADT.h5ad')

(3484, 31)
(3359, 31)
(3408, 31)
