In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

import scanpy as sc
import pandas as pd
import numpy as np
import scipy.sparse as sps
from os.path import join
from scipy.sparse import csr_matrix
from scipy.io import mmread
from scButterfly.train_model_cite import Model
import gzip 
from pathlib import Path, PurePath
import torch
import scvi
import torch.nn as nn

Global seed set to 0


In [2]:
def load_data(_dir):
    feat_names = pd.read_csv(join(_dir, 'features.tsv.gz'), compression='gzip', sep='\t', header=None)
    barcodes   = pd.read_csv(join(_dir, 'barcodes.tsv.gz'), compression='gzip', sep='\t', header=None)

    with gzip.open(join(_dir, 'matrix.mtx.gz'), 'rb') as gzipped_file:
        mat = mmread(gzipped_file)

    ad = sc.AnnData(sps.csr_matrix(mat.T))
    ad.obs_names = barcodes[0].values
    ad.var_names = feat_names[1].values
    ad.var['id'] = feat_names[0].values
    ad.var['type'] = feat_names[2].values
    return ad

import json
import copy
from matplotlib.image import imread
def load_spatial(path, adata, library_id='0'):
    tissue_positions_file = join(path, "tissue_positions.csv")
    files = dict(
        tissue_positions_file=tissue_positions_file,
        scalefactors_json_file=join(path, "scalefactors_json.json"),
        hires_image=join(path, "tissue_hires_image.png"),
        lowres_image=join(path, "tissue_lowres_image.png"),
    )
    
    adata.uns["spatial"] = dict()
    adata.uns["spatial"][library_id] = dict()
    adata.uns["spatial"][library_id]["images"] = dict()
    for res in ["hires", "lowres"]:
        try:
            adata.uns["spatial"][library_id]["images"][res] = imread(
                str(files[f"{res}_image"])
            )
        except Exception:
            raise OSError(f"Could not find '{res}_image'")

    # read json scalefactors
    adata.uns["spatial"][library_id]["scalefactors"] = json.loads(
        Path(files["scalefactors_json_file"]).read_bytes()
    )

    # read coordinates
    positions = pd.read_csv(
        files["tissue_positions_file"],
        header=0 if Path(tissue_positions_file).name == "tissue_positions.csv" else None,
        index_col=0,
    )
    positions.columns = [
        "in_tissue",
        "array_row",
        "array_col",
        "pxl_col_in_fullres",
        "pxl_row_in_fullres",
    ]
    # print(positions.head())

    adata.obs = adata.obs.join(positions, how="left")

    adata.obsm["spatial"] = adata.obs[
        ["pxl_row_in_fullres", "pxl_col_in_fullres"]
    ].to_numpy()
   
    adata.obs.drop(
        columns=["pxl_row_in_fullres", "pxl_col_in_fullres"],
        inplace=True,
    )

In [3]:
data_dir = '/disco_500t/xuhua/data/spatial_multi_omics/lymp_node/LN-2024-new/outs'

ad3 = load_data(join(data_dir, 'filtered_feature_bc_matrix'))
ad3_rna = ad3[:, ad3.var['type']=='Gene Expression'].copy()
ad3_adt = ad3[:, ad3.var['type']=='Antibody Capture'].copy()
load_spatial(join(data_dir, 'spatial'), ad3_rna)
load_spatial(join(data_dir, 'spatial'), ad3_adt)

ad3_rna.obs['src'] = ad3_adt.obs['src'] = ['s3']*ad3_rna.n_obs
ad3_rna.obs_names = [f's3-{x}' for x in ad3_rna.obs_names]
ad3_adt.obs_names = [f's3-{x}' for x in ad3_adt.obs_names]

ad3_rna.var_names_make_unique()
ad3_adt.var_names_make_unique()

data_dir = '/disco_500t/xuhua/data/spatial_multi_omics/lymp_tonsil_ramen'

ad_a1_rna = sc.read_h5ad(join(data_dir, 'lymph_A1/adata_RNA.h5ad'))
ad_a1_adt = sc.read_h5ad(join(data_dir, 'lymph_A1/adata_ADT.h5ad'))
meta1 = pd.read_csv(join(data_dir, 'lymph_A1/A1_LN_cloupe_Kwoh.csv'), index_col=0) 
ad_a1_rna.obs['lab'] = meta1.loc[ad_a1_rna.obs_names, 'manual'].to_list()
ad_a1_adt.obs['lab'] = meta1.loc[ad_a1_adt.obs_names, 'manual'].to_list()
ad_a1_rna.obs['src'] = ad_a1_adt.obs['src'] = ['s1'] * ad_a1_rna.n_obs
ad_a1_rna.obs_names = [f's1-{x}' for x in ad_a1_rna.obs_names]
ad_a1_adt.obs_names = [f's1-{x}' for x in ad_a1_adt.obs_names]
ad_a1_rna.var_names_make_unique()
ad_a1_adt.var_names_make_unique()

ad_d1_rna = sc.read_h5ad(join(data_dir, 'lymph_D1/adata_RNA.h5ad'))
ad_d1_adt = sc.read_h5ad(join(data_dir, 'lymph_D1/adata_ADT.h5ad'))
meta2 = pd.read_csv(join(data_dir, 'lymph_D1/D1_LN_cloupe_Kwoh.csv'), index_col=0) 
ad_d1_rna.obs['lab'] = meta2.loc[ad_d1_rna.obs_names, 'manual'].to_list()
ad_d1_adt.obs['lab'] = meta2.loc[ad_d1_adt.obs_names, 'manual'].to_list()
ad_d1_rna.obs['src'] = ad_d1_adt.obs['src'] = ['s2'] * ad_d1_rna.n_obs
ad_d1_rna.obs_names = [f's2-{x}' for x in ad_d1_rna.obs_names]
ad_d1_adt.obs_names = [f's2-{x}' for x in ad_d1_adt.obs_names]
ad_d1_rna.var_names_make_unique()
ad_d1_adt.var_names_make_unique()

## unify feature names
shared_gene = ad_a1_rna.var_names.intersection(ad_d1_rna.var_names).intersection(ad3_rna.var_names)
shared_prot = ad_a1_adt.var_names.intersection(ad_d1_adt.var_names).intersection(ad3_adt.var_names)

ad_a1_rna, ad_d1_rna, ad3_rna = ad_a1_rna[:, shared_gene].copy(), ad_d1_rna[:, shared_gene].copy(), ad3_rna[:, shared_gene].copy()
ad_a1_adt, ad_d1_adt, ad3_adt = ad_a1_adt[:, shared_prot].copy(), ad_d1_adt[:, shared_prot].copy(), ad3_adt[:, shared_prot].copy()

In [4]:
ad_rna_all = sc.concat([ad_a1_rna, ad_d1_rna, ad3_rna])
ad_adt_all = sc.concat([ad_a1_adt, ad_d1_adt, ad3_adt])

In [5]:
sc.pp.highly_variable_genes(ad_rna_all, batch_key="src", flavor="seurat_v3", n_top_genes=5000)

ad_a1_rna = ad_a1_rna[:, ad_rna_all.var.query('highly_variable').index].copy()
ad_d1_rna = ad_d1_rna[:, ad_rna_all.var.query('highly_variable').index].copy()
ad3_rna = ad3_rna[:, ad_rna_all.var.query('highly_variable').index].copy()

In [6]:
RNA_ADS = [ad_a1_rna, ad_d1_rna, ad3_rna]
ADT_ADS = [ad_a1_adt, ad_d1_adt, ad3_adt]
n_batches = 3
IDS = [np.arange(ad_a1_rna.n_obs), ad_a1_rna.n_obs + np.arange(ad_d1_rna.n_obs), ad_a1_rna.n_obs+ad_d1_rna.n_obs+np.arange(ad3_rna.n_obs)]

In [7]:
output_path = "./Lymph/" #path to results
os.makedirs(output_path, exist_ok=True)

In [8]:
# from scipy.stats import pearsonr
# for ad in ADT_ADS:
#     new_X = np.ceil(ad.X.A / 1000)
#     pccs = []
#     for col in range(new_X.shape[1]):
#         x, y = ad.X.A[:, col], new_X[:, col]
#         pccs.append(pearsonr(x,y)[0])
#     print(np.mean(pccs))

In [9]:
for ad in ADT_ADS:
    ad.X = np.ceil(ad.X.A / 1000)  # to avoid numerical problem

In [10]:
for i in range(n_batches):
    RNA_data = sc.concat(RNA_ADS)
    ADT_data = []
    train_ids = []
    for bi in range(n_batches):
        if bi==i:
            ad_empty = sc.AnnData(np.zeros(ADT_ADS[bi].shape), obs=ADT_ADS[bi].obs.copy())
            ad_empty.var_names = ADT_ADS[bi].var_names
            ADT_data.append(ad_empty)
            test_ids = list(IDS[bi])
        else:
            ADT_data.append(ADT_ADS[bi])
            train_ids.append(IDS[bi])
    ADT_data = sc.concat(ADT_data)
    train_ids = list(np.hstack(train_ids))

    RNA_data.obsm['protein_expression'] = ADT_data.to_df()
    train_batches = RNA_data[train_ids].obs['src'].unique()
    print(train_batches)

    scvi.model.TOTALVI.setup_anndata(RNA_data, batch_key="src", protein_expression_obsm_key="protein_expression")
    
    model = scvi.model.TOTALVI(
        RNA_data,
        latent_distribution="normal",
        n_layers_decoder=2
    )
    model.train()
    
    RNA_data.obsm["X_totalVI"] = model.get_latent_representation()
    RNA_data.obsm["protein_fg_prob"] = model.get_protein_foreground_probability(transform_batch=train_batches)
    _, protein_means = model.get_normalized_expression(
        n_samples=25,
        transform_batch=train_batches,
        include_protein_background=True,
        sample_protein_mixing=False,
        return_mean=True,
    )
    
    protein = protein_means.iloc[test_ids]
    ad_pred = sc.AnnData(protein, obs=ADT_data[test_ids].obs.copy())
    ad_pred.var_names = ADT_data.var_names
    ad_pred.write_h5ad(join(output_path, f'cv{i}_imputedADT.h5ad'))

An NVIDIA GPU may be present on this machine, but a CUDA-enabled jaxlib is not installed. Falling back to cpu.


['s2' 's3']
[34mINFO    [0m Using column names from columns of adata.obsm[1m[[0m[32m'protein_expression'[0m[1m][0m                                       
[34mINFO    [0m Found batches with missing protein expression                                                             
[34mINFO    [0m Computing empirical prior initialization for protein background.                                          


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Epoch 130/400:  32%|██▎    | 130/400 [03:46<07:23,  1.64s/it, loss=637, v_num=1]Epoch 00130: reducing learning rate of group 0 to 2.4000e-03.
Epoch 144/400:  36%|██▌    | 144/400 [04:09<07:23,  1.73s/it, loss=623, v_num=1]
Monitored metric elbo_validation did not improve in the last 45 records. Best score: 1273.287. Signaling Trainer to stop.
['s1' 's3']
[34mINFO    [0m Using column names from columns of adata.obsm[1m[[0m[32m'protein_expression'[0m[1m][0m                                       
[34mINFO    [0m Found batches with missing protein expression                                                             
[34mINFO    [0m Computing empirical prior initialization for protein background.                                          


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Epoch 143/400:  36%|██▌    | 143/400 [04:05<06:54,  1.61s/it, loss=621, v_num=1]Epoch 00143: reducing learning rate of group 0 to 2.4000e-03.
Epoch 157/400:  39%|██▋    | 157/400 [04:27<06:54,  1.71s/it, loss=617, v_num=1]
Monitored metric elbo_validation did not improve in the last 45 records. Best score: 1275.586. Signaling Trainer to stop.
['s1' 's2']
[34mINFO    [0m Using column names from columns of adata.obsm[1m[[0m[32m'protein_expression'[0m[1m][0m                                       
[34mINFO    [0m Found batches with missing protein expression                                                             
[34mINFO    [0m Computing empirical prior initialization for protein background.                                          


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Epoch 133/400:  33%|██▎    | 133/400 [03:46<06:56,  1.56s/it, loss=601, v_num=1]Epoch 00133: reducing learning rate of group 0 to 2.4000e-03.
Epoch 147/400:  37%|██▌    | 147/400 [04:08<07:07,  1.69s/it, loss=628, v_num=1]
Monitored metric elbo_validation did not improve in the last 45 records. Best score: 1272.048. Signaling Trainer to stop.


In [11]:
1

1