In [1]:
import sys, os, importlib

# --- Paths ---
current_dir = os.getcwd()  # This will be .../DeepOMAPNet/Notebooks
project_root = os.path.dirname(current_dir)  # This will be .../DeepOMAPNet

# Add project root to Python path
if project_root not in sys.path:
    sys.path.insert(0, project_root)



# Dataset - 1

In [2]:
import scanpy as sc
import anndata


# Load the preprocessed data
from scripts.data_provider.data_preprocessing import prepare_train_test_anndata
data = prepare_train_test_anndata()
rna_adata = data[0]  # RNA data
rna_test = data[1]
adt_adata = data[2]   # ADT data
adt_test = data[3]





  from .autonotebook import tqdm as notebook_tqdm


All sample IDs in gene data: ['AML0612' 'AML3762' 'AML3133' 'AML2910' 'AML3050' 'AML2451' 'AML056'
 'AML073' 'AML055' 'AML048' 'AML052' 'AML2123' 'AML1371' 'AML4340'
 'AML4897' 'AML051' 'AML0693' 'AML3948' 'AML3730' 'AML0160' 'AML0310'
 'AML0361' 'AML038' 'AML008' 'AML043' 'AML028' 'AML006' 'AML025' 'AML003'
 'AML012' 'AML005' 'AML0048' 'AML022' 'AML0024' 'AML009' 'AML026' 'AML001'
 'AML0114' 'Control4' 'Control2' 'Control1' 'Control3' 'Control5'
 'Control0004' 'Control0058' 'Control0082' 'Control4003' 'Control0005']
AML 80% train: ['AML0024', 'AML001', 'AML3050', 'AML4340', 'AML005', 'AML006', 'AML056', 'AML025', 'AML043', 'AML051', 'AML3948', 'AML055', 'AML0693', 'AML1371', 'AML0160', 'AML048', 'AML022', 'AML0612', 'AML028', 'AML2451', 'AML2123', 'AML3762', 'AML0114', 'AML0361', 'AML3133', 'AML012', 'AML026', 'AML2910', 'AML009', 'AML008', 'AML0048']
AML 20% test: ['AML052', 'AML038', 'AML3730', 'AML0310', 'AML073', 'AML4897', 'AML003']
Control 80% train: ['Control4003', 'Control1', 

In [3]:
adata = rna_adata
adata.layers["counts"] = adata.X.copy()
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata.raw = adata  # keep full dimension safe
sc.pp.highly_variable_genes(
    adata,
    flavor="seurat_v3",
    n_top_genes=2000,
    layer="counts",
    batch_key="sample",
    subset=True,
)

In [4]:
import scvi
scvi.model.SCVI.setup_anndata(adata, layer="counts", batch_key="sample")
scvi_model = scvi.model.SCVI(adata, n_layers=2, n_latent=30)
scvi_model.train()
SCVI_LATENT_KEY = "X_scVI"
adata.obsm[SCVI_LATENT_KEY] = scvi_model.get_latent_representation()

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA H200') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Epoch 51/51: 100%|██████████| 51/51 [07:35<00:00, 10.42s/it, v_num=1, train_loss=621]

`Trainer.fit` stopped: `max_epochs=51` reached.


Epoch 51/51: 100%|██████████| 51/51 [07:35<00:00,  8.93s/it, v_num=1, train_loss=621]


In [5]:
adata

AnnData object with n_obs × n_vars = 158179 × 2000
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'cell', 'UMAP_1', 'UMAP_2', 'samples', 'Broad_cell_identity', 'Cell_type_identity', 'clusters_res.2', 'CNV_pos', 'malignant', 'aml', 'ap_aml_age', 'age_group', 'sex', 'inflammation_group', 'occupancy_score', 'sample', 'unintegrated_clusters', 'seurat_clusters', 'RNA_snn_res.1', 'source', '_scvi_batch', '_scvi_labels'
    var: 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highly_variable_nbatches'
    uns: 'log1p', 'hvg', '_scvi_uuid', '_scvi_manager_uuid'
    obsm: 'X_integrated.cca', 'X_pca', 'X_umap', 'X_umap.unintegrated', 'X_scVI'
    layers: 'counts'

In [6]:
adata.write("/projects/vanaja_lab/satya/DeepOMAPNet/GSE116256.h5ad")

# Dataset-2

In [8]:
import os
import pandas as pd
import scanpy as sc

path = "/projects/vanaja_lab/satya/Datasets/GSE232559"

# 1. Read barcodes, should match rows of RNA matrix
rna_barcodes = pd.read_csv(os.path.join(path, "GSE232559_cell_barcodes.tsv.gz"), header=None)[0].tolist()

# 2. Read metadata for ALL cells
cell_metadata = pd.read_csv(os.path.join(path, "GSE232559_cell_metadata.tsv.gz"), sep="\t", index_col=0)

# 3. Subset and reindex metadata to the barcodes used in RNA counts
cell_metadata_rna = cell_metadata.reindex(rna_barcodes)
cell_metadata_rna = cell_metadata_rna.dropna(how='any')


# 4. Read gene names/features
rna_features_path = os.path.join(path, "GSE232559_rna_normalized_counts_features.tsv.gz")  # or .tsv/.txt
rna_features = pd.read_csv(rna_features_path, header=None)[0].tolist()


# Transpose rna_counts so that cells (columns) become rows
rna_counts = sc.read_mtx(os.path.join(path, "GSE232559_rna_normalized_counts.mtx.gz")).X
rna_counts_T = rna_counts.transpose()  # shape should be (32319, 24561)

# Align metadata index to rna_barcodes (will drop/add NaNs if mismatched)
cell_metadata_rna = cell_metadata.reindex(rna_barcodes)
cell_metadata_rna = cell_metadata_rna.dropna(how='any')  # drop barcodes not in metadata

# Make sure everything matches
print("rna_counts_T shape:", rna_counts_T.shape)
print("cell_metadata_rna shape:", cell_metadata_rna.shape)

# Only keep first N cells if metadata < 32319
N = cell_metadata_rna.shape[0]
rna_counts_T = rna_counts_T[:N, :]
# Also update rna_barcodes if needed
rna_barcodes_final = rna_barcodes[:N]

# Now build AnnData
adata = sc.AnnData(X=rna_counts_T, obs=cell_metadata_rna, var=pd.DataFrame(index=rna_features))


rna_counts_T shape: (32319, 24561)
cell_metadata_rna shape: (29925, 23)


In [9]:
adata.layers["counts"] = adata.X.copy()
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata.raw = adata  # keep full dimension safe
sc.pp.highly_variable_genes(
    adata,
    flavor="seurat_v3",
    n_top_genes=2000,
    layer="counts",
    batch_key="sample",
    subset=True,
)

In [10]:
import scvi
scvi.model.SCVI.setup_anndata(adata, layer="counts", batch_key="sample")
scvi_model = scvi.model.SCVI(adata, n_layers=2, n_latent=30)
scvi_model.train()
SCVI_LATENT_KEY = "X_scVI"
adata.obsm[SCVI_LATENT_KEY] = scvi_model.get_latent_representation()

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Epoch 267/267: 100%|██████████| 267/267 [21:20<00:00,  4.91s/it, v_num=1, train_loss=338]

`Trainer.fit` stopped: `max_epochs=267` reached.


Epoch 267/267: 100%|██████████| 267/267 [21:20<00:00,  4.80s/it, v_num=1, train_loss=338]


In [13]:
adata.write_h5ad("/projects/vanaja_lab/satya/DeepOMAPNet/GSE232559.h5ad")