In [1]:
import scanpy as sc
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
sys.path.append(str(Path.cwd().parent))
from utils.adata import *
from utils.latent import *
from utils.plot import *
from utils.evaluation import *
import json
from sklearn.neighbors import NearestNeighbors
from umap import UMAP
import torch
import torch.nn.functional as F
import anndata
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.decomposition import PCA
from src.scDiffusion import VAE, guided_diffusion
from src.scDiffusion.VAE.VAE_model import *

In [2]:
adata = sc.read_h5ad("../data/emt.h5ad")

In [3]:
adata.obs["celltype"] = adata.obs["cell_type"].map(
    {"Epithelial": "Epi", "Mesenchymal": "Mes"}
)
adata.obs["celltype"] = adata.obs["celltype"].astype("category")

In [4]:
adata.obs["period"] = adata.obs["celltype"]

In [5]:
adata.write("../data/emt_diff.h5ad")

In [40]:
adata

AnnData object with n_obs × n_vars = 5027 × 2000
    obs: 'cell', 'total_umis', 'sample', 'TSNE.1', 'TSNE.2', 'Size_Factor', 'treatment_id', 'spatial_id', 'cell_type', 'dpt_pseudotime', 'celltype', 'period'
    var: 'id', 'gene_short_name', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm'
    uns: 'X_name', 'cell_type_colors', 'diffmap_evals', 'hvg', 'iroot', 'neighbors', 'pca', 'umap'
    obsm: 'X_diffmap', 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

In [None]:
# !python ../src/scDiffusion/VAE/VAE_train.py \
# --data_dir '../data/emt_diff.h5ad' \
# --num_genes 2000 \
# --save_dir '../models/scdiff/emt/VAE' \
# --state_dict '../models/scdiff/annotation_model_v1/' \
# --max_steps 200000


loading pretrained model from: 
 {'encoder': '../models/scdiff/annotation_model_v1/encoder.ckpt', 'decoder': '../models/scdiff/annotation_model_v1/decoder.ckpt', 'gene_order': '../models/scdiff/annotation_model_v1/gene_order.tsv'}
step  0 loss  1.4337416887283325
step  1000 loss  0.2901614010334015
step  2000 loss  0.25145766139030457
step  3000 loss  0.2434931993484497
step  4000 loss  0.21504704654216766
step  5000 loss  0.20773711800575256
step  6000 loss  0.19371017813682556
step  7000 loss  0.1876397430896759
step  8000 loss  0.17543569207191467
step  9000 loss  0.17463421821594238
step  10000 loss  0.16368670761585236
step  11000 loss  0.15742330253124237
step  12000 loss  0.1545163244009018
step  13000 loss  0.15365418791770935
step  14000 loss  0.15154138207435608
step  15000 loss  0.1463499814271927
step  16000 loss  0.1404372900724411
step  17000 loss  0.14103105664253235
step  18000 loss  0.13104760646820068
step  19000 loss  0.13053001463413239
step  20000 loss  0.128240600

In [None]:
# !python ../src/scDiffusion/cell_train.py \
#   --data_dir '../data/emt_diff.h5ad' \
#   --vae_path '../models/scdiff/emt/VAE/model_seed=0_step=199999.pt' \
#   --model_name 'diffusion' \
#   --save_dir '../models/scdiff/emt' \
#   --batch_size 128 \
#   --lr 0.5e-5 \
#   --lr_anneal_steps 20000

Logging to ../models/scdiff/emt/diffusion/logs/diffusion
creating model and diffusion...
creating data loader...
training...
-------------------------
| grad_norm  | 0.878    |
| loss       | 1.13     |
| loss_q0    | 1.17     |
| loss_q1    | 1.12     |
| loss_q2    | 1.27     |
| loss_q3    | 1.28     |
| mse        | 1.13     |
| mse_q0     | 1.17     |
| mse_q1     | 1.12     |
| mse_q2     | 1.27     |
| mse_q3     | 1.28     |
| param_norm | 79       |
| samples    | 128      |
| step       | 0        |
-------------------------
saving model 0...
saving model 0.9999...
-------------------------
| grad_norm  | 0.643    |
| loss       | 1.02     |
| loss_q0    | 0.958    |
| loss_q1    | 1.17     |
| loss_q2    | 1.08     |
| loss_q3    | 0.973    |
| mse        | 1.02     |
| mse_q0     | 0.958    |
| mse_q1     | 1.17     |
| mse_q2     | 1.08     |
| mse_q3     | 0.973    |
| param_norm | 79       |
| samples    | 1.29e+04 |
| step       | 100      |
-------------------------
--

In [6]:
device     = "cuda" if torch.cuda.is_available() else "cpu"
batch_size = 512
num_cells, num_genes = adata.shape

In [7]:
X = adata.X
if not isinstance(X, np.ndarray):
    X = X.toarray()

sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
X = adata.X
if not isinstance(X, np.ndarray):
    X = X.toarray()



In [8]:
latent_dim = 128  

model = VAE(
    num_genes=num_genes,
    hidden_dim=latent_dim,
    device=device,
    seed=0,
    decoder_activation="ReLU",
)
state = torch.load('../models/scdiff/emt/VAE/model_seed=0_step=199999.pt', map_location=device)
model.load_state_dict(state)
model.to(device)
model.eval()

VAE(
  (encoder): Encoder(
    (network): ModuleList(
      (0): Sequential(
        (0): Dropout(p=0.0, inplace=False)
        (1): Linear(in_features=2000, out_features=1024, bias=True)
        (2): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (3): PReLU(num_parameters=1)
      )
      (1-2): 2 x Sequential(
        (0): Dropout(p=0.0, inplace=False)
        (1): Linear(in_features=1024, out_features=1024, bias=True)
        (2): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (3): PReLU(num_parameters=1)
      )
      (3): Linear(in_features=1024, out_features=128, bias=True)
    )
  )
  (decoder): Decoder(
    (network): ModuleList(
      (0): Sequential(
        (0): Linear(in_features=128, out_features=1024, bias=True)
        (1): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): PReLU(num_parameters=1)
      )
      (1-2): 2 x Sequential(
        (

In [9]:
X_latent = np.zeros((num_cells, latent_dim), dtype=np.float32)

with torch.no_grad():
    for start in range(0, num_cells, batch_size):
        end = min(start + batch_size, num_cells)
        batch = torch.tensor(X[start:end], dtype=torch.float32, device=device)

        z = model.encoder(batch)    

        X_latent[start:end] = z.cpu().numpy()

In [None]:
# !python ../src/scDiffusion/classifier_train.py \
#   --data_dir '../data/emt_diff.h5ad' \
#   --vae_path '../models/scdiff/emt/VAE/model_seed=0_step=199999.pt' \
#   --model_path '../models/scdiff/emt/classifier/' \
#   --num_class 2 \
#   --batch_size 64 \
#   --iterations 10000 \
#   --lr 1e-3 \
#   --weight_decay 0.0 \
#   --log_interval 100 \
#   --save_interval 1000 \
#   --noised False

Logging to /tmp/openai-2025-12-11-00-57-28-467648
creating model and diffusion...
Running in single GPU mode without DDP
creating data loader...
creating optimizer...
training classifier model...
-----------------------------
| grad_norm      | 2.79     |
| param_norm     | 49.8     |
| samples        | 64       |
| step           | 0        |
| train_acc@1    | 0.547    |
| train_acc@1_q0 | 0        |
| train_loss     | 0.696    |
| train_loss_q0  | 0.871    |
-----------------------------
-----------------------------
| grad_norm      | 1.01     |
| param_norm     | 50       |
| samples        | 6.46e+03 |
| step           | 100      |
| train_acc@1    | 0.875    |
| train_acc@1_q0 | 1        |
| train_loss     | 0.222    |
| train_loss_q0  | 0.0239   |
-----------------------------
-----------------------------
| grad_norm      | 0.797    |
| param_norm     | 50.3     |
| samples        | 1.29e+04 |
| step           | 200      |
| train_acc@1    | 0.906    |
| train_acc@1_q0 | 1    

: 

In [10]:
!python ../src/scDiffusion/classifier_sample.py 

Logging to /tmp/openai-2025-12-11-01-22-31-654349
creating model and diffusion...
loading classifier...
sampling...
Traceback (most recent call last):
  File "/mnt/gs21/scratch/islamsa3/fm-project/scripts/../src/scDiffusion/classifier_sample.py", line 337, in <module>
    main(cell_type=[0,1], inter=True, weight=[10-i,i])
  File "/mnt/gs21/scratch/islamsa3/fm-project/scripts/../src/scDiffusion/classifier_sample.py", line 188, in main
    start_x = autoencoder(torch.tensor(start_x,device=dist_util.dev()),return_latent=True).detach().cpu().numpy()
  File "/mnt/home/islamsa3/anaconda3/envs/global/lib/python3.10/site-packages/scipy/sparse/_base.py", line 425, in __len__
    raise TypeError("sparse array length is ambiguous; use getnnz()"
TypeError: sparse array length is ambiguous; use getnnz() or shape[0]
