In [1]:
import scanpy as sc
import numpy as np
import pandas as pd
from matplotlib import rcParams
import matplotlib.pyplot as plt

# 1. Load datas

In [2]:
# File path
file_path = "../../Database/Ravi_Cancer_Cell_2022/10XVisium_2/#"
folders = ['UKF241_C_ST'] 

In [3]:
# Load the json file for spatial scale factors
import json

# JSON 파일 읽기
with open(file_path + folders[0] + '/outs/spatial/scalefactors_json.json', 'r') as file:
    scale_factors = json.load(file)

scale_factors


{'spot_diameter_fullres': 52.70366385405869,
 'tissue_hires_scalef': 0.24414062,
 'fiducial_diameter_fullres': 85.13668776424866,
 'tissue_lowres_scalef': 0.07324219}

In [4]:
# Load the spot data
spot_data = pd.read_csv(file_path + folders[0] +'/outs/cell_counts_within_circles.csv', header = None)
spot_data.columns = ['barcode', 'in_tissue', 'array_row', 'array_col', 'pxl_row_in_fullres', 'pxl_col_in_fullres', 'cell_counts']

In [5]:
spot_data

Unnamed: 0,barcode,in_tissue,array_row,array_col,pxl_row_in_fullres,pxl_col_in_fullres,cell_counts
0,GATGTCCGGATCACAT-1,1,7,37,1789,2799,3
1,GGTCACGTTAGATTCA-1,1,6,38,1718,2840,4
2,TTAAGGATACGGAGGT-1,1,7,39,1789,2881,0
3,TTCTACCTTTATGTTG-1,1,6,84,1716,4705,0
4,CCAGCTACGCCTCATA-1,1,9,35,1930,2719,0
...,...,...,...,...,...,...,...
2114,TCGTTTACGCGACCCT-1,1,70,94,6230,5116,0
2115,GACACTTCCAATTACC-1,1,70,96,6230,5197,2
2116,CTTGATGACCATCCAG-1,1,70,98,6230,5278,1
2117,ATCAAGATCCCAGGAC-1,1,72,58,6373,3657,3


In [6]:
# Load stRNA-seq data
adata_list = []
for folder in folders:
    adata = sc.read_10x_h5(file_path + folder + "/outs/filtered_feature_bc_matrix.h5")
    adata.obs['sample'] = folder
    # adata.obs_names_make_unique()
    adata.var_names_make_unique()
    adata_list.append(adata)

  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


In [7]:
adata_list

[AnnData object with n_obs × n_vars = 2119 × 33538
     obs: 'sample'
     var: 'gene_ids', 'feature_types', 'genome']

# 2. Normalization, transformation

In [8]:
adata_list

[AnnData object with n_obs × n_vars = 2119 × 33538
     obs: 'sample'
     var: 'gene_ids', 'feature_types', 'genome']

In [9]:
# sc.pp.log1p(adata_list[0])

# cell2location

In [10]:
import cell2location
from cell2location.models import RegressionModel

  from .autonotebook import tqdm as notebook_tqdm
2024-10-30 21:44:46.667880: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-30 21:44:46.675957: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-30 21:44:46.678414: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [31]:
# Load scRNA-seq reference data
sc_ref_data = sc.read_h5ad('../data/adata_annotated.h5ad')

# Load spatial transcriptomics data
st_data = adata_list[0]

In [33]:
# sc_ref_data로부터 세포 유형 특성 추정
cell2location.models.RegressionModel.setup_anndata(sc_ref_data, labels_key='cell_type', layer = 'counts')


In [34]:
print(sc_ref_data)

AnnData object with n_obs × n_vars = 3261 × 17282
    obs: 'sample', 'n_genes', 'total_counts', 'pct_counts_mito', 'pct_counts_ribo', 'leiden', 'tumor_cell', 'cnv_leiden', 'cnv_score', 'cell_type', '_indices', '_scvi_batch', '_scvi_labels'
    var: 'chromosome', 'start', 'end', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection', 'mean', 'std'
    uns: 'cell_type_colors', 'cnv', 'cnv_leiden', 'cnv_leiden_colors', 'cnv_neighbors', 'dendrogram_cnv_leiden', 'dendrogram_leiden', 'hvg', 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'pca', 'rank_genes_groups', 'sample_colors', 'tumor_cell_colors', 'umap', '_scvi_uuid', '_scvi_manager_uuid'
    obsm: 'X_cnv', 'X_cnv_pca', 'X_pca', 'X_pca_harmony', 'X_umap'
    varm: 'PCs'
    layers: 'counts'
    obsp: 'cnv_neighbors_connectivities', 'cnv_neighbors_distances', 'connectivities', 'distances'


In [35]:
regression_model = RegressionModel(sc_ref_data)
regression_model.train(max_epochs = 250)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/jsw/jswenv/lib/python3.10/site-packages/lightning/pytorch/trainer/configuration_validator.py:68: You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop.
You are using a CUDA device ('NVIDIA GeForce RTX 3070') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/jsw/jswenv/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to impro

Epoch 250/250: 100%|██████████| 250/250 [00:45<00:00,  5.55it/s, v_num=1, elbo_train=2.81e+7]

`Trainer.fit` stopped: `max_epochs=250` reached.


Epoch 250/250: 100%|██████████| 250/250 [00:45<00:00,  5.45it/s, v_num=1, elbo_train=2.81e+7]
