T1D vs T2D 간 beta-cell 유전자 발현 차이

In [1]:
import anndata as ad
import scanpy as sc
import numpy as np
import pandas as pd
from scipy.sparse import issparse

Data loading

In [2]:
# import gzip
# with gzip.open("C:/Users/user/Desktop/GSE211799_adata_atlas.h5ad.gz", 'rb') as f:
#     adata = ad.read_h5ad(f)

# Raw data 구조 및 구성 파악
adata = sc.read_h5ad('C:/Users/user/Desktop/T1D/GSE211799_adata_atlas.h5ad')

In [3]:
print(adata.shape)                      # (n_cells, n_genes) = (301796, 31706)
print(adata)                            # adata 주요 구성 내용
print(adata.obs.head())                 # 각 셀의 메타데이터: sample_id, condition, diabetes_status, etc.
print(adata.var.head())                 # 유전자 정보
print(adata.uns.keys())                 # 시각화용 설정값, 필드 설명 등
print(adata.obsm.keys())                # PCA, UMAP, latent space 등 low-dimensional embedding
print(adata.X[:5, :5])                  # 처음 5x5 행렬 출력
print(adata.uns["field_descriptions"])  # 메타데이터 항목 설명

(301796, 31706)
AnnData object with n_obs × n_vars = 301796 × 31706
    obs: 'study_sample', 'study', 'file', 'reference', 'size_factors_sample', 'phase_cyclone', 's_cyclone', 'g2m_cyclone', 'g1_cyclone', 'sex', 'ins_score', 'ins_high', 'gcg_score', 'gcg_high', 'sst_score', 'sst_high', 'ppy_score', 'ppy_high', 'cell_filtering', 'age', 'strain', 'tissue', 'technique', 'study_sample_design', 'cell_type', 'cell_type_multiplet', 'cell_subtype', 'cell_subtype_multiplet', 'design', 'size_factors_integrated', 'pre_cell_type_unified', 'pre_cell_type_original', 'study_parsed', 'cell_type_parsed', 'low_q', 'BETA-DATA_leiden_r1.5', 'BETA-DATA_leiden_r20', 'BETA-DATA_hc_gene_programs', 'BETA-DATA_hc_gene_programs_parsed', 'BETA-DATA_leiden_r1.5_parsed', 'BETA-DATA_leiden_r1.5_parsed_const', 'CXG-DATA_n_genes', 'CXG-DATA_mt_frac', 'CXG-DATA_doublet_score', 'CXG-DATA_log10_n_counts', 'CXG-DATA_age_approxDays', 'CXG-DATA_cell_subtype_immune_reannotatedIntegrated', 'CXG-DATA_cell_subtype_endothelial_r

In [4]:
# beta-cell subset
print(adata.obs['cell_type_integrated_v2_parsed'].value_counts())

cell_type_integrated_v2_parsed
beta             102143
alpha             40935
immune            31703
E non-endo.       29177
delta             24775
stellate a.       18332
endothelial       13469
ductal             8742
E endo.            7748
gamma              6999
beta+delta         5104
stellate q.        4970
alpha+delta        1901
beta+gamma         1209
delta+gamma        1069
endo. prolif.       887
lowQ                853
alpha+beta          683
schwann             617
acinar              480
Name: count, dtype: int64


In [5]:
# beta cell 이름만 먼저 추출
beta_cell_names = adata.obs_names[adata.obs['cell_type_integrated_v2_parsed'] == 'beta']
print(f"Found {len(beta_cell_names)} beta cells")

Found 102143 beta cells


In [6]:
# Sampling (RAM 폭발 방지)
# 샘플링 크기 결정
n_sample = 5000 if len(beta_cell_names) >= 5000 else len(beta_cell_names)
# 무작위 샘플링
np.random.seed(42)
sampled_cells = np.random.choice(beta_cell_names, size=n_sample, replace=False)
# sampling
beta_cells_subset = adata[sampled_cells].copy()

In [None]:
# beta-cell subset + T1D/control 조건 라벨 정보
beta_cells_subset.obs['condition'] = beta_cells_subset.obs['CXG-DATA_diabetes_model']
beta_cells_subset.write("C:/Users/user/Desktop/T1D/beta_cells_condition_analysis.h5ad")

In [8]:
# [확인] beta_cells_condition_analysis.h5ad 데이터 구조
adata_beta_cond = sc.read("C:/Users/user/Desktop/T1D/beta_cells_condition_analysis.h5ad")
print(adata_beta_cond)
print("컬럼 목록:", adata_beta_cond.obs.columns.tolist())
print(adata_beta_cond.obs["condition"].value_counts())  # T1D vs control 분포

AnnData object with n_obs × n_vars = 5000 × 31706
    obs: 'study_sample', 'study', 'file', 'reference', 'size_factors_sample', 'phase_cyclone', 's_cyclone', 'g2m_cyclone', 'g1_cyclone', 'sex', 'ins_score', 'ins_high', 'gcg_score', 'gcg_high', 'sst_score', 'sst_high', 'ppy_score', 'ppy_high', 'cell_filtering', 'age', 'strain', 'tissue', 'technique', 'study_sample_design', 'cell_type', 'cell_type_multiplet', 'cell_subtype', 'cell_subtype_multiplet', 'design', 'size_factors_integrated', 'pre_cell_type_unified', 'pre_cell_type_original', 'study_parsed', 'cell_type_parsed', 'low_q', 'BETA-DATA_leiden_r1.5', 'BETA-DATA_leiden_r20', 'BETA-DATA_hc_gene_programs', 'BETA-DATA_hc_gene_programs_parsed', 'BETA-DATA_leiden_r1.5_parsed', 'BETA-DATA_leiden_r1.5_parsed_const', 'CXG-DATA_n_genes', 'CXG-DATA_mt_frac', 'CXG-DATA_doublet_score', 'CXG-DATA_log10_n_counts', 'CXG-DATA_age_approxDays', 'CXG-DATA_cell_subtype_immune_reannotatedIntegrated', 'CXG-DATA_cell_subtype_endothelial_reannotatedIntegrat

In [9]:
# expression 값을 DataFrame처럼 보기
df = pd.DataFrame(adata.X[:5, :5].todense() if issparse(adata.X) else adata.X[:5, :5],
                  index=adata.obs_names[:5], columns=adata.var_names[:5])
print(df)

EID                                            ENSMUSG00000000001  \
index                                                               
CAAGATCGTCCAGTTA-1-SRR7610301-NOD_elimination            0.745006   
GATGAAAGTTGTCGCG-1-SRR7610298-NOD_elimination            0.000000   
AGTCTTTAGGAGCGTT-1-SRR7610301-NOD_elimination            0.000000   
CTTCCTTGTACCCAGC-1-MUC13640-VSG                          0.000000   
CATCAAGAGATTACCC-1-SRR7610296-NOD_elimination            0.000000   

EID                                            ENSMUSG00000000003  \
index                                                               
CAAGATCGTCCAGTTA-1-SRR7610301-NOD_elimination                 0.0   
GATGAAAGTTGTCGCG-1-SRR7610298-NOD_elimination                 0.0   
AGTCTTTAGGAGCGTT-1-SRR7610301-NOD_elimination                 0.0   
CTTCCTTGTACCCAGC-1-MUC13640-VSG                               0.0   
CATCAAGAGATTACCC-1-SRR7610296-NOD_elimination                 0.0   

EID                             