In [1]:
import numpy as np
import pandas as pd
import anndata as ad
from scipy.sparse import csr_matrix

counts = csr_matrix(np.random.poisson(1, size=(100, 2000)), dtype=np.float32)
adata = ad.AnnData(counts)
adata.X

<100x2000 sparse matrix of type '<class 'numpy.float32'>'
	with 126508 stored elements in Compressed Sparse Row format>

## AnnData
X : 基因表达稀疏矩阵，通常存储基因表达数据(行代表基因，列代表细胞)

obs : (DataFrame)每个细胞的元数据(观察特征,如细胞类型,样本来源等)

var : (DataFrame)每个基因的元数据(变量特征, 基因名称、基因类型)
uns :  (dict)非标准的、不适用于所有细胞的元数据(如聚类结果、图形坐标)
obsm : (dict)每个细胞的多元数据,通常是细胞的高级表示,如降维结果
varm : 变量的矩阵
layers : 用于存储不同处理步骤的基因表达矩阵

In [2]:
## obs 每个观测单元(通常是细胞)的数据
## var 每个变量(通常是基因)的数据
adata.obs_names = [f"Cell_{i:d}" for i in range(adata.n_obs)]
adata.var_names = [f"Gene_{i:d}" for i in range(adata.n_vars)]
print(adata)

AnnData object with n_obs × n_vars = 100 × 2000


In [3]:
ct = np.random.choice(["B", "T", "Monocyte"], size=(adata.n_obs,))
adata.obs["cell_type"] = pd.Categorical(ct)
print(adata.obs["cell_type"])
print(adata)

Cell_0     Monocyte
Cell_1            B
Cell_2            B
Cell_3     Monocyte
Cell_4     Monocyte
             ...   
Cell_95           T
Cell_96           T
Cell_97           T
Cell_98           T
Cell_99           B
Name: cell_type, Length: 100, dtype: category
Categories (3, object): ['B', 'Monocyte', 'T']
AnnData object with n_obs × n_vars = 100 × 2000
    obs: 'cell_type'


In [8]:
bdata = adata[adata.obs.cell_type == 'B']
adata.obsm["X_umap"] = np.random.normal(0, 1, size=(adata.n_obs, 2))
adata.varm["gene_stuff"] = np.random.normal(0, 1, size=(adata.n_vars, 5))
print(adata.obsm["X_umap"])
print(len(adata.obsm["X_umap"]))

[[-1.91035859 -1.41528693]
 [-1.90145011  1.38268938]
 [ 1.03107842  0.86526057]
 [ 2.11971259  0.02658457]
 [-1.53716831 -0.21861052]
 [ 0.25043842 -0.0361296 ]
 [-1.69858353 -0.94710147]
 [ 0.60285959 -1.20107479]
 [ 0.24133196  0.61674448]
 [ 3.08232152 -0.66991471]
 [-0.82279316  0.15277112]
 [ 0.23247839  0.47608476]
 [-1.98827971 -0.28815051]
 [ 1.26521041 -0.28550589]
 [-1.31105197 -0.63161874]
 [-0.92240566 -0.72172775]
 [ 0.08834511 -1.08743422]
 [ 0.53485967 -1.18474714]
 [-0.32958515 -0.69988001]
 [ 0.34447995 -0.81601178]
 [-1.17352124 -0.42906385]
 [-0.30856815 -1.99696389]
 [-0.91280508 -1.16570179]
 [ 0.31887295 -0.84198368]
 [-0.46333855 -1.47603566]
 [ 1.10130319  0.53368504]
 [-0.20100249 -0.24239979]
 [ 0.62851269 -1.86721283]
 [ 0.49424615  0.55817957]
 [-1.03887072  2.04904457]
 [ 1.39327297  0.86689983]
 [ 1.53413855 -0.71387278]
 [ 0.87298275  1.03324425]
 [-2.53661422  2.88177482]
 [ 0.18283795  1.49502403]
 [ 0.16297641  0.83573668]
 [ 0.11146548  0.26402618]
 

In [9]:
adata.uns["random"] = [1, 2, 3]
adata.uns

OrderedDict([('random', [1, 2, 3])])

In [10]:
adata.layers["log_transformed"] = np.log1p(adata.X)
adata

AnnData object with n_obs × n_vars = 100 × 2000
    obs: 'cell_type'
    uns: 'random'
    obsm: 'X_umap'
    varm: 'gene_stuff'
    layers: 'log_transformed'

In [11]:
adata.to_df(layer="log_transformed")

Unnamed: 0,Gene_0,Gene_1,Gene_2,Gene_3,Gene_4,Gene_5,Gene_6,Gene_7,Gene_8,Gene_9,...,Gene_1990,Gene_1991,Gene_1992,Gene_1993,Gene_1994,Gene_1995,Gene_1996,Gene_1997,Gene_1998,Gene_1999
Cell_0,0.000000,0.693147,0.693147,1.098612,0.693147,0.693147,0.693147,0.693147,1.098612,1.386294,...,0.000000,0.000000,0.000000,0.693147,1.386294,0.693147,0.693147,0.693147,0.693147,0.693147
Cell_1,1.098612,0.693147,0.000000,0.000000,0.693147,0.693147,0.693147,0.000000,1.098612,0.693147,...,0.000000,0.693147,0.693147,1.609438,1.098612,0.000000,0.693147,0.000000,0.000000,0.693147
Cell_2,0.000000,0.000000,0.693147,1.098612,1.098612,0.693147,0.000000,0.000000,1.098612,0.000000,...,1.386294,0.000000,0.000000,0.000000,0.693147,0.000000,1.098612,1.386294,0.000000,0.693147
Cell_3,1.386294,0.693147,0.000000,0.000000,0.693147,0.693147,0.000000,0.693147,1.098612,0.000000,...,0.000000,0.693147,0.693147,0.000000,0.693147,0.693147,1.386294,1.098612,0.693147,1.609438
Cell_4,1.098612,1.098612,0.000000,0.693147,0.000000,0.693147,0.000000,1.098612,0.000000,1.098612,...,0.000000,0.693147,0.000000,1.098612,0.693147,0.693147,0.693147,1.386294,0.693147,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Cell_95,1.791759,0.693147,1.098612,0.000000,0.000000,0.693147,0.000000,0.693147,0.693147,0.000000,...,0.000000,0.693147,1.098612,0.693147,1.386294,0.000000,0.000000,1.098612,0.000000,1.098612
Cell_96,0.000000,0.693147,0.000000,1.609438,0.000000,0.693147,0.000000,0.000000,0.000000,1.098612,...,0.000000,0.000000,0.000000,0.693147,0.000000,1.098612,0.693147,0.000000,0.693147,0.000000
Cell_97,0.000000,0.000000,0.000000,1.098612,0.693147,1.098612,1.098612,0.693147,1.386294,0.693147,...,1.098612,1.098612,0.000000,0.693147,1.386294,0.693147,1.098612,0.000000,0.000000,0.000000
Cell_98,0.000000,0.693147,0.000000,0.000000,0.000000,0.000000,1.386294,1.098612,1.098612,0.000000,...,1.098612,0.693147,1.098612,0.000000,0.693147,0.000000,0.693147,0.693147,1.386294,0.693147


In [14]:
obs_meta = pd.DataFrame({
        'time_yr': np.random.choice([0, 2, 4, 8], adata.n_obs),
        'subject_id': np.random.choice(['subject 1', 'subject 2', 'subject 4', 'subject 8'], adata.n_obs),
        'instrument_type': np.random.choice(['type a', 'type b'], adata.n_obs),
        'site': np.random.choice(['site x', 'site y'], adata.n_obs),
    },
    index=adata.obs.index,    # these are the same IDs of observations as above!
)
adata = ad.AnnData(adata.X, obs=obs_meta, var=adata.var)
adata

AnnData object with n_obs × n_vars = 100 × 2000
    obs: 'time_yr', 'subject_id', 'instrument_type', 'site'

In [18]:
adata_subset = adata[:3, ['Gene_1', 'Gene_2']]
adata_subset.obs['foo'] = range(3)
adata[adata.obs.time_yr.isin([2, 4])].obs.head()


  adata_subset.obs['foo'] = range(3)


Unnamed: 0,time_yr,subject_id,instrument_type,site
Cell_0,4,subject 2,type a,site x
Cell_1,4,subject 1,type b,site x
Cell_6,2,subject 8,type b,site x
Cell_8,4,subject 8,type b,site y
Cell_10,2,subject 2,type b,site x
