In [1]:
import scanpy as sc
import pandas as pd

In [2]:
adata = sc.read_h5ad('Liver_Cell_Atlas.h5ad')
print(adata)

AnnData object with n_obs × n_vars = 167598 × 32738
    obs: 'UMAP_1', 'UMAP_2', 'cluster', 'annot', 'sample', 'patient', 'digest', 'typeSample', 'diet'
    var: 'gene_ids', 'feature_types'


In [3]:
adata.obs

Unnamed: 0_level_0,UMAP_1,UMAP_2,cluster,annot,sample,patient,digest,typeSample,diet
cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AAACCTGTCAGGATCT-1,0.738062,-3.097089,34,Mono+mono derived cells,CISE06,H02,exVivo,citeSeq,Lean
AAACGGGAGCACAGGT-1,0.367818,-3.251462,34,Mono+mono derived cells,CISE06,H02,exVivo,citeSeq,Lean
AAACGGGAGTGTGGCA-1,-0.728939,-5.994781,14,Mono+mono derived cells,CISE06,H02,exVivo,citeSeq,Lean
AAACGGGAGTTAGGTA-1,2.814624,-7.131484,7,Mono+mono derived cells,CISE06,H02,exVivo,citeSeq,Lean
AAACGGGCACCAGATT-1,3.420414,-6.966918,7,Mono+mono derived cells,CISE06,H02,exVivo,citeSeq,Lean
...,...,...,...,...,...,...,...,...,...
TTTACCAAGAGAGGTA-41,-3.674058,7.171934,41,Fibroblasts,CS171,H38,exVivo,scRnaSeq,Lean
TTTACCAGTATCGAAA-41,-4.356690,7.339135,41,Fibroblasts,CS171,H38,exVivo,scRnaSeq,Lean
TTTCCTCGTGAGGCAT-41,-3.759264,7.485170,41,Fibroblasts,CS171,H38,exVivo,scRnaSeq,Lean
TTTCGATTCGATAACC-41,-4.757702,6.576497,41,Fibroblasts,CS171,H38,exVivo,scRnaSeq,Lean


In [4]:
annot_values = adata.obs['annot'].astype(str)
unique_counts = annot_values.value_counts()
print(unique_counts)

annot
T cells                    77355
Mono+mono derived cells    29639
Circulating NK/NKT         12567
Hepatocytes                10996
Resident NK                10894
Macrophages                 7076
Neutrophils                 5340
Endothelial cells           2834
cDC2s                       2629
B cells                     2350
Fibroblasts                 1273
pDCs                        1216
cDC1s                       1180
Plasma cells                1125
Cholangiocytes               657
Basophils                    389
Mig.cDCs                      78
Name: count, dtype: int64


In [5]:
rename_map = {
    'cDC2s': 'Myeloid',
    'Resident NK': 'Nk Cells'
}

adata.obs['annot'] = adata.obs['annot'].cat.rename_categories(rename_map)

print(adata.obs)

                       UMAP_1    UMAP_2  cluster                    annot  \
cell                                                                        
AAACCTGTCAGGATCT-1   0.738062 -3.097089       34  Mono+mono derived cells   
AAACGGGAGCACAGGT-1   0.367818 -3.251462       34  Mono+mono derived cells   
AAACGGGAGTGTGGCA-1  -0.728939 -5.994781       14  Mono+mono derived cells   
AAACGGGAGTTAGGTA-1   2.814624 -7.131484        7  Mono+mono derived cells   
AAACGGGCACCAGATT-1   3.420414 -6.966918        7  Mono+mono derived cells   
...                       ...       ...      ...                      ...   
TTTACCAAGAGAGGTA-41 -3.674058  7.171934       41              Fibroblasts   
TTTACCAGTATCGAAA-41 -4.356690  7.339135       41              Fibroblasts   
TTTCCTCGTGAGGCAT-41 -3.759264  7.485170       41              Fibroblasts   
TTTCGATTCGATAACC-41 -4.757702  6.576497       41              Fibroblasts   
TTTGGAGTCCACTGGG-41 -4.283028  7.481643       41              Fibroblasts   

In [6]:
annot_values = adata.obs['annot'].astype(str)
unique_counts = annot_values.value_counts()
print(unique_counts)

annot
T cells                    77355
Mono+mono derived cells    29639
Circulating NK/NKT         12567
Hepatocytes                10996
Nk Cells                   10894
Macrophages                 7076
Neutrophils                 5340
Endothelial cells           2834
Myeloid                     2629
B cells                     2350
Fibroblasts                 1273
pDCs                        1216
cDC1s                       1180
Plasma cells                1125
Cholangiocytes               657
Basophils                    389
Mig.cDCs                      78
Name: count, dtype: int64


In [7]:
condition = (adata.obs['annot'] == 'cDC1s')
adata.obs.loc[condition, 'annot'] = 'Myeloid'

annot_values = adata.obs['annot'].astype(str)
unique_counts = annot_values.value_counts()
print(unique_counts)

annot
T cells                    77355
Mono+mono derived cells    29639
Circulating NK/NKT         12567
Hepatocytes                10996
Nk Cells                   10894
Macrophages                 7076
Neutrophils                 5340
Myeloid                     3809
Endothelial cells           2834
B cells                     2350
Fibroblasts                 1273
pDCs                        1216
Plasma cells                1125
Cholangiocytes               657
Basophils                    389
Mig.cDCs                      78
Name: count, dtype: int64


In [8]:
condition = (adata.obs['annot'] == 'pDCs')
adata.obs.loc[condition, 'annot'] = 'Myeloid'

annot_values = adata.obs['annot'].astype(str)
unique_counts = annot_values.value_counts()
print(unique_counts)

annot
T cells                    77355
Mono+mono derived cells    29639
Circulating NK/NKT         12567
Hepatocytes                10996
Nk Cells                   10894
Macrophages                 7076
Neutrophils                 5340
Myeloid                     5025
Endothelial cells           2834
B cells                     2350
Fibroblasts                 1273
Plasma cells                1125
Cholangiocytes               657
Basophils                    389
Mig.cDCs                      78
Name: count, dtype: int64


In [9]:
condition = (adata.obs['annot'] == 'Mono+mono derived cells')
adata.obs.loc[condition, 'annot'] = 'Myeloid'

annot_values = adata.obs['annot'].astype(str)
unique_counts = annot_values.value_counts()
print(unique_counts)

annot
T cells               77355
Myeloid               34664
Circulating NK/NKT    12567
Hepatocytes           10996
Nk Cells              10894
Macrophages            7076
Neutrophils            5340
Endothelial cells      2834
B cells                2350
Fibroblasts            1273
Plasma cells           1125
Cholangiocytes          657
Basophils               389
Mig.cDCs                 78
Name: count, dtype: int64


In [10]:
condition = (adata.obs['annot'] == 'Macrophages')
adata.obs.loc[condition, 'annot'] = 'Myeloid'

annot_values = adata.obs['annot'].astype(str)
unique_counts = annot_values.value_counts()
print(unique_counts)

annot
T cells               77355
Myeloid               41740
Circulating NK/NKT    12567
Hepatocytes           10996
Nk Cells              10894
Neutrophils            5340
Endothelial cells      2834
B cells                2350
Fibroblasts            1273
Plasma cells           1125
Cholangiocytes          657
Basophils               389
Mig.cDCs                 78
Name: count, dtype: int64


In [11]:
condition = (adata.obs['annot'] == 'Neutrophils')
adata.obs.loc[condition, 'annot'] = 'Myeloid'

annot_values = adata.obs['annot'].astype(str)
unique_counts = annot_values.value_counts()
print(unique_counts)

annot
T cells               77355
Myeloid               47080
Circulating NK/NKT    12567
Hepatocytes           10996
Nk Cells              10894
Endothelial cells      2834
B cells                2350
Fibroblasts            1273
Plasma cells           1125
Cholangiocytes          657
Basophils               389
Mig.cDCs                 78
Name: count, dtype: int64


In [12]:
condition = (adata.obs['annot'] == 'Basophils')
adata.obs.loc[condition, 'annot'] = 'Myeloid'

annot_values = adata.obs['annot'].astype(str)
unique_counts = annot_values.value_counts()
print(unique_counts)

annot
T cells               77355
Myeloid               47469
Circulating NK/NKT    12567
Hepatocytes           10996
Nk Cells              10894
Endothelial cells      2834
B cells                2350
Fibroblasts            1273
Plasma cells           1125
Cholangiocytes          657
Mig.cDCs                 78
Name: count, dtype: int64


In [13]:
condition = (adata.obs['annot'] == 'Circulating NK/NKT')
adata.obs.loc[condition, 'annot'] = 'Nk Cells'

annot_values = adata.obs['annot'].astype(str)
unique_counts = annot_values.value_counts()
print(unique_counts)

annot
T cells              77355
Myeloid              47469
Nk Cells             23461
Hepatocytes          10996
Endothelial cells     2834
B cells               2350
Fibroblasts           1273
Plasma cells          1125
Cholangiocytes         657
Mig.cDCs                78
Name: count, dtype: int64


In [14]:
categories_to_exclude = ['Plasma cells', 'Cholangiocytes', 'Mig.cDCs']
adata_filtered = adata[~adata.obs['annot'].isin(categories_to_exclude)].copy()

In [15]:
adata = adata_filtered
annot_values = adata.obs['annot'].astype(str)
unique_counts = annot_values.value_counts()
print(unique_counts)

annot
T cells              77355
Myeloid              47469
Nk Cells             23461
Hepatocytes          10996
Endothelial cells     2834
B cells               2350
Fibroblasts           1273
Name: count, dtype: int64


In [16]:
adata.write("Validation_dataset.h5ad")