In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
import scanpy.external as sce

In [2]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

  from .autonotebook import tqdm as notebook_tqdm


scanpy==1.8.2 anndata==0.7.8 umap==0.5.3 numpy==1.21.6 scipy==1.8.0 pandas==1.4.2 scikit-learn==1.0.2 statsmodels==0.13.2 python-igraph==0.9.10 pynndescent==0.5.6


In [3]:
raw_file = 'write_LCA/h_LCA1-5_raw.h5ad'
qc_file = 'write_LCA/h_LCA1-5_qc.h5ad'
filtered_file = 'write_LCA/h_LCA1-5_filtered.h5ad'
pp_file = 'write_LCA/h_LCA1-5_pp.h5ad'
results_file = 'write_LCA/h_LCA1-5.h5ad'  # the file that will store the analysis results

# Import the count matrix and metadata

In [4]:
adatas=[]
## import all the tumor libraries (versus blood)
names = ['p3t1','p3t2','p3t3','p4t1','p4t2','p4t3','p5t1','p5t2','p6t1','p6t2','p7t1','p7t2']
pats = ['p3']*3+['p4']*3+['p5']*2+['p6']*2+['p7']*2
index = [0,1,2,4,5,6,8,9,10,11,13,14]
for i in range(0,len(names)):
    dir='../../data_LCA/pat_LCA/GSM3635'+str(288+index[i])+'_human_'+names[i]+'_raw_counts.tsv'
    adata = sc.read(
        dir,  # the directory with the `.mtx` file
        cache=True,# write a cache file for faster subsequent reading
        )  
    adata.obs['Barcode']=adata.obs_names
    adata.obs['Patient']=pats[i]
    adata.obs['Library']=names[i]
    ### import metadata cell type annotation
    meta = pd.read_csv('../../data_LCA/pat_LCA/GSE127465_human_cell_metadata_54773x25.tsv',sep='\t')[['Tissue','Patient','Library','Barcode','Most likely LM22 cell type','Major cell type','Minor subset']]
    meta = meta[meta['Tissue']=='tumor'][meta['Library']==names[i]]
    LM22_dict_h_LCA = dict(zip(list(meta['Barcode']),list(meta['Most likely LM22 cell type']) ))
    major_dict_h_LCA = dict(zip(list(meta['Barcode']),list(meta['Major cell type']) ))
    sub_dict_h_LCA = dict(zip(list(meta['Barcode']),list(meta['Minor subset']) ))
    list_lm22 = []
    list_maj = []
    list_sub = []
    barcodes = list(adata.obs_names)
    for barcode in barcodes:
        if barcode in list(meta['Barcode']):
            list_lm22.append(LM22_dict_h_LCA[barcode])
            list_maj.append(major_dict_h_LCA[barcode])
            list_sub.append(sub_dict_h_LCA[barcode])
        else:
            list_lm22.append('null')
            list_maj.append('null')
            list_sub.append('null')
    adata.obs['Most_likely_LM22_cell_type']=list_lm22
    adata.obs['Major_cell_type']=list_maj
    adata.obs['Minor_subset']=list_sub
    # unify barcode name
    adata.obs.index=[names[i]+'-'+x for x in adata.obs.index]
    #adata
    adatas.append(adata)

... reading from cache file cache/..-..-data_LCA-pat_LCA-GSM3635288_human_p3t1_raw_counts.h5ad


  meta = meta[meta['Tissue']=='tumor'][meta['Library']==names[i]]


... reading from cache file cache/..-..-data_LCA-pat_LCA-GSM3635289_human_p3t2_raw_counts.h5ad


  meta = meta[meta['Tissue']=='tumor'][meta['Library']==names[i]]


... reading from cache file cache/..-..-data_LCA-pat_LCA-GSM3635290_human_p3t3_raw_counts.h5ad


  meta = meta[meta['Tissue']=='tumor'][meta['Library']==names[i]]


... reading from cache file cache/..-..-data_LCA-pat_LCA-GSM3635292_human_p4t1_raw_counts.h5ad


  meta = meta[meta['Tissue']=='tumor'][meta['Library']==names[i]]


... reading from cache file cache/..-..-data_LCA-pat_LCA-GSM3635293_human_p4t2_raw_counts.h5ad


  meta = meta[meta['Tissue']=='tumor'][meta['Library']==names[i]]


... reading from cache file cache/..-..-data_LCA-pat_LCA-GSM3635294_human_p4t3_raw_counts.h5ad


  meta = meta[meta['Tissue']=='tumor'][meta['Library']==names[i]]


... reading from cache file cache/..-..-data_LCA-pat_LCA-GSM3635296_human_p5t1_raw_counts.h5ad


  meta = meta[meta['Tissue']=='tumor'][meta['Library']==names[i]]


... reading from cache file cache/..-..-data_LCA-pat_LCA-GSM3635297_human_p5t2_raw_counts.h5ad


  meta = meta[meta['Tissue']=='tumor'][meta['Library']==names[i]]


... reading from cache file cache/..-..-data_LCA-pat_LCA-GSM3635298_human_p6t1_raw_counts.h5ad


  meta = meta[meta['Tissue']=='tumor'][meta['Library']==names[i]]


... reading from cache file cache/..-..-data_LCA-pat_LCA-GSM3635299_human_p6t2_raw_counts.h5ad


  meta = meta[meta['Tissue']=='tumor'][meta['Library']==names[i]]


... reading from cache file cache/..-..-data_LCA-pat_LCA-GSM3635301_human_p7t1_raw_counts.h5ad


  meta = meta[meta['Tissue']=='tumor'][meta['Library']==names[i]]


... reading from cache file cache/..-..-data_LCA-pat_LCA-GSM3635302_human_p7t2_raw_counts.h5ad


  meta = meta[meta['Tissue']=='tumor'][meta['Library']==names[i]]


In [5]:
adatas

[AnnData object with n_obs × n_vars = 9534 × 41861
     obs: 'Barcode', 'Patient', 'Library', 'Most_likely_LM22_cell_type', 'Major_cell_type', 'Minor_subset',
 AnnData object with n_obs × n_vars = 9993 × 41861
     obs: 'Barcode', 'Patient', 'Library', 'Most_likely_LM22_cell_type', 'Major_cell_type', 'Minor_subset',
 AnnData object with n_obs × n_vars = 11267 × 41861
     obs: 'Barcode', 'Patient', 'Library', 'Most_likely_LM22_cell_type', 'Major_cell_type', 'Minor_subset',
 AnnData object with n_obs × n_vars = 3388 × 41861
     obs: 'Barcode', 'Patient', 'Library', 'Most_likely_LM22_cell_type', 'Major_cell_type', 'Minor_subset',
 AnnData object with n_obs × n_vars = 4437 × 41861
     obs: 'Barcode', 'Patient', 'Library', 'Most_likely_LM22_cell_type', 'Major_cell_type', 'Minor_subset',
 AnnData object with n_obs × n_vars = 4082 × 41861
     obs: 'Barcode', 'Patient', 'Library', 'Most_likely_LM22_cell_type', 'Major_cell_type', 'Minor_subset',
 AnnData object with n_obs × n_vars = 7943 × 

In [7]:
adata = ad.concat(adatas, merge = "same")


In [8]:
adata.obs

Unnamed: 0,Barcode,Patient,Library,Most_likely_LM22_cell_type,Major_cell_type,Minor_subset
p3t1-bcHNMG,bcHNMG,p3,p3t1,Macrophages M0,tMoMacDC,tMac5
p3t1-bcGUOS,bcGUOS,p3,p3t1,Plasma cells,tPlasma cells,tPC1
p3t1-bcBAMM,bcBAMM,p3,p3t1,Monocytes,Patient3-specific,Pt3A_TFF1/MUC5A
p3t1-bcATNB,bcATNB,p3,p3t1,B cells memory,tB cells,tB
p3t1-bcAEDB,bcAEDB,p3,p3t1,Monocytes,Patient3-specific,Pt3E_CLDN4
...,...,...,...,...,...,...
p7t2-bcFPAE,bcFPAE,p7,p7t2,,,
p7t2-bcBMXQ,bcBMXQ,p7,p7t2,,,
p7t2-bcGQJH,bcGQJH,p7,p7t2,,,
p7t2-bcGEKE,bcGEKE,p7,p7t2,,,


In [9]:
adata.write(raw_file)

# 3/30：p3-p7所有tumor libraries，添加文献metadata的细胞类型注释

  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Barcode' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Patient' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Library' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Most_likely_LM22_cell_type' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Major_cell_type' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Minor_subset' as categorical
