## Notebook for the Colon Cancer Atlas anndata object preparation 
### Developed by: Anna Maguza

### Institute of Computational Biology - Computational Health Centre - Hemlholtz Munich

### 15 May 2023

#### Load required packages

In [2]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as an
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import scipy as sci
from scipy.io import mmread

In [3]:
def X_is_raw(adata):
    return np.array_equal(adata.X.sum(axis=0).astype(int), adata.X.sum(axis=0))

#### Setup Cells

In [4]:
%matplotlib inline

In [5]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

  from .autonotebook import tqdm as notebook_tqdm


scanpy==1.9.3 anndata==0.8.0 umap==0.5.3 numpy==1.23.5 scipy==1.9.1 pandas==1.3.5 scikit-learn==1.2.2 statsmodels==0.13.5 pynndescent==0.5.8


#### Upload Data

In [4]:
# Upload genes
Genes = pd.read_csv('/Users/anna.maguza/Desktop/Data/Gut_project/Human_Colon_Cancer_Atlas/Raw_data/SCP1162/expression/5fbf5a26771a5b0db8fe7a8b/matrix.genes.tsv', sep='\t', index_col=0, header=None)

# Upload metadata
meta = pd.read_csv('/Users/anna.maguza/Desktop/Data/Gut_project/Human_Colon_Cancer_Atlas/Raw_data/SCP1162/metadata/metatable_v3_fix_v2.tsv', sep='\t', index_col=0)

# Upload barcodes
Barcodes = pd.read_csv('/Users/anna.maguza/Desktop/Data/Gut_project/Human_Colon_Cancer_Atlas/Raw_data/SCP1162/expression/5fbf5a26771a5b0db8fe7a8b/matrix.barcodes.tsv', sep='\t', index_col=0, header=None)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [5]:
# Upload matrix (mtx)
matrix = mmread('/Users/anna.maguza/Desktop/Data/Gut_project/Human_Colon_Cancer_Atlas/Raw_data/SCP1162/expression/5fbf5a26771a5b0db8fe7a8b/matrix.mtx')
matrix = sci.sparse.csr_matrix(matrix)

In [8]:
# Upload tSNE coordinates
global_tSNE = pd.read_csv('/Users/anna.maguza/Desktop/Data/Gut_project/Human_Colon_Cancer_Atlas/Raw_data/SCP1162/cluster/crc10x_tSNE_cl_global.tsv', sep='\t', index_col=0)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [7]:
# Drop row 'TYPE' from metadata
meta = meta.drop('TYPE')

In [14]:
# Rename index column in Genes to 'gene_ids' and in Barcodes to 'cell_id'
Genes.index.name = 'gene_id'
Barcodes.index.name = 'cell_id'
Genes.rename(columns={1: 'gene_name'}, inplace=True)

In [16]:
Barcodes.head()

C103_T_1_1_0_c1_v2_id-AAACCTGCATGCTAGT
C103_T_1_1_0_c1_v2_id-AAACCTGGTAGCCTAT
C103_T_1_1_0_c1_v2_id-AAACCTGGTTGTCGCG
C103_T_1_1_0_c1_v2_id-AAACCTGTCATGTGGT
C103_T_1_1_0_c1_v2_id-AAACCTGTCCTTGGTC


In [15]:
Genes.head()

Unnamed: 0_level_0,gene_name
gene_id,Unnamed: 1_level_1
ENSG00000243485.5_4,RP11-34P13.3
ENSG00000237613.2_2,FAM138A
ENSG00000186092.6_4,OR4F5
ENSG00000238009.6_5,RP11-34P13.7
ENSG00000239945.1_5,RP11-34P13.8


In [17]:
# Create anndata object
adata = an.AnnData(X=np.transpose(matrix), obs=Barcodes, var=Genes)

  adata = an.AnnData(X=np.transpose(matrix), obs=Barcodes, var=Genes)


In [18]:
X_is_raw(adata)

False

In [19]:
adata.obs

C103_T_1_1_0_c1_v2_id-AAACCTGCATGCTAGT
C103_T_1_1_0_c1_v2_id-AAACCTGGTAGCCTAT
C103_T_1_1_0_c1_v2_id-AAACCTGGTTGTCGCG
C103_T_1_1_0_c1_v2_id-AAACCTGTCATGTGGT
C103_T_1_1_0_c1_v2_id-AAACCTGTCCTTGGTC
...
C173_T_0_0_0_c1_v3_id-TTTGGAGTCATCGGGC
C173_T_0_0_0_c1_v3_id-TTTGGAGTCTAGTGTG
C173_T_0_0_0_c1_v3_id-TTTGTTGCAGCAATTC
C173_T_0_0_0_c1_v3_id-TTTGTTGGTTCTGAGT
C173_T_0_0_0_c1_v3_id-TTTGTTGTCGTTCCCA


In [20]:
adata.var

Unnamed: 0_level_0,gene_name
gene_id,Unnamed: 1_level_1
ENSG00000243485.5_4,RP11-34P13.3
ENSG00000237613.2_2,FAM138A
ENSG00000186092.6_4,OR4F5
ENSG00000238009.6_5,RP11-34P13.7
ENSG00000239945.1_5,RP11-34P13.8
...,...
Epi_39_SEPP1_SLC26A3_EpiN8,"pEpi39 (SEPP1, SLC26A3)"
Epi_40_CKB_EpiN16,"pEpi40 (CKB, AOC1, CEACAM6)"
Epi_41_GSN_MUC2_EpiN17,"pEpi41 (GSN, MUC2, MUC4)"
Epi_42_ELFs_EpiN11,pEpi42 (elongation factors)


In [21]:
# Join metadata to anndata object by index
adata.obs = adata.obs.join(meta)

In [22]:
adata.obs

Unnamed: 0_level_0,biosample_id,donor_id,SpecimenType,TissueSource,ProcessingMethod,PatientTypeID,sex,Site,Grade,TumorStage,...,qc_emptyDropPval,qc_mitoFraction,species,species__ontology_label,disease,disease__ontology_label,organ,organ__ontology_label,library_preparation_protocol,library_preparation_protocol__ontology_label
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C103_T_1_1_0_c1_v2_id-AAACCTGCATGCTAGT,C103_T_1_1_0_c1_v2,C103,T,MGH,unsorted,C103_T,male,left,low,notT4,...,9.99900009999e-05,0.0762605181209832,NCBITaxon_9606,Homo sapiens,MONDO_0002271,colon adenocarcinoma,UBERON_0001155,colon,EFO_0009899,10X 3' v2 sequencing
C103_T_1_1_0_c1_v2_id-AAACCTGGTAGCCTAT,C103_T_1_1_0_c1_v2,C103,T,MGH,unsorted,C103_T,male,left,low,notT4,...,9.99900009999e-05,0.352076124567474,NCBITaxon_9606,Homo sapiens,MONDO_0002271,colon adenocarcinoma,UBERON_0001155,colon,EFO_0009899,10X 3' v2 sequencing
C103_T_1_1_0_c1_v2_id-AAACCTGGTTGTCGCG,C103_T_1_1_0_c1_v2,C103,T,MGH,unsorted,C103_T,male,left,low,notT4,...,9.99900009999e-05,0.112033277605286,NCBITaxon_9606,Homo sapiens,MONDO_0002271,colon adenocarcinoma,UBERON_0001155,colon,EFO_0009899,10X 3' v2 sequencing
C103_T_1_1_0_c1_v2_id-AAACCTGTCATGTGGT,C103_T_1_1_0_c1_v2,C103,T,MGH,unsorted,C103_T,male,left,low,notT4,...,9.99900009999e-05,0.108513779527559,NCBITaxon_9606,Homo sapiens,MONDO_0002271,colon adenocarcinoma,UBERON_0001155,colon,EFO_0009899,10X 3' v2 sequencing
C103_T_1_1_0_c1_v2_id-AAACCTGTCCTTGGTC,C103_T_1_1_0_c1_v2,C103,T,MGH,unsorted,C103_T,male,left,low,notT4,...,9.99900009999e-05,0.119262997524281,NCBITaxon_9606,Homo sapiens,MONDO_0002271,colon adenocarcinoma,UBERON_0001155,colon,EFO_0009899,10X 3' v2 sequencing
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C173_T_0_0_0_c1_v3_id-TTTGGAGTCATCGGGC,C173_T_0_0_0_c1_v3,C173,T,DFCI,unsorted,C173_T,female,left,high,T4,...,0.0001,0.108428,NCBITaxon_9606,Homo sapiens,MONDO_0002271,colon adenocarcinoma,UBERON_0001155,colon,EFO_0009922,10x 3' v3 sequencing
C173_T_0_0_0_c1_v3_id-TTTGGAGTCTAGTGTG,C173_T_0_0_0_c1_v3,C173,T,DFCI,unsorted,C173_T,female,left,high,T4,...,0.0001,0.260756,NCBITaxon_9606,Homo sapiens,MONDO_0002271,colon adenocarcinoma,UBERON_0001155,colon,EFO_0009922,10x 3' v3 sequencing
C173_T_0_0_0_c1_v3_id-TTTGTTGCAGCAATTC,C173_T_0_0_0_c1_v3,C173,T,DFCI,unsorted,C173_T,female,left,high,T4,...,0.0001,0.437101,NCBITaxon_9606,Homo sapiens,MONDO_0002271,colon adenocarcinoma,UBERON_0001155,colon,EFO_0009922,10x 3' v3 sequencing
C173_T_0_0_0_c1_v3_id-TTTGTTGGTTCTGAGT,C173_T_0_0_0_c1_v3,C173,T,DFCI,unsorted,C173_T,female,left,high,T4,...,0.0001,0.161164,NCBITaxon_9606,Homo sapiens,MONDO_0002271,colon adenocarcinoma,UBERON_0001155,colon,EFO_0009922,10x 3' v3 sequencing


In [24]:
adata.obs.index = adata.obs.index.astype(str)
adata.var.index = adata.var.index.astype(str)
adata.obs_names = adata.obs_names.astype(str)
adata.var.index.name = str(adata.var.index.name) if adata.var.index.name is not None else None
adata.var_names = adata.var_names.astype(str)
adata.obs = adata.obs.astype(str)
adata.var = adata.var.astype(str)

In [9]:
# Drop row 'TYPE' from metadata
global_tSNE = global_tSNE.drop('TYPE')

In [10]:
# Drop columns 'X' and 'Y' from global_tSNE
global_tSNE = global_tSNE.drop(['X', 'Y'], axis=1)

In [12]:
# Make column 'NAME' index
global_tSNE.index.name = 'cell_id'

In [16]:
# Join metadata to anndata object by index
adata.obs = adata.obs.join(global_tSNE)

In [51]:
# Save anndata object
adata.write('/Users/anna.maguza/Desktop/Data/Gut_project/Human_Colon_Cancer_Atlas/anndata/Colon_cancer_atlas_anndata.h5ad')

In [28]:
del matrix

In [26]:
adata.obs_keys()

['biosample_id',
 'donor_id',
 'SpecimenType',
 'TissueSource',
 'ProcessingMethod',
 'PatientTypeID',
 'sex',
 'Site',
 'Grade',
 'TumorStage',
 'LymphNodeStatus',
 'MMRStatusTumor',
 'MMRMLH1Tumor',
 'qc_geneCount',
 'qc_logMappedReads',
 'qc_meanReadsPerUmi',
 'qc_totalReads',
 'qc_logUmiCount',
 'qc_bcSwapFraction',
 'qc_geneSatFraction',
 'qc_seqDupEst',
 'qc_umiSatFraction',
 'qc_emptyDropPval',
 'qc_mitoFraction',
 'species',
 'species__ontology_label',
 'disease',
 'disease__ontology_label',
 'organ',
 'organ__ontology_label',
 'library_preparation_protocol',
 'library_preparation_protocol__ontology_label']

In [27]:
adata.obs['SpecimenType'].value_counts()

T    258359
N    112864
Name: SpecimenType, dtype: int64

In [29]:
adata.obs['TissueSource'].value_counts()

MGH     245813
DFCI    125410
Name: TissueSource, dtype: int64

In [30]:
adata.obs['ProcessingMethod'].value_counts()

unsorted                231737
CD45+                   122766
mixUnsortCD45            10670
Live                      4942
CD45+ / CD3- / CD19-      1108
Name: ProcessingMethod, dtype: int64

In [32]:
adata.obs['Site'].value_counts()

right    271306
left      99917
Name: Site, dtype: int64

In [33]:
adata.obs['Grade'].value_counts()

low     307807
high     58892
10        4524
Name: Grade, dtype: int64

In [34]:
adata.obs['TumorStage'].value_counts()

notT4    232676
T4       138547
Name: TumorStage, dtype: int64

In [35]:
adata.obs['LymphNodeStatus'].value_counts()

N-    201608
N+    169615
Name: LymphNodeStatus, dtype: int64

In [36]:
adata.obs['MMRStatusTumor'].value_counts()

MMRp      139781
MMRd      118578
Normal    112864
Name: MMRStatusTumor, dtype: int64

In [37]:
adata.obs['MMRMLH1Tumor'].value_counts()

MMRp               139781
Normal             111317
MMRd_MLH1Meth       82390
MMRd_MLH1NoMeth     33211
high                 4524
Name: MMRMLH1Tumor, dtype: int64

In [38]:
adata.obs['species'].value_counts()

NCBITaxon_9606    371223
Name: species, dtype: int64

In [39]:
adata.obs['species__ontology_label'].value_counts()

Homo sapiens    371223
Name: species__ontology_label, dtype: int64

In [40]:
adata.obs['disease'].value_counts()

MONDO_0002271    258359
PATO_0000461     112864
Name: disease, dtype: int64

In [41]:
adata.obs['disease__ontology_label'].value_counts()

colon adenocarcinoma    258359
normal                  112864
Name: disease__ontology_label, dtype: int64

In [42]:
adata.obs['organ__ontology_label'].value_counts()

colon    371223
Name: organ__ontology_label, dtype: int64

In [43]:
adata.obs['library_preparation_protocol__ontology_label'].value_counts()

10X 3' v2 sequencing    267712
10x 3' v3 sequencing    103511
Name: library_preparation_protocol__ontology_label, dtype: int64

In [6]:
input_data = '/Users/anna.maguza/Desktop/Data/Gut_project/Human_Colon_Cancer_Atlas/anndata/Colon_cancer_atlas_anndata.h5ad'
adata = sc.read_h5ad(input_data)

In [7]:
adata.obs

Unnamed: 0_level_0,biosample_id,donor_id,SpecimenType,TissueSource,ProcessingMethod,PatientTypeID,sex,Site,Grade,TumorStage,...,species__ontology_label,disease,disease__ontology_label,organ,organ__ontology_label,library_preparation_protocol,library_preparation_protocol__ontology_label,ClusterFull,ClusterMidway,ClusterTop
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C103_T_1_1_0_c1_v2_id-AAACCTGCATGCTAGT,C103_T_1_1_0_c1_v2,C103,T,MGH,unsorted,C103_T,male,left,low,notT4,...,Homo sapiens,MONDO_0002271,colon adenocarcinoma,UBERON_0001155,colon,EFO_0009899,10X 3' v2 sequencing,Tumor cE01 (Stem/TA-like),EpiT,Epi
C103_T_1_1_0_c1_v2_id-AAACCTGGTAGCCTAT,C103_T_1_1_0_c1_v2,C103,T,MGH,unsorted,C103_T,male,left,low,notT4,...,Homo sapiens,MONDO_0002271,colon adenocarcinoma,UBERON_0001155,colon,EFO_0009899,10X 3' v2 sequencing,Tumor cE01 (Stem/TA-like),EpiT,Epi
C103_T_1_1_0_c1_v2_id-AAACCTGGTTGTCGCG,C103_T_1_1_0_c1_v2,C103,T,MGH,unsorted,C103_T,male,left,low,notT4,...,Homo sapiens,MONDO_0002271,colon adenocarcinoma,UBERON_0001155,colon,EFO_0009899,10X 3' v2 sequencing,Tumor cE03 (Stem/TA-like prolif),EpiT,Epi
C103_T_1_1_0_c1_v2_id-AAACCTGTCATGTGGT,C103_T_1_1_0_c1_v2,C103,T,MGH,unsorted,C103_T,male,left,low,notT4,...,Homo sapiens,MONDO_0002271,colon adenocarcinoma,UBERON_0001155,colon,EFO_0009899,10X 3' v2 sequencing,Tumor cE01 (Stem/TA-like),EpiT,Epi
C103_T_1_1_0_c1_v2_id-AAACCTGTCCTTGGTC,C103_T_1_1_0_c1_v2,C103,T,MGH,unsorted,C103_T,male,left,low,notT4,...,Homo sapiens,MONDO_0002271,colon adenocarcinoma,UBERON_0001155,colon,EFO_0009899,10X 3' v2 sequencing,Tumor cE01 (Stem/TA-like),EpiT,Epi
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C173_T_0_0_0_c1_v3_id-TTTGGAGTCATCGGGC,C173_T_0_0_0_c1_v3,C173,T,DFCI,unsorted,C173_T,female,left,high,T4,...,Homo sapiens,MONDO_0002271,colon adenocarcinoma,UBERON_0001155,colon,EFO_0009922,10x 3' v3 sequencing,cM02 (Macrophage-like),Macro,Myeloid
C173_T_0_0_0_c1_v3_id-TTTGGAGTCTAGTGTG,C173_T_0_0_0_c1_v3,C173,T,DFCI,unsorted,C173_T,female,left,high,T4,...,Homo sapiens,MONDO_0002271,colon adenocarcinoma,UBERON_0001155,colon,EFO_0009922,10x 3' v3 sequencing,Tumor cE03 (Stem/TA-like prolif),EpiT,Epi
C173_T_0_0_0_c1_v3_id-TTTGTTGCAGCAATTC,C173_T_0_0_0_c1_v3,C173,T,DFCI,unsorted,C173_T,female,left,high,T4,...,Homo sapiens,MONDO_0002271,colon adenocarcinoma,UBERON_0001155,colon,EFO_0009922,10x 3' v3 sequencing,Tumor cE03 (Stem/TA-like prolif),EpiT,Epi
C173_T_0_0_0_c1_v3_id-TTTGTTGGTTCTGAGT,C173_T_0_0_0_c1_v3,C173,T,DFCI,unsorted,C173_T,female,left,high,T4,...,Homo sapiens,MONDO_0002271,colon adenocarcinoma,UBERON_0001155,colon,EFO_0009922,10x 3' v3 sequencing,cTNI08 (CD4+ Treg),TCD4,TNKILC
