## Notebook for the annotation transfering from of the Gut Cell Atlas to colorectal cancer samples from Khaliq 2022 with scNym for coarse grain labels.
### Developed by: Anna Maguza

### Institute of Computational Biology - Computational Health Centre - Hemlholtz Munich

### 12 October 2022

#### Load required packages

In [1]:
import scanpy as sc
import numpy as np
import pandas as pd
import anndata as ad
import scnym

#### Setup Cells

In [2]:
%matplotlib inline

In [3]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=120, facecolor='white')

scanpy==1.9.1 anndata==0.8.0 umap==0.5.3 numpy==1.22.4 scipy==1.8.1 pandas==1.4.3 scikit-learn==1.1.1 statsmodels==0.13.2 pynndescent==0.5.7


#### Data Upload

In [4]:
#Uploading tumor data
input_CRC_file = '/Users/annamaguza/Desktop/Desktop-Anna/LMU/Master-Thesis/Anna-Master-Project/khaliq_2022_anndata_raw.h5ad'  # the file that will store the analysis results
output_CRC_file = '/Users/annamaguza/Desktop/Desktop-Anna/LMU/Master-Thesis/Anna-Master-Project/khaliq_2022_anndata_processed.h5ad'  # the file that will store the analysis results

In [5]:
#Anndata upload
khaliq_ad = sc.read_h5ad(input_CRC_file)
khaliq_ad.X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [4]:
#Uploading normal control data
input_control_file = '/Users/annamaguza/Desktop/Desktop-Anna/LMU/Master-Thesis/Anna-Master-Project/Full_obj_raw_counts_nosoupx_v2.h5ad'  # the file that will store the analysis results
output_control_file = '/Users/annamaguza/Desktop/Desktop-Anna/LMU/Master-Thesis/Anna-Master-Project/Full_obj_raw_counts_nosoupx_v2_processed.h5ad'  # the file that will store the analysis results

In [5]:
#Anndata upload
control_ad = sc.read_h5ad(input_control_file)
control_ad.X

<428469x33538 sparse matrix of type '<class 'numpy.float32'>'
	with 760344941 stored elements in Compressed Sparse Row format>

In [6]:
control_ad.obs

Unnamed: 0,Sample name,Diagnosis,Age,sample name,Region code,Fraction,Gender,Region,10X,batch,n_genes,n_genes_by_counts,total_counts,total_counts_mt,pct_counts_mt,doublet_scores,predicted_doublets,category,Age_group,Integrated_05
AAACCTGAGAACAACT-1-4918STDY7333456,T036,Pediatric healthy,4,T036-TIL-SC-EPCAMP,TIL,SC-EPCAMP,M,SmallInt,3',4918STDY7333456,968,968,1912.793579,46.228199,2.416790,0.016442,False,B cells,Pediatric,DZ GC cell
AAACCTGAGCGATATA-1-4918STDY7333456,T036,Pediatric healthy,4,T036-TIL-SC-EPCAMP,TIL,SC-EPCAMP,M,SmallInt,3',4918STDY7333456,693,693,2152.089355,0.122579,0.005696,0.027389,False,B cells,Pediatric,Cycling B cell
AAACCTGAGGAGTTTA-1-4918STDY7333456,T036,Pediatric healthy,4,T036-TIL-SC-EPCAMP,TIL,SC-EPCAMP,M,SmallInt,3',4918STDY7333456,720,720,1686.883423,80.195816,4.754082,0.014817,False,T cells,Pediatric,gdT
AAACCTGAGTATCTCG-1-4918STDY7333456,T036,Pediatric healthy,4,T036-TIL-SC-EPCAMP,TIL,SC-EPCAMP,M,SmallInt,3',4918STDY7333456,746,746,2135.183838,104.506760,4.894509,0.039416,False,B cells,Pediatric,Memory B
AAACCTGAGTGACTCT-1-4918STDY7333456,T036,Pediatric healthy,4,T036-TIL-SC-EPCAMP,TIL,SC-EPCAMP,M,SmallInt,3',4918STDY7333456,1227,1227,3103.050293,95.846077,3.088770,0.025763,False,B cells,Pediatric,DZ GC cell
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGTTTCAACACGT-1-WTDAtest7770719,A30 (398B),Healthy adult,20-25,A30-SCL-6-SC-45N-2,SCL,SC-45N,F,LargeInt,3',WTDAtest7770719,1821,1821,4838.415039,144.966095,2.996148,0.027510,False,Mesenchymal,Adult,Stromal 1 (CCL11+)
TTTGGTTTCCAAATGC-1-WTDAtest7770719,A30 (398B),Healthy adult,20-25,A30-SCL-6-SC-45N-2,SCL,SC-45N,F,LargeInt,3',WTDAtest7770719,1814,1814,5337.013672,85.408028,1.600296,0.014349,False,Mesenchymal,Adult,Stromal 1 (CCL11+)
TTTGGTTTCGGCGGTT-1-WTDAtest7770719,A30 (398B),Healthy adult,20-25,A30-SCL-6-SC-45N-2,SCL,SC-45N,F,LargeInt,3',WTDAtest7770719,6418,6418,48142.085938,7198.451172,14.952512,0.072750,False,Epithelial,Adult,Colonocyte
TTTGTCAGTGAAAGAG-1-WTDAtest7770719,A30 (398B),Healthy adult,20-25,A30-SCL-6-SC-45N-2,SCL,SC-45N,F,LargeInt,3',WTDAtest7770719,2578,2578,10848.547852,600.362671,5.534037,0.091348,False,Epithelial,Adult,Stem cells


In [16]:
control_ad.obs.groupby(["Diagnosis", "Sample name"]).apply(len)

Diagnosis                Sample name
Healthy adult            A26 (386C)     10120
                         A30 (398B)      4667
                         A32 (411C)     21940
                         A33 (414C)     37472
                         A34 (417C)     27502
                         A38 (432C)      8236
                         A39 (440C)     14430
fetal                    BRC2026        11785
                         BRC2029         7752
                         BRC2043         8227
                         BRC2046         7570
                         BRC2049         5664
                         BRC2119         7507
                         BRC2121        14716
                         BRC2133        23607
                         BRC2134        17193
                         BRC2258        34177
                         BRC2259        11026
                         F66            16267
                         F67             8379
                         F72            278

In [7]:
#Remove Pediatric Crohn Disease
control_ad_clean = control_ad[~control_ad.obs['Diagnosis'].isin(['Pediatric Crohn Disease']),:]

In [8]:
control_ad_clean.obs.groupby(["Diagnosis", "Sample name"]).apply(len)

Diagnosis          Sample name
Healthy adult      A26 (386C)     10120
                   A30 (398B)      4667
                   A32 (411C)     21940
                   A33 (414C)     37472
                   A34 (417C)     27502
                   A38 (432C)      8236
                   A39 (440C)     14430
fetal              BRC2026        11785
                   BRC2029         7752
                   BRC2043         8227
                   BRC2046         7570
                   BRC2049         5664
                   BRC2119         7507
                   BRC2121        14716
                   BRC2133        23607
                   BRC2134        17193
                   BRC2258        34177
                   BRC2259        11026
                   F66            16267
                   F67             8379
                   F72            27859
                   F73            31569
                   F78            14046
Pediatric healthy  T024            2354
         

In [37]:
control_ad_clean.obs

Unnamed: 0,Sample name,Diagnosis,Age,sample name,Region code,Fraction,Gender,Region,10X,batch,n_genes,n_genes_by_counts,total_counts,total_counts_mt,pct_counts_mt,doublet_scores,predicted_doublets,category,Age_group,Integrated_05
AAACCTGAGAACAACT-1-4918STDY7333456,T036,Pediatric healthy,4,T036-TIL-SC-EPCAMP,TIL,SC-EPCAMP,M,SmallInt,3',4918STDY7333456,968,968,1912.793579,46.228199,2.416790,0.016442,False,B cells,Pediatric,DZ GC cell
AAACCTGAGCGATATA-1-4918STDY7333456,T036,Pediatric healthy,4,T036-TIL-SC-EPCAMP,TIL,SC-EPCAMP,M,SmallInt,3',4918STDY7333456,693,693,2152.089355,0.122579,0.005696,0.027389,False,B cells,Pediatric,Cycling B cell
AAACCTGAGGAGTTTA-1-4918STDY7333456,T036,Pediatric healthy,4,T036-TIL-SC-EPCAMP,TIL,SC-EPCAMP,M,SmallInt,3',4918STDY7333456,720,720,1686.883423,80.195816,4.754082,0.014817,False,T cells,Pediatric,gdT
AAACCTGAGTATCTCG-1-4918STDY7333456,T036,Pediatric healthy,4,T036-TIL-SC-EPCAMP,TIL,SC-EPCAMP,M,SmallInt,3',4918STDY7333456,746,746,2135.183838,104.506760,4.894509,0.039416,False,B cells,Pediatric,Memory B
AAACCTGAGTGACTCT-1-4918STDY7333456,T036,Pediatric healthy,4,T036-TIL-SC-EPCAMP,TIL,SC-EPCAMP,M,SmallInt,3',4918STDY7333456,1227,1227,3103.050293,95.846077,3.088770,0.025763,False,B cells,Pediatric,DZ GC cell
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGTTTCAACACGT-1-WTDAtest7770719,A30 (398B),Healthy adult,20-25,A30-SCL-6-SC-45N-2,SCL,SC-45N,F,LargeInt,3',WTDAtest7770719,1821,1821,4838.415039,144.966095,2.996148,0.027510,False,Mesenchymal,Adult,Stromal 1 (CCL11+)
TTTGGTTTCCAAATGC-1-WTDAtest7770719,A30 (398B),Healthy adult,20-25,A30-SCL-6-SC-45N-2,SCL,SC-45N,F,LargeInt,3',WTDAtest7770719,1814,1814,5337.013672,85.408028,1.600296,0.014349,False,Mesenchymal,Adult,Stromal 1 (CCL11+)
TTTGGTTTCGGCGGTT-1-WTDAtest7770719,A30 (398B),Healthy adult,20-25,A30-SCL-6-SC-45N-2,SCL,SC-45N,F,LargeInt,3',WTDAtest7770719,6418,6418,48142.085938,7198.451172,14.952512,0.072750,False,Epithelial,Adult,Colonocyte
TTTGTCAGTGAAAGAG-1-WTDAtest7770719,A30 (398B),Healthy adult,20-25,A30-SCL-6-SC-45N-2,SCL,SC-45N,F,LargeInt,3',WTDAtest7770719,2578,2578,10848.547852,600.362671,5.534037,0.091348,False,Epithelial,Adult,Stem cells


In [36]:
#Total Counts
sum(control_ad_clean.obs.total_counts)

2497058282.8599243

In [39]:
#Mean Genes per Cell
sum(control_ad_clean.obs.n_genes_by_counts)/401305

1802.3177333947995

In [41]:
#Mean Reads per Cell
sum(control_ad_clean.obs.total_counts)/401305

6222.345305590322

In [43]:
#Mean percentage of mitochondrial counts
from statistics import mean
mean(control_ad_clean.obs.pct_counts_mt)

9.625710284865626

In [9]:
#Adding percentage of ribosomial genes
control_ad_clean.var['ribo'] = control_ad_clean.var_names.str.startswith(("RPS","RPL"))  # annotate the group of ribosomal genes as 'ribo'
sc.pp.calculate_qc_metrics(control_ad_clean, qc_vars=['ribo'], percent_top=None, log1p=False, inplace=True)

  control_ad_clean.var['ribo'] = control_ad_clean.var_names.str.startswith(("RPS","RPL"))  # annotate the group of ribosomal genes as 'ribo'


In [12]:
control_ad_clean.obs

Unnamed: 0,Sample name,Diagnosis,Age,sample name,Region code,Fraction,Gender,Region,10X,batch,...,total_counts,total_counts_mt,pct_counts_mt,doublet_scores,predicted_doublets,category,Age_group,Integrated_05,total_counts_ribo,pct_counts_ribo
AAACCTGAGAACAACT-1-4918STDY7333456,T036,Pediatric healthy,4,T036-TIL-SC-EPCAMP,TIL,SC-EPCAMP,M,SmallInt,3',4918STDY7333456,...,1994.0,46.228199,2.416790,0.016442,False,B cells,Pediatric,DZ GC cell,491.0,24.623873
AAACCTGAGCGATATA-1-4918STDY7333456,T036,Pediatric healthy,4,T036-TIL-SC-EPCAMP,TIL,SC-EPCAMP,M,SmallInt,3',4918STDY7333456,...,2231.0,0.122579,0.005696,0.027389,False,B cells,Pediatric,Cycling B cell,1048.0,46.974449
AAACCTGAGGAGTTTA-1-4918STDY7333456,T036,Pediatric healthy,4,T036-TIL-SC-EPCAMP,TIL,SC-EPCAMP,M,SmallInt,3',4918STDY7333456,...,1759.0,80.195816,4.754082,0.014817,False,T cells,Pediatric,gdT,652.0,37.066517
AAACCTGAGTATCTCG-1-4918STDY7333456,T036,Pediatric healthy,4,T036-TIL-SC-EPCAMP,TIL,SC-EPCAMP,M,SmallInt,3',4918STDY7333456,...,2243.0,104.506760,4.894509,0.039416,False,B cells,Pediatric,Memory B,996.0,44.404812
AAACCTGAGTGACTCT-1-4918STDY7333456,T036,Pediatric healthy,4,T036-TIL-SC-EPCAMP,TIL,SC-EPCAMP,M,SmallInt,3',4918STDY7333456,...,3253.0,95.846077,3.088770,0.025763,False,B cells,Pediatric,DZ GC cell,1121.0,34.460495
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGTTTCAACACGT-1-WTDAtest7770719,A30 (398B),Healthy adult,20-25,A30-SCL-6-SC-45N-2,SCL,SC-45N,F,LargeInt,3',WTDAtest7770719,...,4943.0,144.966095,2.996148,0.027510,False,Mesenchymal,Adult,Stromal 1 (CCL11+),1170.0,23.669836
TTTGGTTTCCAAATGC-1-WTDAtest7770719,A30 (398B),Healthy adult,20-25,A30-SCL-6-SC-45N-2,SCL,SC-45N,F,LargeInt,3',WTDAtest7770719,...,5446.0,85.408028,1.600296,0.014349,False,Mesenchymal,Adult,Stromal 1 (CCL11+),1454.0,26.698494
TTTGGTTTCGGCGGTT-1-WTDAtest7770719,A30 (398B),Healthy adult,20-25,A30-SCL-6-SC-45N-2,SCL,SC-45N,F,LargeInt,3',WTDAtest7770719,...,49366.0,7198.451172,14.952512,0.072750,False,Epithelial,Adult,Colonocyte,12117.0,24.545233
TTTGTCAGTGAAAGAG-1-WTDAtest7770719,A30 (398B),Healthy adult,20-25,A30-SCL-6-SC-45N-2,SCL,SC-45N,F,LargeInt,3',WTDAtest7770719,...,11052.0,600.362671,5.534037,0.091348,False,Epithelial,Adult,Stem cells,4846.0,43.847267


In [14]:
#Mean percentage of ribosomial counts
from statistics import mean
mean(control_ad_clean.obs.pct_counts_ribo)

23.591741837701004