## Notebook for the Joanito, 2022 QC 
**Developed by:** Anna Maguza  
**Institute of Computational Biology - Computational Health Centre - Hemlholtz Munich**  
**Date:** 16th June 2023

#### Load required packages

In [3]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as an
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

#### Setup Cells

In [1]:
%matplotlib inline

In [4]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

  from .autonotebook import tqdm as notebook_tqdm


scanpy==1.9.3 anndata==0.8.0 umap==0.5.3 numpy==1.23.5 scipy==1.9.1 pandas==1.3.5 scikit-learn==1.2.2 statsmodels==0.13.5 pynndescent==0.5.8


In [5]:
def X_is_raw(adata):
    return np.array_equal(adata.X.sum(axis=0).astype(int), adata.X.sum(axis=0))

#### Upload Data

In [37]:
input = '/Users/anna.maguza/Desktop/Data/Gut_project/Joanito_cancer/anndata/Joanito_raw_anndata_tumor_cells.h5ad'
adata = sc.read_h5ad(input)

In [38]:
X_is_raw(adata)

True

In [39]:
adata.obs_keys

<bound method AnnData.obs_keys of AnnData object with n_obs × n_vars = 170596 × 33287
    obs: 'nFeature_RNA', 'percent.mt', 'Sample_ID', 'Donor_ID', 'sample.origin', 'dataset_x', 'Cell_Type', 'dataset_y', 'Sex', 'Tumor Stage', 'MSS/MSI', 'Location', 'Side', 'Group Stage', 'Stage TNM', 'iCMS.transcriptomic', 'iCMS.inferCNV', 'KRAS', 'BRAF', 'TP53', 'APC', 'PIK3CA', 'LymphNode', 'Normal', 'Tumor', 'CMS', 'cell_ID', 'iCMS', 'msi', 'batch', 'Age_group', 'Study_name', 'Diagnosis', 'n_genes_by_counts', 'total_counts', 'Library_Preparation_Protocol', 'doublet_scores', 'predicted_doublets'
    var: 'feature_types', 'genome', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'>

### Fill the table

In [42]:
# Calculate number of donors
len(adata.obs.Donor_ID.unique())

62

In [43]:
# Calculate number of samples
len(adata.obs.Sample_ID.unique())

97

In [44]:
# Number of cells
adata.n_obs

170596

In [45]:
#Total Counts
sum(adata.obs.total_counts)

1620208962.0

In [46]:
# Mean cells per sample
adata.obs.groupby('Sample_ID').size().mean()

1758.721649484536

In [47]:
# calculate mean reads per cell 
sum(adata.obs.total_counts)/len(adata.obs)

9497.344380876457

In [48]:
#Mean Genes per Cell
sum(adata.obs.n_genes_by_counts)/len(adata.obs)

2280.8492520340455

In [53]:
# Rename column percent.mt to percent_mt
adata.obs.rename(columns={'percent.mt':'percent_mt'}, inplace=True)

In [54]:
#Mean percentage of mitochondrial counts 
sum(adata.obs.percent_mt)/len(adata.obs)

6.4581119913776694

In [26]:
adata.obs['Location'].value_counts()

Sigmoid colon              50661
Ascending colon            40430
Caecum                     20662
Rectum                     19524
Upper rectum                7737
Rectosigmoid                6688
Low rectum                  6229
Hepatic Flexure             4136
Distal Sigmoid colon        3773
Transverse colon            2896
Descending colon            2816
Mid-rectum                  2554
Distal Ascending colon      1527
Distal Descending colon      963
Name: Location, dtype: int64

In [27]:
sum(adata.obs['predicted_doublets'])

656

In [None]:
sc.set_figure_params(dpi=300)
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'percent_mt'],
             jitter=0.4, multi_panel=True)

In [None]:
adata = adata[adata.obs.n_genes_by_counts < 5000, :]
adata = adata[adata.obs.n_genes_by_counts > 200, :]
adata = adata[adata.obs.total_counts < 50000, :]

sc.set_figure_params(dpi=300)
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mito'],
             jitter=0.4, multi_panel=True)