## Notebook for the Kong, 2023 QC 
**Developed by:** Anna Maguza  
**Institute of Computational Biology - Computational Health Centre - Hemlholtz Munich**  
**Date:** 5th July 2023  

#### Load required packages

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as an
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

In [2]:
import scrublet 

#### Setup Cells

In [3]:
%matplotlib inline

In [4]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

  from .autonotebook import tqdm as notebook_tqdm


scanpy==1.9.3 anndata==0.8.0 umap==0.5.3 numpy==1.23.5 scipy==1.9.1 pandas==1.3.5 scikit-learn==1.2.2 statsmodels==0.13.5 pynndescent==0.5.8


In [5]:
def X_is_raw(adata):
    return np.array_equal(adata.X.sum(axis=0).astype(int), adata.X.sum(axis=0))

#### Upload Data

In [6]:
input = '/Users/anna.maguza/Desktop/Data/Processed_datasets/1_QC/Kong_scVI_scANVI.h5ad'
adata = sc.read_h5ad(input)

In [7]:
adata = adata.raw.to_adata()

In [8]:
X_is_raw(adata)

True

### Fill the table

In [9]:
# Calculate number of donors
len(adata.obs.Donor_ID.unique())

25

In [10]:
# Calculate number of samples
len(adata.obs.Sample_ID.unique())

75

In [11]:
# Number of cells
adata.n_obs

181806

In [12]:
# change the value type of adata.obs.n_counts to int
adata.obs['n_counts'] = adata.obs['n_counts'].astype(int)
adata.obs['n_genes'] = adata.obs['n_genes'].astype(int)

#Total Counts
sum(adata.obs.n_counts)

721415359

In [13]:
# Mean cells per sample
adata.obs.groupby('Sample_ID').size().mean()

2424.08

In [15]:
# calculate mean reads per cell 
sum(adata.obs.n_counts)/len(adata.obs)

3968.0503338723693

In [16]:
#Mean Genes per Cell
sum(adata.obs.n_genes_by_counts)/len(adata.obs)

1010.0540411207551

In [17]:
#Mean percentage of mitochondrial counts 
sum(adata.obs.pct_counts_mito)/len(adata.obs)

7.780488314788929

In [18]:
# Mean percentage of ribosomal counts
sum(adata.obs.pct_counts_ribo)/len(adata.obs)

23.270713829365054

In [19]:
sum(adata.obs['predicted_doublets'])

232

In [None]:
sc.set_figure_params(dpi=300)
sc.pl.violin(adata, ['n_genes', 'n_counts', 'pct_counts_mito', 'pct_counts_ribo'],
             jitter=0.4, multi_panel=True)

In [None]:
adata = adata[adata.obs.n_genes < 5000, :]
adata = adata[adata.obs.n_genes > 200, :]
adata = adata[adata.obs.n_counts < 50000, :]

sc.set_figure_params(dpi=300)
sc.pl.violin(adata, ['n_genes', 'n_counts', 'pct_counts_mito', 'pct_counts_ribo'],
             jitter=0.4, multi_panel=True)