## Notebook for the Khaliq 2022 data quality check.
### Developed by: Anna Maguza

### Institute of Computational Biology - Computational Health Centre - Hemlholtz Munich

### 12 October 2022

#### Load required packages

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as an
import scrublet

#### Setup Cells

In [2]:
%matplotlib inline

In [3]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

scanpy==1.9.1 anndata==0.8.0 umap==0.5.3 numpy==1.21.6 scipy==1.8.1 pandas==1.4.2 scikit-learn==1.1.1 statsmodels==0.13.2 python-igraph==0.9.11 louvain==0.7.1 pynndescent==0.5.7


#### Data Quality Check and Preprocessing

In [None]:
#Loading again to further processing
input_CRC_file = '/Users/annamaguza/Desktop/Desktop-Anna/LMU/Master-Thesis/Anna-Master-Project/khaliq_2022_anndata_raw.h5ad'  # the file that will store the analysis results
output_CRC_file = '/Users/annamaguza/Desktop/Desktop-Anna/LMU/Master-Thesis/Anna-Master-Project/khaliq_2022_anndata_processed.h5ad'  # the file that will store the analysis results

In [None]:
#Anndata upload
khaliq_ad = sc.read_h5ad(input_CRC_file)
khaliq_ad.X

In [None]:
khaliq_ad.obs

In [None]:
khaliq_ad.obs

In [None]:
#Show those genes that yield the highest fraction of counts in each single cell, across all cells
sc.pl.highest_expr_genes(khaliq_ad, n_top=20, )

In [None]:
#Calculate quality control metrics
sc.pp.calculate_qc_metrics(khaliq_ad, expr_type = 'counts', var_type='genes', percent_top = None, inplace=True)

In [None]:
#Check the output
khaliq_ad.obs

In [None]:
#Adding percentage of mitochondrial genes
khaliq_ad.var['mt'] = khaliq_ad.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(khaliq_ad, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

In [None]:
#Adding percentage of ribosomial genes
khaliq_ad.var['ribo'] = khaliq_ad.var_names.str.startswith(("RPS","RPL"))  # annotate the group of ribosomal genes as 'ribo'
sc.pp.calculate_qc_metrics(khaliq_ad, qc_vars=['ribo'], percent_top=None, log1p=False, inplace=True)

In [None]:
#A violin plot of some of the computed quality measures:
    # the number of genes expressed in the count matrix
    #the total counts per cell
    #the percentage of counts in mitochondrial genes
    #the percentage of counts in ribosomial genes
sc.pl.violin(khaliq_ad, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt', 'pct_counts_ribo'],
             jitter=0.4, multi_panel=True)

In [None]:
#Delete normal samples (leave only tumor)
khaliq_ad_tumor = khaliq_ad[khaliq_ad.obs.Condition == "Tumor", :]

In [None]:
#A violin plot of some of the computed quality measures:
    # the number of genes expressed in the count matrix
    #the total counts per cell
    #the percentage of counts in mitochondrial genes
    #the percentage of counts in ribosomial genes
sc.pl.violin(khaliq_ad_tumor, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt', 'pct_counts_ribo'],
             jitter=0.4, multi_panel=True)

In [None]:
#Check Quality Metrics by Samples
#Create visualisation of QC covariates for samples
sc.pl.violin(khaliq_ad_tumor, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt','pct_counts_ribo'],
             jitter=0.4, groupby = 'samples', rotation = 45)

In [None]:
#Check Quality Metrics by Samples
#Create visualisation of QC covariates for Tumor location
sc.pl.violin(khaliq_ad_tumor, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt','pct_counts_ribo'],
             jitter=0.4, groupby = 'Location', rotation = 45)

In [None]:
#Check Quality Metrics by Samples
#Create visualisation of QC covariates for MSI status
sc.pl.violin(khaliq_ad_tumor, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt','pct_counts_ribo'],
             jitter=0.4, groupby = 'MSI_Status', rotation = 45)

In [None]:
khaliq_ad_tumor.obs

In [None]:
#Identification of hightly variable genes to run UMAP
sc.pp.highly_variable_genes(khaliq_ad_tumor, flavor = 'seurat_v3', n_top_genes=2000)

In [None]:
#Neighbors calculating for UMAp
sc.pp.neighbors(khaliq_ad_tumor, n_neighbors = 50, n_pcs = 50)

In [None]:
#UMAP with QC parameters
sc.tl.umap(khaliq_ad_tumor)

In [None]:
#UMAP with QC parameters
sc.pl.umap(khaliq_ad_tumor, color_map = "magma", color=['n_genes_by_counts', 'total_counts', 'pct_counts_mt', 'pct_counts_ribo', "Location", "MSI_Status", "samples"])

In [None]:
#Mean percentage of ribosomial counts
from statistics import mean
mean(khaliq_ad_tumor.obs.pct_counts_ribo)

In [None]:
#Mean percentage of mitochondrial counts
mean(khaliq_ad_tumor.obs.pct_counts_mt)

In [None]:
#Total Counts
sum(khaliq_ad_tumor.obs.total_counts)

In [None]:
#Mean Genes per Cell
sum(khaliq_ad_tumor.obs.n_genes_by_counts)/31586

In [None]:
#Mean Reads per Cell
sum(khaliq_ad_tumor.obs.total_counts)/31586

### Doublets identification with scrublet

In [None]:
%matplotlib inline
scrub = scrublet.Scrublet(khaliq_ad_tumor.X)

In [None]:
%matplotlib inline
khaliq_ad_tumor.obs['doublet_scores'], khaliq_ad_tumor.obs['predicted_doublets'] = scrub.scrub_doublets()
scrub.plot_histogram()

In [None]:
%matplotlib inline
sum(khaliq_ad_tumor.obs['predicted_doublets'])

In [None]:
# add in column with singlet/doublet instead of True/False
khaliq_ad_tumor.obs['doublet_info'] = khaliq_ad_tumor.obs["predicted_doublets"].astype(str)

In [None]:
khaliq_ad_tumor.obs

In [None]:
%matplotlib inline
#check if our predicted doublets also have more detected genes in general
sc.pl.violin(khaliq_ad_tumor, 'n_genes_by_counts',
             jitter=0.4, groupby = 'doublet_info', rotation=45)

In [None]:
#Lets run PCA and UMAP and plot doublet scores onto umap to check the doublet predictions.
sc.pp.highly_variable_genes(khaliq_ad_tumor, flavor = 'seurat_v3', n_top_genes=2000)

In [None]:
khaliq_ad_tumor_scrublet = khaliq_ad_tumor[:, khaliq_ad_tumor.var.highly_variable]

In [None]:
sc.pp.regress_out(khaliq_ad_tumor_scrublet, ['total_counts', 'pct_counts_mt'])

In [None]:
sc.pp.scale(khaliq_ad_tumor_scrublet, max_value=10)

In [None]:
sc.tl.pca(khaliq_ad_tumor_scrublet, svd_solver='arpack')

In [None]:
sc.pp.neighbors(khaliq_ad_tumor_scrublet, n_neighbors = 50, n_pcs = 50)

In [None]:
sc.tl.umap(khaliq_ad_tumor_scrublet)

In [None]:
%matplotlib inline
sc.pl.umap(khaliq_ad_tumor_scrublet, color_map = "magma", color=['doublet_scores','doublet_info','samples'])

#### Calculate cell cycle scores

In [None]:
sc.pl.umap(khaliq_ad_tumor, color_map = "magma", color=['EXO1'])

In [None]:
khaliq_ad_tumor.var_names

Load cell cycle genes defined in Tirosh et al, 2015. It is a list of 97 genes, represented by their gene symbol. The list here is for humans, in case of alternate organism, a list of ortologues should be compiled. There are major differences in the way Scanpy and Seurat manage data, in particular we need to filter out cell cycle genes that are not present in our dataset to avoid errors.

In [None]:
!if [ ! -f /Users/annamaguza/Desktop/Desktop-Anna/LMU/Master-Thesis/Anna-Master-Project/cell_cycle_genes.txt ]; then curl -o /Users/annamaguza/Desktop/Desktop-Anna/LMU/Master-Thesis/Anna-Master-Project/cell_cycle_genes.txt https://raw.githubusercontent.com/theislab/scanpy_usage/master/180209_cell_cycle/data/regev_lab_cell_cycle_genes.txt; fi

In [None]:
cell_cycle_genes = [x.strip() for x in open('/Users/annamaguza/Desktop/Desktop-Anna/LMU/Master-Thesis/Anna-Master-Project/cell_cycle_genes.txt')]
print(len(cell_cycle_genes))

Here we define two lists, genes associated to the S phase and genes associated to the G2M phase

In [None]:
# Split into 2 lists
s_genes = cell_cycle_genes[:43]
g2m_genes = cell_cycle_genes[43:]

In [None]:
cell_cycle_genes = [x for x in cell_cycle_genes if x in khaliq_ad_tumor.var_names]
print(len(cell_cycle_genes))

In [None]:
#Create basic anndata for score calculation
adata_log = an.AnnData(X = khaliq_ad_tumor.X,  var = khaliq_ad_tumor.var, obs = khaliq_ad_tumor.obs)
sc.pp.normalize_total(adata_log, target_sum = 1e6, exclude_highly_expressed = True)
sc.pp.log1p(adata_log)

In [None]:
sc.tl.score_genes_cell_cycle(adata_log, s_genes = s_genes, g2m_genes = g2m_genes)
sc.pl.violin(adata_log, ['S_score', 'G2M_score'],
             jitter = 0.4, groupby = 'samples', rotation = 45)

In [None]:
khaliq_ad_tumor.obs['S_score'] = adata_log.obs['S_score']
khaliq_ad_tumor.obs['G2M_score'] = adata_log.obs['G2M_score']
khaliq_ad_tumor