## Notebook for the Wang, 2020 QC analysis
**Developed by:** Anna Maguza  
**Institute of Computational Biology - Computational Health Centre - Hemlholtz Munich**  
**4th July 2023**  

#### Load required packages

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as an
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

In [None]:
import scrublet 

#### Setup Cells

In [None]:
%matplotlib inline

In [None]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

In [None]:
def X_is_raw(adata):
    return np.array_equal(adata.X.sum(axis=0).astype(int), adata.X.sum(axis=0))

#### Upload Data

In [None]:
input = '/Users/anna.maguza/Desktop/Data/Gut_project/Wang/Wang_anndata_raw/Wang_2022_raw_anndata.h5ad'
adata = sc.read_h5ad(input)
X_is_raw(adata)

In [None]:
adata.obs.rename(columns = {'CellType': 'Cell_Type'}, inplace = True)

### Generate QC values

In [None]:
sc.pp.calculate_qc_metrics(adata, expr_type='counts', var_type='genes', qc_vars=(), percent_top=None, inplace=True, log1p=False)

In [None]:
#Adding percentage of ribosomial genes
adata.var['ribo'] = adata.var_names.str.startswith(("RPS","RPL"))  # annotate the group of ribosomal genes as 'ribo'
sc.pp.calculate_qc_metrics(adata, qc_vars=['ribo'], percent_top=None, log1p=False, inplace=True)

In [None]:
# Add percent_mito to adata
adata.var['mito'] = adata.var_names.str.startswith(("MT-"))  # annotate the group of ribosomal genes as 'ribo'
sc.pp.calculate_qc_metrics(adata, qc_vars=['mito'], percent_top=None, log1p=False, inplace=True)

In [None]:
adata.obs

### Fill the table

In [None]:
# Calculate number of donors
len(adata.obs.Donor_ID.unique())

In [None]:
# Calculate number of samples
len(adata.obs.Sample_ID.unique())

In [None]:
# Number of cells
adata.n_obs

In [None]:
#Total Counts
sum(adata.obs.total_counts)

In [None]:
# Mean cells per sample
adata.obs.groupby('Sample_ID').size().mean()

In [None]:
# calculate mean reads per cell 
sum(adata.obs.total_counts)/len(adata.obs)

In [None]:
#Mean Genes per Cell
sum(adata.obs.n_genes_by_counts)/len(adata.obs)

In [None]:
#Mean percentage of mitochondrial counts 
sum(adata.obs.pct_counts_mito)/len(adata.obs)

In [None]:
# Mean percentage of ribosomal counts
sum(adata.obs.pct_counts_ribo)/len(adata.obs)

In [None]:
sc.set_figure_params(dpi=300)
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mito', 'pct_counts_ribo'],
             jitter=0.4, multi_panel=True)

In [None]:
adata = adata[adata.obs.n_genes_by_counts < 5000, :]
adata = adata[adata.obs.n_genes_by_counts > 200, :]
adata = adata[adata.obs.total_counts < 50000, :]

sc.set_figure_params(dpi=300)
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mito', 'pct_counts_ribo'],
             jitter=0.4, multi_panel=True)

In [None]:
# Add 'Female' to sex column in adata.obs
female_donors = ['Wang_Donor_2']

adata.obs['Sex'] = ['Female' if donor in female_donors else 'Male' for donor in adata.obs['Donor_ID']]

In [None]:
adata.obs['Cell_Type'].value_counts()

### Identify doublets

In [None]:
scrub = scrublet.Scrublet(adata.X)

In [None]:
adata.obs['doublet_scores'], adata.obs['predicted_doublets'] = scrub.scrub_doublets()
scrub.plot_histogram()

In [None]:
sum(adata.obs['predicted_doublets'])

In [None]:
# add in column with singlet/doublet instead of True/False
adata.obs['doublet_info'] = adata.obs["predicted_doublets"].astype(str)

In [None]:
adata.write('/Users/anna.maguza/Desktop/Data/Processed_datasets/1_QC/Wang_with_QC_raw.h5ad')