## Notebook for the Smillie, 2019 QC 
### Developed by: Anna Maguza

### Institute of Computational Biology - Computational Health Centre - Hemlholtz Munich

### 16th June 2023

#### Load required packages

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as an
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

In [None]:
import scrublet 

#### Setup Cells

In [None]:
%matplotlib inline

In [None]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

In [None]:
def X_is_raw(adata):
    return np.array_equal(adata.X.sum(axis=0).astype(int), adata.X.sum(axis=0))

#### Upload Data

In [None]:
input = '/Users/anna.maguza/Desktop/Data/Gut_project/Smillie/Anndata_raw/Smillie_ulcerative_colitis_anndata.h5ad'
adata = sc.read_h5ad(input)

In [None]:
X_is_raw(adata)

In [None]:
adata.obs

In [None]:
adata.obs['Diagnosis'].value_counts()

In [None]:
adata.obs['Cell_States'] = adata.obs['CellType']

In [None]:
adata.obs['CellType'].replace({
    'Enterocytes': 'Epithelial',
    'Best4+ Enterocytes': 'Epithelial',
    'Enterocyte Progenitors': 'Epithelial',
    'Immature Enterocytes 1': 'Epithelial',
    'Immature Enterocytes 2': 'Epithelial',
    'Immature Goblet': 'Epithelial',
    'M cells': 'Epithelial',
    'Secretory TA': 'Epithelial',
    'TA 1': 'Epithelial',
    'TA 2': 'Epithelial',
    'Cycling TA': 'Epithelial',
    'WNT2B+ Fos-lo 1': 'Epithelial',
    'WNT2B+ Fos-hi': 'Epithelial',
    'WNT5B+ 2': 'Epithelial',
    'RSPO3+': 'Epithelial',
    'Enteroendocrine': 'Epithelial',
    'Tuft': 'Epithelial',
    'Goblet': 'Epithelial',
    'Stem': 'Epithelial',
    'WNT2B+ Fos-lo 2': 'Epithelial',
    'WNT5B+ 1': 'Epithelial',
    
    'Myofibroblasts': 'Mesenchymal',
    'Pericytes': 'Mesenchymal',
    'Inflammatory Fibroblasts': 'Mesenchymal',
    
    'Cycling B': 'B cells',
    'GC': 'B cells',
    'Follicular': 'B cells',
    
    'CD4+ Activated Fos-hi': 'T cells',
    'CD4+ Activated Fos-lo': 'T cells',
    'CD4+ Memory': 'T cells',
    'CD4+ PD1+': 'T cells',
    'CD8+ IELs': 'T cells',
    'CD8+ IL17+': 'T cells',
    'CD8+ LP': 'T cells',
    'Tregs': 'T cells',
    'Cycling T': 'T cells',
    
    'Plasma': 'Plasma cells',
    
    'Post-capillary Venules': 'Endothelial',
    'Microvascular': 'Endothelial',
    
    'Glia': 'Neuronal',
    
    'NKs': 'Myeloid',
    'Inflammatory Monocytes': 'Myeloid',
    'Cycling Monocytes': 'Myeloid',
    'Macrophages': 'Myeloid',
    'DC1': 'Myeloid',
    'CD69- Mast': 'Myeloid',
    'CD69+ Mast': 'Myeloid',
    'DC2': 'Myeloid',
    
    'MT-hi': 'Red blood cells',
    'ILCs': 'Red blood cells',
}, inplace=True)


### Generate QC values

In [None]:
#Adding percentage of ribosomial genes
adata.var['ribo'] = adata.var_names.str.startswith(("RPS","RPL"))  # annotate the group of ribosomal genes as 'ribo'
sc.pp.calculate_qc_metrics(adata, qc_vars=['ribo'], percent_top=None, log1p=False, inplace=True)

In [None]:
# Add percent_mito to adata
adata.var['mito'] = adata.var_names.str.startswith(("MT-"))  # annotate the group of ribosomal genes as 'ribo'
sc.pp.calculate_qc_metrics(adata, qc_vars=['mito'], percent_top=None, log1p=False, inplace=True)

### Fill the table

In [None]:
# Calculate number of donors
len(adata.obs.Donor_ID.unique())

In [None]:
# Calculate number of samples
len(adata.obs.Sample_ID.unique())

In [None]:
# Number of cells
adata.n_obs

In [None]:
# change the value type of adata.obs.n_counts to int
adata.obs['n_counts'] = adata.obs['n_counts'].astype(int)
adata.obs['n_genes'] = adata.obs['n_genes'].astype(int)

#Total Counts
sum(adata.obs.n_counts)

In [None]:
# Mean cells per sample
adata.obs.groupby('Sample_ID').size().mean()

In [None]:
# calculate mean reads per cell 
sum(adata.obs.total_counts)/len(adata.obs)

In [None]:
#Mean Genes per Cell
sum(adata.obs.n_genes_by_counts)/len(adata.obs)

In [None]:
#Mean percentage of mitochondrial counts 
sum(adata.obs.pct_counts_mito)/len(adata.obs)

In [None]:
# Mean percentage of ribosomal counts
sum(adata.obs.pct_counts_ribo)/len(adata.obs)

In [None]:
# Rename values in Cancer_adata.obs['ClusterTop'] as in Healthy_adata.obs['Cell Type']
adata.obs['Location'].replace({'SmallInt' : 'Small Intestine',
                                            'Small Bowel' : 'Small Intestine',
                                            'LargeInt': 'Large Intestine',
                                            'Colon': 'Large Intestine',
                                            'REC' : 'Rectum',
                                            'Epi': 'Epithelium',
                                            'LP': 'Lamina Propria'}, inplace=True)

In [None]:
adata.obs['Location'].value_counts()

In [None]:
sc.set_figure_params(dpi=300)
sc.pl.violin(adata, ['n_genes', 'n_counts', 'pct_counts_mito', 'pct_counts_ribo'],
             jitter=0.4, multi_panel=True)

In [None]:
adata = adata[adata.obs.n_genes < 5000, :]
adata = adata[adata.obs.n_genes > 200, :]
adata = adata[adata.obs.n_counts < 50000, :]

sc.set_figure_params(dpi=300)
sc.pl.violin(adata, ['n_genes', 'n_counts', 'pct_counts_mito', 'pct_counts_ribo'],
             jitter=0.4, multi_panel=True)

### Identify doublets

In [None]:
scrub = scrublet.Scrublet(adata.X)

In [None]:
adata.obs['doublet_scores'], adata.obs['predicted_doublets'] = scrub.scrub_doublets()
scrub.plot_histogram()

In [None]:
sum(adata.obs['predicted_doublets'])

In [None]:
# add in column with singlet/doublet instead of True/False
adata.obs['doublet_info'] = adata.obs["predicted_doublets"].astype(str)

In [None]:
adata.write('/Users/anna.maguza/Desktop/Data/Processed_datasets/1_QC/Smillie_with_QC_raw.h5ad')