## Notebook for Gut Cell Atlas Quality Check 

**Prepared by:** Anna Maguza  
**Date:** 4th of July 2023  

#### Load required packages

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as an
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

In [None]:
import scrublet as scr

#### Setup Cells

In [None]:
%matplotlib inline

In [None]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

In [None]:
def X_is_raw(adata):
    return np.array_equal(adata.X.sum(axis=0).astype(int), adata.X.sum(axis=0))

#### Upload Data

In [None]:
GCA_adata = sc.read('/Users/anna.maguza/Desktop/GCA_social_network/data/raw_anndata/GCA/GCA_raw_anndata.h5ad')
X_is_raw(GCA_adata)

#### Modify obs

In [None]:
GCA_adata.obs['Study_name'] = 'Elmentaite, 2021'

In [None]:
#Remove Pediatric Crohn Disease
GCA_adata = GCA_adata[~GCA_adata.obs['Diagnosis'].isin(['Pediatric Crohn Disease']),:]
#Deleting the lymph node samples
GCA_adata = GCA_adata[~GCA_adata.obs['Region'].isin(['lymph node']),:]

In [None]:
#Adding percentage of ribosomial genes
GCA_adata.var['ribo'] = GCA_adata.var_names.str.startswith(("RPS","RPL"))  # annotate the group of ribosomal genes as 'ribo'
sc.pp.calculate_qc_metrics(GCA_adata, qc_vars=['ribo'], percent_top=None, log1p=False, inplace=True)

In [None]:
GCA_adata.obs.rename(columns = {'10X': 'Library_Preparation_Protocol'}, inplace = True)
GCA_adata.obs.rename(columns = {'Gender': 'Sex'}, inplace = True)
GCA_adata.obs['Diagnosis'] = GCA_adata.obs['Diagnosis'].replace('fetal', 'Fetal Healthy')
GCA_adata.obs['Diagnosis'] = GCA_adata.obs['Diagnosis'].replace('Healthy', 'Healthy adult')
GCA_adata.obs['Sex'] = GCA_adata.obs['Sex'].replace('M', 'Male')
GCA_adata.obs['Sex'] = GCA_adata.obs['Sex'].replace('F', 'Female')

In [None]:
GCA_adata.obs.rename(columns = {'Sample name': 'Donor_ID'}, inplace = True)
GCA_adata.obs.rename(columns = {'sample name': 'Sample_ID'}, inplace = True)
GCA_adata.obs.rename(columns = {'Integrated_05': 'Cell_State'}, inplace = True)
GCA_adata.obs.rename(columns = {'Region': 'Location'}, inplace = True)
GCA_adata.obs.rename(columns = {'category': 'Cell_Type'}, inplace = True)

In [None]:
GCA_adata.obs['Sample_ID'] = GCA_adata.obs.apply(
    lambda row: row['Sample_ID'] if row['Sample_ID'].lower() != "nan" else
    (row['Donor_ID'] + '_' + str(row['Age']) + '_' + row['Region code'] + '_' + str(row['Fraction'])),
    axis=1
)

GCA_adata.obs['Donor_ID'] = GCA_adata.obs['Donor_ID'].astype('str')
GCA_adata.obs['Age'] = GCA_adata.obs['Age'].astype('str')
GCA_adata.obs['Region code'] = GCA_adata.obs['Region code'].astype('str')
GCA_adata.obs['Fraction'] = GCA_adata.obs['Fraction'].astype('str')

GCA_adata.obs['Sample_ID'] = GCA_adata.obs['Sample_ID'].where(
    pd.notna(GCA_adata.obs['Sample_ID']),
    GCA_adata.obs['Donor_ID'] + '_' + GCA_adata.obs['Age'].astype(str) + '_' +
    GCA_adata.obs['Region code'] + '_' + GCA_adata.obs['Fraction'].astype(str)
)

In [None]:
# Rename values in Cancer_adata.obs['ClusterTop'] as in Healthy_adata.obs['Cell Type']
GCA_adata.obs['Location'].replace({'SmallInt' : 'Small Intestine',
                                            'Small Bowel' : 'Small Intestine',
                                            'LargeInt': 'Large Intestine',
                                            'Colon': 'Large Intestine',
                                            'REC' : 'Rectum',
                                            'Epi': 'Epithelium',
                                            'LP': 'Lamina Propria'}, inplace=True)

In [None]:
GCA_adata.obs['Library_Preparation_Protocol'].replace({"3'" : "10x 3' v1",
                                            "5'" : "10x 5' v1",
                                            "nan" : "10x 3' v1"}, inplace=True)

### Fill the table

In [None]:
# Calculate number of donors
len(GCA_adata.obs.Donor_ID.unique())

In [None]:
# Calculate number of samples
len(GCA_adata.obs.Sample_ID.unique())

In [None]:
# calculate number of adult samples
len(GCA_adata.obs[GCA_adata.obs['Diagnosis'].isin(['Healthy adult'])].Sample_ID.unique())

In [None]:
# calculate number of fetal samples
len(GCA_adata.obs[GCA_adata.obs['Diagnosis'].isin(['Fetal Healthy'])].Sample_ID.unique())

In [None]:
# calculate number of Pediatric samples
len(GCA_adata.obs[GCA_adata.obs['Diagnosis'].isin(['Pediatric healthy'])].Sample_ID.unique())

In [None]:
#Total Counts
sum(GCA_adata.obs.total_counts)

In [None]:
# calculate mean cells per sample
GCA_adata.obs.groupby('Sample_ID').size().mean()

In [None]:
# calculate mean reads per cell 
sum(GCA_adata.obs.total_counts)/len(GCA_adata.obs)

In [None]:
#Mean Genes per Cell
sum(GCA_adata.obs.n_genes_by_counts)/len(GCA_adata.obs)

In [None]:
#Mean percentage of mitochondrial counts 
sum(GCA_adata.obs.pct_counts_mt)/len(GCA_adata.obs)

In [None]:
# Mean percentage of ribosomal counts
sum(GCA_adata.obs.pct_counts_ribo)/len(GCA_adata.obs)

In [None]:
# Number of predicted_doublets == True
GCA_adata.obs['predicted_doublets'].value_counts()

In [None]:
GCA_adata.obs['Cell_Type'].value_counts()

In [None]:
GCA_adata.obs['Location'].value_counts()

In [None]:
GCA_adata.obs['Library_Preparation_Protocol'].value_counts()

#### Create Violin Plots

In [None]:
#A violin plot of some of the computed quality measures:
    # the number of genes expressed in the count matrix
    #the total counts per cell
    #the percentage of counts in mitochondrial genes
    #the percentage of counts in ribosomial genes
sc.set_figure_params(dpi=300)
sc.pl.violin(GCA_adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt', 'pct_counts_ribo'],
             jitter=0.4, multi_panel=True)

In [None]:
#Filtering by genes
GCA_adata = GCA_adata[GCA_adata.obs.n_genes_by_counts < 5000, :]
GCA_adata = GCA_adata[GCA_adata.obs.n_genes_by_counts > 200, :]
GCA_adata = GCA_adata[GCA_adata.obs.total_counts < 50000, :]

sc.set_figure_params(dpi=300)
sc.pl.violin(GCA_adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt', 'pct_counts_ribo'],
             jitter=0.4, multi_panel=True)

In [None]:
GCA_adata.write('/Users/anna.maguza/Desktop/Data/Processed_datasets/1_QC/GCA_filtered_raw.h5ad')