In [1]:
# General modules
import sys
import os
import session_info
import warnings
from pyprojroot.here import here
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scienceplots
from sklearn.neighbors import NearestNeighbors
from typing import Iterable

# Specific modules
import scanpy as sc
import scanpy.external as sce
import anndata as ad
# from harmonypy import lisi

# Setting some parameters
warnings.filterwarnings("ignore")
sys.path.insert(1, str(here('bin')))

# Import custom functions
from customPythonFunctions import cumulative_explained_variance, balanced_sample, generateID2SymbolDF

print("Main directory path: {}".format(here()))

#plt.style.use(['science','nature','no-latex'])
dpi_fig_save = 300
sc.set_figure_params(dpi=100, dpi_save=dpi_fig_save, vector_friendly=True)

overwriteFigures = True
overwriteData = True

Main directory path: /scratch_isilon/groups/singlecell/shared/projects/Inflammation-PBMCs-Atlas-R1


**Loading the restored object**

In [None]:
adata = sc.read_h5ad(here("01_data_processing/results/05_INFLAMMATION_main_HVGsubset_scVI_UMAP_clinical_allGenes.h5ad"))
adata

# Inspecting quality control metrics

After running PCA and UMAP dimensionality reduction, it is important to assess some quality control metrics to identify potential batch-effects as well as low-quality cells that have not been removed on the first permissive quality control step.

However, working with such a large number of projects, libraries, patients, technologies, sequencing facilities, technitians, there will be a clear batch-effect associated to the data. The idea is to identify the main source of batch-effect, so we can correct it by integrating the data with the gold-standard methods of the field.

For this reason, we will need to load and merge the **patient metadata** that we stored in an external file, with the current `adata` object.

## Before integration

In [None]:
### To reduce the computational time, we are plotting only 10% of cells

In [None]:
SScells = balanced_sample(adata.obs, cols = ['sampleID'], n = None, frac = 0.1, shuffle = True, random_state = 42)['cellID']

In [None]:
adataSS = adata[SScells].copy()

In [None]:
del adata

### Technical confounding factors

In [None]:
fig = sc.pl.embedding(adataSS, basis = 'X_umap',  
                 color=['studyID', 'chemistry', 'technology',
                        'total_counts', 'n_genes_by_counts', 
                        'pct_counts_mt', 'pct_counts_ribo', 'pct_counts_hb', 'pct_counts_plt',
                       'doublet_score'], 
                 wspace = 0.4, ncols = 4, return_fig=True)

if overwriteFigures:
    plt.savefig(here('01_data_processing/figures/06_UMAP_technicalFactors_unintegrated.pdf'), bbox_inches='tight', pad_inches=0, dpi=dpi_fig_save)

### Clinical covariates

In [None]:
fig = sc.pl.embedding(adataSS, basis = 'X_umap', 
                 color=['sex','binned_age', 'disease',
                       "ethnicity", "smokingStatus", "therapyResponse"], 
                 wspace = 0.4, ncols = 3, return_fig=True)

if overwriteFigures:
    plt.savefig(here('01_data_processing/figures/06_UMAP_clinicalFactors_unintegrated.pdf'), bbox_inches='tight', pad_inches=0, dpi=dpi_fig_save)

### Cell cycling score

In [None]:
fig = sc.pl.embedding(adataSS, basis = 'X_umap', 
                 ncols=3, color=['S_score', 'G2M_score', 'phase'], wspace = 0.4, return_fig=True)

if overwriteFigures:
    plt.savefig(here('01_data_processing/figures/06_UMAP_cellcyclingScore_unintegrated.pdf'), bbox_inches='tight', pad_inches=0, dpi=dpi_fig_save)

### Expression of main gene markers

In [None]:
geneList = ['PTPRC', # immune cells
                        'CD3D', 'CD4', 'FOXP3', 'SELL', 'IRF7', 'KLRG1','CD8A', 'NKG7', # T cells
                        'KLRD1', # NK cells
                        'CD79A', 'MS4A1', 'JCHAIN', # B + Plasma cells
                        'LYZ', 'CD14', 'FCGR3A', 'PLAC8', # Monocytes + (Macro)
                        'CLEC9A', 'FTL', 
                        'CLEC10A', 'GZMB', # DCs
                        'S100A9', # Neutros?
                        'CD34', # HSCs
                        'PPBP' # Platelets
           ]
ID2symDF = generateID2SymbolDF(varDF = adataSS.var, symbolList = geneList, behaviour = 'all')
fig = sc.pl.embedding(adataSS, basis = 'X_umap', 
                 color= ID2symDF.gene_id.tolist(), title = ID2symDF.symbol.tolist(),
                 wspace = 0.4, sort_order=True, ncols=4, return_fig=True, use_raw=False)

if overwriteFigures:
    plt.savefig(here('01_data_processing/figures/06_UMAP_immuneMarkergenes_unintegrated.pdf'), bbox_inches='tight', pad_inches=0, dpi=dpi_fig_save)

As we have already anticipated, cells do not cluster based on their transcriptomic profile, because of the presence of batch-effect associated to their library of origin / patient (dataset source). For this reason, we will be integrating the datasets considering the main variable carrying the batch-effect.

# After integration

After performing data integration with Harmony, it is important to evaluate the integration step to ensure the batch-effect have been removed, and the biological variance has been preserved. Here, we visually inspect again some quality control metrics.

### Technical confounding factors

In [None]:
fig = sc.pl.embedding(adataSS, basis = 'X_umap_scVI',  
                 color=['studyID', 'chemistry', 'technology',
                        'total_counts', 'n_genes_by_counts', 
                        'pct_counts_mt', 'pct_counts_ribo', 'pct_counts_hb', 'pct_counts_plt',
                       'doublet_score'], 
                      wspace = 0.4, ncols = 4, return_fig=True)


if overwriteFigures:
    plt.savefig(here('01_data_processing/figures/06_UMAP_technicalFactors_scVI.pdf'), bbox_inches='tight', pad_inches=0, dpi=dpi_fig_save)

### Clinical covariates

In [None]:
fig = sc.pl.embedding(adataSS, basis = 'X_umap_scVI', 
                 color=['sex','binned_age', 'disease',
                       "ethnicity", "smokingStatus", "therapyResponse"], 
                 wspace = 0.4, ncols = 3, return_fig=True)

if overwriteFigures:
    plt.savefig(here('01_data_processing/figures/06_UMAP_clinicalFactors_scVI.pdf'), bbox_inches='tight', pad_inches=0, dpi=dpi_fig_save)

### Cell cycling score

In [None]:
fig = sc.pl.embedding(adataSS, basis = 'X_umap_scVI', 
                 ncols=3, color=['S_score', 'G2M_score', 'phase'], wspace = 0.4, return_fig=True)

if overwriteFigures:
    plt.savefig(here('01_data_processing/figures/06_UMAP_cellcyclingScore_scVI.pdf'), bbox_inches='tight', pad_inches=0, dpi=dpi_fig_save)

### Expression of main gene markers

In [None]:
geneList = ['PTPRC', # immune cells
                        'CD3D', 'CD4', 'FOXP3', 'SELL', 'IRF7', 'KLRG1','CD8A', 'NKG7', # T cells
                        'KLRD1', # NK cells
                        'CD79A', 'MS4A1', 'JCHAIN', # B + Plasma cells
                        'LYZ', 'CD14', 'FCGR3A', 'PLAC8', # Monocytes + (Macro)
                        'CLEC9A', 'FTL', 
                        'CLEC10A', 'GZMB', # DCs
                        'S100A9', # Neutros?
                        'CD34', # HSCs
                        'PPBP'
           ]
ID2symDF = generateID2SymbolDF(varDF = adataSS.var, symbolList = geneList, behaviour = 'all')

fig = sc.pl.embedding(adataSS, basis = 'X_umap_scVI', 
                color= ID2symDF.gene_id.tolist(), title = ID2symDF.symbol.tolist(),
                 wspace = 0.4, sort_order=True, ncols=4, return_fig=True, use_raw=False)

if overwriteFigures:
    plt.savefig(here('01_data_processing/figures/06_UMAP_immuneMarkergenes_scVI.pdf'), bbox_inches='tight', pad_inches=0, dpi=dpi_fig_save)

Now, after data integration, cells do cluster based on their transcriptomic profile rather than by their library of origin / patient (dataset source). Therefore, data integration worked successfully and allowed us to remove batch-effects associated to unwanted sources of variance. At this point, we can move on with the clustering and cell annotation steps.

In [None]:
session_info.show()