# 2.0-QC CellRanger mapping

**Inés Sentís**

Date of execution 

In [None]:
Sys.Date()

## Introduction


The objective of this notebook is to perform a basic quality control of the mapping performed with [CellRanger](https://support.10xgenomics.com/single-cell-vdj/software/pipelines/latest/using/multi)

This QC report is originally from [@PaulaNietoG](https://github.com/PaulaNietoG/CSF/commits?author=PaulaNietoG)

## Libraries

In [None]:
suppressMessages(suppressWarnings({
library(tidyverse)
library(gt)
library(ggpubr)
library(ggrepel)
library(here)
library(glue)
library(magrittr)
library(RColorBrewer)}))

## Parameters

In [None]:
source(here::here("SCGRES_124_125/sc_analysis/misc/paths.R"))
#source(here::here("SCGRES_119_120/sc_analysis/misc/paths.R"))
#source(here::here("SCGRES_105_106/sc_analysis/misc/paths.R"))
#source(here::here("SCGRES_99_100/sc_analysis/misc/paths.R"))
#source(here::here("SCGRES_83_84/sc_analysis/misc/paths.R"))

source(here::here("utils/bin.R"))

"{qc}/{plt_dir}" %>%
  glue::glue() %>%
  here::here() %>%
  dir.create(path = .,
           showWarnings = FALSE,
           recursive = TRUE)

"{qc}/{robj_dir}" %>%
  glue::glue() %>%
  here::here() %>%
  dir.create(path = .,
           showWarnings = FALSE,
           recursive = TRUE)

## Functions

In [None]:
# display gt tables in Jupyter (from here https://stackoverflow.com/questions/75033756/gt-table-in-jupyter)
gts <- function(gt_table){
   gt:::as.tags.gt_tbl(gt_table)
}

## Load

In [None]:
subproject <- c("SCGRES_124_125")

In [None]:
metadata <- read.csv(here(glue("{cellranger}/metadata.csv")))

In [None]:
cellranger_metrics <-purrr::map(list.dirs(path = here(glue("{cellranger}/jobs")), full.names = FALSE, recursive = FALSE), function(lib){
    if (!grepl("test", lib)) {
        print(lib)
        cr <- read.csv(here(glue("{cellranger}/jobs/{lib}/{lib}/outs/per_sample_outs/{lib}/metrics_summary.csv")), dec = c(".", ","), sep=',')
        cr$Subproject <- subproject
        cr$GemID <- lib
        cr
    }
})
# merge all datasets
cellranger_metrics <- bind_rows(cellranger_metrics)

In [None]:
# fix columns
cellranger_metrics$Metric.Value <- as.numeric(str_remove_all(cellranger_metrics$Metric.Value, "%|,"))

In [None]:
head(cellranger_metrics)

### GEx

In [None]:
cellranger_metrics_gex_df <- cellranger_metrics[cellranger_metrics$Library.Type == "Gene Expression", c("Metric.Name", "Metric.Value", "Subproject", "GemID")] %>%
  unique() %>% 
  pivot_wider(names_from = "Metric.Name", values_from = "Metric.Value", id_cols = c("Subproject", "GemID"))

In [None]:
colnames(cellranger_metrics_gex_df) <- str_replace_all(colnames(cellranger_metrics_gex_df), pattern = " ", replacement = "_")

In [None]:
cellranger_metrics_gex_df %>% 
  select(c(1:3, 5:9)) %>% 
  rename_with(~str_c(str_replace_all(., "_", " "))) %>%
  gt() %>%
  fmt_number(columns = "Number of reads", scale_by = 1 / 1E6, pattern = "{x}M") %>% 
  tab_header(
    title = md("**GEX QC metrics**"),
    subtitle = ("cellranger v 7.0.0")
  ) %>% 
  #Apply new style to all column headers
   tab_style(
     locations = cells_column_labels(columns = everything()),
     style     = list(
       #Give a thick border below
       cell_borders(sides = "bottom", weight = px(3)),
       #Make text bold
       cell_text(weight = "bold")
     )
   ) %>% 
   #Apply different style to the title
   tab_style(
     locations = cells_title(groups = "title"),
     style     = list(
       cell_text(weight = "bold", size = 24)
     )
   ) %>% gts

#### Mapping QC

In [None]:
options(repr.plot.width = 10, repr.plot.height = 16)
qc_map_vars <- c("Confidently_mapped_to_genome",
                 "Confidently_mapped_to_intergenic_regions",
                 "Confidently_mapped_to_intronic_regions",
                 "Confidently_mapped_to_exonic_regions",
                 "Confidently_mapped_antisense",
                 "Confidently_mapped_to_transcriptome")

gg_qc_map <- purrr::map(qc_map_vars, function(var) {
  print(var)
  ggplot(cellranger_metrics_gex_df, aes_string(x = "GemID", y = var, fill = "GemID")) +
    geom_col() +
    # facet_grid(~subproject, scales = "free_x", space = "free", switch = "x") +
    theme_bw() +
    scale_fill_brewer(palette = "Dark2") +
    ylim(0, 100) +
    labs(x = "Libraries (GEM IDs)",
         y = str_c(str_replace_all(var, "_", " "), " (%)")) +
    theme(axis.title = element_text(size = 14),
          axis.text = element_text(size = 10),
          axis.text.x = element_text(hjust = 1, angle = 90, size = 14),
          axis.text.y = element_text(size = 14),
          strip.placement = "outside",
          strip.background = element_rect(colour = NA),
          legend.position = "none") +
    scale_fill_manual(values=as.vector(pals::polychrome()))
})

patchwork::wrap_plots(gg_qc_map, ncol = 2)

#### Sequencing saturation depth

In [None]:
options(repr.plot.width = 10, repr.plot.height = 16)
gg_lib_size <- cellranger_metrics_gex_df %>%
  mutate(Number_of_Reads_mil = Number_of_reads / 1000000) %>%
  ggplot(aes(x = GemID, y = Number_of_Reads_mil, fill = GemID)) +
  geom_bar(stat = "identity") +
  theme_bw() +
  labs(x = "Libraries (GEM IDs)", y = "Library size (in millions)") +
  theme(axis.title = element_text(size = 18),
          axis.text = element_text(size = 10),
          axis.text.x = element_text(hjust = 1, angle = 90, size = 14),
          axis.text.y = element_text(size = 18),
          strip.placement = "outside",
          strip.background = element_rect(colour = NA),
          legend.position = "none") +
    scale_fill_manual(values=as.vector(pals::polychrome()))

gg_qc_seq_sat <- cellranger_metrics_gex_df %>%
  mutate(Sequencing_Saturation_perc = Sequencing_saturation / 100,
         Mean_Reads_per_Cell_tho = Mean_reads_per_cell / 1000) %>%
  ggplot(aes(x = Mean_Reads_per_Cell_tho,
             y = Sequencing_Saturation_perc, color = GemID)) +
  geom_point() +
  theme_bw() +
  ylim(0, 1) +
  geom_text_repel(aes(label = GemID), size = 4) +
  labs(x = "Mean Reads per Cell\n(in thousands)", y = "Sequencing Saturation") +
  theme(axis.title = element_text(size = 18),
          axis.text = element_text(size = 10),
          axis.text.x = element_text(hjust = 1, angle = 90, size = 14),
          axis.text.y = element_text(size = 18),
          strip.placement = "outside",
          strip.background = element_rect(colour = NA),
          legend.position = "none") +
    scale_color_manual(values=as.vector(pals::polychrome()))

gg_qc_seq_depth_cell <- cellranger_metrics_gex_df %>%
  mutate(Mean_Reads_per_Cell_tho = Mean_reads_per_cell / 1000) %>%
  ggplot(aes(x = Mean_Reads_per_Cell_tho,
             y = Median_genes_per_cell, color = GemID)) +
  geom_point() +
  theme_bw() +
  geom_text_repel(aes(label = GemID), size = 4) +
  labs(x = "Mean Reads per Cell\n(in thousands)", y = "Mean Detected Genes per Cell") +
  theme(axis.title = element_text(size = 18),
          axis.text = element_text(size = 10),
          axis.text.x = element_text(hjust = 1, angle = 90, size = 14),
          axis.text.y = element_text(size = 18),
          strip.placement = "outside",
          strip.background = element_rect(colour = NA),
          legend.position = "none") +
    scale_color_manual(values=as.vector(pals::polychrome()))

gg_qc_seq_depth <- cellranger_metrics_gex_df %>%
  mutate(Number_of_Reads_mil = Number_of_reads / 1000000) %>%
  ggplot(aes(x = Number_of_Reads_mil,
             y = Total_genes_detected, color = GemID)) +
  geom_point() +
  theme_bw() +
  geom_text_repel(aes(label = GemID), size = 4) +
  labs(x = "Number of Reads\n(in millions)", y = "Total Genes Detected") +
  theme(axis.title = element_text(size = 18),
          axis.text = element_text(size = 10),
          axis.text.x = element_text(hjust = 1, angle = 90, size = 14),
          axis.text.y = element_text(size = 18),
          strip.placement = "outside",
          strip.background = element_rect(colour = NA),
          legend.position = "none") +
    scale_color_manual(values=as.vector(pals::polychrome()))

patchwork::wrap_plots(list(gg_lib_size, gg_qc_seq_sat, 
                   gg_qc_seq_depth_cell, gg_qc_seq_depth), 
                   ncol = 2)

### VDJ-T QC

In [None]:
cellranger_metrics_vdjT_df <- cellranger_metrics[cellranger_metrics$Library.Type == "VDJ T", c("Metric.Name", "Metric.Value", "Subproject", "GemID")] %>%
  unique() %>% 
  pivot_wider(names_from = "Metric.Name", values_from = "Metric.Value", id_cols = c("Subproject", "GemID"))

In [None]:
colnames(cellranger_metrics_vdjT_df) <- str_replace_all(colnames(cellranger_metrics_vdjT_df), pattern = " ", replacement = "_")
colnames(cellranger_metrics_vdjT_df) <- str_replace_all(colnames(cellranger_metrics_vdjT_df), pattern = "\\(", replacement = "_")
colnames(cellranger_metrics_vdjT_df) <- str_replace_all(colnames(cellranger_metrics_vdjT_df), pattern = "\\)", replacement = "_")

In [None]:
col_names <-c('Subproject','GemID','Number_of_reads','Estimated_number_of_cells',
              'Fraction_reads_in_cells','Mean_reads_per_cell',
              'Reads_mapped_to_any_V_D_J_gene','Cells_with_productive_V-J_spanning_pair')

In [None]:
cellranger_metrics_vdjT_df %>% 
  select(all_of(col_names)) %>% 
  rename_with(~str_c(str_replace_all(., "_", " "))) %>% 
  gt() %>%
  fmt_number(columns = "Mean reads per cell", scale_by = 1 / 1E6, pattern = "{x}M") %>% 
  tab_header(
    title = md("**VDJ-T QC metrics**"),
    subtitle = ("cellranger v 7.0.0")
  ) %>% 
  #Apply new style to all column headers
   tab_style(
     locations = cells_column_labels(columns = everything()),
     style     = list(
       #Give a thick border below
       cell_borders(sides = "bottom", weight = px(3)),
       #Make text bold
       cell_text(weight = "bold")
     )
   ) %>% 
   #Apply different style to the title
   tab_style(
     locations = cells_title(groups = "title"),
     style     = list(
       cell_text(weight = "bold", size = 24)
     )
   ) %>% gts

#### Mapping QC

In [None]:
options(repr.plot.width = 15, repr.plot.height = 8)
qc_map_vars <- c("Reads_mapped_to_any_V_D_J_gene",
                 "Reads_mapped_to_TRA",
                 "Reads_mapped_to_TRB")

gg_qc_map <- purrr::map(qc_map_vars, function(var) {
  ggplot(cellranger_metrics_vdjT_df, aes_string(x = "GemID", y = var, fill = "GemID")) +
    geom_col() +
    theme_bw() +
    ylim(0, 100) +
    labs(x = "Libraries (GEM IDs)",
         y = str_c(str_replace_all(var, "_", " "), " (%)")) +
    theme(axis.title = element_text(size = 18),
          axis.text = element_text(size = 10),
          axis.text.x = element_text(hjust = 1, angle = 90, size = 14),
          axis.text.y = element_text(size = 18),
          strip.placement = "outside",
          strip.background = element_rect(colour = NA),
          legend.position = "none") +
    scale_fill_manual(values=as.vector(pals::polychrome())) 
})
patchwork::wrap_plots(gg_qc_map, nrow = 1)

#### V(D)J Expression

Here, we will assess the median number of UMIs assigned to a TRA/TRB contig per cell. Low values for any of the two parameters can indicate cells with extremely low TRA/TRB expression or poor cell quality, among others.

In [None]:
cellranger_metrics_vdjT_df[, c("GemID", 
                              "Median_TRA_UMIs_per_Cell", 
                              "Median_TRB_UMIs_per_Cell")] %>% 
  gt() %>%
  tab_header(
    title = md("**VDJ-T expression**"),
    subtitle = ("cellranger v 7.0.0")
  ) %>% 
  #Apply new style to all column headers
   tab_style(
     locations = cells_column_labels(columns = everything()),
     style     = list(
       #Give a thick border below
       cell_borders(sides = "bottom", weight = px(3)),
       #Make text bold
       cell_text(weight = "bold")
     )
   ) %>% 
   #Apply different style to the title
   tab_style(
     locations = cells_title(groups = "title"),
     style     = list(
       cell_text(weight = "bold", size = 24)
     )
   )%>% gts

#### V(D)J Annotation

Now, we will check the V(D)J annotation for the studied samples. To better interpret the obtained results, we will consider the information given in the cellranger web summary file. We will assess the fraction of cell-associated barcodes (with at least...), that are the following ones:
  
  * Cells With TRA/TRB Contig: one TRA/TRB contig annotated as a full or partial V(D)J gene.

* Cells With CDR3-annotated TRA/TRB Contig: one TRA/TRB contig where a CDR3 was detected.

* Cells With Productive TRA/TRB Contig: one contig that spans the 5' end of the V region to the 3' end of the J region for TRA/TRB, has a start codon in the expected part of the V sequence, has an in-frame CDR3, and has no stop codons in the aligned V-J region.

* Cells With Productive V-J Spanning Pair: one productive contig for each chain of the receptor pair. As well as the correspondent the number of cells with productive V-J Spanning Pair.

For all thre previous parameters, low values can indicate poor cell quality, low yield from the RT reaction, poor specificity of the V(D)J enrichment. Moreover, we will also check:
  
  * Paired Clonotype Diversit Effective diversity of the paired clonotypes. It is computed as the Inverse Simpson Index of the clonotype frequencies. A value of 1 indicates a minimally diverse sample - only one distinct clonotype was detected, whereas a value equal to the estimated number of cells indicates a maximally diverse sample.

In [None]:
col_names <- c('GemID','Estimated_number_of_cells','Cells_with_productive_V-J_spanning_pair',
              'Number_of_cells_with_productive_V-J_spanning_pair','Paired_clonotype_diversity',
              'Cells_with_productive_TRB_contig','Cells_with_productive_TRA_contig')

In [None]:
cellranger_metrics_vdjT_df %>% 
  select(all_of(col_names)) %>% 
  gt() %>%
  tab_header(
    title = md("**V(D)J annotation**"),
    subtitle = ("cellranger v6.0.1")
  ) %>%
  cols_label(
    GemID = md("**GEM ID**"),
    Estimated_number_of_cells = md("**Estimated Number of Recovered Cells**"),
    `Number_of_cells_with_productive_V-J_spanning_pair` = md("Cells"),
    `Cells_with_productive_V-J_spanning_pair` = md("Fraction"),
    Paired_clonotype_diversity = md("**Paired Clonotype Diversity**"),
    Cells_with_productive_TRA_contig = md("TRA"),
    Cells_with_productive_TRB_contig = md("TRB"),
  ) %>% 
  tab_spanner(
    label = md("**Productive contig**"),
    columns = vars(
      Cells_with_productive_TRA_contig,
      Cells_with_productive_TRB_contig)
  )   %>% 
  tab_spanner(
    label = md("**Productive V-J Spanning Pair**"),
    columns = vars(
      `Cells_with_productive_V-J_spanning_pair`,
      `Number_of_cells_with_productive_V-J_spanning_pair`)
  ) %>% gts

## Data overview

In [None]:
# Print loaded data
print("Libraries metadata")
metadata
print("GEX QC summary table")
cellranger_metrics_gex_df
print("VDJ-T QC summary table")
cellranger_metrics_vdjT_df

# Session Information

In [None]:
sessionInfo()