In [1]:
suppressPackageStartupMessages({
    library(Seurat)
    library(SeuratDisk)
    library(scCustomize) 
    library(SCP)
    library(ggplot2)
    library(dplyr)
    library(knitr)
    library(readr) 
    library(ggsci)
    library(scater)
    library(DoubletFinder)
    library(Trex)    
    library(SCpubr)
    library(biomaRt)
    library(data.table)
    library(genekitr)
    library(Azimuth)
    library(UCell)

    load("data/cycle.rda")
    source('helper_functions.R')
    
})


options(future.globals.maxSize = 128*1024**3)
plan(strategy = "multicore", workers = 1)
plan()

set.seed(123)

#### 1. Load Cellbender filtered  

In [2]:
samples <- c(
    'P4IP','P6IP','P8IP_04', 'P3IP'
)

In [3]:
print(paste0("Number of samples: ", length(samples)))

[1] "Number of samples: 4"


In [4]:
options(Seurat.object.assay.version = "v3")

In [5]:
sample.objs <- list()
for ( sample in samples) {
    cellbender_path = paste0("../ProcessedData/cellbender/",sample,"/cellbender_output_FPR_0.1_filtered.h5")
    cell_bender_mat <- Read_CellBender_h5_Mat(cellbender_path)
    colnames(cell_bender_mat) <- paste0(colnames(cell_bender_mat), '_', sample)
    dual_seurat <- CreateSeuratObject(CreateAssayObject(cell_bender_mat))
    dual_seurat@meta.data$sample_id = sample
    dual_seurat[["percent.mt"]] <- PercentageFeatureSet(dual_seurat, pattern = "^MT-") 
    dual_seurat <- subset(dual_seurat, subset =
      nFeature_RNA > 500 & 
      nCount_RNA > 1000 & 
      nCount_RNA < 50000 & 
      percent.mt < 10        
    )
    sample.objs[[sample]] <- dual_seurat
}

prepost <- merge(sample.objs[[1]], y = unlist(sample.objs[-c(1)]))

In [6]:
print(paste0("Number of cells called by cellbender: ", dim(prepost)[2]))

[1] "Number of cells called by cellbender: 40132"


#### 2. Load Cellranger count 

In [7]:
sample.objs <- list()
for ( sample.id in samples) {
    sample.path <- Sys.glob(paste0("../RawData/cellranger_out/",sample.id,"/per_sample_outs/*/count/sample_filtered_feature_bc_matrix.h5")) 
    cell_ranger_mat <- Read10X_h5(sample.path[[1]])
    colnames(cell_ranger_mat) <- paste0(colnames(cell_ranger_mat), '_', sample.id)
    sample <- CreateSeuratObject(CreateAssayObject(cell_ranger_mat))
    sample@meta.data$sample_id = sample.id
    sample[["percent.mt"]] <- PercentageFeatureSet(sample, pattern = "^MT-") 
    
    sample <- subset(sample, subset =
      nFeature_RNA > 500 & 
      nCount_RNA > 1000 & 
      nCount_RNA < 50000 & 
      percent.mt < 10     
    )
    sample.objs[[sample.id]] <- sample
}
prepost.cellranger <- merge(sample.objs[[1]], y = unlist(sample.objs[-c(1)]))


In [8]:
print(paste0("Number of cells called by cellranger: ", dim(prepost.cellranger)[2]))

[1] "Number of cells called by cellranger: 41065"


In [9]:
# Keep only cells called by both cellranger AND cellbender cell calling algorithms

prepost@meta.data$barcodes <- colnames(prepost)
prepost@meta.data$in.cellranger <- 'No'
prepost@meta.data$in.cellranger[colnames(prepost) %in% colnames(prepost.cellranger)] <- 'Yes'
prepost <- subset(prepost, in.cellranger == 'Yes')

In [10]:
print(paste0("Number of cells called by both cellranger AND cellbender: ", dim(prepost)[2]))

[1] "Number of cells called by both cellranger AND cellbender: 39234"


In [11]:
rm(prepost.cellranger)
rm(sample.objs)
gc()

Unnamed: 0,used,(Mb),gc trigger,(Mb).1,max used,(Mb).2
Ncells,16429425,877.5,27240640,1454.9,27240640,1454.9
Vcells,948712027,7238.1,2605699239,19880.0,2536164277,19349.4


In [18]:
prepost$sample_id[prepost$sample_id == 'P8IP_04'] = 'P8IP'

In [19]:
saveRDS(prepost, "../ProcessedData/seurat/filtered_ip4.rds")

#### 3. QC Stats 

In [13]:
prepost <- readRDS("../ProcessedData/seurat/filtered_ip4.rds")

In [14]:
prepost[["percent.TCR"]] <- PercentageFeatureSet(prepost, pattern = "^TR[ABDG][VJC]")
prepost[["percent.ribo"]] <- PercentageFeatureSet(prepost, pattern = "^RP[SL]")
prepost[["percent.mt"]] <- PercentageFeatureSet(prepost, pattern = "^MT-")
prepost[["percent.hb"]] <- PercentageFeatureSet(prepost, pattern = "^HB[^(P)]")
prepost[["log10GenesPerUMI"]]  <- log10(prepost$nFeature_RNA) / log10(prepost$nCount_RNA)


In [20]:
summary_stats <- prepost@meta.data %>%
  group_by(sample_id) %>%
  summarize(
    num_cells = n(),
    median_nUMI = median(nCount_RNA),
    median_nGene = median(nFeature_RNA),
    median_percent_mt = round(median(percent.mt),2)
  )
kable((summary_stats), format = "markdown")



|sample_id | num_cells| median_nUMI| median_nGene| median_percent_mt|
|:---------|---------:|-----------:|------------:|-----------------:|
|P3IP      |      6992|     27320.5|       6066.0|              5.05|
|P4IP      |      5910|     27179.5|       5678.0|              3.81|
|P6IP      |      4114|     24327.0|       5319.0|              3.96|
|P8IP      |     22218|      6859.0|       2653.5|              3.74|

In [21]:
Idents(prepost) <- "sample_id"
prepost@meta.data$Patient <- sub("(P\\d+).*$", "\\1", prepost@meta.data$sample_id)
prepost@meta.data$Day <- sub("^P\\d+", "", prepost@meta.data$sample_id)

batch_mapping <- c( 
    "P8IP" = 'Batch8', "P4IP" = 'Batch9', "P6IP" = 'Batch9', "P3IP" = 'Batch10'
)
prepost@meta.data$Batch <- batch_mapping[as.character(prepost$sample_id)]

In [22]:
saveRDS(prepost, "../ProcessedData/seurat/filtered_ip4.rds")

In [None]:
SaveH5Seurat(prepost, filename = "../ProcessedData/seurat/filtered_ip4.h5Seurat", overwrite=T)
Convert("../ProcessedData/seurat/filtered_ip4.h5Seurat", dest = "h5ad", overwrite=T)