In [None]:
suppressPackageStartupMessages({
    library(Seurat)
    library(SeuratDisk)
    library(scCustomize) 
    library(SCP)
    library(ggplot2)
    library(dplyr)
    library(knitr)
    library(readr) 
    library(ggsci)
    library(scater)
    library(DoubletFinder)
    library(Trex)    
    library(SCpubr)
    library(biomaRt)
    library(data.table)
    library(genekitr)
    library(Azimuth)
    library(UCell)

    load("data/cycle.rda")
    source('helper_functions.R')
    
})


options(future.globals.maxSize = 128*1024**3)
plan(strategy = "multicore", workers = 1)
plan()

set.seed(123)

#### 1. Load Cellbender filtered  

In [2]:
samples <- c(
    'P1D0','P1D7','P1D21', 
    'P2D0','P2D7','P2D21',
    'P3D0','P3D7','P3D21',
    'P4D0','P4D7','P4D21',
    'P5D0','P5D0_run2','P5D7','P5D21',
    'P6D0','P6D7','P6D21', 
    'P7D0','P7D7','P7D21',  
    'P8D0','P8D7','P8D21' 
)

In [None]:
print(paste0("Number of samples: ", length(samples)))

In [4]:
options(Seurat.object.assay.version = "v3")

In [5]:
sample.objs <- list()
for ( sample in samples) {
    cellbender_path = paste0("../ProcessedData/cellbender/",sample,"/cellbender_output_FPR_0.1_filtered.h5")
    cell_bender_mat <- Read_CellBender_h5_Mat(cellbender_path)
    colnames(cell_bender_mat) <- paste0(colnames(cell_bender_mat), '_', sample)
    dual_seurat <- CreateSeuratObject(CreateAssayObject(cell_bender_mat))
    dual_seurat@meta.data$sample_id = sample
    dual_seurat[["percent.mt"]] <- PercentageFeatureSet(dual_seurat, pattern = "^MT-") 
    dual_seurat <- subset(dual_seurat, subset =
      nFeature_RNA > 500 & 
      nCount_RNA > 1000 & 
      nCount_RNA < 50000 & 
      percent.mt < 10        
    )
    sample.objs[[sample]] <- dual_seurat
}
prepost <- merge(sample.objs[[1]], y = unlist(sample.objs[-c(1)]))

In [None]:
print(paste0("Number of cells called by cellbender: ", dim(prepost)[2]))

#### 2. Load Cellranger count 

In [7]:
sample.objs <- list()
for ( sample.id in samples) {
    sample.path <- Sys.glob(paste0("../RawData/cellranger_out/",sample.id,"/per_sample_outs/*/count/sample_filtered_feature_bc_matrix.h5")) 
    cell_ranger_mat <- Read10X_h5(sample.path[[1]])
    colnames(cell_ranger_mat) <- paste0(colnames(cell_ranger_mat), '_', sample.id)
    sample <- CreateSeuratObject(CreateAssayObject(cell_ranger_mat))
    sample@meta.data$sample_id = sample.id
    
    sample[["percent.mt"]] <- PercentageFeatureSet(sample, pattern = "^MT-") 
    
    sample <- subset(sample, subset =
      nFeature_RNA > 500 & 
      nCount_RNA > 1000 & 
      nCount_RNA < 50000 & 
      percent.mt < 10     
    )
    sample.objs[[sample.id]] <- sample
}
prepost.cellranger <- merge(sample.objs[[1]], y = unlist(sample.objs[-c(1)]))


In [None]:
print(paste0("Number of cells called by cellranger: ", dim(prepost.cellranger)[2]))

In [9]:
prepost@meta.data$barcodes <- colnames(prepost)
prepost@meta.data$in.cellranger <- 'No'
prepost@meta.data$in.cellranger[colnames(prepost) %in% colnames(prepost.cellranger)] <- 'Yes'
prepost <- subset(prepost, in.cellranger == 'Yes')

In [None]:
print(paste0("Number of cells called by both cellranger AND cellbender: ", dim(prepost)[2]))

In [None]:
rm(prepost.cellranger)
rm(sample.objs)
gc()

In [12]:
saveRDS(prepost, "../ProcessedData/seurat/filtered_prepost_csf.rds")

#### 3. QC Stats 

In [13]:
prepost <- readRDS("../ProcessedData/seurat/filtered_prepost_csf.rds")

In [14]:
prepost[["percent.TCR"]] <- PercentageFeatureSet(prepost, pattern = "^TR[ABDG][VJC]")
prepost[["percent.ribo"]] <- PercentageFeatureSet(prepost, pattern = "^RP[SL]")
prepost[["percent.mt"]] <- PercentageFeatureSet(prepost, pattern = "^MT-")
prepost[["percent.hb"]] <- PercentageFeatureSet(prepost, pattern = "^HB[^(P)]")
prepost[["log10GenesPerUMI"]]  <- log10(prepost$nFeature_RNA) / log10(prepost$nCount_RNA)

prepost <- Add_Top_Gene_Pct_Seurat(prepost, num_top_genes = 20)

[1m[22mCalculating percent expressing top 20 for layer: [32mcounts[39m


In [None]:
summary_stats <- prepost@meta.data %>%
  group_by(sample_id) %>%
  summarize(
    num_cells = n(),
    median_nUMI = median(nCount_RNA),
    median_nGene = median(nFeature_RNA),
    median_percent_mt = round(median(percent.mt),2)
  )
kable((summary_stats), format = "markdown")

In [16]:
Idents(prepost) <- "sample_id"
prepost@meta.data$Patient <- sub("(P\\d+).*$", "\\1", prepost@meta.data$sample_id)
prepost@meta.data$Day <- sub("^P\\d+", "", prepost@meta.data$sample_id)


batch_mapping <- c(
    "P5D0" = "Batch1", "P6D7" = "Batch1",'P6D21' = 'Batch1',
    'P6D0' = 'Batch2','P3D0' = 'Batch2','P3D7' = 'Batch2',
    'P3D21' = 'Batch3','P2D7' = 'Batch3','P2D0' = 'Batch3',
    'P1D7' = 'Batch4','P2D21' = 'Batch4', 
    'P5D0_run2' = 'Batch5', 'P5D7' = 'Batch5', 'P5D21' = 'Batch5', 'P1D21' = 'Batch5', 'P1D78' = 'Batch5', 'P4D21' = 'Batch5',
    'P7D0' = 'Batch6', 'P7D7' = 'Batch6', 'P7D21' = 'Batch6',
    'P8D0' = 'Batch7', 'P8D7' = 'Batch7', 'P8D21' = 'Batch7', 'P8M2' = 'Batch7', 'P7M2' = 'Batch7', 
    'P1D0' = 'Batch8', 'P4D0' = 'Batch8', 'P4D7' = 'Batch8', 'P8IP' = 'Batch8'
)
prepost@meta.data$Batch <- batch_mapping[as.character(prepost$sample_id)]

In [17]:
saveRDS(prepost, "../ProcessedData/seurat/filtered_prepost_csf.rds")

In [None]:
SaveH5Seurat(prepost, filename = "../ProcessedData/seurat/filtered_prepost_csf.h5Seurat", overwrite=T)
Convert("../ProcessedData/seurat/filtered_prepost_csf.h5Seurat", dest = "h5ad", overwrite=T)