In [1]:
## ---- Global Cell Set Preprocessing Notebook ----
## Load Analysis Parameters (Parm1)
## Download Aggregated 10x Single Cell Data (Seurat)
## Perform QC Filtering (Seurat)
## Identify Patient and Health Condition All Cells
## Create Filtered Monocle3 CDS Object and Perform Down Sampling (Monocle3)
## Remove Zero Expressing Genes
## Perform Preliminary Pre-Processing and Clustering (Monocle3)
## Annotate Partitions (Cell Types)
## Save Result As An R Data Object (CDS1)

In [1]:
## Create a Working Input and Output Data Directory, If It Does Not Exist
parentdir <- '/gpfs/group/torkamani/devans/'
datdir <- paste(parentdir, 'CDC2', sep = '')
if (!file.exists(datdir)) {
    dir.create(datdir)
}
setwd(datdir)

In [2]:
## Read the parameters file
ps <- read.table(file = 'parms.txt', stringsAsFactors = FALSE, header = TRUE)

In [3]:
ps

indir,outdir,outsdir.10x,expdatfile.10x,metadatfile.10x,conpath
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
inputdat/,outputdat/,outs/,AGGREGATEMAPPED-tisCAR6samples_expression.csv,AGGREGATEMAPPED-tisCAR6samples_metadata.csv,con345/


In [5]:
## Load Monocle3 and Seurat Libraries
library(monocle3)
library(Seurat)
library(dplyr)
library(magrittr)
library(ggplot2)
library(gridExtra)
library(Matrix)
library(rhdf5)

In [6]:
## Set up paths to the 6 sample directories
pf1d <- paste(ps$indir, ps$conpath, 'p3/dis/outs/filtered_feature_bc_matrix', sep = '')
pf1h <- paste(ps$indir, ps$conpath, 'p3/hea/outs/filtered_feature_bc_matrix', sep = '')
pf2d <- paste(ps$indir, ps$conpath, 'p4/dis/outs/filtered_feature_bc_matrix', sep = '')
pf2h <- paste(ps$indir, ps$conpath, 'p4/hea/outs/filtered_feature_bc_matrix', sep = '')
pf3d <- paste(ps$indir, ps$conpath, 'p5/dis/outs/filtered_feature_bc_matrix', sep = '')
pf3h <- paste(ps$indir, ps$conpath, 'p5/hea/outs/filtered_feature_bc_matrix', sep = '')

In [7]:
## Read the six samples
p1d_cds <- Read10X(data.dir = pf1d)
p1h_cds <- Read10X(data.dir = pf1h)
p2d_cds <- Read10X(data.dir = pf2d)
p2h_cds <- Read10X(data.dir = pf2h)
p3d_cds <- Read10X(data.dir = pf3d)
p3h_cds <- Read10X(data.dir = pf3h)

In [8]:
## Create the Seurat Objects
p1d_cds <- CreateSeuratObject(counts = p1d_cds, project = "Cor.Pat1.Dis", min.cells = 0, min.features = 0)
p1h_cds <- CreateSeuratObject(counts = p1h_cds, project = "Cor.Pat1.Hea", min.cells = 0, min.features = 0)
p2d_cds <- CreateSeuratObject(counts = p2d_cds, project = "Cor.Pat2.Dis", min.cells = 0, min.features = 0)
p2h_cds <- CreateSeuratObject(counts = p2h_cds, project = "Cor.Pat2.Hea", min.cells = 0, min.features = 0)
p3d_cds <- CreateSeuratObject(counts = p3d_cds, project = "Cor.Pat3.Dis", min.cells = 0, min.features = 0)
p3h_cds <- CreateSeuratObject(counts = p3h_cds, project = "Cor.Pat3.Hea", min.cells = 0, min.features = 0)

In [9]:
p1d_cds
p1h_cds
p2d_cds
p2h_cds
p3d_cds
p3h_cds

An object of class Seurat 
33538 features across 11015 samples within 1 assay 
Active assay: RNA (33538 features, 0 variable features)

An object of class Seurat 
33538 features across 3716 samples within 1 assay 
Active assay: RNA (33538 features, 0 variable features)

An object of class Seurat 
33538 features across 15960 samples within 1 assay 
Active assay: RNA (33538 features, 0 variable features)

An object of class Seurat 
33538 features across 5523 samples within 1 assay 
Active assay: RNA (33538 features, 0 variable features)

An object of class Seurat 
33538 features across 12388 samples within 1 assay 
Active assay: RNA (33538 features, 0 variable features)

An object of class Seurat 
33538 features across 3379 samples within 1 assay 
Active assay: RNA (33538 features, 0 variable features)

In [10]:
# The [[ operator can add columns to object metadata. This is a great place to stash QC stats
p1d_cds[["percent.mt"]] <- PercentageFeatureSet(p1d_cds, pattern = "^MT-")
p1h_cds[["percent.mt"]] <- PercentageFeatureSet(p1h_cds, pattern = "^MT-")
p2d_cds[["percent.mt"]] <- PercentageFeatureSet(p2d_cds, pattern = "^MT-")
p2h_cds[["percent.mt"]] <- PercentageFeatureSet(p2h_cds, pattern = "^MT-")
p3d_cds[["percent.mt"]] <- PercentageFeatureSet(p3d_cds, pattern = "^MT-")
p3h_cds[["percent.mt"]] <- PercentageFeatureSet(p3h_cds, pattern = "^MT-")

In [11]:
## Results of single sample cellranger count analysis (Hardcode input from summary.html files)
summaryreads <- c(273078128, 234857523, 246707399, 315532869, 252278050, 270736818)
summarysat <- c(0.603, 0.834, 0.421, 0.812, 0.464, .841)

In [12]:
qc_df <- as.data.frame(matrix(0, nrow = 6, ncol = 12))
colnames(qc_df) <- c('Sample', 'N.Cells', 'MedGenepCell', 'MedUMIspCell',
                     'MeanReadspCell', 'ReadspUMI', 'SeqSat.pct', 'MT.lt.5.pct', 'MT.lt.10.pct',
                     'pct.Genes.gt.200', 'pct.Genes.lt.2500', 'pct.Genes.lt.4000')

lencds <- length(p1d_cds$nFeature_RNA)
qc_df$Sample[1] <- "Patient 1 Diseased"
qc_df$N.Cells[1] <- lencds
qc_df$MedGenepCell[1] <- round(median(p1d_cds$nFeature_RNA),0)
qc_df$MedUMIspCell[1] <- round(median(colSums(p1d_cds@assays$RNA@counts)), 0)

qc_df$MeanReadspCell[1] <- round(summaryreads[1]/lencds, 0)
qc_df$ReadspUMI[1] <- round(summaryreads[1]/sum(p1d_cds@assays$RNA@counts), 1)
qc_df$SeqSat.pct[1] <- summarysat[1] * 100

qc_df$MT.lt.5.pct[1] <- round(100 * sum(p1d_cds$percent.mt < 5)/lencds, 1)
qc_df$MT.lt.10.pct[1] <- round(100 * sum(p1d_cds$percent.mt < 10)/lencds, 1)
qc_df$pct.Genes.gt.200[1] <- round(100 * sum(p1d_cds$nFeature_RNA > 200)/lencds, 1)
qc_df$pct.Genes.lt.2500[1] <- round(100 * sum(p1d_cds$nFeature_RNA < 2500)/lencds, 1)
qc_df$pct.Genes.lt.4000[1] <- round(100 * sum(p1d_cds$nFeature_RNA < 4000)/lencds, 1)


##----------------

lencds <- length(p1h_cds$nFeature_RNA)
qc_df$Sample[2] <- "Patient 1 Healthy"
qc_df$N.Cells[2] <- lencds
qc_df$MedGenepCell[2] <- round(median(p1h_cds$nFeature_RNA),0)
qc_df$MedUMIspCell[2] <- round(median(colSums(p1h_cds@assays$RNA@counts)), 0)

qc_df$MeanReadspCell[2] <- round(summaryreads[2]/lencds, 0)
qc_df$ReadspUMI[2] <- round(summaryreads[2]/sum(p1h_cds@assays$RNA@counts), 1)
qc_df$SeqSat.pct[2] <- summarysat[2] * 100

qc_df$MT.lt.5.pct[2] <- round(100 * sum(p1h_cds$percent.mt < 5)/lencds, 1)
qc_df$MT.lt.10.pct[2] <- round(100 * sum(p1h_cds$percent.mt < 10)/lencds, 1)
qc_df$pct.Genes.gt.200[2] <- round(100 * sum(p1h_cds$nFeature_RNA > 200)/lencds, 1)
qc_df$pct.Genes.lt.2500[2] <- round(100 * sum(p1h_cds$nFeature_RNA < 2500)/lencds, 1)
qc_df$pct.Genes.lt.4000[2] <- round(100 * sum(p1h_cds$nFeature_RNA < 4000)/lencds, 1)

##----------------

lencds <- length(p2d_cds$nFeature_RNA)
qc_df$Sample[3] <- "Patient 2 Diseased"
qc_df$N.Cells[3] <- lencds
qc_df$MedGenepCell[3] <- round(median(p2d_cds$nFeature_RNA),0)
qc_df$MedUMIspCell[3] <- round(median(colSums(p2d_cds@assays$RNA@counts)), 0)

qc_df$MeanReadspCell[3] <- round(summaryreads[3]/lencds, 0)
qc_df$ReadspUMI[3] <- round(summaryreads[3]/sum(p2d_cds@assays$RNA@counts), 1)
qc_df$SeqSat.pct[3] <- summarysat[3] * 100

qc_df$MT.lt.5.pct[3] <- round(100 * sum(p2d_cds$percent.mt < 5)/lencds, 1)
qc_df$MT.lt.10.pct[3] <- round(100 * sum(p2d_cds$percent.mt < 10)/lencds, 1)
qc_df$pct.Genes.gt.200[3] <- round(100 * sum(p2d_cds$nFeature_RNA > 200)/lencds, 1)
qc_df$pct.Genes.lt.2500[3] <- round(100 * sum(p2d_cds$nFeature_RNA < 2500)/lencds, 1)
qc_df$pct.Genes.lt.4000[3] <- round(100 * sum(p2d_cds$nFeature_RNA < 4000)/lencds, 1)

##----------------

lencds <- length(p2h_cds$nFeature_RNA)
qc_df$Sample[4] <- "Patient 2 Healthy"
qc_df$N.Cells[4] <- lencds
qc_df$MedGenepCell[4] <- round(median(p2h_cds$nFeature_RNA),0)
qc_df$MedUMIspCell[4] <- round(median(colSums(p2h_cds@assays$RNA@counts)), 0)

qc_df$MeanReadspCell[4] <- round(summaryreads[4]/lencds, 0)
qc_df$ReadspUMI[4] <- round(summaryreads[4]/sum(p2h_cds@assays$RNA@counts), 1)
qc_df$SeqSat.pct[4] <- summarysat[4] * 100

qc_df$MT.lt.5.pct[4] <- round(100 * sum(p2h_cds$percent.mt < 5)/lencds, 1)
qc_df$MT.lt.10.pct[4] <- round(100 * sum(p2h_cds$percent.mt < 10)/lencds, 1)
qc_df$pct.Genes.gt.200[4] <- round(100 * sum(p2h_cds$nFeature_RNA > 200)/lencds, 1)
qc_df$pct.Genes.lt.2500[4] <- round(100 * sum(p2h_cds$nFeature_RNA < 2500)/lencds, 1)
qc_df$pct.Genes.lt.4000[4] <- round(100 * sum(p2h_cds$nFeature_RNA < 4000)/lencds, 1)

##----------------

lencds <- length(p3d_cds$nFeature_RNA)
qc_df$Sample[5] <- "Patient 3 Diseased"
qc_df$N.Cells[5] <- lencds
qc_df$MedGenepCell[5] <- round(median(p3d_cds$nFeature_RNA),0)
qc_df$MedUMIspCell[5] <- round(median(colSums(p3d_cds@assays$RNA@counts)), 0)

qc_df$MeanReadspCell[5] <- round(summaryreads[5]/lencds, 0)
qc_df$ReadspUMI[5] <- round(summaryreads[5]/sum(p3d_cds@assays$RNA@counts), 1)
qc_df$SeqSat.pct[5] <- summarysat[5] * 100

qc_df$MT.lt.5.pct[5] <- round(100 * sum(p3d_cds$percent.mt < 5)/lencds, 1)
qc_df$MT.lt.10.pct[5] <- round(100 * sum(p3d_cds$percent.mt < 10)/lencds, 1)
qc_df$pct.Genes.gt.200[5] <- round(100 * sum(p3d_cds$nFeature_RNA > 200)/lencds, 1)
qc_df$pct.Genes.lt.2500[5] <- round(100 * sum(p3d_cds$nFeature_RNA < 2500)/lencds, 1)
qc_df$pct.Genes.lt.4000[5] <- round(100 * sum(p3d_cds$nFeature_RNA < 4000)/lencds, 1)

##----------------

lencds <- length(p3h_cds$nFeature_RNA)
qc_df$Sample[6] <- "Patient 3 Healthy"
qc_df$N.Cells[6] <- lencds
qc_df$MedGenepCell[6] <- round(median(p3h_cds$nFeature_RNA),0)
qc_df$MedUMIspCell[6] <- round(median(colSums(p3h_cds@assays$RNA@counts)), 0)

qc_df$MeanReadspCell[6] <- round(summaryreads[6]/lencds, 0)
qc_df$ReadspUMI[6] <- round(summaryreads[6]/sum(p3h_cds@assays$RNA@counts), 1)
qc_df$SeqSat.pct[6] <- summarysat[6] * 100

qc_df$MT.lt.5.pct[6] <- round(100 * sum(p3h_cds$percent.mt < 5)/lencds, 1)
qc_df$MT.lt.10.pct[6] <- round(100 * sum(p3h_cds$percent.mt < 10)/lencds, 1)
qc_df$pct.Genes.gt.200[6] <- round(100 * sum(p3h_cds$nFeature_RNA > 200)/lencds, 1)
qc_df$pct.Genes.lt.2500[6] <- round(100 * sum(p3h_cds$nFeature_RNA < 2500)/lencds, 1)
qc_df$pct.Genes.lt.4000[6] <- round(100 * sum(p3h_cds$nFeature_RNA < 4000)/lencds, 1)

In [13]:
qc_df

Sample,N.Cells,MedGenepCell,MedUMIspCell,MeanReadspCell,ReadspUMI,SeqSat.pct,MT.lt.5.pct,MT.lt.10.pct,pct.Genes.gt.200,pct.Genes.lt.2500,pct.Genes.lt.4000
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Patient 1 Diseased,11015,1667,5162,24791,3.7,60.3,86.0,93.7,99.7,78.9,98.5
Patient 1 Healthy,3716,1968,6016,63202,9.9,83.4,70.7,84.1,97.7,75.0,99.3
Patient 2 Diseased,15960,1204,3388,15458,2.8,42.1,67.5,89.6,99.8,86.6,99.1
Patient 2 Healthy,5523,1466,4805,57131,8.6,81.2,66.9,89.5,99.7,82.4,99.2
Patient 3 Diseased,12388,1557,4566,20365,3.3,46.4,63.9,87.2,99.6,76.5,97.3
Patient 3 Healthy,3379,2213,6994,80123,10.9,84.1,69.4,85.6,99.0,60.1,97.6


In [14]:
paste(getwd(), ps$indir, sep = '/')

In [15]:
## Read all the aggregated data into a Monocle cds object
stdycds <- load_cellranger_data(ps$indir)

In [17]:
## Set up the gene annotation object for later use
gene_annot <- fData(stdycds)@listData$gene_short_name
names(gene_annot) <- rownames(stdycds)
gene_annot <- as.data.frame(gene_annot)
colnames(gene_annot) <- 'gene_short_name'

In [18]:
str(gene_annot)

'data.frame':	33538 obs. of  1 variable:
 $ gene_short_name: Factor w/ 33514 levels "A1BG","A1BG-AS1",..: 22768 15766 24588 9683 9685 9684 9686 9793 24585 5853 ...


In [18]:
## Associate the cells with disease/healthy status and patient ID (set up)
cellgrps <- c('healthy', 'diseased', 'healthy', 'diseased', 'healthy', 'diseased')
cellpats <- c('ID Number 1', 'ID Number 1', 'ID Number 2', 'ID Number 2', 'ID Number 3', 'ID Number 3')

In [19]:
## Set up the patient and health columns
stdycds@colData$condition <- cellgrps[as.integer(substr(stdycds@colData@rownames, 18, 20))]
stdycds@colData$patient <- cellpats[as.integer(substr(stdycds@colData@rownames, 18, 20))]

In [20]:
## Save the unfiltered cds object
stdycdsall <- stdycds

In [21]:
## Build a gene short name to gene id (Ensembl) lookup
short2geneid <- fData(stdycdsall)@rownames
names(short2geneid) <- fData(stdycdsall)@listData$gene_short_name

In [22]:
## Build a gene id (Ensembl) to gene short name lookup
geneid2short <- fData(stdycdsall)@listData$gene_short_name
names(geneid2short) <- fData(stdycdsall)@rownames

In [28]:
filtcells <- NULL
for (i in 1:length(cellpats)) {
    patid <- cellpats[i]
    ## Flip the order of the disease/healthy to match single sample ordering
    condid <- c('diseased', 'healthy', 'diseased', 'healthy', 'diseased', 'healthy')[i]
    selbars <-  stdycds@colData@rownames[(stdycds@colData$patient == patid) &
                                         (stdycds@colData$condition == condid)]
    ## Try creating a Seurat object from the sample data from Monocle
    sam_seu <- CreateSeuratObject(counts = exprs(stdycds[,selbars]), project = '',
                                         min.cells = 0, min.features = 0)
    # Fix the gene names (from Ensembl to HUGE). Probably not needed for final pipeliine
    sam_seu@assays$RNA@counts@Dimnames[1] <- list(fData(stdycds)@listData$gene_short_name)
    sam_seu@assays$RNA@data@Dimnames[1] <- list(fData(stdycds)@listData$gene_short_name)
    
    # The [[ operator can add columns to object metadata. This is a great place to stash QC stats
    sam_seu[["percent.mt"]] <- PercentageFeatureSet(sam_seu, pattern = "^MT-")    
    bars <- selbars[(sam_seu$percent.mt < 10) & (sam_seu$nFeature_RNA > 200) & (sam_seu$nFeature_RNA < 4000)]
    filtcells <- c(filtcells, bars)
    print(c(dim(sam_seu), sum(sam_seu$percent.mt < 10), sum(sam_seu$nFeature_RNA > 200),
        sum(sam_seu$nFeature_RNA < 4000),
        sum((sam_seu$percent.mt < 10) & (sam_seu$nFeature_RNA > 200) & (sam_seu$nFeature_RNA < 4000))))
}

[1] 33538 11015 10331 10876 10967 10273
[1] 33538  3716  3112  3471  3713  3104
[1] 33538 15960 14296 15923 15822 14157
[1] 33538  5523  4931  5420  5512  4896
[1] 33538 12388 10806 12322 12138 10556
[1] 33538  3379  2882  3255  3372  2850


In [28]:
## Write the filtered barcode out
filtbarfile <- paste(ps$outdir, 'AggregateLevelQCFilterCells.Rev1.txt', sep = '')
write.table(filtcells, file = filtbarfile, append = FALSE,
            quote = FALSE, row.names = FALSE, col.names = FALSE)

In [29]:
## Create the QC Monocle Cell Set Object
stdycds <- stdycds[,filtcells]

In [30]:
## Prepare for downsample to small cell set from set of 6 cell groups
mincellcount <- min(table(substr(colnames(exprs(stdycds)), 18, 18)))
barcodes <- colnames(exprs(stdycds))
cellgroups <- substr(barcodes, 18, 18)
mincellcount

In [32]:
## Get the barcodes from downsampled expression data
set.seed(301)
subcellset <- NULL
for (cellgroup in 1:length(table(substr(barcodes, 18, 18)))) {
    samplegroup <- sample(barcodes[cellgroups == cellgroup], mincellcount, replace = FALSE)
    subcellset <- cbind(subcellset, samplegroup)
}

In [33]:
# Subset the expression data based on the downsampled barcodes
boolscs <- barcodes %in% subcellset
downsampled_expression_data <- exprs(stdycds)[, boolscs]

In [34]:
## Create the new downsampled dataset (Monocle)
down_stdycds <- new_cell_data_set(downsampled_expression_data,
                         gene_metadata = gene_annot)

In [35]:
dim(down_stdycds)

In [26]:
## Remove zero expressing genes from the cell set
gsums <- rowSums(exprs(down_stdycds))
down_stdycds <- down_stdycds[(gsums !=0), ]

In [27]:
## Set up the patient and health columns in down sampled cell set
down_stdycds@colData$condition <- cellgrps[as.integer(substr(down_stdycds@colData@rownames, 18, 20))]
down_stdycds@colData$patient <- cellpats[as.integer(substr(down_stdycds@colData@rownames, 18, 20))]

In [28]:
## Process the data, without and with patient correction
down_stdycds = preprocess_cds(down_stdycds, num_dim = 100)

In [29]:
## Get the UMAP components (2)
down_stdycds <- reduce_dimension(down_stdycds, reduction_method = "UMAP")

In [30]:
## Cluster and Partition the data
down_stdycds <- cluster_cells(down_stdycds, reduction_method = 'UMAP')

In [31]:
## Define and Assign Cell Types
celltypes5 <- c('1-Unidentified',
                '2-Macrophages',
                '3-Endothelial Cells - (a)',
                '4-Lymphoid Progenitors',
                '5-VSMCs - Contractile',
                '6-Endothelial Cells - (b)',
                '7-VSMCs - Synthetic',
                '8-Natural Killer Cells',
                '9-Cytotoxic T Lymphocytes',
                '10-B Lymphocytes',
                '11-T lymphocytes',
                '12-Plasmacytoid Dendritic Cells',
                '13-B1 Lymphocytes',
                '14-Neutrophils',
                '15-Mast Cells')

In [32]:
## Store the annotated cell type in the cell data object
colData(down_stdycds)$assigned_cell_type <- as.character(partitions(down_stdycds))
colData(down_stdycds)$assigned_cell_type = dplyr::recode(colData(down_stdycds)$assigned_cell_type,
                                                "1"  = celltypes5[1],
                                                "2"  = celltypes5[2],
                                                "3"  = celltypes5[3],
                                                "4"  = celltypes5[4],
                                                "5"  = celltypes5[5],
                                                "6"  = celltypes5[6],
                                                "7"  = celltypes5[7],
                                                "8"  = celltypes5[8],
                                                "9"  = celltypes5[9],
                                                "10" = celltypes5[10],
                                                "11" = celltypes5[11],
                                                "12" = celltypes5[12],
                                                "13" = celltypes5[13],
                                                "14" = celltypes5[14],
                                                "15" = celltypes5[15])

In [33]:
toms_markers5 <- c('NRXN1', 'CLU', 'ICAM2',
                 'CD14', 'CD68', 'AIF1',
                 'VWF', 'EDN1', 'ECSCR',
                 'MKI67', 'UBE2C', 'TOP2A',
                 'ACTA2', 'TAGLN', 'MYL9',
                 'ACKR1', 'SPARCL1', 'PECAM1',
                 'CALD1', 'MGP', 'DCN',
                 'NKG7', 'XCL1', 'CTSW',
                 'CD8A', 'TRAC', 'CD2',
                 'MS4A1', 'CD79A', 'BANK1',
                 'CD69', 'CXCR4', 'IL7R',
                 'LILRA4', 'IRF7', 'CLEC4C',
                 'MZB1', 'JCHAIN', 'TNFRSF17',
                 'LST1', 'FCGR3B', 'S100A8',
                 'TPSAB1', 'CPA3', 'MS4A2')
toms_gene_ids5 <- short2geneid[toms_markers5]

In [34]:
## Create partition gene lists (3 genes per partition)
i <- 1
part_genes <- NULL
while (i < length(toms_markers5)) {
    part_genes <- c(list(toms_markers5[i:(i + 2)]), part_genes)
    i <- i + 3
}

In [35]:
## Store the annotated partition genes in the cell data object
colData(down_stdycds)$assigned_genes <- as.character(partitions(down_stdycds))
colData(down_stdycds)$assigned_genes = dplyr::recode(colData(down_stdycds)$assigned_genes,
                                                "1"  = part_genes[1],
                                                "2"  = part_genes[2],
                                                "3"  = part_genes[3],
                                                "4"  = part_genes[4],
                                                "5"  = part_genes[5],
                                                "6"  = part_genes[6],
                                                "7"  = part_genes[7],
                                                "8"  = part_genes[8],
                                                "9"  = part_genes[9],
                                                "10" = part_genes[10],
                                                "11" = part_genes[11],
                                                "12" = part_genes[12],
                                                "13" = part_genes[13],
                                                "14" = part_genes[14],
                                                "15" = part_genes[15])

In [36]:
# Save an object to a file i(CDS1 in outline)
saveRDS(down_stdycds, file = paste(ps$outdir, 'Aggregated.downsampled.QC.Preprocessed.rds', sep = ''))

In [37]:
## Get a boolean selector for cells based on quality
filtbool <- stdycdsall@colData@rownames %in% filtcells

In [38]:
## Tag the good and bad cells (based on QC)
stdycdsall@colData$state[filtbool] <- 'Good'
stdycdsall@colData$state[!filtbool] <- 'Fail QC'

In [39]:
## Preprocess the cells so they can be plotted
stdycdsall = preprocess_cds(stdycdsall, num_dim = 100)

In [40]:
## Get the UMAP components so they can be plotted
stdycdsall <- reduce_dimension(stdycdsall, reduction_method = 'UMAP')

In [41]:
## Cluster and Partition the cells so they can be plotted
stdycdsall <- cluster_cells(stdycdsall, reduction_method = 'UMAP')

In [42]:
## Write the full dataset, showing which cells failed QC
plotfile <- 'Aggregated.Allcells.Partitions.ColorizedbyQC.Rev1.pdf'
plotfp <- paste(ps$outdir, plotfile, sep = '')

In [43]:
pdf(plotfp, width = 8, height = 6)
gtop3 <- plot_cells(stdycdsall, color_cells_by = "state", group_cells_by="partition",
            show_trajectory_graph = FALSE, reduction_method = "UMAP", cell_size = 0.1,
            label_cell_groups = FALSE, alpha = .4)
gtop3
dev.off()

In [44]:
print(celltypes5)

 [1] "1-Unidentified"                  "2-Macrophages"                  
 [3] "3-Endothelial Cells - (a)"       "4-Lymphoid Progenitors"         
 [5] "5-VSMCs - Contractile"           "6-Endothelial Cells - (b)"      
 [7] "7-VSMCs - Synthetic"             "8-Natural Killer Cells"         
 [9] "9-Cytotoxic T Lymphocytes"       "10-B Lymphocytes"               
[11] "11-T lymphocytes"                "12-Plasmacytoid Dendritic Cells"
[13] "13-B1 Lymphocytes"               "14-Neutrophils"                 
[15] "15-Mast Cells"                  
