In [1]:
## ---- Global Cell Set Characterization Notebook Doublet Identification---- 
##
## Load Analysis Parameters (Parm1)
## Load Preprocessed/Downsampled CDS Object
## Preform some preliminary doublet analyses (cut and paste outputted to spreadsheets)
## Create new remixed partitions
## Perform formal Gene Exclusion Study (Look for Genes Common in One Partition and Rare in Others)
## Apply Gene Selection Based on Gene Exclusion Study
## Perform Doublet (Multiplet) Scoring
## Save Result As An R Data Object (CDS1)
## Compute the mean of the maximum pseudo_R2rds for Tom's genes in the dot plot

In [5]:
## Load Monocle3 and Seurat Libraries
library(monocle3)
library(Seurat)
library(dplyr)
library(magrittr)
library(ggplot2)
library(gridExtra)
library(Matrix)
library(rhdf5)
library(corpcor)
library(fdrtool)

In [3]:
## Create a Working Input and Output Data Directory, If Id Does Not Exist
parentdir <- '/gpfs/group/torkamani/devans/'
datdir <- paste(parentdir, 'CDC2', sep = '')
if (!file.exists(datdir)) {
    dir.create(datdir)
}
setwd(datdir)

In [4]:
## Read the parameters file
ps <- read.table(file = 'parms.txt', stringsAsFactors = FALSE, header = TRUE)

In [6]:
## Read all the aggregated data into a Monocle cds object
stdycds <- load_cellranger_data(ps$indir)

In [7]:
## Read the previously preprocessed downsampled cell set data object
down_stdycds <- readRDS(file = paste(ps$outdir, 'Aggregated.downsampled.QC.Preprocessed.rds', sep = ''))

In [12]:
dim(down_stdycds)

In [19]:
## Set up file for 15 partition plot with labels showing assigned cell types
plotfile <- 'Aggregated.downsampled.QC.15parts.WithoutLegend.Rev1.pdf'
plotfp <- paste(ps$outdir, plotfile, sep = '')

In [20]:
## Plot partitions with legend showing cell types
pdf(plotfp, width = 8, height = 6)
gtop1 <- plot_cells(down_stdycds, color_cells_by="assigned_cell_type", group_cells_by="partition",
           show_trajectory_graph = FALSE, reduction_method = "UMAP",
           label_cell_groups = TRUE, cell_size = 0.1, alpha = .4)
gtop1
dev.off()

In [9]:
# ## Compute top markers using partitions (no need in this notebook to rerun this and the next
# ## 5 cells that follow this cell). Do run the 6th cell, which reads the previously stored results
# toppart <- top_markers(down_stdycd, group_cells_by = "partition",
#                                        genes_to_test_per_group = dim(down_stdycds)[1],
#                                        reference_cells = dim(down_stdycds)[2], cores = 8)




In [31]:
# ## Prepare the output for ultimate conversion to a spreadsheet
# toppart_GC <- toppart[,c(1,2,2,3:10)]
# colnames(toppart_GC)[2] <- 'gene_cards'
# nmarkers2 <- dim(toppart_GC)[1]
# gc_hyper2 <- '=HYPERLINK(CONCAT("https://www.genecards.org/Search/Keyword?queryString=", '
# gc_hyper2 <- paste(gc_hyper2, 'A', 2:(nmarkers2 + 1), '), C', 2:(nmarkers2 + 1), ')', sep ='')
# toppart_GC$gene_cards <- gc_hyper2

In [18]:
# ## Write the file
# ##This file should be nearly identical to Aggregated.downsampled.QC.TopAllMarkersPer.Rev1.tsv,
# ## with only slight variation in the least significant digits due to the optimization or 
# ## sampling method used within the Monocle top_markers function
# topmarkerfile3 <- paste(ps$outdir, 'Aggregated.downsampled.15parts.TopAllMarkers.Rev3.tsv', sep = '')
# write.table(toppart_GC, file = topmarkerfile3,
#             row.names = FALSE, col.names = TRUE,
#             sep = '\t', quote = FALSE)

In [None]:
# ## Compute all top marker files (using assigned cell type - note that this output file is not used)
# # topmarkerfile <- paste(ps$outdir, 'Aggregated.downsampled.QC.TopAllMarkersPer.Rev2.tsv', sep = '')
# marker_test_res_topall <- top_markers(down_stdycds, group_cells_by = "assigned_cell_type",
#                                        genes_to_test_per_group = dim(down_stdycds)[1], 
#                                        reference_cells = dim(down_stdycds)[2], cores = 4)



In [10]:
# ## Save all top markers to a file
#     marker_test_res_topall_with_GC <- marker_test_res_topall[,c(1,2,2,3:10)]
#     colnames(marker_test_res_topall_with_GC)[2] <- 'gene_cards'
#     nmarkers1 <- dim(marker_test_res_topall_with_GC)[1]
#     gc_hyper1 <- '=HYPERLINK(CONCAT("https://www.genecards.org/Search/Keyword?queryString=", '
#     gc_hyper1 <- paste(gc_hyper1, 'A', 2:(nmarkers1 + 1), '), C', 2:(nmarkers1 + 1), ')', sep ='')
#     marker_test_res_topall_with_GC$gene_cards <- gc_hyper1

In [11]:
# ## Write the file
# ## Should be similar to Rev1 file of the same name
# topmarkerfile2 <- paste(ps$outdir, 'Aggregated.downsampled.15parts.TopAllMarkers.Rev2.tsv', sep = '')
# write.table(marker_test_res_topall_with_GC, file = topmarkerfile2,
#             row.names = FALSE, col.names = TRUE,
#             sep = '\t', quote = FALSE)

In [9]:
# ## I've added this code from the original CSC1 code (notebook 03.00.00...)
# ## Compute top 500 marker files
# topmarkerfile <- paste(ps$outdir, 'Aggregated.downsampled.QC.Top500MarkersPer.Rev3.tsv', sep = '')
# marker_test_res_top500 <- top_markers(down_stdycds, group_cells_by = "partition",
#                                        genes_to_test_per_group = 500, # dim(down_stdycds)[1], 
#                                        reference_cells = dim(down_stdycds)[2], cores = 4)
# ## Save top 500 markers to a file
#     marker_test_res_top500_with_GC <- marker_test_res_top500[,c(1,2,2,3:10)]
#     colnames(marker_test_res_top500_with_GC)[2] <- 'gene_cards'
#     nmarkers1 <- dim(marker_test_res_top500_with_GC)[1]
#     gc_hyper1 <- '=HYPERLINK(CONCAT("https://www.genecards.org/Search/Keyword?queryString=", '
#     gc_hyper1 <- paste(gc_hyper1, 'A', 2:(nmarkers1 + 1), '), C', 2:(nmarkers1 + 1), ')', sep ='')
#     marker_test_res_top500_with_GC$gene_cards = gc_hyper1
    
# write.table(marker_test_res_top500_with_GC, file = topmarkerfile,
#             row.names = FALSE, col.names = TRUE,
#             sep = '\t', quote = FALSE)



In [10]:
## Read the original top markers file (this file was created with code similar to the code
## used to generate Aggregated.downsampled.15parts.TopAllMarkers.Rev3.tsv above, nearly identical)
topmarkers <- read.table(paste(ps$outdir,
            'Aggregated.downsampled.QC.TopAllMarkersPer.Rev1.tsv', sep = ''),
                         header = TRUE, sep = '\t', row.names = NULL)

In [8]:
## Build a gene short name to gene id (Ensembl) lookup
short2geneid <- fData(down_stdycds)@rownames
names(short2geneid) <- fData(down_stdycds)@listData$gene_short_name

In [9]:
## Build a gene id (Ensembl) to gene short name lookup
geneid2short <- fData(down_stdycds)@listData$gene_short_name
names(geneid2short) <- fData(down_stdycds)@rownames

In [10]:
## Create variables for how cells sets are organized
cellgrps <- c('healthy', 'diseased', 'healthy', 'diseased', 'healthy', 'diseased')
cellpats <- c('ID Number 1', 'ID Number 1', 'ID Number 2', 'ID Number 2', 'ID Number 3', 'ID Number 3')

In [11]:
## Create variables for how cells sets are organized
cellgrps <- c('healthy', 'diseased', 'healthy', 'diseased', 'healthy', 'diseased')
cellpats <- c('ID Number 1', 'ID Number 1', 'ID Number 2', 'ID Number 2', 'ID Number 3', 'ID Number 3')

In [12]:
## Define and Assign Cell Types
celltypes5 <- c('1-Unidentified',
                '2-Macrophages',
                '3-Endothelial Cells - (a)',
                '4-Lymphoid Progenitors',
                '5-VSMCs - Contractile',
                '6-Endothelial Cells - (b)',
                '7-VSMCs - Synthetic',
                '8-Natural Killer Cells',
                '9-Cytotoxic T Lymphocytes',
                '10-B Lymphocytes',
                '11-T lymphocytes',
                '12-Plasmacytoid Dendritic Cells',
                '13-B1 Lymphocytes',
                '14-Neutrophils',
                '15-Mast Cells')

In [13]:
## Declare Tom's best genes for definiting cell types
toms_markers5 <- c('NRXN1', 'CLU', 'ICAM2',
                 'CD14', 'CD68', 'AIF1',
                 'VWF', 'EDN1', 'ECSCR',
                 'MKI67', 'UBE2C', 'TOP2A',
                 'ACTA2', 'TAGLN', 'MYL9',
                 'ACKR1', 'SPARCL1', 'PECAM1',
                 'CALD1', 'MGP', 'DCN',
                 'NKG7', 'XCL1', 'CTSW',
                 'CD8A', 'TRAC', 'CD2',
                 'MS4A1', 'CD79A', 'BANK1',
                 'CD69', 'CXCR4', 'IL7R',
                 'LILRA4', 'IRF7', 'CLEC4C',
                 'MZB1', 'JCHAIN', 'TNFRSF17',
                 'LST1', 'FCGR3B', 'S100A8',
                 'TPSAB1', 'CPA3', 'MS4A2')
toms_gene_ids5 <- short2geneid[toms_markers5]

In [14]:
## This code was used to generate the cut and paste versions of the 
## gene exlusion studies, prior to remixing the partitions

In [14]:
## Set up parameters for one partitions worth of analysis
np <- 15              ## Number of Partitions
poi <- 6              ## Partition of interest
tl <- 0.90            ## Top level genes expressing cells threshold
bl <- 0.10            ## Bottom level genes expressing cells threshold
qp <- 1               ## Number of partitions needed below bl to qualify
xlpb <- TRUE          ## Exclude low cell counts problem partitions from qualifying partitions
lp <- c(1, 4, 11:15)  ## Small or low expressing partitions: from 1, 4, 11, 12, 13, 14, 14
tlgenes_f1 <- topmarkers[topmarkers$cell_group == poi,]  
tlgenes_f2 <- tlgenes_f1[tlgenes_f1$fraction_expressing >= tl,]
tlgenes <- as.character(tlgenes_f2$gene_short_name.1)
tlgenes_f3 <- topmarkers[topmarkers$gene_short_name.1 %in% tlgenes,]

In [23]:
## Set up table for storing results
gene_tab <- matrix(as.double(0), nrow = length(tlgenes), ncol = np)
rownames(gene_tab) <- tlgenes
for (g in 1:length(tlgenes)) {
    tlgenes_f3_p <- tlgenes_f3[tlgenes_f3$gene_short_name.1 %in% tlgenes[g],]
    for (p in 1:np) {
        gene_tab[g, p] <- round(tlgenes_f3_p$fraction_expressing[tlgenes_f3_p$cell_group == p], 3)
    }    
}
if (xlpb) gene_tab[, lp] <- 1 
qual_parts <- rowSums(gene_tab <= bl)
qual_gene_tab <- gene_tab[qual_parts >= qp,]
qual_part_filt <- rowSums(qual_gene_tab <= bl)
qual_gene_tabx <- cbind(qual_gene_tab, qual_part_filt)
qual_gene_tabx <- qual_gene_tabx[order(qual_part_filt, qual_gene_tabx[, poi], decreasing = TRUE),]
colnames(qual_gene_tabx) <- c(paste('p', 1:15, sep = ''), 'match')
dim(qual_gene_tabx)

In [24]:
## Get the expression data for this partition and the top level genes
poi_genes <- short2geneid[rownames(qual_gene_tabx)]
poi_exp <- exprs(down_stdycds[rowData(down_stdycds)$gene_short_name %in% rownames(qual_gene_tabx),
                       partitions(down_stdycds) == poi])

In [26]:
## Get the co-expression terms for this partition and genes
poi_cor <- cor(t(as.matrix(poi_exp)))
poi_cor_v <- round(sm2vec(poi_cor, diag = FALSE), 3)
poi_ind <- sm.index(poi_cor, diag = FALSE)
coex <- cbind(poi_cor_v, abs(poi_cor_v),
                as.character(geneid2short[rownames(poi_cor)[poi_ind[,1]]]),
                as.character(geneid2short[rownames(poi_cor)[poi_ind[,2]]]))
coex <- coex[order(coex[,2], decreasing = TRUE), ]
## if there is only one pair of genes, then a vector will be returned instead of a matrix
if (length(coex) == 4) {
    coex_lst <- paste(coex[3], ' to ',
                      coex[4], ' = ',
                      coex[1], sep = '')[1:length(rownames(qual_gene_tabx))]
} else {
    coex_lst <- paste(coex[,3], ' to ',
                          coex[,4],' = ',
                          coex[,1], sep = '')[1:length(rownames(qual_gene_tabx))]
} 
coex_all <- as.data.frame(cbind(qual_gene_tabx, coex_lst), stringsAsFactors = FALSE)
coex_4db <- coex_all

In [19]:
## Adjust the table to be in percentages instead of fractions, and add percentage to gene names
## in the first column (actually the rownames)
rownames(coex_all) <- paste(rownames(coex_all), '(', 100 * as.numeric(coex_all[,poi]), ')', sep ='')
for (i in 1:length(rownames(qual_gene_tabx))) {
    for (j in 1:np) {
        if (as.numeric(coex_all[i,j]) > bl) {
            coex_all[i,j] = '' 
        }
        else {
            coex_all[i,j] <- as.numeric(coex_all[i, j]) * 100
        }
    }
}


In [25]:
## This is the cut and paste table (commented out here)
# coex_all

In [21]:
## Loop through a partition and compute the fold change
## This is trial code
for (p in 1:np) {
    partx <- exprs(down_stdycds[, partitions(down_stdycds) == p])
    ## Loop through the genes
    for (g in 1:dim(coex_4db)[1]) {
        test_sgene <- rownames(coex_4db)[g]
        test_gid <- short2geneid[test_sgene]
        test_cells_o <- partx[test_gid,] > 0
        ncells <- dim(partx)[2]
        pluscells_meanUMI <- colMeans(partx[, test_cells_o])
        minuscells_meanUMI <- colMeans(partx[, !test_cells_o])
        upUMI <- mean(pluscells_meanUMI)/mean(minuscells_meanUMI)
        ref_sgene <- ref_genes[p]
        ref_gid <- short2geneid[ref_sgene]
        pluscells_refUMI <- mean(partx[ref_gid, test_cells_o])
        minuscells_refUMI <- mean(partx[ref_gid, !test_cells_o])       
        calUMI <- mean(pluscells_refUMI)/mean(minuscells_refUMI)
        print(round(as.numeric(c(sum(test_cells_o),
                                 coex_4db[g, p],
                                 upUMI, calUMI,
                                 upUMI/calUMI)), 3))
    }
}


ERROR: Error in eval(expr, envir, enclos): object 'ref_genes' not found


In [None]:
## This is the first code for working with remixed partitions

In [22]:
## Set up new partition id colData variable remix
remix <- as.character(partitions(down_stdycds))
remix[remix %in% c(1, 4, 11:15)] <- 'FAILQC2.1.4.11-15'
remix[remix %in% c(3, 6)] <- 'ENDO.3.6'
remix[remix %in% c(5, 7)] <- 'VSMC.5.7'
remix[remix == 2] <- 'MACRO.2'
remix[remix == 8] <- 'NKC.8'
remix[remix == 9] <- 'CYTOT.9'
remix[remix == 10] <- 'BLYMPH.10'
remix_keepers <- c('MACRO.2', 'ENDO.3.6', 'VSMC.5.7', 'NKC.8', 'CYTOT.9', 'BLYMPH.10')
down_stdycds@colData$remix <- remix

In [39]:
## Save the remix cell barcodes and cluster name/mappings
sum(table(down_stdycds@colData$remix)[c(1,2,3,5,6,7)])
length(colnames(down_stdycds))
length(down_stdycds@colData$remix)
remix_codes <- cbind(colnames(down_stdycds), down_stdycds@colData$remix)
write.table(remix_codes, file ='outputdat/Remix_Codes.Rev1.txt', col.names = FALSE)

In [40]:
getwd()

In [28]:
## Set up new topmarkers data object based on new partition definitions
newtop <- as.data.frame(matrix('', nrow <- dim(down_stdycds)[1], ncol = 4),
                       stringsAsFactors = FALSE)
colnames(newtop) <- colnames(topmarkers)[c(1, 3, 4, 7)]
newtop$gene_id <- fData(down_stdycds)@rownames
newtop$gene_short_name.1 <- geneid2short[newtop$gene_id]

In [29]:
## Loop though all the remixed partitions, fill in the newtop object
## and append into a single data object for all the remixed partitions
for (p in remix_keepers) {
    newtop$cell_group <- p
    newp_exp <- exprs(down_stdycds[, colData(down_stdycds)$remix == p])
    newtop$mean_expression <- rowMeans(newp_exp)
    ncells <- dim(newp_exp)[2]
    newtop$fraction_expressing <- rowSums(newp_exp > 0)/ncells
    if (p == remix_keepers[1]) {
        topmix <- newtop
    } else {
        topmix <- rbind(topmix, newtop)
    }
}

In [30]:
## Now code the gene exclusion Study Using the remixed partition frame work

In [31]:
## Parameters, again in cut and paste senario, single partition at a time
np <- length(remix_keepers)     ## Number of Partitions
poi <- remix_keepers[5]         ## Partition of interest
tl <- 0.78                    ## Top level genes expressing cells threshold
bl <- 0.10                      ## Bottom level genes expressing cells threshold
qp <- 1                         ## Number of partitions needed below bl to qualify
# xlpb <- TRUE                  ## Exclude low cell counts problem partitions from qualifying partitions
# lp <- c(1, 4, 11:15)          ## Small or low expressing partitions: from 1, 4, 11, 12, 13, 14, 14
tlgenes_f1 <- topmix[topmix$cell_group == poi,]  
tlgenes_f2 <- tlgenes_f1[tlgenes_f1$fraction_expressing >= tl,]
tlgenes <- as.character(tlgenes_f2$gene_short_name.1)
tlgenes_f3 <- topmix[topmix$gene_short_name.1 %in% tlgenes,]

In [32]:
## Compute the new but limited topmarker matrix 
gene_tab <- matrix(as.double(0), nrow = length(tlgenes), ncol = np)
rownames(gene_tab) <- tlgenes
for (g in 1:length(tlgenes)) {
    tlgenes_f3_p <- tlgenes_f3[tlgenes_f3$gene_short_name.1 %in% tlgenes[g],]
    for (p in 1:np) {
        ptxt <- remix_keepers[p]
        gene_tab[g, p] <- round(tlgenes_f3_p$fraction_expressing[tlgenes_f3_p$cell_group == ptxt], 3)
    }    
}
# if (xlpb) gene_tab[, lp] <- 1 
qual_parts <- rowSums(gene_tab <= bl)
qual_gene_tab <- gene_tab[qual_parts >= qp,]
qual_part_filt <- rowSums(qual_gene_tab <= bl)
qual_gene_tabx <- cbind(qual_gene_tab, qual_part_filt)
thispart <- which(poi %in% remix_keepers)
qual_gene_tabx <- qual_gene_tabx[order(qual_part_filt, qual_gene_tabx[, thispart], decreasing = TRUE),]
colnames(qual_gene_tabx) <- c(remix_keepers, 'match')
dim(qual_gene_tabx)

In [33]:
## Get the expression data for just this gene set
poi_genes <- short2geneid[rownames(qual_gene_tabx)]
poi_exp <- exprs(down_stdycds[rowData(down_stdycds)$gene_short_name %in% rownames(qual_gene_tabx),
    colData(down_stdycds)$remix == poi])

In [34]:
## Perform co-expression calculation
poi_cor <- cor(t(as.matrix(poi_exp)))
poi_cor_v <- round(sm2vec(poi_cor, diag = FALSE), 3)
poi_ind <- sm.index(poi_cor, diag = FALSE)
coex <- cbind(poi_cor_v, abs(poi_cor_v),
                as.character(geneid2short[rownames(poi_cor)[poi_ind[,1]]]),
                as.character(geneid2short[rownames(poi_cor)[poi_ind[,2]]]))
coex <- coex[order(coex[,2], decreasing = TRUE), ]
## if there is only one pair of genes, then a vector will be returned instead of a matrix
if (length(coex) == 4) {
    coex_lst <- paste(coex[3], ' to ',
                      coex[4], ' = ',
                      coex[1], sep = '')[1:length(rownames(qual_gene_tabx))]
} else {
    coex_lst <- paste(coex[,3], ' to ',
                          coex[,4],' = ',
                          coex[,1], sep = '')[1:length(rownames(qual_gene_tabx))]
} 
coex_all <- as.data.frame(cbind(qual_gene_tabx, coex_lst), stringsAsFactors = FALSE)
coex_4db <- coex_all

In [35]:
## Reformat results into percentages, from fractions
rownames(coex_all) <- paste(rownames(coex_all), '(', 100 * as.numeric(coex_all[,poi]), ')', sep ='')
for (i in 1:length(rownames(qual_gene_tabx))) {
    for (j in 1:np) {
        if (as.numeric(coex_all[i,j]) > bl) {
            coex_all[i,j] = '' 
        }
        else {
            coex_all[i,j] <- as.numeric(coex_all[i, j]) * 100
        }
    }
}



In [130]:
## This is the cut and paste table (if uncommented)
# coex_all

In [37]:
## Get remix partition breakdown (cell counts by partition across patients, disease and healthy)
part_stats2 <- as.data.frame(matrix('', nrow = length(names(table(remix))),
                                   ncol = (length(cellgrps) + 2)), stringsAsFactors = FALSE)

In [38]:
## Add the cell groups and patient IDs to the first two rows
part_stats2 <- rbind(cellpats, cellgrps, part_stats2)
## Add the partition names (cell types), and first two column row names as well
remixall <- c(remix_keepers, 'FAILQC2.1.4.11-15')
row.names(part_stats2) <- c('Patient ID', 'Condition', remixall)
part_stats2[1, c(7,8)] <- c('Total', 'Mean')
part_stats2[2, c(7,8)] <- c('Cells', 'UMI')

In [39]:
## Get the partitions to summarize
parts <- colData(down_stdycds)$remix[colData(down_stdycds)$remix %in% remixall]

In [40]:
for (c in 1:dim(part_stats2)[2]) {
    for (r in 3:dim(part_stats2)[1]) {
        cells <- sum((parts == remixall[r - 2]) &
                     (colData(down_stdycds)$patient ==  cellpats[c])  &
                     (colData(down_stdycds)$condition ==  cellgrps[c]))
        part_stats2[r, c] <- cells
    }
}
for (r in 3:dim(part_stats2)[1]) {
    part_stats2[r,7] <- sum(as.numeric(part_stats2[r, 1:6]))
    part_stats2[r,8] <- round(mean(colSums(exprs(down_stdycds[,
                        colData(down_stdycds)$remix == remixall[r-2]]))), 0)
}

In [42]:
## Save the partition statistics
write.table(part_stats2,
            file = paste(ps$outdir, 'Aggregated.downsampled.QC.Partition.CellCount.Stats2.txt', sep = ''),
            col.names = FALSE)

In [44]:
## Recoded from above, in a loop form for all partitions of
## the gene exclusion Study Using the remixed partition frame work
## This is not cut and paste

In [77]:
## This function computed the doublet statistics (4 tables) and marks doublets in
## an updated cell dataset object
doublet_stats <- function(pgtab, cdsdat, bottom_level, pthresh) {
    bl <- bottom_level
    ## Some return variables
    cell_cnt_tab <- pgtab
    cell_umi_enr <- pgtab
    cell_tstat <- pgtab
    cell_pval <- pgtab
    expdat <- exprs(cdsdat)
    updated_cds <- cdsdat ## Make a copy
    pcnt <- 1
    for (p in colnames(pgtab)) {
        gcnt <- 1
        pexpdat <- expdat[, colData(cdsdat)$remix == p]
        for (g in short2geneid[rownames(pgtab)]) {
            if (pgtab[gcnt, pcnt] <= bl ) {
                allbars <- colnames(pexpdat)
                pluscells <- pexpdat[which(rownames(pexpdat) %in% g), ] > 0
                minuscells <- !pluscells
                cell_cnt <- sum(pluscells)
                plusumi <- colSums(pexpdat[, pluscells])
                minusumi <- colSums(pexpdat[, minuscells])
                foldch <- round((mean(plusumi)/mean(minusumi)), 2)
                tt <- t.test(x = minusumi, y = plusumi, alternative = 'less')
                cell_cnt_tab[gcnt, pcnt] <-  cell_cnt
                cell_umi_enr[gcnt, pcnt] <- round(foldch, 2)
                cell_tstat[gcnt, pcnt] <- round(tt$statistic, 2)
                cell_pval[gcnt, pcnt] <- formatC(tt$p.value, format = "e", digits = 2)
                plusbars <- colnames(pexpdat)[pluscells]
                if (tt$p.value <= pthresh) {
                    v <- updated_cds[, which(colnames(updated_cds) %in% plusbars)]$pcount + 1
                    updated_cds[, which(colnames(updated_cds) %in% plusbars)]$pcount <- v
                }
            } else {
                cell_cnt_tab[gcnt, pcnt] <- ''
                cell_umi_enr[gcnt, pcnt] <- ''
                cell_tstat[gcnt, pcnt] <- ''
                cell_pval[gcnt, pcnt] <- ''
            }
            gcnt <- gcnt + 1
        }
    pcnt <- pcnt + 1
    }
    colnames(cell_cnt_tab) <- paste('ccnt.', colnames(cell_cnt_tab))
    colnames(cell_umi_enr) <- paste('foldc.', colnames(cell_umi_enr))
    colnames(cell_tstat) <- paste('tstat.', colnames(cell_tstat))
    colnames(cell_pval) <- paste('pval.', colnames(cell_pval))
    
    itemlist <- list(cell_cnt_tab, cell_umi_enr, cell_tstat, cell_pval, updated_cds)
    return(itemlist)
}

In [78]:
## Recompute the topmarkers data based on merged and deleted partitions,
## and without partition level DE (gene expression levels only)

## Set up new partition id colData variable remix
remix <- as.character(partitions(down_stdycds))
remix[remix %in% c(1, 4, 11:15)] <- 'FAILQC2.1.4.11-15'
remix[remix %in% c(3, 6)] <- 'ENDO.3.6'
remix[remix %in% c(5, 7)] <- 'VSMC.5.7'
remix[remix == 2] <- 'MACRO.2'
remix[remix == 8] <- 'NKC.8'
remix[remix == 9] <- 'CYTOT.9'
remix[remix == 10] <- 'BLYMPH.10'
remix_keepers <- c('MACRO.2', 'ENDO.3.6', 'VSMC.5.7', 'NKC.8', 'CYTOT.9', 'BLYMPH.10')
down_stdycds@colData$remix <- remix

## Set up new topmarkers data object based on new partition definitions
newtop <- as.data.frame(matrix('', nrow <- dim(down_stdycds)[1], ncol = 4),
                       stringsAsFactors = FALSE)
colnames(newtop) <- colnames(topmarkers)[c(1, 3, 4, 7)]
newtop$gene_id <- fData(down_stdycds)@rownames
newtop$gene_short_name.1 <- geneid2short[newtop$gene_id]

## Loop though all the remixed partitions, fill in the newtop object
## and append into a single data object for all the remixed partitions
for (p in remix_keepers) {
    newtop$cell_group <- p
    newp_exp <- exprs(down_stdycds[, colData(down_stdycds)$remix == p])
    newtop$mean_expression <- rowMeans(newp_exp)
    ncells <- dim(newp_exp)[2]
    newtop$fraction_expressing <- rowSums(newp_exp > 0)/ncells
    if (p == remix_keepers[1]) {
        topmix <- newtop
    } else {
        topmix <- rbind(topmix, newtop)
    }
}

In [121]:
## Now code the gene exclusion Study Using the remixed partition frame, this time in a loop
np <- length(remix_keepers)     ## Number of Partitions
## Top level genes expressing cells thresholds (varies by partition)
tl_gp <- c(.9, .9, .87, .9, .78, .9)
pvalue.thresh <- 0.005          ## P-value threshold for calling doublets 
bl <- 0.10                      ## Bottom level genes expressing cells threshold
qp <- 1                         ## Number of partitions needed below bl to qualify
cnt <- 1                        ## Loop Counter
options(stringsAsFactors = FALSE)
colData(down_stdycds)$pcount <- 0 ## Initialize cell counter
for(poi in remix_keepers[1:6]) {
    tl <- tl_gp[cnt]
    cnt <- cnt + 1
    tlgenes_f1 <- topmix[topmix$cell_group == poi,]  
    tlgenes_f2 <- tlgenes_f1[tlgenes_f1$fraction_expressing >= tl,]
    tlgenes <- as.character(tlgenes_f2$gene_short_name.1)
    tlgenes_f3 <- topmix[topmix$gene_short_name.1 %in% tlgenes,]

    ## Compute the new but limited topmarker matrix 
    gene_tab <- matrix(as.double(0), nrow = length(tlgenes), ncol = np)
    rownames(gene_tab) <- tlgenes
    for (g in 1:length(tlgenes)) {
        tlgenes_f3_p <- tlgenes_f3[tlgenes_f3$gene_short_name.1 %in% tlgenes[g],]
        for (p in 1:np) {
            ptxt <- remix_keepers[p]
            gene_tab[g, p] <- round(tlgenes_f3_p$fraction_expressing[tlgenes_f3_p$cell_group == ptxt], 3)
        }    
    }
    qual_parts <- rowSums(gene_tab <= bl)
    qual_gene_tab <- gene_tab[qual_parts >= qp,]
    qual_part_filt <- rowSums(qual_gene_tab <= bl)
    qual_gene_tabx <- cbind(qual_gene_tab, qual_part_filt)
    thispart <- which(poi %in% remix_keepers)
    qual_gene_tabx <- qual_gene_tabx[order(qual_part_filt, qual_gene_tabx[, thispart], decreasing = TRUE),]
    colnames(qual_gene_tabx) <- c(remix_keepers, 'match')
    dim(qual_gene_tabx)

    ## Get the expression data for just this gene set
    poi_genes <- short2geneid[rownames(qual_gene_tabx)]
    poi_exp <- exprs(down_stdycds[rowData(down_stdycds)$gene_short_name %in% rownames(qual_gene_tabx),
        colData(down_stdycds)$remix == poi])

    ## Perform co-expression calculation
    poi_cor <- cor(t(as.matrix(poi_exp)))
    poi_cor_v <- round(sm2vec(poi_cor, diag = FALSE), 3)
    poi_ind <- sm.index(poi_cor, diag = FALSE)
    coex <- cbind(poi_cor_v, abs(poi_cor_v),
                    as.character(geneid2short[rownames(poi_cor)[poi_ind[,1]]]),
                    as.character(geneid2short[rownames(poi_cor)[poi_ind[,2]]]))
    coex <- coex[order(coex[,2], decreasing = TRUE), ]
    ## if there is only one pair of genes, then a vector will be returned instead of a matrix
    if (length(coex) == 4) {
        coex_lst <- paste(coex[3], ' to ',
                          coex[4], ' = ',
                          coex[1], sep = '')[1:length(rownames(qual_gene_tabx))]
    } else {
        coex_lst <- paste(coex[,3], ' to ',
                              coex[,4],' = ',
                              coex[,1], sep = '')[1:length(rownames(qual_gene_tabx))]
    } 
    coex_all <- as.data.frame(cbind(qual_gene_tabx, coex_lst), stringsAsFactors = FALSE)
    
    ## Compute the other matrices needed to determine potential doublet enrichment
    coex_red <- coex_all[, 1:np]
    coex_stat_lst <- doublet_stats(coex_red, down_stdycds, bl, pvalue.thresh)
    colData(down_stdycds)$pcount <-  colData(coex_stat_lst[[5]])$pcount ## Update CDS object
    ## Reformat results
    rownames(coex_all) <- paste(rownames(coex_all), '(', 100 * as.numeric(coex_all[,poi]), ')', sep ='')
    for (i in 1:length(rownames(qual_gene_tabx))) {
        for (j in 1:np) {
            if (as.numeric(coex_all[i,j]) > bl) {
                coex_all[i,j] = '' 
            }
            else {
                coex_all[i,j] <- as.numeric(coex_all[i, j]) * 100
            }
        }
    }
    
    coex_1sp <- matrix('', nrow = 1, ncol = dim(coex_all)[2])
    colnames(coex_1sp) <- colnames(coex_all)
    rownames(coex_1sp) <- paste('Partition ', poi, ", Thresh = ", round(tl * 100, 0), '%', sep = '')
    coex_all <- rbind(coex_1sp, coex_all)
    coex_all[1,] <- colnames(coex_all)
    
    coex_1sp <- matrix('', nrow = 1, ncol = dim(coex_stat_lst[[1]])[2])
    colnames(coex_1sp) <- colnames(coex_stat_lst[[1]])
    rownames(coex_1sp) <- 'Cell Counts'
    coex_all_cnt <- rbind(coex_1sp, coex_stat_lst[[1]])
    coex_all_cnt[1,] <- colnames(coex_stat_lst[[1]])
    coex_all <- cbind(coex_all, coex_all_cnt)
    
    coex_1sp <- matrix('', nrow = 1, ncol = dim(coex_stat_lst[[2]])[2])
    colnames(coex_1sp) <- colnames(coex_stat_lst[[2]])
    rownames(coex_1sp) <- 'Fold Change'
    coex_all_fc <- rbind(coex_1sp, coex_stat_lst[[2]])
    coex_all_fc[1,] <- colnames(coex_stat_lst[[2]])
    coex_all <- cbind(coex_all, coex_all_fc)
     
    coex_1sp <- matrix('', nrow = 1, ncol = dim(coex_stat_lst[[3]])[2])
    colnames(coex_1sp) <- colnames(coex_stat_lst[[3]])
    rownames(coex_1sp) <- 'T Statistic'
    coex_all_tstat <- rbind(coex_1sp, coex_stat_lst[[3]])
    coex_all_tstat[1,] <- colnames(coex_stat_lst[[3]])
    coex_all <- cbind(coex_all, coex_all_tstat)   
    
    coex_1sp <- matrix('', nrow = 1, ncol = dim(coex_stat_lst[[4]])[2])
    colnames(coex_1sp) <- colnames(coex_stat_lst[[4]])
    rownames(coex_1sp) <- 'p Value'
    coex_all_pval <- rbind(coex_1sp, coex_stat_lst[[4]])
    coex_all_pval[1,] <- colnames(coex_stat_lst[[4]])
    coex_all <- cbind(coex_all, coex_all_pval)  

if (poi == remix_keepers[1]) sum_coex_all <- coex_all else sum_coex_all <- rbind(sum_coex_all, coex_all) 
}

In [124]:
## Rerun the code above get the the pcount for various p-value thresholds and save in separate variables 
# colData(down_stdycds)$p0.0 <- colData(down_stdycds)$pcount
# colData(down_stdycds)$p1.0 <- colData(down_stdycds)$pcount
# colData(down_stdycds)$p0.5 <- colData(down_stdycds)$pcount
# colData(down_stdycds)$p0.05 <- colData(down_stdycds)$pcount
# colData(down_stdycds)$p0.005 <- colData(down_stdycds)$pcount
# colData(down_stdycds)$p0.0005 <- colData(down_stdycds)$pcount
# colData(down_stdycds)$p0.00005 <- colData(down_stdycds)$pcount

In [133]:
# Save the updated CDS object to a file i(CDS1 in outline)
saveRDS(down_stdycds, file = paste(ps$outdir, 'Aggregated.downsampled.QC.Doublets.Counted.rds', sep = ''))

In [98]:
## Save the automated doublet findings (gene exclusion study)
write.table(sum_coex_all,
            file = paste('Auto.Gene.Exclusion.Study.Rev5.txt', sep = ''),
            col.names = FALSE)

In [1]:
gc()

Unnamed: 0,used,(Mb),gc trigger,(Mb).1,max used,(Mb).2
Ncells,539119,28.8,1168712,62.5,813602,43.5
Vcells,1037983,8.0,8388608,64.0,1760016,13.5


In [48]:
## Compute the mean of the maximum pseudo_R2rds for Tom's genes in the dot plot
stattrack <- vector(mode = 'numeric', length = length(toms_gene_ids5))
i <- 1
for (gene in toms_gene_ids5[1:45]) {
    mmatch <- topmarkers$pseudo_R2[which(topmarkers$gene_id == gene)]
    stattrack[i] <- round(max(mmatch), 3)
    i <- i + 1
}
print(mean(stattrack))

[1] 0.5598444


In [32]:
colnames(topmarkers)

In [35]:
help(vector)