In [1]:
## ---- Global Cell Set Characterization Notebook 15 Partition Cells Set

## Load Analysis Parameters (Parm1)
## Read R Data Object from Prior Notebook (CDS1)
## Compute Top Genes for 15 Partition Cell Set and Save Result
## Create Dot Plots For 15 Partition Cell Set

In [1]:
## Create a Working Input and Output Data Directory, If Id Does Not Exist
parentdir <- '/gpfs/group/torkamani/devans/'
datdir <- paste(parentdir, 'CDC2', sep = '')
if (!file.exists(datdir)) {
    dir.create(datdir)
}
setwd(datdir)

In [2]:
## Read the parameters file
ps <- read.table(file = 'parms.txt', stringsAsFactors = FALSE, header = TRUE)

In [4]:
## Load Monocle3 and Seurat Libraries
library(monocle3)
library(Seurat)
library(dplyr)
library(magrittr)
library(ggplot2)
library(gridExtra)
library(Matrix)
library(rhdf5)
library(grid)

In [8]:
## Read the previously preprocessed downsampled cell set data object
down_stdycds <- readRDS(file = paste(ps$outdir, 'Aggregated.downsampled.QC.Preprocessed.rds', sep = ''))

In [5]:
datdir

In [9]:
## Build a gene short name to gene id (Ensembl) lookup
short2geneid <- fData(down_stdycds)@rownames
names(short2geneid) <- fData(down_stdycds)@listData$gene_short_name

In [10]:
## Build a gene id (Ensembl) to gene short name lookup
geneid2short <- fData(down_stdycds)@listData$gene_short_name
names(geneid2short) <- fData(down_stdycds)@rownames

In [11]:
## Create variables for how cells sets are organized
cellgrps <- c('healthy', 'diseased', 'healthy', 'diseased', 'healthy', 'diseased')
cellpats <- c('ID Number 1', 'ID Number 1', 'ID Number 2', 'ID Number 2', 'ID Number 3', 'ID Number 3')

In [12]:
## Define and Assign Cell Types
celltypes5 <- c('1-Unidentified',
                '2-Macrophages',
                '3-Endothelial Cells - (a)',
                '4-Lymphoid Progenitors',
                '5-VSMCs - Contractile',
                '6-Endothelial Cells - (b)',
                '7-VSMCs - Synthetic',
                '8-Natural Killer Cells',
                '9-Cytotoxic T Lymphocytes',
                '10-B Lymphocytes',
                '11-T lymphocytes',
                '12-Plasmacytoid Dendritic Cells',
                '13-B1 Lymphocytes',
                '14-Neutrophils',
                '15-Mast Cells')

In [13]:
## Declare Tom's best genes for definiting cell types
toms_markers5 <- c('NRXN1', 'CLU', 'ICAM2',
                 'CD14', 'CD68', 'AIF1',
                 'VWF', 'EDN1', 'ECSCR',
                 'MKI67', 'UBE2C', 'TOP2A',
                 'ACTA2', 'TAGLN', 'MYL9',
                 'ACKR1', 'SPARCL1', 'PECAM1',
                 'CALD1', 'MGP', 'DCN',
                 'NKG7', 'XCL1', 'CTSW',
                 'CD8A', 'TRAC', 'CD2',
                 'MS4A1', 'CD79A', 'BANK1',
                 'CD69', 'CXCR4', 'IL7R',
                 'LILRA4', 'IRF7', 'CLEC4C',
                 'MZB1', 'JCHAIN', 'TNFRSF17',
                 'LST1', 'FCGR3B', 'S100A8',
                 'TPSAB1', 'CPA3', 'MS4A2')
toms_gene_ids5 <- short2geneid[toms_markers5]

In [None]:
## Compute top markers using partitions (no need to rerun this and the next
## 5 cells that follow this cell). Do run the 6th cell, which reads the previously stored results
toppart <- top_markers(down_stdycd, group_cells_by = "partition",
                                       genes_to_test_per_group = dim(down_stdycds)[1],
                                       reference_cells = dim(down_stdycds)[2], cores = 8)


In [None]:
## Prepare the output for ultimate conversion to a spreadsheet
toppart_GC <- toppart[,c(1,2,2,3:10)]
colnames(toppart_GC)[2] <- 'gene_cards'
nmarkers2 <- dim(toppart_GC)[1]
gc_hyper2 <- '=HYPERLINK(CONCAT("https://www.genecards.org/Search/Keyword?queryString=", '
gc_hyper2 <- paste(gc_hyper2, 'A', 2:(nmarkers2 + 1), '), C', 2:(nmarkers2 + 1), ')', sep ='')
toppart_GC$gene_cards <- gc_hyper2

In [None]:
## Write the file
##This file should be nearly identical to Aggregated.downsampled.QC.TopAllMarkersPer.Rev1.tsv,
## with only slight variation in the least significant digits due to the optimization or 
## sampling method used within the Monocle top_markers function
topmarkerfile3 <- paste(ps$outdir, 'Aggregated.downsampled.15parts.TopAllMarkers.Rev3x.tsv', sep = '')
write.table(toppart_GC, file = topmarkerfile3,
            row.names = FALSE, col.names = TRUE,
            sep = '\t', quote = FALSE)

In [None]:
## Compute all top marker files (using assigned cell type - note that this output file is not used)
# topmarkerfile <- paste(ps$outdir, 'Aggregated.downsampled.QC.TopAllMarkersPer.Rev2.tsv', sep = '')
marker_test_res_topall <- top_markers(down_stdycds, group_cells_by = "assigned_cell_type",
                                       genes_to_test_per_group = dim(down_stdycds)[1], 
                                       reference_cells = dim(down_stdycds)[2], cores = 4)

In [None]:
## Save all top markers to a file
    marker_test_res_topall_with_GC <- marker_test_res_topall[,c(1,2,2,3:10)]
    colnames(marker_test_res_topall_with_GC)[2] <- 'gene_cards'
    nmarkers1 <- dim(marker_test_res_topall_with_GC)[1]
    gc_hyper1 <- '=HYPERLINK(CONCAT("https://www.genecards.org/Search/Keyword?queryString=", '
    gc_hyper1 <- paste(gc_hyper1, 'A', 2:(nmarkers1 + 1), '), C', 2:(nmarkers1 + 1), ')', sep ='')
    marker_test_res_topall_with_GC$gene_cards <- gc_hyper1

In [None]:
## Write the file
## Should be similar to Rev1 file of the same name
topmarkerfile2 <- paste(ps$outdir, 'Aggregated.downsampled.15parts.TopAllMarkers.Rev2.tsv', sep = '')
write.table(marker_test_res_topall_with_GC, file = topmarkerfile2,
            row.names = FALSE, col.names = TRUE,
            sep = '\t', quote = FALSE)

In [None]:
## I've added this code from the original CSC1 code (notebook 03.00.00...)
## Compute top 500 marker files
topmarkerfile <- paste(ps$outdir, 'Aggregated.downsampled.QC.Top500MarkersPer.Rev3x.tsv', sep = '')
marker_test_res_top500 <- top_markers(down_stdycds, group_cells_by = "partition",
                                       genes_to_test_per_group = 500, # dim(down_stdycds)[1], 
                                       reference_cells = dim(down_stdycds)[2], cores = 4)
## Save top 500 markers to a file
    marker_test_res_top500_with_GC <- marker_test_res_top500[,c(1,2,2,3:10)]
    colnames(marker_test_res_top500_with_GC)[2] <- 'gene_cards'
    nmarkers1 <- dim(marker_test_res_top500_with_GC)[1]
    gc_hyper1 <- '=HYPERLINK(CONCAT("https://www.genecards.org/Search/Keyword?queryString=", '
    gc_hyper1 <- paste(gc_hyper1, 'A', 2:(nmarkers1 + 1), '), C', 2:(nmarkers1 + 1), ')', sep ='')
    marker_test_res_top500_with_GC$gene_cards = gc_hyper1
    
write.table(marker_test_res_top500_with_GC, file = topmarkerfile,
            row.names = FALSE, col.names = TRUE,
            sep = '\t', quote = FALSE)

In [16]:
## Compute top 500 marker files
## The Rev1 version of this file was created in the first pipeline run (CDC1)
## The cod
topmarkerfile <- paste(ps$outdir, 'Aggregated.downsampled.QC.15Part.Top500MarkersPer.Rev2x.tsv', sep = '')
marker_test_res_top500 <- top_markers(down_stdycds, group_cells_by = "partition",
                                       genes_to_test_per_group = 500, # dim(down_stdycds)[1], 
                                       reference_cells = dim(down_stdycds)[2], cores = 4)



In [261]:
## Save top 500 markers to a file
    marker_test_res_top500_with_GC <- marker_test_res_top500[,c(1,2,2,3:10)]
    colnames(marker_test_res_top500_with_GC)[2] <- 'gene_cards'
    nmarkers1 <- dim(marker_test_res_top500_with_GC)[1]
    gc_hyper1 <- '=HYPERLINK(CONCAT("https://www.genecards.org/Search/Keyword?queryString=", '
    gc_hyper1 <- paste(gc_hyper1, 'A', 2:(nmarkers1 + 1), '), C', 2:(nmarkers1 + 1), ')', sep ='')
    marker_test_res_top500_with_GC$gene_cards = gc_hyper1
    
write.table(marker_test_res_top500_with_GC, file = topmarkerfile,
            row.names = FALSE, col.names = TRUE,
            sep = '\t', quote = FALSE)

In [174]:
top_specific_markers3 <- marker_test_res_top500_with_GC %>%
                            filter(fraction_expressing >= 0.10) %>%
                            group_by(cell_group) %>%
                            top_n(3, pseudo_R2)
top_specific_marker_ids <- unique(top_specific_markers3 %>% pull(gene_id))
top_specific_genes <- as.character(geneid2short[top_specific_marker_ids])
top_specific_marker_ids <- short2geneid[top_specific_genes]

In [155]:
source('~/code/CSC/07.0.0.Carotid.SC.Utilities.01.r')

In [184]:
## This is a dot plot of the toms list of marker genes (5th iteration)
g5 <- plot_genes_by_group2(down_stdycds,
                    axis_order = "group_marker",
                    toms_gene_ids5,
                    group_cells_by = "partition",
                    ordering_type = "as_ordered", # "cluster_row_col" "as_ordered"
                    max.size = 8,
                    norm_method = 'size_only',
                    scale_max = 3, scale_min = -3)

In [191]:
## Write the updated dot plot to a file
plotfile <- 'Aggregated.downsampled.QC.15Part.TomsGenes.DOTPlot.Rev2.pdf'
plotfp <- paste(ps$outdir, plotfile, sep = '')

In [193]:
pdf(plotfp, width = 10, height = 12)
g5
dev.off()

In [22]:
## Get the gene data for toms_markers5
GlobExpVal <- marker_test_res_topall_with_GC[marker_test_res_topall_with_GC$gene_short_name.1 %in%
                                             toms_markers5,]
GlobExpVal$gene_short_name.1 <- as.character(GlobExpVal$gene_short_name.1)

ERROR: Error in eval(expr, envir, enclos): object 'marker_test_res_topall_with_GC' not found


In [None]:
## Temporarily regroup the GlobExpVal object, and then replace it
tempGEV <- GlobExpVal[,c(3, 4, 6, 7)]
colnames(tempGEV) <- c('Gene', 'Group', 'mean', 'percentage')
tempGEV$Group <- as.integer(substr(tempGEV$Group, 1, 1))
GlobExpVal <- tempGEV

In [194]:
## This is a special dot plot created for Tom's poster
## This is all setup in this cell
##-----------------
## Intialize data frame for dot plot
dotmat <- as.data.frame(matrix(as.integer(0), nrow = dim(GlobExpVal)[1] , ncol = 6), stringsAsFactors = FALSE)
colnames(dotmat) <- c('xname', 'yname', 'x', 'y', 'size', 'color') 
## Load the data frame
dotmat$y <- as.integer(GlobExpVal$Group)
for (i in 1:dim(GlobExpVal)[1]) {
    dotmat$x[i] <- as.integer(which(toms_markers5 %in%  as.character(GlobExpVal$Gene[i]))[1])
    }
dotmat$xname <- toms_markers5[dotmat$x]
dotmat$yname <- as.character(celltypes5[dotmat$y])
dotmat$size <- GlobExpVal$percentage * 100
dotmat$color <- GlobExpVal$mean
dotmat$stroke <- as.integer(0)
dotmat$fill <- 'white'
## Some settings and precomputed values
xmargin <- .5
ymargin <- .5
dotscale <- 1.5
dotmat$scsize <- dotscale * dotmat$size
xlimlow <- min(dotmat$x) - xmargin
xlimhi <- max(dotmat$x) + xmargin
ylimlow <- min(dotmat$y) - ymargin
ylimhi <- max(dotmat$y) + ymargin
## Get the axis tick labels
xnames <- toms_markers5 # unique(dotmat$xname)
ynames <- celltypes5 # unique(dotmat$yname)
## Change the order of y axis
dotmat$y <- 16 - dotmat$y
ynames <- ynames[15:1]
## Some ggplot text formatting
red.bold.italic.text <- element_text(face = "bold.italic", color = "black")
black.90italic.text <- element_text(face = "bold.italic", hjust = 1,
                                    color = "black", angle = 90, vjust = .5, size = 8)

In [199]:
## Create the actual dot plot object
g6 <- ggplot(dotmat, aes(x = x, y = y)) + 
    geom_point(aes(col = color, size = size, stroke = size/2), stroke = 0) +
    scale_radius(range = c(1, 5)) +
    coord_cartesian(xlim = c(xlimlow, xlimhi), ylim = c(ylimlow, ylimhi)) +
    xlab("") + ylab("")  +                  
    scale_x_continuous(breaks = seq(min(dotmat$x),  max(dotmat$x), 1), labels = xnames) + 
    scale_y_continuous(breaks = seq(min(dotmat$y), max(dotmat$y), 1), labels = ynames) +
    scale_colour_gradient(high = "#132B43", low = "#56B1F7") +
    theme_bw() + # labs(subtitle="BW Theme") +
    theme_classic() + # labs(subtitle="Classic Theme") + 
    theme(axis.text.x = black.90italic.text) + 
    theme(legend.position = 'bottom', legend.direction = 'vertical') +
    theme(plot.margin = margin(.5, .5, .5, .5, "in")) + # , plot.background = element_rect(fill = "white"))
    theme(panel.border = element_rect(colour = "black", fill = NA)) + 
    labs(color = "", size = "") 

In [223]:
## set up the dot plot file
plotfile <- 'Aggregated.downsampled.QC.Part15.TomsGenes.PosterDOTPlot.Rev2.pdf'
plotfp <- paste(ps$outdir, plotfile, sep = '')

In [224]:
## Actually create the file, adding annotations
pdf(plotfp, width = 10, height = 8)
    g6
    my_text1 <- 'Percent of cells in\npartition with 1+ reads\ncorresponding to\ngene marker'
    my_grob1 <- grid.text(my_text1, x = .55 - .025,  y =.09 + .07, hjust = 1,
                        gp = gpar(col = "black", fontsize = 10, fontface = "bold"))
    my_text2 <- 'Scaled Expression'
    my_grob2 <- grid.text(my_text2, x = .7 - .06,  y =.21 + .035,
                        gp = gpar(col = "black", fontsize = 10, fontface = "bold"))
dev.off()