In [1]:
## ---- Partition Cell Set Analysis - Extended Analysis ---- 0530.00.00
## Load Analysis Parameters (Parm1)
## Loop Through Each Partition and Perform Analysis as Follows:
## Load Partition Cell Set
## Load Partition Differential Expresssion Gene Set Model
## Create Violin Plots Based On p-value Rank Instead of q-value
## Perform Top Genes Analysis Per Cluster and Save
## Compute Cluster Stats and Save
## Perform Other Plots as Needed

In [2]:
## Create a Working Input and Output Data Directory, If Id Does Not Exist
parentdir <- '/gpfs/group/torkamani/devans/'
datdir <- paste(parentdir, 'CDC2', sep = '')
if (!file.exists(datdir)) {
    dir.create(datdir)
}
setwd(datdir)

In [3]:
## Read the parameters file
ps <- read.table(file = 'parms.txt', stringsAsFactors = FALSE, header = TRUE)

In [11]:
## Load Monocle3 and Seurat Libraries
library(monocle3)
library(Seurat)
library(dplyr)
library(magrittr)
library(ggplot2)
library(gridExtra)
library(Matrix)
library(rhdf5)
library(grid)

In [5]:
## Read the previously preprocessed downsampled cell set data object
down_stdycds <- readRDS(file = paste(ps$outdir,
                    'Aggregated.downsampled.QC.NoDoublets.Repartitioned.rds', sep = ''))

In [6]:
## Build a gene short name to gene id (Ensembl) lookup
short2geneid <- fData(down_stdycds)@rownames
names(short2geneid) <- fData(down_stdycds)@listData$gene_short_name

In [7]:
## Build a gene id (Ensembl) to gene short name lookup
geneid2short <- fData(down_stdycds)@listData$gene_short_name
names(geneid2short) <- fData(down_stdycds)@rownames

In [8]:
## Create variables for how cells sets are organized
cellgrps <- c('healthy', 'diseased', 'healthy', 'diseased', 'healthy', 'diseased')
cellpats <- c('ID Number 1', 'ID Number 1', 'ID Number 2', 'ID Number 2', 'ID Number 3', 'ID Number 3')

In [9]:
## Define and Assign Cell Types
celltypes6 <- c('1-Macrophages',
                '2-Endothelial Cells',
                '3-VSMCs',
                '4-Natural Killer Cells',
                '5-Cytotoxic T Lymphocytes',
                '6-B Lymphocytes')

In [10]:
## Declare Tom's best genes for definiting cell types
toms_markers5 <- c('NRXN1', 'CLU', 'ICAM2',
                 'CD14', 'CD68', 'AIF1',
                 'VWF', 'EDN1', 'ECSCR',
                 'MKI67', 'UBE2C', 'TOP2A',
                 'ACTA2', 'TAGLN', 'MYL9',
                 'ACKR1', 'SPARCL1', 'PECAM1',
                 'CALD1', 'MGP', 'DCN',
                 'NKG7', 'XCL1', 'CTSW',
                 'CD8A', 'TRAC', 'CD2',
                 'MS4A1', 'CD79A', 'BANK1',
                 'CD69', 'CXCR4', 'IL7R',
                 'LILRA4', 'IRF7', 'CLEC4C',
                 'MZB1', 'JCHAIN', 'TNFRSF17',
                 'LST1', 'FCGR3B', 'S100A8',
                 'TPSAB1', 'CPA3', 'MS4A2')
toms_gene_ids5 <- short2geneid[toms_markers5]


doug_markers1 <- c('AIF1', 'LYZ', 'FCER1G',  'CD68',
                'RNASE1', 'PECAM1', 'IGFBP4', 'ADIRF', 
                'SOD3', 'MYL9', 'CALD1', 'GSN',
                'TYROBP', 'NKG7', 'CTSW', 'CD69',
                'CD3D', 'CD2', 'TRBC2', 'TRAC',
                'MS4A1', 'CD79A', 'HLA-DQA1', 'CD37')
dougs_gene_ids1 <- short2geneid[doug_markers1]

In [14]:
## Loop through the partitions and perform extended analysis
np <- length(celltypes6)
gout <- FALSE
de <- TRUE
ngenes <- 10
for (p in 6:6) {
    setwd(paste(datdir, '/', ps$outdir, celltypes6[p], sep = ''))
    ## Extract partition subset for par
    partn_cds <- readRDS('Partition.Cell.Set.rds')
    if (de) gene_all_fits_pat_cond_unadj <- readRDS('Gene.Model.Fits.rds')
    if (file.exists('Diff.AllHealthCoeff.RemoveFail.NoModels.txt'))  {   
        de_tab <- read.table('Diff.AllHealthCoeff.RemoveFail.NoModels.txt', header = TRUE,
                             stringsAsFactors = FALSE)
    }
    if (dim(de_tab)[1] > 100) {
        de_tab_plus <- de_tab[(de_tab$estimate > 0),]
        de_tab_plus <- de_tab_plus[order(de_tab_plus$p_value, decreasing = FALSE),][1:ngenes,]$gene_short_name
        de_tab_minus <- de_tab[(de_tab$estimate < 0),]
        de_tab_minus <- de_tab_minus[order(de_tab_minus$p_value, decreasing = FALSE),][1:ngenes,]$gene_short_name
    

        partn_cds_plus <- partn_cds[rowData(partn_cds)$gene_short_name %in% de_tab_plus]
        partn_cds_minus <- partn_cds[rowData(partn_cds)$gene_short_name %in% de_tab_minus]

        plotfp <- 'Violin.Plot.Top10.Healthy.DiffExp.p-value.pdf'     
        pdf(plotfp, width = 10, height = 4)
        g1 <- plot_genes_violin(partn_cds_plus, group_cells_by = "condition", ncol = ngenes) +
           theme(axis.text.x=element_text(angle=45, hjust=1))
        print(g1)
        dev.off()
    
        plotfp <- 'Violin.Plot.Top10.Diseased.p-value.DiffExp.pdf'     
        pdf(plotfp, width = 10, height = 4)
        g1 <- plot_genes_violin(partn_cds_minus, group_cells_by = "condition", ncol = ngenes) +
           theme(axis.text.x=element_text(angle=45, hjust=1))
        print(g1)
        dev.off()
        
        ##################################################################
        ## Get cluster breakdown (cell counts by cluster across patients, disease and healthy)
        colData(partn_cds)$clust <- clusters(partn_cds)
        nclust <- length(unique(colData(partn_cds)$clust))
        clusttypes <- paste('cluster-', 1:nclust, sep = '') 
        clust_stats <- as.data.frame(matrix('', nrow = length(unique(colData(partn_cds)$clust)),
                                   ncol = 2 + length(cellgrps)), stringsAsFactors = FALSE)

        ## Add the cell groups and patient IDs to the first two rows
        clust_stats <- rbind(cellpats, cellgrps, clust_stats)
        ## Add the cluster names (cell types), and first two column row names as well
        row.names(clust_stats) <- c('Patient ID', 'Condition', clusttypes)
        clust_stats[1, c(7,8)] <- c('Total', 'Mean')
        clust_stats[2, c(7,8)] <- c('Cells', 'UMI')

        clusts <- colData(partn_cds)$clust
        for (c in 1:(dim(clust_stats)[2] - 2)) {
            for (r in 3:dim(clust_stats)[1]) {
                cells <- sum((clusts == (r - 2)) &
                             (colData(partn_cds)$patient ==  cellpats[c])  &
                             (colData(partn_cds)$condition ==  cellgrps[c]))
                clust_stats[r, c] <- cells
            }
        }
        for (r in 3:dim(clust_stats)[1]) {
            clust_stats[r,7] <- sum(as.numeric(clust_stats[r, 1:6]))
            clust_stats[r,8] <- round(mean(colSums(exprs(partn_cds[,
                                colData(partn_cds)$clust == (r-2)]))), 0)
        }
        ## Save the cluster statistics
        write.table(clust_stats,
                    file = 'Aggregated.downsampled.QC.Cluster.CellCount.Stats.txt',
                    col.names = FALSE)
        
        #####################################################################
        ## Compute all top marker files
        marker_test_res_topall <- top_markers(partn_cds, group_cells_by = "clust",
                                               genes_to_test_per_group = dim(partn_cds)[1], 
                                               reference_cells = dim(partn_cds)[2], cores = 4)

        ## Save all top markers to a file
        marker_test_res_topall_with_GC <- marker_test_res_topall[,c(1,2,2,3:10)]
        colnames(marker_test_res_topall_with_GC)[2] <- 'gene_cards'
        nmarkers1 <- dim(marker_test_res_topall_with_GC)[1]
        gc_hyper1 <- '=HYPERLINK(CONCAT("https://www.genecards.org/Search/Keyword?queryString=", '
        gc_hyper1 <- paste(gc_hyper1, 'A', 2:(nmarkers1 + 1), '), C', 2:(nmarkers1 + 1), ')', sep ='')
        marker_test_res_topall_with_GC$gene_cards <- gc_hyper1

        ## Actually write the file
        topmarkerfile <- 'Aggregated.downsampled.QC.Cluster.TopAllMarkers.Rev1.tsv'
        write.table(marker_test_res_topall_with_GC, file = topmarkerfile,
                    row.names = FALSE, col.names = TRUE,
                    sep = '\t', quote = FALSE) 
        
        
        
    }
}

“Removed 659 rows containing non-finite values (stat_summary).”



In [None]:
## Cells below this point are redundant and can be deleted if needed

In [57]:
## Get cluster breakdown (cell counts by cluster across patients, disease and healthy)
colData(partn_cds)$clust <- clusters(partn_cds)
nclust <- length(unique(colData(partn_cds)$clust))
clusttypes <- paste('cluster-', 1:nclust, sep = '') 
clust_stats <- as.data.frame(matrix('', nrow = length(unique(colData(partn_cds)$clust)),
                                   ncol = 2 + length(cellgrps)), stringsAsFactors = FALSE)

## Add the cell groups and patient IDs to the first two rows
clust_stats <- rbind(cellpats, cellgrps, clust_stats)
## Add the cluster names (cell types), and first two column row names as well
row.names(clust_stats) <- c('Patient ID', 'Condition', clusttypes)
clust_stats[1, c(7,8)] <- c('Total', 'Mean')
clust_stats[2, c(7,8)] <- c('Cells', 'UMI')

clusts <- colData(partn_cds)$clust
for (c in 1:(dim(clust_stats)[2] - 2)) {
    for (r in 3:dim(clust_stats)[1]) {
        cells <- sum((clusts == (r - 2)) &
                     (colData(partn_cds)$patient ==  cellpats[c])  &
                     (colData(partn_cds)$condition ==  cellgrps[c]))
        clust_stats[r, c] <- cells
    }
}
for (r in 3:dim(clust_stats)[1]) {
    clust_stats[r,7] <- sum(as.numeric(clust_stats[r, 1:6]))
    clust_stats[r,8] <- round(mean(colSums(exprs(partn_cds[,
                        colData(partn_cds)$clust == (r-2)]))), 0)
}
## Save the cluster statistics
write.table(clust_stats,
            file = 'Aggregated.downsampled.QC.Cluster.CellCount.Stats.txt',
            col.names = FALSE)

In [106]:
## Compute all top marker files
marker_test_res_topall <- top_markers(partn_cds, group_cells_by = "clust",
                                       genes_to_test_per_group = dim(partn_cds)[1], 
                                       reference_cells = dim(partn_cds)[2], cores = 4)

## Save all top markers to a file
    marker_test_res_topall_with_GC <- marker_test_res_topall[,c(1,2,2,3:10)]
    colnames(marker_test_res_topall_with_GC)[2] <- 'gene_cards'
    nmarkers1 <- dim(marker_test_res_topall_with_GC)[1]
    gc_hyper1 <- '=HYPERLINK(CONCAT("https://www.genecards.org/Search/Keyword?queryString=", '
    gc_hyper1 <- paste(gc_hyper1, 'A', 2:(nmarkers1 + 1), '), C', 2:(nmarkers1 + 1), ')', sep ='')
    marker_test_res_topall_with_GC$gene_cards <- gc_hyper1

## Actually write the file
topmarkerfile <- 'Aggregated.downsampled.QC.Cluster.TopAllMarkers.Rev1.tsv'
write.table(marker_test_res_topall_with_GC, file = topmarkerfile,
            row.names = FALSE, col.names = TRUE,
            sep = '\t', quote = FALSE) 




In [None]:
        ## Compute all top marker files
        marker_test_res_topall <- top_markers(partn_cds, group_cells_by = "clust",
                                               genes_to_test_per_group = dim(partn_cds)[1], 
                                               reference_cells = dim(partn_cds)[2], cores = 4)

        ## Save all top markers to a file
        marker_test_res_topall_with_GC <- marker_test_res_topall[,c(1,2,2,3:10)]
        colnames(marker_test_res_topall_with_GC)[2] <- 'gene_cards'
        nmarkers1 <- dim(marker_test_res_topall_with_GC)[1]
        gc_hyper1 <- '=HYPERLINK(CONCAT("https://www.genecards.org/Search/Keyword?queryString=", '
        gc_hyper1 <- paste(gc_hyper1, 'A', 2:(nmarkers1 + 1), '), C', 2:(nmarkers1 + 1), ')', sep ='')
        marker_test_res_topall_with_GC$gene_cards <- gc_hyper1

        ## Actually write the file
        topmarkerfile <- 'Aggregated.downsampled.QC.Cluster.TopAllMarkers.Rev1.tsv'
        write.table(marker_test_res_topall_with_GC, file = topmarkerfile,
                    row.names = FALSE, col.names = TRUE,
                    sep = '\t', quote = FALSE) 

In [15]:
p