In [1]:
## ---- Partition Cell Set Analysis with Diff Expression Notebooks A ---- 0500.00.00
## Load Analysis Parameters (Parm1)
## Loop Through Each Partition and Perform Analysis as Follows:
## Load Partition Cell Set
## Load Partition Differential Expresssion Gene Set Model
## Plot Cell Sets as Needed
## Extract Differential Analysis Coefficients and Save
## Extract Gene Rank List and Save

In [2]:
## Create a Working Input and Output Data Directory, If Id Does Not Exist
parentdir <- '/gpfs/group/torkamani/devans/'
datdir <- paste(parentdir, 'CDC2', sep = '')
if (!file.exists(datdir)) {
    dir.create(datdir)
}
setwd(datdir)

In [3]:
## Read the parameters file
ps <- read.table(file = 'parms.txt', stringsAsFactors = FALSE, header = TRUE)

In [5]:
## Load Monocle3 and Seurat Libraries
library(monocle3)
library(Seurat)
library(dplyr)
library(magrittr)
library(ggplot2)
library(gridExtra)
library(Matrix)
library(rhdf5)
library(grid)

In [6]:
## Read the previously preprocessed downsampled cell set data object
down_stdycds <- readRDS(file = paste(ps$outdir,
                    'Aggregated.downsampled.QC.NoDoublets.Repartitioned.rds', sep = ''))

In [7]:
## Build a gene short name to gene id (Ensembl) lookup
short2geneid <- fData(down_stdycds)@rownames
names(short2geneid) <- fData(down_stdycds)@listData$gene_short_name

In [8]:
## Build a gene id (Ensembl) to gene short name lookup
geneid2short <- fData(down_stdycds)@listData$gene_short_name
names(geneid2short) <- fData(down_stdycds)@rownames

In [9]:
## Create variables for how cells sets are organized
cellgrps <- c('healthy', 'diseased', 'healthy', 'diseased', 'healthy', 'diseased')
cellpats <- c('ID Number 1', 'ID Number 1', 'ID Number 2', 'ID Number 2', 'ID Number 3', 'ID Number 3')

In [10]:
## Define and Assign Cell Types
celltypes6 <- c('1-Macrophages',
                '2-Endothelial Cells',
                '3-VSMCs',
                '4-Natural Killer Cells',
                '5-Cytotoxic T Lymphocytes',
                '6-B Lymphocytes')

In [12]:
## Declare Tom's best genes for definiting cell types
toms_markers5 <- c('NRXN1', 'CLU', 'ICAM2',
                 'CD14', 'CD68', 'AIF1',
                 'VWF', 'EDN1', 'ECSCR',
                 'MKI67', 'UBE2C', 'TOP2A',
                 'ACTA2', 'TAGLN', 'MYL9',
                 'ACKR1', 'SPARCL1', 'PECAM1',
                 'CALD1', 'MGP', 'DCN',
                 'NKG7', 'XCL1', 'CTSW',
                 'CD8A', 'TRAC', 'CD2',
                 'MS4A1', 'CD79A', 'BANK1',
                 'CD69', 'CXCR4', 'IL7R',
                 'LILRA4', 'IRF7', 'CLEC4C',
                 'MZB1', 'JCHAIN', 'TNFRSF17',
                 'LST1', 'FCGR3B', 'S100A8',
                 'TPSAB1', 'CPA3', 'MS4A2')
toms_gene_ids5 <- short2geneid[toms_markers5]


doug_markers1 <- c('AIF1', 'LYZ', 'FCER1G',  'CD68',
                'RNASE1', 'PECAM1', 'IGFBP4', 'ADIRF', 
                'SOD3', 'MYL9', 'CALD1', 'GSN',
                'TYROBP', 'NKG7', 'CTSW', 'CD69',
                'CD3D', 'CD2', 'TRBC2', 'TRAC',
                'MS4A1', 'CD79A', 'HLA-DQA1', 'CD37')
dougs_gene_ids1 <- short2geneid[doug_markers1]

In [14]:
## Loop through the partitions and perform analysis, plotting graphs.
## and completing differential expression analysis
np <- length(celltypes6)
gout <- TRUE
de <- TRUE
for (p in 1:np) {
    setwd(paste(datdir, '/', ps$outdir, celltypes6[p], sep = ''))
    ## Extract partition subset for par
    partn_cds <- readRDS('Partition.Cell.Set.rds')
    if (de) gene_all_fits_pat_cond_unadj <- readRDS('Gene.Model.Fits.rds')    
    
    if (gout) {
        ## Plot the cell set and trajectory of this partition
        g1 <- plot_cells(partn_cds,  group_cells_by="partition",
               show_trajectory_graph = TRUE, reduction_method = "UMAP", cell_size = 0.1, alpha = .4,
               color_cells_by = "assigned_cell_type",
               label_cell_groups = TRUE,
               label_leaves=TRUE,
               label_branch_points=TRUE,
               graph_label_size=1.5)
        pdf('AssignedCellType.With.Trajectory.pdf', width = 8, height = 6)
        print(g1)
        dev.off()
    
        ## Plot the new partitions of this partition, colorized by partition
        g2 <- plot_cells(partn_cds,
               color_cells_by="partition", group_cells_by="partition",
               show_trajectory_graph = FALSE, reduction_method = "UMAP", cell_size = 0.1, alpha = .4)
    
        pdf('ColoredbyPartition.Numbered.pdf', width = 8, height = 6)
        print(g2)
        dev.off()
        
        ## Plot the new partitions of this partition, colorized by partition, smaller with Legend
        g2l <- plot_cells(partn_cds,
               color_cells_by="partition", group_cells_by="partition",
               show_trajectory_graph = FALSE, reduction_method = "UMAP",
                          label_cell_groups = FALSE, cell_size = 0.1, alpha = .4)
    
        pdf('ColoredbyPartition.Numbered.WithLegned.pdf', width = 4, height = 3)
        print(g2l)
        dev.off()
    
        ## Plot the partitions of this partition, colorized by condition
        g3 <- plot_cells(partn_cds, color_cells_by="condition", group_cells_by="partition",
               show_trajectory_graph = FALSE, reduction_method = "UMAP", cell_size = 0.1, alpha = .4)
        pdf('ColoredbyCondition.Numbered.pdf', width = 8, height = 6)
        print(g3)
        dev.off()
    
        ## Plot the partitions of this partition, colorized by condition, smaller with Legend
        g3l <- plot_cells(partn_cds, color_cells_by="condition", group_cells_by="partition",
               show_trajectory_graph = FALSE, reduction_method = "UMAP",
                          label_cell_groups = FALSE, cell_size = 0.1, alpha = .4)
        pdf('ColoredbyCondition.Numbered.WithLegend.pdf', width = 4, height = 3)
        print(g3l)
        dev.off()
        

        ## Plot the clusters of this partition, colorized by cluster
        g4 <- plot_cells(partn_cds, color_cells_by="cluster", group_cells_by="cluster",
               show_trajectory_graph = FALSE, reduction_method = "UMAP", cell_size = 0.1, alpha = .4)
        pdf('ColoredbyCluster.Numbered.pdf', width = 8, height = 6)
        print(g4)
        dev.off()
    
        ## Get a cells in a partition (later change to largest partition)
        # trajectory_start_cell <- colnames(partn_cds)[1]
    
        # Compute the pseudo time based on one of the cells a partition
        # partn_cds <- order_cells(partn_cds, root_cells = trajectory_start_cell, verbose = TRUE)
    
        ## Plot the trajectory/pseudotime
        # g5 <- plot_cells(partn_cds,
        #        color_cells_by = "pseudotime",
        #        label_cell_groups=FALSE,
        #        label_leaves=FALSE,
        #        label_branch_points=FALSE,
        #        graph_label_size=1.5
    }
    
    if (de) {
        all_coef3 <- NULL
        if(length(gene_all_fits_pat_cond_unadj) != 1) allow_de <- 1 else allow_de <- 0
        if (allow_de >= 1) {
            ## Get the DE Coefficients
            all_coef3 <- coefficient_table(gene_all_fits_pat_cond_unadj)
            write.table(all_coef3[,c(1,4:12)], file = 'Diff.AllCoeff.NoModels.txt',
                        col.names = TRUE, row.names = FALSE)
            # Extract the relavent data from the DE tables
            condition_terms3 <- all_coef3 %>% filter(term == "conditionhealthy")
            write.table(condition_terms3[,c(1, 4:12)],
                        'Diff.AllHealthCoeff.NoModels.txt', col.names = TRUE, row.names = FALSE)
            condition_terms3b <- condition_terms3 %>% filter(status == 'OK')
            write.table(condition_terms3b[,c(1, 4:12)],
                        'Diff.AllHealthCoeff.RemoveFail.NoModels.txt', col.names = TRUE, row.names = FALSE)
        }
    }
}

In [24]:
## Loop through the partitions and export the gene rank file (needed for GSEA Tool)
for (p in 1:np) {
    setwd(paste(datdir, '/', ps$outdir, celltypes6[p], sep = ''))
    if (file.exists('Diff.AllHealthCoeff.RemoveFail.NoModels.txt'))  {   
        de_tab <- read.table('Diff.AllHealthCoeff.RemoveFail.NoModels.txt', header = TRUE,
                             stringsAsFactors = FALSE)
        rank_list <- matrix('', nrow = dim(de_tab)[1], ncol = 2)
        if (dim(de_tab)[1] > 100) {
            rank_list[,1] <- de_tab$gene_short_name
            rank_list[,2] <-  (1 - abs(de_tab$p_value)) * sign(de_tab$estimate)
            rank_list <- as.data.frame(rank_list, stringsAsFactors = FALSE)
            rank_list[,2] <- as.numeric(rank_list[,2]) 
            rank_list <- rank_list[order(rank_list[,2], decreasing = TRUE),]
            write.table(rank_list, file = 'Diff.Exp.Gene.Rank.List.pvalue.rnk', sep = '\t',
                        col.names = FALSE, row.names = FALSE, quote = FALSE)
        }
    }
}

In [25]:
## Loop through the partitions and generate violin plots, but only if DE exists
np <- length(celltypes6)
ngenes <- 10
gout <- TRUE
for (p in 1:np) {
    setwd(paste(datdir, '/', ps$outdir, celltypes6[p], sep = ''))
    ## Extract partition subset for par
    partn_cds <- readRDS('Partition.Cell.Set.rds') 
    if (file.exists('Diff.AllHealthCoeff.RemoveFail.NoModels.txt')) {
        de_tab <- read.table('Diff.AllHealthCoeff.RemoveFail.NoModels.txt', header = TRUE,
                             stringsAsFactors = FALSE)
    }
    else de_tab <- cbind(c(0,0), c(0,0))
    if (dim(de_tab)[1] > 100) {
        de_tab_plus <- de_tab[(de_tab$estimate > 0),]
        de_tab_plus <- de_tab_plus[order(de_tab_plus$p_value, decreasing = FALSE),][1:ngenes,]$gene_short_name
        de_tab_minus <- de_tab[(de_tab$estimate < 0),]
        de_tab_minus <- de_tab_minus[order(de_tab_minus$p_value, decreasing = FALSE),][1:ngenes,]$gene_short_name
    

        partn_cds_plus <- partn_cds[rowData(partn_cds)$gene_short_name %in% de_tab_plus]
        partn_cds_minus <- partn_cds[rowData(partn_cds)$gene_short_name %in% de_tab_minus]

        plotfp <- 'Violin.Plot.Top10.Healthy.DiffExp.pdf'     
        pdf(plotfp, width = 10, height = 4)
        g1 <- plot_genes_violin(partn_cds_plus, group_cells_by = "condition", ncol = ngenes) +
           theme(axis.text.x=element_text(angle=45, hjust=1))
        print(g1)
        dev.off()
    
        plotfp <- 'Violin.Plot.Top10.Diseased.DiffExp.pdf'     
        pdf(plotfp, width = 10, height = 4)
        g1 <- plot_genes_violin(partn_cds_minus, group_cells_by = "condition", ncol = ngenes) +
           theme(axis.text.x=element_text(angle=45, hjust=1))
        print(g1)
        dev.off()
    }
}

“Removed 659 rows containing non-finite values (stat_summary).”

In [27]:
gc()

Unnamed: 0,used,(Mb),gc trigger,(Mb).1,max used,(Mb).2
Ncells,59087923,3155.7,163826731,8749.3,163826731,8749.3
Vcells,181370434,1383.8,1204212160,9187.5,1505265200,11484.3


In [26]:
help(write.table)