In [1]:
## ---- Partition Cell Set Partital Correlation Analysis-Network Notebooks B ----
## Load Analysis Parameters (Parm1)
## Loop Through Each Partition and Perform Analysis as Follows:
## Load Partition Cell Set
## Perform Partial Correlation Analysis
## Perform Perform FDR Scoring
## Create 50000 Edge Networks and Save
## Save Tabulated and Summarized Partition Results

In [2]:
## Create a Working Input and Output Data Directory, If Id Does Not Exist
parentdir <- '/gpfs/group/torkamani/devans/'
datdir <- paste(parentdir, 'CDC2', sep = '')
if (!file.exists(datdir)) {
    dir.create(datdir)
}
setwd(datdir)

In [3]:
## Read the parameters file
ps <- read.table(file = 'parms.txt', stringsAsFactors = FALSE, header = TRUE)

In [4]:
## Read the GENCODE v27 Data
v27_gen <- read.table(paste(ps$indir, 'v27_Embl_Hugo.txt', sep = ''), header = F)
v27_gen[,1] <- substr(v27_gen[,1] ,1, 15)
v27 <- read.table(paste(ps$indir, 'v27_Embl_Hugo_Type.txt', sep =''))

In [6]:
## Load Monocle3 and Seurat Libraries
library(monocle3)
library(Seurat)
library(dplyr)
library(magrittr)
library(ggplot2)
library(gridExtra)
library(Matrix)
library(rhdf5)
library(grid)
library(igraph)
library(corpcor)
library(fdrtool)
## Get the igraph utilities
source('/gpfs/home/sjsmith/code/xtissue_paper/notebooks/Utilities/setup_igraph.r')
## Alternate to stat_parm utility in igraph utilities (does not assume log10 data)
stat_parm2 <- function(gf, expdata){
    
    ## Remove expression values not in the network, get antilog
    # expdata_filt <- 10 ^ log10expdata[,V(gf)$name]
    expdata_filt <- expdata[,V(gf)$name]
    
    ## Compute the node expression stats, including coeff of variation, add to network
    V(gf)$mean <- apply(expdata_filt, 2, mean)
    V(gf)$sd <- apply(expdata_filt, 2, sd)
    V(gf)$cv <- V(gf)$sd/V(gf)$mean
    V(gf)$med <- apply(expdata_filt, 2, median)

    ## Compute the strength
    V(gf)$strength <- strength(gf, weights = abs(E(gf)$pcor))
    return(gf)
    }
## Alternate to plot function in utility

plot_parm2 <- function (gf) {

    V(gf)$color <- 'red'
    
    E(gf)$color[E(gf)$pcor > 0] <- 'green'
    E(gf)$color[E(gf)$pcor < 0] <- 'orange'
    fedthick <- 100 * abs(E(gf)$pcor)
    V(gf)$label.cex = .1
    E(gf)$label.cex = .1
    V(gf)$label <- paste(substr(V(gf)$v27,1,10),
                                   round(V(gf)$strength,3), sep = '\n')
    E(gf)$width <- fedthick
    E(gf)$label <- round(E(gf)$pcor, 4)
    V(gf)$size <- 1

    return(gf)
    }

In [7]:
## Read the previously preprocessed downsampled cell set data object
down_stdycds <- readRDS(file = paste(ps$outdir,
            'Aggregated.downsampled.QC.NoDoublets.Repartitioned.rds', sep = ''))

In [8]:
## Build a gene short name to gene id (Ensembl) lookup
short2geneid <- fData(down_stdycds)@rownames
names(short2geneid) <- fData(down_stdycds)@listData$gene_short_name

In [9]:
## Build a gene id (Ensembl) to gene short name lookup
geneid2short <- fData(down_stdycds)@listData$gene_short_name
names(geneid2short) <- fData(down_stdycds)@rownames

In [10]:
## Create variables for how cells sets are organized
cellgrps <- c('healthy', 'diseased', 'healthy', 'diseased', 'healthy', 'diseased')
cellpats <- c('ID Number 1', 'ID Number 1', 'ID Number 2', 'ID Number 2', 'ID Number 3', 'ID Number 3')

In [11]:
## Define and Assign Cell Types
celltypes6 <- c('1-Macrophages',
                '2-Endothelial Cells',
                '3-VSMCs',
                '4-Natural Killer Cells',
                '5-Cytotoxic T Lymphocytes',
                '6-B Lymphocytes')

In [12]:
## Declare Tom's best genes for definiting cell types
toms_markers5 <- c('NRXN1', 'CLU', 'ICAM2',
                 'CD14', 'CD68', 'AIF1',
                 'VWF', 'EDN1', 'ECSCR',
                 'MKI67', 'UBE2C', 'TOP2A',
                 'ACTA2', 'TAGLN', 'MYL9',
                 'ACKR1', 'SPARCL1', 'PECAM1',
                 'CALD1', 'MGP', 'DCN',
                 'NKG7', 'XCL1', 'CTSW',
                 'CD8A', 'TRAC', 'CD2',
                 'MS4A1', 'CD79A', 'BANK1',
                 'CD69', 'CXCR4', 'IL7R',
                 'LILRA4', 'IRF7', 'CLEC4C',
                 'MZB1', 'JCHAIN', 'TNFRSF17',
                 'LST1', 'FCGR3B', 'S100A8',
                 'TPSAB1', 'CPA3', 'MS4A2')
toms_gene_ids5 <- short2geneid[toms_markers5]


doug_markers1 <- c('AIF1', 'LYZ', 'FCER1G',  'CD68',
                'RNASE1', 'PECAM1', 'IGFBP4', 'ADIRF', 
                'SOD3', 'MYL9', 'CALD1', 'GSN',
                'TYROBP', 'NKG7', 'CTSW', 'CD69',
                'CD3D', 'CD2', 'TRBC2', 'TRAC',
                'MS4A1', 'CD79A', 'HLA-DQA1', 'CD37')
dougs_gene_ids1 <- short2geneid[doug_markers1]

In [None]:
## Place for saving partition network related statistics
netstats <- matrix(0, nrow = np, ncol = 6)
colnames(netstats) <- c('nodes', 'edges', 'max_qval', 'max_pval', 'min_pcor', 'max_pcor')

In [32]:
## Loop through the partitions and perform analysis, making networks
## Determine the number of partitions
np <- length(celltypes6)

## Some control parameters (many are not used in the variation of the network notebooks)
gout <- TRUE
gene_cut <- .05              # Average fraction of UMI's per cell needed to keep a gene (1/100 typical)
max_fdr_cut <- 50000         # The maximum number of significant edges to save (treated as hardcoded)
graph_edges <- 1000          # The number of edges to plot
cutoff <- 1                  # FDR p or q value cutoff for partial correlations, force to be high for this notebook
diff_exp_qval_cut <- 0.05    # Differential Expression q-value cutoff
min_sig_fdr <- 10            # There needs to be at least 10 significant edges to form a network

for (p in 1:np) {
    setwd(paste(datdir, '/', ps$outdir, celltypes6[p], sep = ''))
    ## Extract partition subset for par
    partn_cds <- readRDS('Partition.Cell.Set.rds')
    partn_exp <- exprs(partn_cds)
    ## There are lots of low expressing genes
    ## Let's remove them (at least 1 UMI per 100 cells). Transpose the results 
    sig_genes <- (rowSums(partn_exp) > dim(partn_exp)[2] * gene_cut)
    partn_exp <- partn_exp[sig_genes, ]
    ## There may also be some cells with no variance. Remove those too
    sig_cells <- (apply(partn_exp, 2, var) > 0)
    partn_exp <- partn_exp[, sig_cells]
    ## Transpose the results for next steps
    partn_exp <- t(partn_exp)
    ## Compute the partial correlations
    pc_partn_exp <- pcor.shrink(partn_exp)   
    ## Convert them to a vector
    pcv_partn <- sm2vec(pc_partn_exp)
    pcv_partn_ind <- sm.index(pc_partn_exp)
    ## Compute the FDR scores
    pdf('FDR.Tool.Results.50000.pdf', width = 8, height = 6)
        pcv_partn_fdr <- fdrtool(pcv_partn, statistic = "correlation")
    dev.off()
    ## Gete the expression data gene names
    pcor_v_names <- colnames(partn_exp)
    # Create an ordered set of the correlation data and fdrtools data. Include columns
    # with pointers to the partial correlation matrix row and column so that the original
    # data can referenced.
    pcor_order <- order(pcv_partn, decreasing = FALSE)
    composite <- cbind(pcv_partn, pcv_partn_fdr$pval, pcv_partn_fdr$qval, pcv_partn_fdr$lfdr,
               pcor_order, pcv_partn_ind[,1], pcv_partn_ind[,2])
    # Sort the array by partial correlation values
    com_s <- composite[pcor_order,]
    rcount <- length(com_s[,1])
    # Add a column with the row number (after sort)
    com_s <- cbind(com_s, 1:rcount)
    # Make nicer column names
    colnames(com_s) <- c('pcor', 'pval', 'qval', 'lfdr', 'ord', 'row', 'col', 'idx')
    # Get the id of the last value in fdr, then create cut set
    cutset <- com_s[,2] <= cutoff
    fdr_cut <- com_s[cutset,]
    fdr_cut <- as.data.frame(fdr_cut)
    ## Break out if fdr_cut is empty
    if (dim(fdr_cut)[1] <= min_sig_fdr) break
    ## Cut down the number of edges is greater than max_fdr_cut (typically only save 20,000)
    if (dim(fdr_cut)[1] > max_fdr_cut) {
        qorder <- order(fdr_cut$pval, decreasing = FALSE)
        fdr_cut <- fdr_cut[qorder[1:max_fdr_cut],]
        pcorder <- order(fdr_cut$pcor, decreasing = FALSE)
        fdr_cut <- fdr_cut[pcorder,]
    }
    # Add a column in the fdr data for the TO node (for edge defintion)
    fdr_cut$to <- pcor_v_names[fdr_cut$row]
    # Add a column in the fdr data for the FROM node (for edge defintion)
    fdr_cut$from = pcor_v_names[fdr_cut$col]
    # Compute a vector containing the ordering of the absolute value of the partial correlation
    arank_v <- order(abs(fdr_cut$pcor), decreasing = T)
    # Store this rank in the fdr data
    fdr_cut$arank[arank_v] <- 1:length(fdr_cut[,1])
    # Add the gene attributes to a node data frame and set the column names
    nodes <- pcor_v_names
    # Convert the fdr data and node data frame into a graph
    # net <- graph_from_data_frame(d=fdr_cut[,c(9:10,1:8,11)], vertices=nodes, directed=F)
    net <- graph_from_data_frame(d=fdr_cut[,c('to', 'from', 'pcor', 'pval', 'qval', 'lfdr',
                                         'ord', 'row', 'col', 'idx', 'arank')],
                                         vertices=nodes, directed=F)
    # Show some parameters of the fdr data and the graph (number of nodes - last line)
    ## Write the network object and the FDR object
    saveRDS(net, 'Network.50000.rds')
    saveRDS(fdr_cut, "FDR_CUT.50000.rds")
    netstats[p,] <- c(length(V(net)$name), length(E(net)$pcor),
                        max(E(net)$qval), max(E(net)$pval), min(abs(E(net)$pcor)), max(abs(E(net)$pcor)))
}

Estimating optimal shrinkage intensity lambda (correlation matrix): 0.7037 

Step 1... determine cutoff point
Step 2... estimate parameters of null distribution and eta0
Step 3... compute p-values and estimate empirical PDF/CDF
Step 4... compute q-values and local fdr
Step 5... prepare for plotting



In [54]:
## Save the network stats for each partition in the root output data directory
## Add Cell Types as Rownames
rownames(netstats) <- celltypes6
## Round to 5th Digit
netstats2 <- round(netstats, 5)
## Actually write the file
netstatsfile <- paste(datdir, '/', ps$outdir, 'Netstats.Partition.50000.Edges.Rev1.txt', sep = '')
write.table(netstats2, file = netstatsfile,
            row.names = TRUE, col.names = TRUE,
            sep = '\t', quote = FALSE)