In [16]:
## ---- Partition Cell Set Network Module Analysis ---- 0671.00.00
## Load Analysis Parameters (Parm1)
## Load Network Utilities
## Loop Through Each Partition and Perform Analysis as Follows:
## Load Partition Cell Set
## Load Large 50,000 Edge Networks Created in 0620.00.00.Carotid.SC.NotebookPartitionAnalysisNetwork.B.01.ipynb0620.00.00
## Cut networks to 2,000 edges
## Create Clusters Using Recursive Louvain (Rev 2 plots use weight = pcor, Rev1 plots use 1/pcor)
## Plot the Network with Clusters (2,000 Edges Each)
## The following items will be performed in another notebook
## Perform Module GO Enrichment Analysis on Selected Clusters (topGO, with Tables and GO Plots)
## Note: Use GO Biological Process Terms for Enrichment Analysis
## Save Module Analysis Results to 'Modules.2000' Directory 

In [17]:
## Create a Working Input and Output Data Directory, If Id Does Not Exist
parentdir <- '/gpfs/group/torkamani/devans/'
datdir <- paste(parentdir, 'CDC2', sep = '')
if (!file.exists(datdir)) {
    dir.create(datdir)
}
setwd(datdir)

In [18]:
## Read the parameters file
ps <- read.table(file = 'parms.txt', stringsAsFactors = FALSE, header = TRUE)

In [19]:
## Read the GENCODE v27 Data
v27_gen <- read.table(paste(ps$indir, 'v27_Embl_Hugo.txt', sep = ''), header = F)
v27_gen[,1] <- substr(v27_gen[,1] ,1, 15)
v27 <- read.table(paste(ps$indir, 'v27_Embl_Hugo_Type.txt', sep =''))

In [20]:
## Load Monocle3 and Seurat Libraries
library(monocle3)
library(Seurat)
library(dplyr)
library(magrittr)
library(ggplot2)
library(gridExtra)
library(Matrix)
library(rhdf5)
library(grid)
library(igraph)
library(corpcor)
library(fdrtool)

## Get the igraph utilities
source('/gpfs/home/devans/code/Utilities/fromSara/setup_igraph.r')
source('/gpfs/home/devans/code/Utilities/fromSara/recursive_louvain.r')
source('/gpfs/home/devans/code/Utilities/fromSara/cluster_score.r')
source('/gpfs/home/devans/code/Utilities/fromSara/cluster_overlap.r')
source('/gpfs/home/devans/code/Utilities/fromSara/plot_by_cluster.r')

## Alternate to stat_parm utility in igraph utilities (does not assume log10 data)
stat_parm2 <- function(gf, expdata){
    
    ## Remove expression values not in the network, get antilog
    # expdata_filt <- 10 ^ log10expdata[,V(gf)$name]
    expdata_filt <- expdata[,V(gf)$name]
    
    ## Compute the node expression stats, including coeff of variation, add to network
    V(gf)$mean <- apply(expdata_filt, 2, mean)
    V(gf)$sd <- apply(expdata_filt, 2, sd)
    V(gf)$cv <- V(gf)$sd/V(gf)$mean
    V(gf)$med <- apply(expdata_filt, 2, median)

    ## Compute the strength
    V(gf)$strength <- strength(gf, weights = abs(E(gf)$pcor))
    return(gf)
    }
## Alternate to plot function in utility

plot_parm2 <- function (gf) {

    V(gf)$color <- 'red'
    
    E(gf)$color[E(gf)$pcor > 0] <- 'green'
    E(gf)$color[E(gf)$pcor < 0] <- 'orange'
    fedthick <- 100 * abs(E(gf)$pcor)
    V(gf)$label.cex = .1
    E(gf)$label.cex = .1
    V(gf)$label <- paste(substr(V(gf)$v27,1,10),
                                   round(V(gf)$strength,3), sep = '\n')
    E(gf)$width <- fedthick
    E(gf)$label <- round(E(gf)$pcor, 4)
    V(gf)$size <- 1

    return(gf)
    }

In [21]:
## Read the previously preprocessed downsampled cell set data object
down_stdycds <- readRDS(file = paste(ps$outdir,
            'Aggregated.downsampled.QC.NoDoublets.Repartitioned.rds', sep = ''))

In [22]:
## Build a gene short name to gene id (Ensembl) lookup
short2geneid <- fData(down_stdycds)@rownames
names(short2geneid) <- fData(down_stdycds)@listData$gene_short_name

In [23]:
## Build a gene id (Ensembl) to gene short name lookup
geneid2short <- fData(down_stdycds)@listData$gene_short_name
names(geneid2short) <- fData(down_stdycds)@rownames

In [24]:
## Create variables for how cells sets are organized
cellgrps <- c('healthy', 'diseased', 'healthy', 'diseased', 'healthy', 'diseased')
cellpats <- c('ID Number 1', 'ID Number 1', 'ID Number 2', 'ID Number 2', 'ID Number 3', 'ID Number 3')

In [25]:
## Define and Assign Cell Types
celltypes6 <- c('1-Macrophages',
                '2-Endothelial Cells',
                '3-VSMCs',
                '4-Natural Killer Cells',
                '5-Cytotoxic T Lymphocytes',
                '6-B Lymphocytes')

In [26]:
## Declare Tom's best genes for definiting cell types
toms_markers5 <- c('NRXN1', 'CLU', 'ICAM2',
                 'CD14', 'CD68', 'AIF1',
                 'VWF', 'EDN1', 'ECSCR',
                 'MKI67', 'UBE2C', 'TOP2A',
                 'ACTA2', 'TAGLN', 'MYL9',
                 'ACKR1', 'SPARCL1', 'PECAM1',
                 'CALD1', 'MGP', 'DCN',
                 'NKG7', 'XCL1', 'CTSW',
                 'CD8A', 'TRAC', 'CD2',
                 'MS4A1', 'CD79A', 'BANK1',
                 'CD69', 'CXCR4', 'IL7R',
                 'LILRA4', 'IRF7', 'CLEC4C',
                 'MZB1', 'JCHAIN', 'TNFRSF17',
                 'LST1', 'FCGR3B', 'S100A8',
                 'TPSAB1', 'CPA3', 'MS4A2')
toms_gene_ids5 <- short2geneid[toms_markers5]


doug_markers1 <- c('AIF1', 'LYZ', 'FCER1G',  'CD68',
                'RNASE1', 'PECAM1', 'IGFBP4', 'ADIRF', 
                'SOD3', 'MYL9', 'CALD1', 'GSN',
                'TYROBP', 'NKG7', 'CTSW', 'CD69',
                'CD3D', 'CD2', 'TRBC2', 'TRAC',
                'MS4A1', 'CD79A', 'HLA-DQA1', 'CD37')
dougs_gene_ids1 <- short2geneid[doug_markers1]

In [28]:
## Loop through the partitions and perform analysis, making networks
## Determine the number of partitions
np <- length(celltypes6)
## When trimming, decomposing, or re-aggregating the network,
## what are the fewest edges of subnets to re-aggregate
min_edges <- 4
diff_exp_qval_cut <- 0.05    # Differential Expression q-value cutoff for displaying on graph
plotflag <- TRUE
edges_to_keep <- 2000
netstat_m <- NULL

for (p in 2:np) {
    setwd(paste(datdir, '/', ps$outdir, celltypes6[p], sep = ''))
    
    ## Read partition cell data set, expression data, and diff expression data
    partn_cds <- readRDS('Partition.Cell.Set.rds')
    partn_exp <- t(exprs(partn_cds))
    if (file.exists('Diff.AllHealthCoeff.RemoveFail.NoModels.txt'))  {   
        de_tab <- read.table('Diff.AllHealthCoeff.RemoveFail.NoModels.txt', header = TRUE,
                             stringsAsFactors = FALSE)    
    }
    
    ## Read the saved network
    net2 <- readRDS('Network.50000.rds')
    
    ## Remove the singleton nodes
    net2 <- delete_vertices(net2, v = V(net2)$name[degree(net2) == 0])
    
    # Remove the small disconnected modules
    dcomp <- decompose(net2, min.vertices = 0)
    merge_cnt <- 0
    del_edges <- NULL
    for (i in 1:length(dcomp)) {
        if (length(E(dcomp[[i]])$pcor) < min_edges) {
            del_edges <- c(del_edges, V(dcomp[[i]])$name )
            merge_cnt <- merge_cnt + 1
            # print(merge_cnt)
        }
    }
    net2 <- delete_vertices(net2, v = del_edges)


    ## Delete Sufficient Edges to Get Down to 2,000, and then removed singletons
    p_value_cutoff_deletion <- sort(E(net2)$pval, decreasing = FALSE)[edges_to_keep]
    net2 <- delete_edges(net2, edges = E(net2)[!(E(net2)$pval <= p_value_cutoff_deletion)]) 
    
    ## Remove the singleton nodes
    net2 <- delete_vertices(net2, v = V(net2)$name[degree(net2) == 0]) 
    
    # Remove the small disconnected modules
    dcomp <- decompose(net2, min.vertices = 0)
    merge_cnt <- 0
    del_edges <- NULL
    for (i in 1:length(dcomp)) {
        if (length(E(dcomp[[i]])$pcor) < min_edges) {
            del_edges <- c(del_edges, V(dcomp[[i]])$name )
            merge_cnt <- merge_cnt + 1
            # print(merge_cnt)
        }
    }
    net2 <- delete_vertices(net2, v = del_edges)
    
    if (plotflag) {
        ## Get the FDR socres
        fdr_cut <- readRDS('FDR_CUT.50000.rds')
        V(net2)$v27 <- as.vector(v27_gen[match(V(net2)$name, v27_gen[,1]),2])
        # Lookup the HUGO gene names and types for the network nodes
        hv27types <- as.vector(v27[match(V(net2)$name, substr(v27[,1],1,15)),3])
        # Save the gene types for V27 as attributes in the network
        V(net2)$v27type <- hv27types
        ## Set the weight equal to the recipicol of the absolute pcor values
        ## This is an interpretation of weight = closer (larger values are closer nodes)
        E(net2)$weight <- abs(E(net2)$pcor)
        ## add degree and stength node attributes
        V(net2)$strength <- strength(net2, weights = abs(E(net2)$pcor))
        V(net2)$degree <- degree(net2, normalized = TRUE)
        net2 <- stat_parm(net2, partn_exp)  
        V(net2)$gene <- V(net2)$name
        ## Get the annotation information, force single tissue (cell type)
        net2 <- annot_parm(net2)
        ## Get the plot information
        net2 <- plot_parm2(net2)  
        
        ## This was mistakenly used in the pcor = 1E-1 dense plot
        net2_clust <- cluster_louvain(net2)
        clust_sc <- cluster_score(net2, clusters = membership(net2_clust), sizes = TRUE, max.str = TRUE)
    
#         ## The other cluster option to consider using
#         net2_reclust <- recursive_louvain(net2, biggest = 150)
#         reclust_sc <- cluster_score(net2, clusters = net2_reclust, sizes = TRUE, max.str = TRUE)
        
        ## Get the significant differential expressed terms
        qsig <- de_tab %>% filter(q_value < diff_exp_qval_cut) %>% 
            dplyr::select(gene_short_name, normalized_effect, q_value)
    
        ## If there are any significant DE genes, colorize them in the network
        V(net2)$color <- rgb(0, 0, 1, alpha = .75)
        valid_de <- dim(qsig)[1]
        if (valid_de >= 1) {
            gene2ne <- qsig$normalized_effect
            names(gene2ne) <- qsig$gene_short_name
            gene2ne <- gene2ne/max(abs(gene2ne))
            gene2ne_p <- round(gene2ne[gene2ne > 0], 3)
            gene2ne_m <- round(-gene2ne[gene2ne < 0], 3)
            V(net2)$color <- rgb(0, 0, 1, alpha = .75)
            V(net2)$color[V(net2)$v27 %in% names(gene2ne_p)] <- 
                rgb(1, gene2ne_p[names(gene2ne_p) %in% V(net2)$v27], 
                    gene2ne_p[names(gene2ne_p) %in% V(net2)$v27], alpha = .5)
            V(net2)$color[V(net2)$v27 %in% names(gene2ne_m)] <- 
                rgb(gene2ne_m[names(gene2ne_m) %in% V(net2)$v27], 1,
                    gene2ne_m[names(gene2ne_m) %in% V(net2)$v27], alpha = .5)   
        }
        
        ## Save everything before plotting    
        saveRDS(net2, 'Network.2k.rds')
        saveRDS(net2_clust, 'Louvain.Clust.2k.rds')
        saveRDS(clust_sc, 'Louvain.Clust.Score.2k.rds')
#         saveRDS(net2_reclust, 'Recuvsive.Louvain.Clust.2k.rds')
#         saveRDS(reclust_sc, 'Recursive.Louvain.Clust.Score.2k.rds')        

    plot_by_cluster(graph = net2, clusts = membership(net2_clust), scores = clust_sc,
            mode = c('number','cutoff','specify')[2],
            numclusts = 5, minscore = .01, specifyclusts = c(1,3,5,9), crosses = TRUE,
            filename = 'Medium.Dense.2k.Network.Rev2.wCrosses.pdf')
    
    plot_by_cluster(graph = net2, clusts = membership(net2_clust), scores = clust_sc,
            mode = c('number','cutoff','specify')[2],
            numclusts = 5, minscore = .01, specifyclusts = c(1,3,5,9), crosses = FALSE,
            filename = 'Medium.Dense.2k.Network.Rev2.woCrosses.pdf')
    }
}

In [15]:
p