In [1]:
## ---- Partition Cell Set(s) Extraction Notebook ---
## Read R Data Object from Prior Notebook (CDS1)
## Load Analysis Parameters (Parm1)
## Loop Through Each Partition and Perform Network Analysis as Follows:
##    Extract Partition Cell Subsets and Save Them
##    Extract Differential Expression Gene Set Models and Save Them 

In [2]:
## Create a Working Input and Output Data Directory, If Id Does Not Exist
parentdir <- '/gpfs/group/torkamani/devans/'
datdir <- paste(parentdir, 'CDC2', sep = '')
if (!file.exists(datdir)) {
    dir.create(datdir)
}
setwd(datdir)

In [3]:
## Read the parameters file
ps <- read.table(file = 'parms.txt', stringsAsFactors = FALSE, header = TRUE)

In [5]:
## Load Monocle3 and Seurat Libraries
library(monocle3)
library(Seurat)
library(dplyr)
library(magrittr)
library(ggplot2)
library(gridExtra)
library(Matrix)
library(rhdf5)
library(grid)

In [6]:
## Read the previously preprocessed downsampled cell set data object
down_stdycds <- readRDS(file = paste(ps$outdir,
                'Aggregated.downsampled.QC.NoDoublets.Repartitioned.rds', sep = ''))

In [7]:
## Build a gene short name to gene id (Ensembl) lookup
short2geneid <- fData(down_stdycds)@rownames
names(short2geneid) <- fData(down_stdycds)@listData$gene_short_name

In [8]:
## Build a gene id (Ensembl) to gene short name lookup
geneid2short <- fData(down_stdycds)@listData$gene_short_name
names(geneid2short) <- fData(down_stdycds)@rownames

In [9]:
## Create variables for how cells sets are organized
cellgrps <- c('healthy', 'diseased', 'healthy', 'diseased', 'healthy', 'diseased')
cellpats <- c('ID Number 1', 'ID Number 1', 'ID Number 2', 'ID Number 2', 'ID Number 3', 'ID Number 3')

In [10]:
## Define and Assign Cell Types
celltypes6 <- c('1-Macrophages',
                '2-Endothelial Cells',
                '3-VSMCs',
                '4-Natural Killer Cells',
                '5-Cytotoxic T Lymphocytes',
                '6-B Lymphocytes')

In [11]:
## Declare Tom's best genes for definiting cell types
toms_markers5 <- c('NRXN1', 'CLU', 'ICAM2',
                 'CD14', 'CD68', 'AIF1',
                 'VWF', 'EDN1', 'ECSCR',
                 'MKI67', 'UBE2C', 'TOP2A',
                 'ACTA2', 'TAGLN', 'MYL9',
                 'ACKR1', 'SPARCL1', 'PECAM1',
                 'CALD1', 'MGP', 'DCN',
                 'NKG7', 'XCL1', 'CTSW',
                 'CD8A', 'TRAC', 'CD2',
                 'MS4A1', 'CD79A', 'BANK1',
                 'CD69', 'CXCR4', 'IL7R',
                 'LILRA4', 'IRF7', 'CLEC4C',
                 'MZB1', 'JCHAIN', 'TNFRSF17',
                 'LST1', 'FCGR3B', 'S100A8',
                 'TPSAB1', 'CPA3', 'MS4A2')
toms_gene_ids5 <- short2geneid[toms_markers5]


doug_markers1 <- c('AIF1', 'LYZ', 'FCER1G',  'CD68',
                'RNASE1', 'PECAM1', 'IGFBP4', 'ADIRF', 
                'SOD3', 'MYL9', 'CALD1', 'GSN',
                'TYROBP', 'NKG7', 'CTSW', 'CD69',
                'CD3D', 'CD2', 'TRBC2', 'TRAC',
                'MS4A1', 'CD79A', 'HLA-DQA1', 'CD37')
dougs_gene_ids1 <- short2geneid[doug_markers1]

In [14]:
## Create partition directories (if they do not already exist)
for (dirs in celltypes6) {
    part_dir <- paste(ps$outdir, dirs, sep = '')
    if (!file.exists(part_dir)) dir.create(part_dir)
    }

In [18]:
## Loop through the partitions and perform partition level analysis
np <- length(celltypes6)

for (p in 1:np) {
    setwd(paste(datdir, '/', ps$outdir, celltypes6[p], sep = ''))
    ## Extract partition subset for par
    partn_cds <- down_stdycds[,colData(down_stdycds)$assigned_cell_type == celltypes6[p]]
    ## Set the dimensionality reduction to the min of 100 or the the number of cells in the partition
    num_parts <- 100 # min(dim(partn_cds)[2] / 1, 100)
    ## Check to see if partition only has one patient. If no, regress out the patient ID
    num_pats <- length(unique(partn_cds@colData$patient))
    if(num_pats > 1) {
        partn_cds <- preprocess_cds(partn_cds, num_dim = num_parts,
                                 residual_model_formula_str = '~patient') }
    else {
        partn_cds <- preprocess_cds(partn_cds, num_dim = num_parts)
    }
    partn_cds <- reduce_dimension(partn_cds, reduction_method = "UMAP")
    partn_cds <- cluster_cells(partn_cds, reduction_method = 'UMAP')
    partn_cds <- learn_graph(partn_cds)    
    saveRDS(partn_cds, 'Partition.Cell.Set.rds')
    
    ## Fit a model to the condition and patient variables (if possible)
    num_conds <- length(unique(partn_cds@colData$condition))
    gene_all_fits_pat_cond_unadj <- paste('Unable to Perform Differential Expression on ', 
                                         celltypes6[p], ' Due to Lack of Condition Variance', sep = '')
    if((num_pats > 1) & (num_conds > 1)) {
        gene_all_fits_pat_cond_unadj = fit_models(partn_cds, cores = 4,
                                model_formula_str = "~patient + condition")
        }
    if((num_pats == 1) & (num_conds > 1)) {
        gene_all_fits_pat_cond_unadj = fit_models(partn_cds, cores = 4,
                                model_formula_str = "~condition")
        }
    saveRDS(gene_all_fits_pat_cond_unadj, 'Gene.Model.Fits.rds')    
    }


“RANN counts the point itself, k must be smaller than
the total number of points - 1 (all other points) - 1 (itself)!”



In [16]:
## This is a special case of DE for VSMC only, CD3+/-
## Loop through the partitions and perform partition level analysis
p <- 3

setwd(paste(datdir, '/', ps$outdir, celltypes6[p], sep = ''))
## Extract partition subset for par
partn_cds <- down_stdycds[,colData(down_stdycds)$assigned_cell_type == celltypes6[p]]
## Set the dimensionality reduction to the min of 100 or the the number of cells in the partition
num_parts <- 100 # min(dim(partn_cds)[2] / 1, 100)
## Check to see if partition only has one patient. If no, regress out the patient ID
num_pats <- length(unique(partn_cds@colData$patient))
if(num_pats > 1) {
    partn_cds <- preprocess_cds(partn_cds, num_dim = num_parts,
                             residual_model_formula_str = '~patient') 
} else {
    partn_cds <- preprocess_cds(partn_cds, num_dim = num_parts)
}
partn_cds <- reduce_dimension(partn_cds, reduction_method = "UMAP")
partn_cds <- cluster_cells(partn_cds, reduction_method = 'UMAP')
# partn_cds <- learn_graph(partn_cds)    

## saveRDS(partn_cds, 'Partition.Cell.Set.rds')

In [26]:
## Tag the cells that are C3
c3pos_bool <- exprs(partn_cds)[short2geneid['C3'], ] > 0
colData(partn_cds)$C3plus <- cpos_bool
table(colData(partn_cds)$C3plus)
## Fit a model to the C3pos
gene_all_fits_pat_cond_unadj = fit_models(partn_cds, cores = 4,
                            model_formula_str = "~patient + C3plus")



FALSE  TRUE 
 2703   969 

In [31]:
## Extract the coefficient data from the models
all_coef3 <- coefficient_table(gene_all_fits_pat_cond_unadj)

In [34]:
all_coef3[1:4, c(1, 4:12)]

gene_short_name,status,term,estimate,std_err,test_val,p_value,normalized_effect,model_component,q_value
<fct>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>
AL627309.1,OK,(Intercept),-7.168496,0.9219573,-7.7753,9.71e-15,0.0,count,4.20443e-11
AL627309.1,OK,patientID Number 2,-16.1545806,2148.3291467,-0.0075,0.994,-0.10708262,count,1.0
AL627309.1,OK,patientID Number 3,0.4458419,1.0921626,0.4082,0.683,0.05684629,count,1.0
AL627309.1,OK,C3plusTRUE,0.1214251,1.0921626,0.1112,0.911,0.01326312,count,1.0


In [41]:
# Extract the relavent data from the DE tables, remove genes that failed DE, write a file
C3plus_terms3 <- all_coef3 %>% filter(term == "C3plusTRUE") %>% filter(status == 'OK')
write.table(C3plus_terms3[,c(1, 4:12)],
            'Diff.C3plus.Coeff.RemoveFail.NoModels.txt', col.names = TRUE, row.names = FALSE)

In [40]:
C3plus_terms3[1:5, c(1, 4:12)]
dim(C3plus_terms3)

gene_short_name,status,term,estimate,std_err,test_val,p_value,normalized_effect,model_component,q_value
<fct>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>
AL627309.1,OK,C3plusTRUE,0.1214251,1.0921626,0.1112,0.911,0.01326312,count,1
AL669831.5,OK,C3plusTRUE,-0.7770789,0.3443259,-2.2568,0.0241,-0.803829,count,1
FAM87B,OK,C3plusTRUE,0.2422225,0.5306329,0.4565,0.648,0.14083818,count,1
LINC00115,OK,C3plusTRUE,0.2060703,0.4220268,0.4883,0.625,0.14244887,count,1
FAM41C,OK,C3plusTRUE,-0.4294373,0.4673052,-0.919,0.358,-0.33490497,count,1
