In [2]:
library(Seurat)
library(Signac)
library(SeuratWrappers)
library(RENIN)
library(tidyverse)
library(chromVARmotifs)
library(BSgenome.Hsapiens.UCSC.hg19)

## define the two ident to compare in FindMarkers function

In [5]:
FindMarkers_ident_1 = "Papilla"
FindMarkers_ident_2 = "Medulla"

celltype_subset = c("tL1", "tL2")

In [6]:
de_gene_file = "../../processed_data/RENIN/de_genes/446267_cells_RNA/tL_rank_genes_groups_p_vs_m_20230425.RData"

In [7]:
processed_dir = file.path("..", "..", "processed_data", "RENIN", "by_regions", paste0(gsub(pattern = "\\/", "_", paste(celltype_subset, collapse = "_"))))
processed_dir
dir.create(processed_dir, recursive = T, showWarnings = F)

### load previously calculated image which includes expr_mat, peak_mat, novaseq.sub. This could save time for dealing with new de_gene for different clusters while sharing the same novaseq.sub object

In [8]:
Sys.time()
load("../../processed_data/RENIN/RENIN_324701_cells_preprocess.RData")
Sys.time()

[1] "2023-07-01 12:27:02 CDT"

[1] "2023-07-01 12:31:55 CDT"

In [9]:
novaseq.sub
table(novaseq.sub$celltype5_rna)

An object of class Seurat 
237522 features across 324701 samples within 2 assays 
Active assay: peaks (189184 features, 189184 variable features)
 1 other assay present: SCT
 6 dimensional reductions calculated: pca, harmony_RNA, lsi, harmony_peaks, umap.peaks, WNN.UMAP


      JGA       POD       PEC        PT PT_dediff  PT_VCAM1       tL1       tL2 
     2465      3179      1239     64880     29494      8445     25438      9029 
   tL-TAL      TAL1      TAL2      TAL3       DCT       CNT       PC1       PC2 
    15485     29910     19724      8900     14150     13197     16140      4049 
      ICA       ICB      Uro1      Uro2      ENDO      SMC1      SMC2      SMC3 
    10579      4120      3210       682     12847      4272       156        86 
     Fib1      Fib2      Fib3        Ma        BT 
     3786      7191      3095      3673      5280 

In [10]:
ls()

In [11]:
## subset novaseq.sub again to only include the celltypes of interest

In [12]:
novaseq.sub = subset(novaseq.sub, celltype5_rna %in% celltype_subset)
novaseq.sub
table(novaseq.sub$celltype5_rna)

An object of class Seurat 
237522 features across 34467 samples within 2 assays 
Active assay: peaks (189184 features, 189184 variable features)
 1 other assay present: SCT
 6 dimensional reductions calculated: pca, harmony_RNA, lsi, harmony_peaks, umap.peaks, WNN.UMAP


      JGA       POD       PEC        PT PT_dediff  PT_VCAM1       tL1       tL2 
        0         0         0         0         0         0     25438      9029 
   tL-TAL      TAL1      TAL2      TAL3       DCT       CNT       PC1       PC2 
        0         0         0         0         0         0         0         0 
      ICA       ICB      Uro1      Uro2      ENDO      SMC1      SMC2      SMC3 
        0         0         0         0         0         0         0         0 
     Fib1      Fib2      Fib3        Ma        BT 
        0         0         0         0         0 

In [14]:
ls()

### remove the following variables because they will be regenerated for new comparison

In [15]:
rm(list = c("de_genes", "de_genes_BK", "gene_list", "peak_results", "aen_lists", "tf_results", "regulator_tf_names", "centrality_rankings"))

“object 'de_genes' not found”
“object 'de_genes_BK' not found”
“object 'gene_list' not found”
“object 'peak_results' not found”
“object 'aen_lists' not found”
“object 'tf_results' not found”
“object 'regulator_tf_names' not found”
“object 'centrality_rankings' not found”


In [16]:
ls()

In [17]:
Idents(novaseq.sub) = novaseq.sub$renal_region_new
head(Idents(novaseq.sub))

### subset mpt from novaseq.sub because comparison is to be made bewtween two celltypes

In [19]:
mpt <- subset(novaseq.sub, renal_region_new %in% c(FindMarkers_ident_1, FindMarkers_ident_2))
mpt
table(mpt$celltype5_rna)

An object of class Seurat 
237522 features across 30959 samples within 2 assays 
Active assay: peaks (189184 features, 189184 variable features)
 1 other assay present: SCT
 6 dimensional reductions calculated: pca, harmony_RNA, lsi, harmony_peaks, umap.peaks, WNN.UMAP


      JGA       POD       PEC        PT PT_dediff  PT_VCAM1       tL1       tL2 
        0         0         0         0         0         0     22871      8088 
   tL-TAL      TAL1      TAL2      TAL3       DCT       CNT       PC1       PC2 
        0         0         0         0         0         0         0         0 
      ICA       ICB      Uro1      Uro2      ENDO      SMC1      SMC2      SMC3 
        0         0         0         0         0         0         0         0 
     Fib1      Fib2      Fib3        Ma        BT 
        0         0         0         0         0 

## load hli_446267_cells de_gene data frame

In [20]:
Sys.time()
options(future.globals.maxSize = 8000 * 1024^2)
load(de_gene_file)
Sys.time()

[1] "2023-07-01 12:41:39 CDT"

[1] "2023-07-01 12:41:39 CDT"

In [21]:
rownames(de_genes) = de_genes$names
de_genes_BK = de_genes
print(head(de_genes))
dim(de_genes)

        names           scores                 pvals             pvals_adj
CA8       CA8 53.3390846252441                     0                     0
NEBL     NEBL 45.4913749694824                     0                     0
SH3GL3 SH3GL3 42.8328742980957                     0                     0
ANK2     ANK2 38.8889694213867                     0                     0
GLS       GLS 32.5694999694824 1.10904285491029e-232 2.39989942284745e-229
OSBPL3 OSBPL3 31.0615062713623 7.97782863205628e-212 1.31531600346516e-208
          logfoldchanges
CA8     2.97676587104797
NEBL    1.01005041599274
SH3GL3  2.10418319702148
ANK2    1.25412750244141
GLS     1.02023696899414
OSBPL3 0.997186601161957


In [22]:
### keep DEGs with pvals_adj<0.05
p_val_cutoff = 0.05
peak_assay = "peaks"
de_genes <- de_genes[which(de_genes$pvals_adj < p_val_cutoff), ]
DefaultAssay(novaseq.sub) <- peak_assay
DefaultAssay(mpt) <- peak_assay

dim(de_genes)

In [23]:
Sys.time()
# gene_coords <- Signac:::CollapseToLongestTranscript(Annotation(novaseq.sub))
gene_coords <- Signac:::CollapseToLongestTranscript(Annotation(mpt))
Sys.time()

[1] "2023-07-01 12:42:16 CDT"

[1] "2023-07-01 12:42:19 CDT"

In [24]:
de_genes <- de_genes[which(rownames(de_genes) %in% gene_coords$gene_name), ]
dim(de_genes)
gene_list = rownames(de_genes)

## Next, make two de_gene sublists: gene_list_neg, gene_list_pos.

In [25]:
gene_list_neg = rownames(de_genes)[de_genes$logfoldchanges < 0]
print(length(gene_list_neg))
gene_list_pos = rownames(de_genes)[de_genes$logfoldchanges > 0]
print(length(gene_list_pos))

[1] 358
[1] 262


In [26]:
# de_genes[order(de_genes$logfoldchanges, decreasing = T),]

## run_peak_aen

In [27]:
Sys.time()

options(future.globals.maxSize = 8000 * 1024^2)

peak_results <- run_peak_aen(novaseq.sub, expr_mat, peak_mat, gene_list, lambda2 = 0.5, max_distance = 5e+05, num_bootstraps = 100)

aen_lists <- make_aen_lists(peak_results)

Sys.time()

[1] "2023-07-01 12:51:26 CDT"

Loading required package: gcdnet


Attaching package: ‘gcdnet’


The following object is masked from ‘package:stats4’:

    coef


The following objects are masked from ‘package:stats’:

    coef, predict


Loading required package: future


Attaching package: ‘future’


The following object is masked from ‘package:rtracklayer’:

    values


The following object is masked from ‘package:GenomicRanges’:

    values


The following object is masked from ‘package:IRanges’:

    values


The following object is masked from ‘package:S4Vectors’:

    values


Loading required package: future.apply



[1] "Omitted (mitochondrial) genes with no genomic peaks: MT-CO1"
[2] "Omitted (mitochondrial) genes with no genomic peaks: TBL1Y" 
[1] "AEN completed in 8.59863569736481"


[1] "2023-07-01 13:00:19 CDT"

## prepare_pseudocell_matrix for mpt

In [28]:
expr_mat_2 <- prepare_pseudocell_matrix(mpt, 
                                      assay = "SCT", 
                                      slot = "data", 
                                      cells_per_partition = 10, 
                                      find_neighbors = FALSE,
                                      reduction1 = "harmony_peaks",
                                      reduction2 = "harmony_RNA",
                                      dim_list = list(1:50, 1:50),
                                      k.nn = 5,
                                      seed = 489284)

Loading required package: Matrix


Attaching package: ‘Matrix’


The following object is masked from ‘package:S4Vectors’:

    expand


The following objects are masked from ‘package:tidyr’:

    expand, pack, unpack


Loading required package: VISION

Loading required package: SingleCellExperiment

Loading required package: SummarizedExperiment

Loading required package: MatrixGenerics

Loading required package: matrixStats


Attaching package: ‘matrixStats’


The following object is masked from ‘package:dplyr’:

    count



Attaching package: ‘MatrixGenerics’


The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDif

In [29]:
length(gene_list)
sum(gene_list %in% colnames(expr_mat_2))

### run_tf_aen

In [30]:
Sys.time()

tf_results <- run_tf_aen(mpt, expr_mat_2, peak_results, gene_list, lambda2 = 0.5)

regulator_tf_names <- unlist(novaseq.sub@assays$peaks@motifs@motif.names)

regulator_tf_names <- regulator_tf_names[which(regulator_tf_names %in% rownames(GetAssayData(novaseq.sub, assay = "SCT")))]

centrality_rankings <- rank_tfs_by_centrality(tf_results, novaseq.sub)

Sys.time()

[1] "2023-07-01 13:05:54 CDT"

[1] "Genes without any candidate TFs--most likely no linked peaks or promoter region peaks: IL1RL1"
[2] "Genes without any candidate TFs--most likely no linked peaks or promoter region peaks: NAIP"  
[1] "AEN completed in 13.337763774395"


Loading required package: igraph


Attaching package: ‘igraph’


The following objects are masked from ‘package:future’:

    %->%, %<-%


The following objects are masked from ‘package:rtracklayer’:

    blocks, path


The following object is masked from ‘package:Biostrings’:

    union


The following object is masked from ‘package:XVector’:

    path


The following object is masked from ‘package:GenomicRanges’:

    union


The following object is masked from ‘package:IRanges’:

    union


The following object is masked from ‘package:S4Vectors’:

    union


The following objects are masked from ‘package:BiocGenerics’:

    normalize, path, union


The following objects are masked from ‘package:dplyr’:

    as_data_frame, groups, union


The following objects are masked from ‘package:purrr’:

    compose, simplify


The following object is masked from ‘package:tidyr’:

    crossing


The following object is masked from ‘package:tibble’:

    as_data_frame


The following objects a

[1] "2023-07-01 13:19:43 CDT"

In [31]:
print(Sys.time())
    
print("centrality_rankings - Betweenness:")
print(head(centrality_rankings$Betweenness,20))
print("centrality_rankings - PageRank:")
print(head(centrality_rankings$PageRank,20))

print(Sys.time())
print("calculating rank_tfs...")
tf_ranks = rank_tfs(tf_results, 
                    negative_genes = rownames(de_genes_BK)[which(de_genes_BK$logfoldchanges < 0)], 
                    seurat = mpt, 
                    num_cores =  1
    )
print(Sys.time())

[1] "2023-07-01 13:19:43 CDT"
[1] "centrality_rankings - Betweenness:"
      ELF1      STAT3     HIVEP3      KLF12    SMARCC1      CREB5      ATOH8 
4031.56667 3748.48333 3267.48333 2891.20000 2883.31667 1808.00000  838.25000 
      ELF2        HLF      ESRRB       RARB      CEBPD      NFKB1      HNF1B 
 765.20000  456.66667  262.41667  176.33333  170.16667  143.66667  111.00000 
      KLF7      MLXIP     TCF7L1       BPTF      HOXD3        VDR 
  79.00000   76.83333   65.83333   63.75000   26.50000   22.08333 
[1] "centrality_rankings - PageRank:"
      RUNX1       MECOM      TFAP2B      ARNTL2        SOX6        BCL6 
0.020807855 0.012433656 0.012324934 0.011593641 0.011104470 0.010330234 
      HNF4G        PBX1        NFIB       PROX1       BACH1       TEAD1 
0.009937950 0.009138496 0.007899867 0.007498011 0.007103922 0.007073588 
      STAT1       SMAD9        ELF5       NR1H4       PPARG        ZEB1 
0.006709501 0.006482994 0.006245102 0.005943381 0.005827495 0.005723419 
      E

In [32]:
head(tf_ranks,20)
tail(tf_ranks,20)

Unnamed: 0_level_0,TF_name,Score,SE
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>
MECOM,MECOM,3.8227974,0.08809677
RUNX1,RUNX1,2.9182348,0.05806189
TEAD1,TEAD1,1.9878145,0.09978262
ARNTL2,ARNTL2,1.4782638,0.1507156
ELF1,ELF1,1.3470883,0.06634797
BACH1,BACH1,1.2330211,0.05141307
BCL6,BCL6,1.0596308,0.03963175
TFAP2B,TFAP2B,1.0353967,0.02490451
GLIS3,GLIS3,0.8470593,0.10959604
HNF4G,HNF4G,0.8336927,0.02847953


Unnamed: 0_level_0,TF_name,Score,SE
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>
CUX1,CUX1,-0.1700801,0.02712446
PLAGL1,PLAGL1,-0.1757637,0.01707298
RREB1,RREB1,-0.1855252,0.02569827
HNF1B,HNF1B,-0.2532379,0.04447832
MITF,MITF,-0.2879897,0.0437247
GLI3,GLI3,-0.3175795,0.01645019
ZNF148,ZNF148,-0.3754478,0.03808196
FOXO3,FOXO3,-0.4421478,0.02870272
ARID5B,ARID5B,-0.5095355,0.0387706
PPARG,PPARG,-0.5604226,0.02564116


In [33]:
save(list=c("gene_list", "aen_lists", "de_genes_BK", "tf_results", "tf_ranks", "regulator_tf_names", "centrality_rankings", "peak_results"), 
     file = file.path(processed_dir, paste0("RENIN_324701_cells_", FindMarkers_ident_1, "_vs_", FindMarkers_ident_2, ".RData")), compress = T)

In [34]:
Sys.time()

[1] "2023-07-01 13:21:23 CDT"