In [1]:
library(Seurat)
library(Signac)
library(SeuratWrappers)
library(RENIN)
library(tidyverse)
library(chromVARmotifs)
library(BSgenome.Hsapiens.UCSC.hg19)

Attaching SeuratObject

Attaching sp


Attaching package: ‘Signac’


The following object is masked from ‘package:Seurat’:

    FoldChange


── [1mAttaching packages[22m ─────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.2 ──
[32m✔[39m [34mggplot2[39m 3.4.1     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.8     [32m✔[39m [34mdplyr  [39m 1.0.9
[32m✔[39m [34mtidyr  [39m 1.2.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.2     [32m✔[39m [34mforcats[39m 0.5.1
── [1mConflicts[22m ────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[31m✖[39m [34mpurrr[39m::[32mreduce()[39m masks [34mSignac[39m::reduce()


“no function found corresponding to methods exports fro

## define the two ident to compare in FindMarkers function

In [2]:
FindMarkers_ident_1 = "Papilla"
FindMarkers_ident_2 = "Medulla"

celltype_subset = c("TAL1", "TAL2", "TAL3")

In [3]:
de_gene_file = "../../processed_data/RENIN/de_genes/446267_cells_RNA/TAL_rank_genes_groups_p_vs_m_20230425.RData"

In [4]:
processed_dir = file.path("..", "..", "processed_data", "RENIN", "by_regions", paste0(gsub(pattern = "\\/", "_", paste(celltype_subset, collapse = "_"))))
processed_dir
dir.create(processed_dir, recursive = T, showWarnings = F)

### load previously calculated image which includes expr_mat, peak_mat, novaseq.sub. This could save time for dealing with new de_gene for different clusters while sharing the same novaseq.sub object

In [5]:
Sys.time()
load("../../processed_data/RENIN/RENIN_324701_cells_preprocess.RData")
Sys.time()

[1] "2023-07-01 13:19:14 CDT"

[1] "2023-07-01 13:24:13 CDT"

In [6]:
novaseq.sub
table(novaseq.sub$celltype5_rna)

An object of class Seurat 
237522 features across 324701 samples within 2 assays 
Active assay: peaks (189184 features, 189184 variable features)
 1 other assay present: SCT
 6 dimensional reductions calculated: pca, harmony_RNA, lsi, harmony_peaks, umap.peaks, WNN.UMAP


      JGA       POD       PEC        PT PT_dediff  PT_VCAM1       tL1       tL2 
     2465      3179      1239     64880     29494      8445     25438      9029 
   tL-TAL      TAL1      TAL2      TAL3       DCT       CNT       PC1       PC2 
    15485     29910     19724      8900     14150     13197     16140      4049 
      ICA       ICB      Uro1      Uro2      ENDO      SMC1      SMC2      SMC3 
    10579      4120      3210       682     12847      4272       156        86 
     Fib1      Fib2      Fib3        Ma        BT 
     3786      7191      3095      3673      5280 

In [7]:
ls()

In [8]:
## subset novaseq.sub again to only include the celltypes of interest

In [9]:
novaseq.sub = subset(novaseq.sub, celltype5_rna %in% celltype_subset)
novaseq.sub
table(novaseq.sub$celltype5_rna)

An object of class Seurat 
237522 features across 58534 samples within 2 assays 
Active assay: peaks (189184 features, 189184 variable features)
 1 other assay present: SCT
 6 dimensional reductions calculated: pca, harmony_RNA, lsi, harmony_peaks, umap.peaks, WNN.UMAP


      JGA       POD       PEC        PT PT_dediff  PT_VCAM1       tL1       tL2 
        0         0         0         0         0         0         0         0 
   tL-TAL      TAL1      TAL2      TAL3       DCT       CNT       PC1       PC2 
        0     29910     19724      8900         0         0         0         0 
      ICA       ICB      Uro1      Uro2      ENDO      SMC1      SMC2      SMC3 
        0         0         0         0         0         0         0         0 
     Fib1      Fib2      Fib3        Ma        BT 
        0         0         0         0         0 

In [10]:
ls()

### remove the following variables because they will be regenerated for new comparison

In [11]:
rm(list = c("de_genes", "de_genes_BK", "gene_list", "peak_results", "aen_lists", "tf_results", "regulator_tf_names", "centrality_rankings"))

“object 'de_genes' not found”
“object 'de_genes_BK' not found”
“object 'gene_list' not found”
“object 'peak_results' not found”
“object 'aen_lists' not found”
“object 'tf_results' not found”
“object 'regulator_tf_names' not found”
“object 'centrality_rankings' not found”


In [12]:
ls()

In [13]:
Idents(novaseq.sub) = novaseq.sub$renal_region_new
head(Idents(novaseq.sub))

### subset mpt from novaseq.sub because comparison is to be made bewtween two celltypes

In [14]:
mpt <- subset(novaseq.sub, renal_region_new %in% c(FindMarkers_ident_1, FindMarkers_ident_2))
mpt
table(mpt$celltype5_rna)

An object of class Seurat 
237522 features across 42982 samples within 2 assays 
Active assay: peaks (189184 features, 189184 variable features)
 1 other assay present: SCT
 6 dimensional reductions calculated: pca, harmony_RNA, lsi, harmony_peaks, umap.peaks, WNN.UMAP


      JGA       POD       PEC        PT PT_dediff  PT_VCAM1       tL1       tL2 
        0         0         0         0         0         0         0         0 
   tL-TAL      TAL1      TAL2      TAL3       DCT       CNT       PC1       PC2 
        0     28219     13135      1628         0         0         0         0 
      ICA       ICB      Uro1      Uro2      ENDO      SMC1      SMC2      SMC3 
        0         0         0         0         0         0         0         0 
     Fib1      Fib2      Fib3        Ma        BT 
        0         0         0         0         0 

## load hli_446267_cells de_gene data frame

In [15]:
Sys.time()
options(future.globals.maxSize = 8000 * 1024^2)
load(de_gene_file)
Sys.time()

[1] "2023-07-01 13:25:32 CDT"

[1] "2023-07-01 13:25:32 CDT"

In [16]:
rownames(de_genes) = de_genes$names
de_genes_BK = de_genes
print(head(de_genes))
dim(de_genes)

                          names           scores                 pvals
MT-RNR2                 MT-RNR2 57.4203147888184                     0
RUNX1                     RUNX1 45.6538124084473                     0
ANK2                       ANK2 42.5474662780762                     0
ESRRG                     ESRRG 41.2324142456055                     0
ENSG00000287176 ENSG00000287176 36.9767036437988 2.71235518939549e-299
MT-RNR1                 MT-RNR1 35.9948806762695 1.00589312376237e-283
                            pvals_adj    logfoldchanges
MT-RNR2                             0 0.857780754566193
RUNX1                               0 0.961069822311401
ANK2                                0 0.724869072437286
ESRRG                               0 0.460754990577698
ENSG00000287176 1.00194400696269e-295 0.549769937992096
MT-RNR1         3.40612176591336e-280 0.561120629310608


In [17]:
### keep DEGs with pvals_adj<0.05
p_val_cutoff = 0.05
peak_assay = "peaks"
de_genes <- de_genes[which(de_genes$pvals_adj < p_val_cutoff), ]
DefaultAssay(novaseq.sub) <- peak_assay
DefaultAssay(mpt) <- peak_assay

dim(de_genes)

In [18]:
Sys.time()
# gene_coords <- Signac:::CollapseToLongestTranscript(Annotation(novaseq.sub))
gene_coords <- Signac:::CollapseToLongestTranscript(Annotation(mpt))
Sys.time()

[1] "2023-07-01 13:25:32 CDT"

[1] "2023-07-01 13:25:36 CDT"

In [19]:
de_genes <- de_genes[which(rownames(de_genes) %in% gene_coords$gene_name), ]
dim(de_genes)
gene_list = rownames(de_genes)

## Next, make two de_gene sublists: gene_list_neg, gene_list_pos.

In [20]:
gene_list_neg = rownames(de_genes)[de_genes$logfoldchanges < 0]
print(length(gene_list_neg))
gene_list_pos = rownames(de_genes)[de_genes$logfoldchanges > 0]
print(length(gene_list_pos))

[1] 111
[1] 863


In [21]:
# de_genes[order(de_genes$logfoldchanges, decreasing = T),]

## run_peak_aen

In [22]:
Sys.time()

options(future.globals.maxSize = 8000 * 1024^2)

peak_results <- run_peak_aen(novaseq.sub, expr_mat, peak_mat, gene_list, lambda2 = 0.5, max_distance = 5e+05, num_bootstraps = 100)

aen_lists <- make_aen_lists(peak_results)

Sys.time()

[1] "2023-07-01 13:25:36 CDT"

Loading required package: gcdnet


Attaching package: ‘gcdnet’


The following object is masked from ‘package:stats4’:

    coef


The following objects are masked from ‘package:stats’:

    coef, predict


Loading required package: future


Attaching package: ‘future’


The following object is masked from ‘package:rtracklayer’:

    values


The following object is masked from ‘package:GenomicRanges’:

    values


The following object is masked from ‘package:IRanges’:

    values


The following object is masked from ‘package:S4Vectors’:

    values


Loading required package: future.apply



[1] "Omitted (mitochondrial) genes with no genomic peaks: MT-ND1"   
[2] "Omitted (mitochondrial) genes with no genomic peaks: MT-CYB"   
[3] "Omitted (mitochondrial) genes with no genomic peaks: MT-CO3"   
[4] "Omitted (mitochondrial) genes with no genomic peaks: MT-ND6"   
[5] "Omitted (mitochondrial) genes with no genomic peaks: LINC00278"
[1] "AEN completed in 11.3628739476204"


[1] "2023-07-01 13:37:26 CDT"

## prepare_pseudocell_matrix for mpt

In [23]:
expr_mat_2 <- prepare_pseudocell_matrix(mpt, 
                                      assay = "SCT", 
                                      slot = "data", 
                                      cells_per_partition = 10, 
                                      find_neighbors = FALSE,
                                      reduction1 = "harmony_peaks",
                                      reduction2 = "harmony_RNA",
                                      dim_list = list(1:50, 1:50),
                                      k.nn = 5,
                                      seed = 489284)

Loading required package: Matrix


Attaching package: ‘Matrix’


The following object is masked from ‘package:S4Vectors’:

    expand


The following objects are masked from ‘package:tidyr’:

    expand, pack, unpack


Loading required package: VISION

Loading required package: SingleCellExperiment

Loading required package: SummarizedExperiment

Loading required package: MatrixGenerics

Loading required package: matrixStats


Attaching package: ‘matrixStats’


The following object is masked from ‘package:dplyr’:

    count



Attaching package: ‘MatrixGenerics’


The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDif

In [24]:
length(gene_list)
sum(gene_list %in% colnames(expr_mat_2))

### run_tf_aen

In [25]:
Sys.time()

tf_results <- run_tf_aen(mpt, expr_mat_2, peak_results, gene_list, lambda2 = 0.5)

regulator_tf_names <- unlist(novaseq.sub@assays$peaks@motifs@motif.names)

regulator_tf_names <- regulator_tf_names[which(regulator_tf_names %in% rownames(GetAssayData(novaseq.sub, assay = "SCT")))]

centrality_rankings <- rank_tfs_by_centrality(tf_results, novaseq.sub)

Sys.time()

[1] "2023-07-01 13:47:12 CDT"

[1] "Genes without any candidate TFs--most likely no linked peaks or promoter region peaks: NAIP"
[1] "AEN completed in 28.5997781515121"


Loading required package: igraph


Attaching package: ‘igraph’


The following objects are masked from ‘package:future’:

    %->%, %<-%


The following objects are masked from ‘package:rtracklayer’:

    blocks, path


The following object is masked from ‘package:Biostrings’:

    union


The following object is masked from ‘package:XVector’:

    path


The following object is masked from ‘package:GenomicRanges’:

    union


The following object is masked from ‘package:IRanges’:

    union


The following object is masked from ‘package:S4Vectors’:

    union


The following objects are masked from ‘package:BiocGenerics’:

    normalize, path, union


The following objects are masked from ‘package:dplyr’:

    as_data_frame, groups, union


The following objects are masked from ‘package:purrr’:

    compose, simplify


The following object is masked from ‘package:tidyr’:

    crossing


The following object is masked from ‘package:tibble’:

    as_data_frame


The following objects a

[1] "2023-07-01 14:16:31 CDT"

In [26]:
print(Sys.time())
    
print("centrality_rankings - Betweenness:")
print(head(centrality_rankings$Betweenness,20))
print("centrality_rankings - PageRank:")
print(head(centrality_rankings$PageRank,20))

print(Sys.time())
print("calculating rank_tfs...")
tf_ranks = rank_tfs(tf_results, 
                    negative_genes = rownames(de_genes_BK)[which(de_genes_BK$logfoldchanges < 0)], 
                    seurat = mpt, 
                    num_cores =  1
    )
print(Sys.time())

[1] "2023-07-01 14:16:31 CDT"
[1] "centrality_rankings - Betweenness:"
      RUNX1       ESRRG        RORA       BACH2       NR3C2       TCF12 
23573.11818 16668.31126  7360.92641  5692.97771  5567.87489  5152.23939 
       ELF1        MITF      HIVEP2         EHF       ARNT2        CBFB 
 3392.91234  3150.98312  1834.32273  1377.04286  1066.13030   761.55455 
       PBX3      NFE2L1        BCL6        HLTF        RBPJ      NFE2L2 
  520.38939   281.36667   272.78831   254.50000   218.89524   164.35000 
       CTCF       SMAD2 
   79.08333    63.41667 
[1] "centrality_rankings - PageRank:"
     ARNTL2       RUNX1       FOXP2      PRDM16       PPARA        PAX8 
0.028460088 0.026970866 0.013895078 0.013573576 0.011212116 0.009046006 
      ESRRB       ESRRG       NFAT5       TEAD1        NFIA        PAX2 
0.008821998 0.008696940 0.007712785 0.007630041 0.007118031 0.006841714 
    TFCP2L1       RREB1        SIM2       BACH2      SREBF2       HIF1A 
0.006606413 0.006431036 0.006297322 0.

In [27]:
head(tf_ranks,20)
tail(tf_ranks,20)

Unnamed: 0_level_0,TF_name,Score,SE
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>
ESRRG,ESRRG,5.9962512,0.24196152
RUNX1,RUNX1,3.4609585,0.07424302
FOXP2,FOXP2,2.8509847,0.10363236
PRDM16,PRDM16,2.6075067,0.07837923
ESRRB,ESRRB,2.4783194,0.08960972
PPARA,PPARA,1.7210423,0.05726383
KLF12,KLF12,1.2722377,0.07488487
TEAD1,TEAD1,1.2257371,0.06274342
PAX2,PAX2,1.2028204,0.05853798
PAX8,PAX8,1.0655422,0.05260465


Unnamed: 0_level_0,TF_name,Score,SE
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>
ARID5B,ARID5B,-0.07052518,0.02170564
NR1H4,NR1H4,-0.07533001,0.017798969
BACH1,BACH1,-0.08419297,0.018889417
SMAD4,SMAD4,-0.09573162,0.012336805
PKNOX2,PKNOX2,-0.09931334,0.009963007
TFAP2B,TFAP2B,-0.10134528,0.014510564
STAT3,STAT3,-0.10720098,0.03763549
HNF1B,HNF1B,-0.11316163,0.032839089
PBX1,PBX1,-0.12615911,0.030437457
ARNT2,ARNT2,-0.1412301,0.02221255


In [28]:
save(list=c("gene_list", "aen_lists", "de_genes_BK", "tf_results", "tf_ranks", "regulator_tf_names", "centrality_rankings", "peak_results"), 
     file = file.path(processed_dir, paste0("RENIN_324701_cells_", FindMarkers_ident_1, "_vs_", FindMarkers_ident_2, ".RData")), compress = T)

In [29]:
Sys.time()

[1] "2023-07-01 14:18:08 CDT"