In [1]:
library(ArchR)
library(tidyverse)
library(BSgenome.Hsapiens.UCSC.hg38)
library(SingleCellExperiment)
library(anndata)
# library(sceasy)
# library(reticulate)
# use_condaenv('cellpymc')
# loompy <- reticulate::import('loompy')


                                                   / |
                                                 /    \
            .                                  /      |.
            \\\                              /        |.
              \\\                          /           `|.
                \\\                      /              |.
                  \                    /                |\
                  \\#####\           /                  ||
                ==###########>      /                   ||
                 \\##==......\    /                     ||
            ______ =       =|__ /__                     ||      \\\
       \               '        ##_______ _____ ,--,__,=##,__   ///
        ,    __==    ___,-,__,--'#'  ==='      `-'    | ##,-/
        -,____,---'       \\####\\________________,--\\_##,/
           ___      .______        ______  __    __  .______      
          /   \     |   _  \      /      ||  |  |  | |   _  \     
         /  ^  \    |  |_) 

In [2]:
setwd('/nfs/team205/heart/anndata_objects/8regions/ArchR')
getwd()

In [3]:
# before starting a project we must set the ArchRGenome and default threads for parallelization.
# Setting default genome to Hg38.
addArchRGenome("hg38")

Setting default genome to Hg38.



In [4]:
# Setting default number of Parallel threads to 16
addArchRThreads(threads = 10) 

Setting default number of Parallel threads to 10.



# Read in ArchR project

In [5]:
archr_project_path = '/nfs/team205/heart/anndata_objects/8regions/ArchR/project_output'
proj = loadArchRProject(path = archr_project_path, showLogo = FALSE)
proj

Successfully loaded ArchRProject!


           ___      .______        ______  __    __  .______      
          /   \     |   _  \      /      ||  |  |  | |   _  \     
         /  ^  \    |  |_)  |    |  ,----'|  |__|  | |  |_)  |    
        /  /_\  \   |      /     |  |     |   __   | |      /     
       /  _____  \  |  |\  \\___ |  `----.|  |  |  | |  |\  \\___.
      /__/     \__\ | _| `._____| \______||__|  |__| | _| `._____|
    



class: ArchRProject 
outputDirectory: /nfs/team205/heart/anndata_objects/8regions/ArchR/project_output 
samples(47): HCAHeart9508627_HCAHeart9508819
  HCAHeart9508628_HCAHeart9508820 ...
  HCAHeartST13180618_HCAHeartST13177115
  HCAHeartST13180619_HCAHeartST13177116
sampleColData names(1): ArrowFiles
cellColData names(52): Sample TSSEnrichment ... Gex_MitoRatio
  Gex_RiboRatio
numberOfCells(1): 139835
medianTSS(1): 8.699
medianFrags(1): 9459

# Get Peak2Gene links, atrial CMs

https://www.archrproject.com/bookdown/peak2genelinkage-with-archr.html

* look for correlations between peak accessibility and gene expression.

In [6]:
table(proj$cell_type)


                 Adipocyte       Atrial Cardiomyocyte 
                      1374                      14163 
          Endothelial cell                 Fibroblast 
                     16104                      42479 
Lymphatic Endothelial cell                   Lymphoid 
                       334                       4399 
                 Mast cell           Mesothelial cell 
                       274                         62 
                Mural cell                    Myeloid 
                     10121                      13660 
               Neural cell  Ventricular Cardiomyocyte 
                      1581                      35284 

In [7]:
# only aCMs
celltypes = c("Atrial Cardiomyocyte")
proj_list = list()

for(cell in celltypes){
    print(paste0("##### ",cell," #####"))
    # subset cell type
    idxSample <- BiocGenerics::which(proj$cell_state %in% c("aCM1","aCM2","aCM3","aCM4","SAN_P_cell","AVN_P_cell"))
    cellsSample <- proj$cellNames[idxSample]
    proj_sub <- proj[cellsSample, ]
    print(table(proj_sub$cell_state))
    
    # re-dimensional reduction per cell type
    proj_sub <- addIterativeLSI(
        ArchRProj = proj_sub,
        useMatrix = "TileMatrix", 
        name = "IterativeLSI", 
        iterations = 2, 
        clusterParams = list( #See Seurat::FindClusters
            resolution = c(0.2), 
            sampleCells = 10000, 
            n.start = 10
        ), 
        varFeatures = 25000, 
        dimsToUse = 1:30,
        force = TRUE
    )
    
    # peak2gene
    proj_sub <- addPeak2GeneLinks(
        ArchRProj = proj_sub,
        reducedDims = "IterativeLSI",
        useMatrix = "GeneExpressionMatrix"
    )
    
    # store in project list
    proj_list[cell] <- proj_sub
    
    # save output with gene and peak name
    # p2geneDF <- metadata(proj_sub@peakSet)$Peak2GeneLinks
    # p2geneDF$geneName <- mcols(metadata(p2geneDF)$geneSet)$name[p2geneDF$idxRNA]
    # p2geneDF$peakName <- (metadata(p2geneDF)$peakSet %>% {paste0(seqnames(.), "_", start(.), "-", end(.))})[p2geneDF$idxATAC]
    # save
    # write.csv(p2geneDF,
    #           file=,
    #           row.names = FALSE)
    
    # rm(p2geneDF)
    # rm(proj_sub)
}

[1] "##### Atrial Cardiomyocyte #####"

      aCM1       aCM2       aCM3       aCM4 AVN_P_cell SAN_P_cell 
      3443       7420       1021       1993         91        169 


Checking Inputs...

ArchR logging to : ArchRLogs/ArchR-addIterativeLSI-341e18de06d4-Date-2023-01-09_Time-12-04-54.log
If there is an issue, please report to github with logFile!

2023-01-09 12:04:56 : Computing Total Across All Features, 0.02 mins elapsed.

2023-01-09 12:05:11 : Computing Top Features, 0.261 mins elapsed.

###########
2023-01-09 12:05:12 : Running LSI (1 of 2) on Top Features, 0.289 mins elapsed.
###########

2023-01-09 12:05:12 : Sampling Cells (N = 10013) for Estimated LSI, 0.291 mins elapsed.

2023-01-09 12:05:12 : Creating Sampled Partial Matrix, 0.291 mins elapsed.

2023-01-09 12:05:45 : Computing Estimated LSI (projectAll = FALSE), 0.829 mins elapsed.

2023-01-09 12:06:08 : Identifying Clusters, 1.224 mins elapsed.

2023-01-09 12:06:33 : Identified 5 Clusters, 1.634 mins elapsed.

2023-01-09 12:06:33 : Saving LSI Iteration, 1.634 mins elapsed.

Length of unique values greater than palette, interpolating..

2023-01-09 12:07:00 : Creating Cluster Matrix on the tota

In [13]:
# corr_thresh = 0.2
# FDR_thresh = 1e-04

p2geneDF <- metadata(proj_list[[cell]]@peakSet)$Peak2GeneLinks
p2geneDF$geneName <- mcols(metadata(p2geneDF)$geneSet)$name[p2geneDF$idxRNA]
p2geneDF$peakName <- (metadata(p2geneDF)$peakSet %>% {paste0(seqnames(.), "_", start(.), "-", end(.))})[p2geneDF$idxATAC]
p2geneDF$geneName <- as.character(p2geneDF$geneName)
# p2geneDF <- p2geneDF[which((p2geneDF[['Correlation']]>corr_thresh)==TRUE),]
# p2geneDF <- p2geneDF[which((p2geneDF[['FDR']]<FDR_thresh)==TRUE),]
p2geneDF

DataFrame with 3815446 rows and 8 columns
          idxATAC    idxRNA Correlation       FDR  VarQATAC   VarQRNA
        <integer> <integer>   <numeric> <numeric> <numeric> <numeric>
1               1        10 -0.02040158  0.793400 0.0282625  0.115077
2               2        10  0.03320061  0.649272 0.4983249  0.115077
3               3        10 -0.04713256  0.491986 0.6815563  0.115077
4               4        10  0.00170392  0.984771 0.3979638  0.115077
5               5        10  0.02780257  0.710586 0.5194427  0.115077
...           ...       ...         ...       ...       ...       ...
3815442    429824     32587 -0.01345811 0.8683935  0.396128  0.143063
3815443    429825     32587 -0.00917579 0.9128302  0.938089  0.143063
3815444    429826     32587  0.10594846 0.0658652  0.493846  0.143063
3815445    429827     32587 -0.03272107 0.6547628  0.426382  0.143063
3815446    429828     32587  0.11271067 0.0473720  0.123007  0.143063
           geneName               peakName
     

In [20]:
# save
write.csv(p2geneDF,
          file=paste0('/nfs/team205/heart/anndata_objects/8regions/ArchR/project_output/csv/peak2gene/',sub(' ','',cell),'.csv'),
          row.names = FALSE)