# Settings

In [1]:
# Load Reticulate function
Sys.setenv(RETICULATE_PYTHON="/home/luca/anaconda3/envs/reticulate/bin/python")
library(reticulate)
reticulate::use_python("/home/luca/anaconda3/envs/reticulate/bin/python")
reticulate::use_condaenv("/home/luca/anaconda3/envs/reticulate")
reticulate::py_module_available(module='anndata') #needs to be TRUE
reticulate::import('anndata') #good to make sure this doesn't error
reticulate::py_module_available(module='leidenalg') #needs to be TRUE
reticulate::import('leidenalg') #good to make sure this doesn't error

Module(anndata)

Module(leidenalg)

In [2]:
## Patch for annotations in R4.1
# BiocManager::install("Bioconductor/GenomeInfoDb",lib = "/home/luca/R/x86_64-pc-linux-gnu-library/4.1",force = TRUE)
# library(GenomeInfoDb,lib.loc="/home/luca/R/x86_64-pc-linux-gnu-library/4.1")

In [3]:
# Load packages
pacman::p_load(dplyr, stringr, data.table, tidyr, data.table, Matrix, future, 
               hdf5r, Seurat, Signac,harmony, knitr, SoupX, 
               EnsDb.Hsapiens.v86, 
               logr, parallel, 
               ggplot2, ggpubr, ggrepel, ggbreak, gridExtra, patchwork, grid, ggh4x)

In [4]:
# Load genome
#suppressMessages(annotations <- GetGRangesFromEnsDb(ensdb=EnsDb.Hsapiens.v86))
#genome(annotations) <- 'hg38'
#seqlevelsStyle(annotations) <- 'UCSC'
# Save table
# writeRDS(annotations, "/nfs/lab/Luca/Assets/references/Cellranger/hg38.annotations.rds")

# Load table
annotations = readRDS("/nfs/lab/Luca/Assets/references/Cellranger/hg38.annotations.rds")
seqlevelsStyle(annotations) <- 'UCSC'
genome(annotations) <- 'hg38'

# Seq info downloaded from: https://github.com/broadinstitute/ichorCNA/issues/84
seq.info = readRDS("/nfs/lab/Luca/Assets/references/Cellranger/seqinfo_hg38_ucsc.rds")

In [5]:
# Set options
options(stringsAsFactors = FALSE)
warnLevel <- getOption('warn')
options(warn = -1)
opts_chunk$set(tidy=TRUE)

# set Future
plan("multicore", workers = 4)
# set RAM treshold
## 1000 = 1gb
RAM.tresh = 10000 * 1024^2
options(future.globals.maxSize = RAM.tresh)

In [7]:
# Set directories
base.dir = "/nfs/lab/projects/mega_heart/"
assets.dir = "/nfs/lab/projects/mega_heart/Assets/"

step1.dir = "/nfs/lab/projects/mega_heart/FNIH/multiome/Analysis/1_preprocessing/"
step2.dir = "/nfs/lab/projects/mega_heart/FNIH/multiome/Analysis/3_downstream/Major_celltypes/"
PEAKS.dir = "/nfs/lab/projects/mega_heart/FNIH/multiome/Analysis/1_preprocessing/PeakCalling/2_PeaksMap_Peakcalls/unified_peaks/"


counts.dir = paste0(step2.dir, "ATAC/4Chambers/COUNTS/")
TPM.dir = paste0(step2.dir, "ATAC/4Chambers/CPM/")
DESEQ.dir = paste0(step2.dir, "ATAC/4Chambers/DESEQ/")

In [8]:
# Create those directories
dir.create(step2.dir)
dir.create(counts.dir)
dir.create(TPM.dir)
dir.create(DESEQ.dir)

In [9]:
log_open(file_name = paste0(base.dir, "4chambers_ATAC_DownstreamFiles.log"))

# Load assay

In [10]:
log_print(" Loading data")
adata = readRDS(paste(step1.dir, "LV_RV_LA_RA.multiome.mrg.filt.MTless.silQC.curated.peaks.rds", sep = ""))
log_print(paste("Done"))

[1] " Loading data"
[1] "Done"


In [None]:
# Backup then indent
# adata.bckp = adata

In [11]:
adata

An object of class Seurat 
392885 features across 329255 samples within 4 assays 
Active assay: ATAC (285873 features, 285873 variable features)
 2 layers present: counts, data
 3 other assays present: RNA, RNA_raw, SCT
 7 dimensional reductions calculated: pca, harmony.rna, umap.rna, lsi, harmony.atac, umap.atac, umap.wnn

# Cell count matrix  - ATAC

In [12]:
samples = as.character(unique(adata$donor))
samples

In [13]:
######## SET TO WHATEVER YOUR ASSIGNMENTS ARE STORED UNDER ########
Idents(object = adata) <- "cell.major_types"
head(Idents(adata))
#### OUTPUT DIRECTORY #####
outdir = counts.dir

#pull out list of all cell types, removing ignore
unique_cell_types <- unique(adata$cell.major_types)
#unique_cell_types <- unique_cell_types[-c(11)]
print(unique_cell_types)


sample_bcs <- list()
for (sample in samples){
    sample_bcs[[sample]] <- row.names(adata[[]][adata[[]]$donor == sample,])
}

##############
#### SET TO WHATEVER ASSAY YOU WANT TO USE ######
DefaultAssay(adata) <- 'ATAC'
gex.counts <- GetAssayData(adata, slot='counts')
dim(gex.counts)
head(gex.counts)
adata_matrices <- adata

 [1] "Fibroblast"  "Endothelial" "vCM"         "Myeloid"     "Pericyte"   
 [6] "Endocardial" "Lymphoid"    "SM"          "Neuronal"    "Adipocyte"  
[11] "Epicardial"  "aCM"        


  [[ suppressing 34 column names 'QY_2193_1_2_QY_2192_1_2_AAACAGCCAACTAGGG-1', 'QY_2193_1_2_QY_2192_1_2_AAACAGCCACTTACAG-1', 'QY_2193_1_2_QY_2192_1_2_AAACAGCCAGTTTGTG-1' ... ]]



6 x 329255 sparse Matrix of class "dgCMatrix"
                                                                              
chr1-181260-181560 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
chr1-191308-191608 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
chr1-779626-779926 . . . . . . . . . . . . . . . . . 1 . . . . . . . . . . . .
chr1-807726-808026 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
chr1-818649-818949 . . . . . . . . . . . . . . . . . . . . . . 1 . . . . . . .
chr1-832295-832595 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                                 
chr1-181260-181560 . . . . ......
chr1-191308-191608 . . . . ......
chr1-779626-779926 . . . . ......
chr1-807726-808026 . . . . ......
chr1-818649-818949 . . . . ......
chr1-832295-832595 . . . . ......

 .....suppressing 329221 columns in show(); maybe adjust options(max.print=, width=)
 ..............................

In [14]:
# Load peaks list
list.files(PEAKS.dir)

# Match with celltypes
unique_cell_types

message("NOTE: celltypes and peak names should match")

NOTE: celltypes and peak names should match



In [15]:
#looping through cell types by making ^ into a function
get_per_sample_gex_SUMS <- function(cell.type, filename){
    print(paste(cell.type,Sys.time()))

    #pull out rows of gex.counts where BC Ident matches cell.type
    bcs <- names(Idents(adata_matrices)[Idents(adata_matrices) == cell.type])
    counts <- gex.counts[,colnames(gex.counts) %in% bcs]

    #initialize the matrix of sample gex
    counts.df <- as.data.frame(rep(0,length(row.names(gex.counts))))
    row.names(counts.df) <- row.names(gex.counts)
    colnames(counts.df) <- c('temp')

    #go through samples and calculate sum of gex values
    for (sample in samples){
        sample_cols <- colnames(counts) %in% sample_bcs[[sample]]
        counts.cut <- counts[,sample_cols]
        
        #if only one bc, this becomes a vector which is an issue
        if (typeof(counts.cut) == 'double'){
            mean.counts <- counts.cut
        #if there are NO bcs, this will return NA (just return 0 for everything)
        } else if(length(colnames(counts.cut)) == 0){
            mean.counts <- rep(0,length(row.names(counts)))
        } else {
            mean.counts <- rowSums(counts.cut)
        }
        counts.df <- cbind(counts.df,as.data.frame(mean.counts))
     }
    fin.counts.df <- counts.df[,-c(1)]
    colnames(fin.counts.df) <- samples

    # Load celltype specific peaks
    ct_specific_peaks <- read.table(paste0(PEAKS.dir, cell.type, "_UnifiedFiltVariable.bed"),
                                    sep="\t", header = FALSE)
    ct_specific_peaks = paste0(ct_specific_peaks$V1, "-", ct_specific_peaks$V2, "-", ct_specific_peaks$V3)
    fin.counts.df = fin.counts.df[rownames(fin.counts.df) %in% ct_specific_peaks, ]
    print(paste0(" - peaks: ", nrow(fin.counts.df)))
    #export df
    write.table(fin.counts.df, filename, sep='\t',quote=FALSE)
}

In [16]:
##### NAME YOUR FILES #####
for (cell.type in unique_cell_types){
    filename <- paste(outdir, cell.type, '_perdonor.ATAC.counts', sep = "")
    get_per_sample_gex_SUMS(cell.type, filename)
}

[1] "Fibroblast 2025-01-27 11:50:40.444866"
[1] " - peaks: 129296"
[1] "Endothelial 2025-01-27 11:51:00.771338"
[1] " - peaks: 89791"
[1] "vCM 2025-01-27 11:51:14.906292"
[1] " - peaks: 162084"
[1] "Myeloid 2025-01-27 11:51:53.254793"
[1] " - peaks: 98275"
[1] "Pericyte 2025-01-27 11:52:08.174736"
[1] " - peaks: 65897"
[1] "Endocardial 2025-01-27 11:52:17.744501"
[1] " - peaks: 62275"
[1] "Lymphoid 2025-01-27 11:52:24.960312"
[1] " - peaks: 46008"
[1] "SM 2025-01-27 11:52:32.926787"
[1] " - peaks: 52412"
[1] "Neuronal 2025-01-27 11:52:39.228435"
[1] " - peaks: 29658"
[1] "Adipocyte 2025-01-27 11:52:44.378869"
[1] " - peaks: 35501"
[1] "Epicardial 2025-01-27 11:52:50.773793"
[1] " - peaks: 49313"
[1] "aCM 2025-01-27 11:52:57.183414"
[1] " - peaks: 139616"


# CPM

In [17]:
#looping through cell types by making ^ into a function
calculate_CPM <- function(cell.type, cpm.filename, pseudobulk.filename){
    x <- read.table(paste(pseudobulk.filename))
    
    cpm <- x
    for (i in 1:ncol(x)){
        cpm[,i] <- 1e6 * (x[,i]/sum(x[,i]))
    }
    
    #export df
    write.table(cpm, cpm.filename, sep='\t',quote=FALSE)
}

In [18]:
indir <- counts.dir
outdir <- TPM.dir

In [19]:
unique_cell_types = c('Fibroblast', 'Myeloid', 'Pericyte', 'Endocardial', 'Endothelial',
                      'aCM', 'Lymphoid', 'Epicardial', 'SM', 'Neuronal', 'Adipocyte')

In [20]:
##### NAME YOUR FILES #####
for (cell.type in unique_cell_types){
    cpm.filename <- paste(outdir, cell.type, '_perdonor.ATAC.cpm', sep = "")
    pseudobulk.filename <- paste(indir, cell.type, '_perdonor.ATAC.counts', sep = "")
    calculate_CPM(cell.type, cpm.filename, pseudobulk.filename)
}

In [21]:
# SAVE TPM per celltype
TPM.by.cell <- data.frame()

for (c in unique_cell_types) {
    message("Processing ", c)
    ct.TPM <- rowMeans(read.table(paste0(outdir,c,'_perdonor.ATAC.cpm')), na.rm=T)
    if(length(rownames(TPM.by.cell)) == 0) {
        TPM.by.cell <- data.frame(ct.TPM)
        colnames(TPM.by.cell) <- c
    } else {
        TPM.by.cell[[c]] <- ct.TPM[rownames(TPM.by.cell)]
    }
}

dim(TPM.by.cell)
head(TPM.by.cell)

write.table(TPM.by.cell, 
            paste0(outdir, 'AllCellTypes..ATAC.cpm'),
           quote=F, col.names=T, row.names=T, sep='\t')

Processing Fibroblast

Processing Myeloid

Processing Pericyte

Processing Endocardial

Processing Endothelial

Processing aCM

Processing Lymphoid

Processing Epicardial

Processing SM

Processing Neuronal

Processing Adipocyte



Unnamed: 0_level_0,Fibroblast,Myeloid,Pericyte,Endocardial,Endothelial,aCM,Lymphoid,Epicardial,SM,Neuronal,Adipocyte
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
chr1-181260-181560,9.632933,10.777465,10.66474,15.341643,12.481735,10.24369,14.303689,6.00566,13.585399,27.77125,25.79901
chr1-191308-191608,5.959459,7.877625,6.872459,17.288083,12.457476,14.899194,7.671111,,6.580643,22.94376,44.23927
chr1-779626-779926,5.237409,4.158349,3.847444,4.712415,5.137712,3.187757,,,4.842369,,
chr1-818649-818949,8.449336,,,,2.830931,2.99876,,,,,12.49027
chr1-836289-836554,1.146947,,1.46159,,,,,,,,
chr1-844035-844335,1.32443,,,,,3.278159,2.803032,,,,
