# Settings

In [1]:
# Load Reticulate function
Sys.setenv(RETICULATE_PYTHON="/home/luca/anaconda3/envs/reticulate/bin/python")
library(reticulate)
reticulate::use_python("/home/luca/anaconda3/envs/reticulate/bin/python")
reticulate::use_condaenv("/home/luca/anaconda3/envs/reticulate")
reticulate::py_module_available(module='anndata') #needs to be TRUE
reticulate::import('anndata') #good to make sure this doesn't error
reticulate::py_module_available(module='leidenalg') #needs to be TRUE
reticulate::import('leidenalg') #good to make sure this doesn't error

Module(anndata)

Module(leidenalg)

In [2]:
## Patch for annotations in R4.1
# BiocManager::install("Bioconductor/GenomeInfoDb",lib = "/home/luca/R/x86_64-pc-linux-gnu-library/4.1",force = TRUE)
# library(GenomeInfoDb,lib.loc="/home/luca/R/x86_64-pc-linux-gnu-library/4.1")

In [13]:
# Load packages
pacman::p_load(dplyr, stringr, data.table, tidyr, data.table, Matrix, future, 
               hdf5r, Seurat, Signac,harmony, knitr, SoupX, 
               EnsDb.Hsapiens.v86, SeuratDisk,
               logr, parallel, 
               ggplot2, ggpubr, ggrepel, ggbreak, gridExtra, patchwork, grid, ggh4x)

In [4]:
# Load genome
#suppressMessages(annotations <- GetGRangesFromEnsDb(ensdb=EnsDb.Hsapiens.v86))
#genome(annotations) <- 'hg38'
#seqlevelsStyle(annotations) <- 'UCSC'
# Save table
# writeRDS(annotations, "/nfs/lab/Luca/Assets/references/Cellranger/hg38.annotations.rds")

# Load table
annotations = readRDS("/nfs/lab/Luca/Assets/references/Cellranger/hg38.annotations.rds")
seqlevelsStyle(annotations) <- 'UCSC'
genome(annotations) <- 'hg38'

# Seq info downloaded from: https://github.com/broadinstitute/ichorCNA/issues/84
seq.info = readRDS("/nfs/lab/Luca/Assets/references/Cellranger/seqinfo_hg38_ucsc.rds")

In [5]:
# Set options
options(stringsAsFactors = FALSE)
warnLevel <- getOption('warn')
options(warn = -1)
opts_chunk$set(tidy=TRUE)

# set Future
plan("multicore", workers = 4)
# set RAM treshold
## 1000 = 1gb
RAM.tresh = 10000 * 1024^2
options(future.globals.maxSize = RAM.tresh)
options(scipen = 999)

In [6]:
# Set directories
base.dir = "/nfs/lab/projects/mega_heart/CAREHF/multiome/"
assets.dir = "/nfs/lab/projects/mega_heart/CAREHF/multiome/Assets/"

cell.ranger.dir = "/nfs/lab/projects/mega_heart/CAREHF/multiome/cellranger.symlinks/"

step1.dir = "/nfs/lab/projects/mega_heart/CAREHF/multiome/Analysys/1_preprocessing/"
step2.dir = "/nfs/lab/projects/mega_heart/CAREHF/multiome/Analysys/2_PeaksReformat/"
step3.dir = "/nfs/lab/projects/mega_heart/CAREHF/multiome/Analysys/3_SoupX/"
step4.dir = "/nfs/lab/projects/mega_heart/CAREHF/multiome/Analysys/4_Doublet_cleanup/"
log.dir = "/nfs/lab/projects/mega_heart/CAREHF/multiome/log/"

dir.create(paste0(step4.dir, "MM_counts/"))

In [7]:
# start log
options("logr.on" = TRUE, "logr.notes" = TRUE)
options("logr.autolog" = TRUE)
options("logr.compact" = TRUE)
options("logr.traceback" = TRUE)
log.file = paste(base.dir, Sys.Date(),".7_Upload_part1.log", sep="")

In [8]:
log_open(log.file)

In [12]:
# Load sample info
sample.info = read.table(paste(assets.dir, "sample.info", sep = ""), sep = "\t", header = TRUE)
# Build variables
sample.ls = sample.info$ID
cellranger.outs.ls = paste0(sample.info$CellRanger,
                            sample.info$Chamber, "/",
                            sample.info$ID, "/outs/")

# Check the paths r right
sample.ls[1]
cellranger.outs.ls[1]
length(sample.ls)

In [78]:
# Load final list of barcodes
meta = fread("/nfs/lab/tscc/luca/MEGA_Heart/CAREHF_rna.meta")
# Ensure barcodes match the cell names in Seurat object
meta$V1 <- gsub(":", "_", meta$V1)
rownames(meta) <- meta$V1  # Set row names to barcodes

In [None]:
for (i in seq_along(sample.ls)){
    gc(reset = TRUE)
    # Set sample variable
    sample = sample.ls[i]
    out.dir = paste0(step4.dir, "MM_counts/", sample, "/")
    dir.create(out.dir)
    log_print(paste("Processing: ", sample))
    # Load sample
    adata = readRDS(file = paste(step4.dir, sample, "_prefilt.Peaks.SoupX.RmMult.rds", sep = ""))
    # Create new idents
    barcodes_library = paste0(adata@meta.data$samples, 
                             "_",
                             adata@meta.data$barcode)

    names(barcodes_library) <- colnames(adata)
    adata <- RenameCells(adata, new.names = barcodes_library)

    # Subset annotations to match cells present in the Seurat object
    log_print(paste("   - cells before cleanup: ", ncol(adata)))
    cell.keep <- meta[rownames(meta) %in% colnames(adata), ]
    adata_sub <- subset(adata, cells = cell.keep$V1)
    log_print(paste("   - cells after cleanup: ", ncol(adata_sub)))

    # Get counts
    counts_matrix <- GetAssayData(adata, assay = "ATAC", slot = "counts")

    writeMM(counts_matrix,
            file = paste0(out.dir,
                          "ATAC_counts.mtx"))

    write.table(rownames(counts_matrix),
            file = paste0(out.dir,
                          "ATAC_features.tsv"),
                sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE)

    # Write cell barcodes
    write.table(colnames(counts_matrix),
            file = paste0(out.dir,
                          "Barcodes.tsv"),
                sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE)
}