**This notebook sets up a detailed workflow for single-cell ATAC-seq window-based clustering. It involves:**

- Environment Setup: Initiates an R environment and loads necessary libraries like Seurat, Signac, and EnsDb.Hsapiens.v86.
- Data Reading: Processes ATAC-seq data from various samples to meet Seurat's format requirements.
- Pre-processing: Adjusts barcode formats and prepares data for sparse matrix conversion, focusing on highly variable windows.
- Sparse Matrices Creation: Merges sample matrices into one sparse matrix, handling missing data.
- Seurat Object Creation: Constructs a chromatin assay and a Seurat object, annotating it with sample metadata.
- Data Analysis: Executes TF-IDF and SVD for dimensionality reduction, uses Harmony for batch effect correction, and performs UMAP for visualization and clustering.
- Quality Control and Visualization: Adds QC metrics to the Seurat object and generates various plots for data exploration.
- Final Steps: Saves the Seurat object and produces plots to analyze cell distribution across samples and clusters.

**Key aspects include data integration, dimensionality reduction, batch effect correction, and advanced visualization to identify distinct cell populations within ATAC-seq data.**

### Environment Setup

In [None]:
#source activate newEnv
suppressMessages(library(hdf5r)) 
suppressMessages(library(Seurat))
suppressMessages(library(Signac))
suppressMessages(library(EnsDb.Hsapiens.v86))
suppressMessages(library(dplyr))
suppressMessages(library(ggplot2))
suppressMessages(library(Matrix))
suppressMessages(library(harmony))
suppressMessages(library(data.table))
suppressMessages(library(ggpubr))
suppressMessages(library(future))
library(dplyr)
library(Seurat)
library(ggplot2)
library(sctransform)
library(scater)
library(reticulate)
library(future)
library('Biobase')
library(pheatmap)
library(gplots)
library('hdf5r')
library(EnsDb.Hsapiens.v86)
library(BiocParallel)
library(tictoc)
library(Seurat)
library(Signac)
library(EnsDb.Hsapiens.v86)
library(ggplot2)
library(cowplot)
library("Signac")

In [None]:
BiocManager::install("Bioconductor/GenomeInfoDb",lib = "/home/parulk/R/x86_64-pc-linux-gnu-library/4.1",force = TRUE)

In [None]:
library(GenomeInfoDb,lib.loc="/home/parulk/R/x86_64-pc-linux-gnu-library/4.1")

In [None]:
packageVersion("GenomeInfoDb",lib.loc="/home/parulk/R/x86_64-pc-linux-gnu-library/4.1")

In [None]:
GenomeInfoDb::getChromInfoFromUCSC("hg38")

# Upstream analysis

### Read in the  window  lfm from joshs pipeline 5K default
### First read in 4-6 samples to get HVWs


In [None]:
samples <- c('HPAP-035','HPAP-051', 'HPAP-055', 'HPAP-061', 'HPAP-062')


### Read in long fmt mtx V1 should be windows, V2 should be barcodes

In [None]:
#### READ IN JOSHS MATRICES
#read in ATAC data from the lfm matrices (sm workaround method for now)
# load in starting ATAC long format matrices to a list 
atacs_OG <- list()
atacs_FINAL <- list()

for (sample in samples) {
    #print(sample)
    wd <- sprintf('/nfs/lab/parulk/HPAP_scATAC/lfm1')
    atacs_OG[[sample]] <- read.table(file.path(wd, sprintf('/%s.long_fmt_mtx.txt.gz',sample)), sep='\t', header=FALSE, stringsAsFactors=FALSE)
    #atacs_OG[[sample]]$V1 <- as.factor(atacs_OG[[sample]]$V1)
    #atacs_OG[[sample]]$V2 <- as.factor(atacs_OG[[sample]]$V2)
    atacs_FINAL[[sample]] <- read.table(file.path(wd, sprintf('/%s.long_fmt_mtx.txt.gz',sample)), sep='\t', header=FALSE, stringsAsFactors=FALSE)
    atacs_FINAL[[sample]]$V1 <- as.factor(atacs_OG[[sample]]$V2)
    atacs_FINAL[[sample]]$V2 <- as.factor(atacs_OG[[sample]]$V1)
    atacs_OG[[sample]] <- NULL
    #atacs_FINAL[[sample]] <- atacs_FINAL[[sample]][atacs_FINAL[[sample]]$V2 %in% good,]
}

In [None]:
atac_mod <- atacs_FINAL
head(atac_mod)

In [None]:
#### ADD '-1' TO THE BARCODES FROM JOSHS MATRICES 
# THIS IS ALL BECAUSE THE FRAGMENT FILE FROM CELLRANGER IS IN THIS FORMAT AND SEURAT NEEDS THEM TO MATCH
atac_mod$`HPAP-035`$V2 <- paste0(atac_mod$`HPAP-035`$V2, "-1")
atac_mod$`HPAP-051`$V2 <- paste0(atac_mod$`HPAP-051`$V2, "-1")
atac_mod$`HPAP-055`$V2 <- paste0(atac_mod$`HPAP-055`$V2, "-1")
atac_mod$`HPAP-061`$V2 <- paste0(atac_mod$`HPAP-061`$V2, "-1")
atac_mod$`HPAP-062`$V2 <- paste0(atac_mod$`HPAP-062`$V2, "-1")

### Read in windows file from pipeline for initial run, but replace with HVWs file afterwards

In [None]:
### GRAB THE WINDOWS FILE FROM JOSH'S PIPELINE 
#read in the HVWs set, we'll cut down all samples to be these windows
hvw_fp2 <- '/nfs/lab/parulk/HPAP_scATAC/lfm1/windows_modified.txt'
hvws2 <- scan(hvw_fp2, what="", sep="\n")
print(head(hvws2))

#sort alphanumerically
hvws_fin2 <- sort(hvws2)
print(head(hvws_fin2))

In [None]:
#function which takes in a list of long format atac_fragment dfs with 
#sample names (df), an overall windows file (windows) and then makes these 
#into sparse matrices and merges them together

#modified to take in the hvws set and use those... will still check if 
#there's any missing windows and add those, so only a few changes!

merge_sparse_matrices_hvws <- function(dfs, windows){
    samples <- names(dfs)
    for (sample in samples){
        #get missing windows list for this sample
        print(paste(sample,Sys.time(),sep=': '))
        df <- dfs[[sample]]
        mis_windows <- windows[!windows %in% levels(df$V1)]
        
        #make sure there are missing_windows
        if (length(mis_windows) > 0){
            print('Adding missing windows')
            #create a new long format matrix (sm) with the missing windows added as 0 counts
            filler_bc <- as.character(df$V2[[1]])
            print(paste("Using filler BC:",filler_bc,sep=" "))
            new_rows <- cbind(as.data.frame(mis_windows),
                              as.data.frame(rep(filler_bc),length(mis_windows)),
                              as.data.frame(rep(0,length(mis_windows))))
            colnames(new_rows) <- c("V1","V2","V3")
            lfm <- rbind(df,new_rows)
        #if there aren't, set lfm to df
        } else {
            print('No windows were missing')
            lfm <- df
        }
        
        #cut down lfm to just be the hvws (windows)
        lfm_cut <- lfm[lfm$V1 %in% windows,]
        
        #cut down barcodes to keep those in final set 
        #lfm_cut <- lfm_cut[lfm_cut$V2 %in% good,]
        #lfm_cut <- lfm[,lfm$V2 %in% good]
        #atacs_FINAL[[sample]] <- atacs_FINAL[[sample]][atacs_FINAL[[sample]]$V2 %in% good,]
        #set the levels of the lfm based on the desired bc order and reorder V1
        lfm_cut$V1 <- factor(lfm_cut$V1, levels=windows)
        lfm2 <- lfm_cut[order(lfm_cut$V1),]
        lfm2$V2 <- as.factor(lfm2$V2)
        
        if (sample == samples[1]){
            #if first sample, will make the overall sparse matrix 
            overall_sm <- with(lfm2,sparseMatrix(i=as.numeric(V1), j=as.numeric(V2), x=V3, dimnames=list(levels(V1), levels(V2))))
            print(dim(overall_sm))
            
        } else {
            #lfm2 <- lfm2[lfm2$V2 %in% good,]

            #otherwise, convert into a sparse matrix and add to the overall one
            sm = with(lfm2,sparseMatrix(i=as.numeric(V1), j=as.numeric(V2), x=V3, dimnames=list(levels(V1), levels(V2))))
            print(dim(sm))
            overall_sm = cbind(overall_sm, sm) 
        }
    }
    return(overall_sm)
}

In [None]:
overall_sm2 <- merge_sparse_matrices_hvws(atac_mod,hvws_fin2)
dim(overall_sm2)
head(overall_sm2)

In [None]:
#check if the windows of overall_sm are sorted (they should be)
sorted_windows2 <- sort(row.names(overall_sm2))
table(sorted_windows2 == row.names(overall_sm2))

In [None]:
## atac == giant matrix of barcodes by highly varibale windows
atac <- overall_sm2


In [None]:
#continue to make Seurat compatible object
suppressMessages(annotations <- GetGRangesFromEnsDb(ensdb=EnsDb.Hsapiens.v86))
seqlevelsStyle(annotations) <- "UCSC"
genome(annotations) <- 'hg38'


### Create the merged fragment file for the 4-6 samples <br>
<code>for SAMPLE in HPAP-035 HPAP-051 HPAP-055 HPAP-061 HPAP-062 ; do zcat /nfs/lab/projects/multiomic_islet/data/hpap/atac/${SAMPLE}/Upenn_scATACseq/cellranger_RME/${SAMPLE}/outs/fragments.tsv.gz | awk -v SAMPLE=$SAMPLE \ 'BEGIN{FS=OFS="\t"} {print $1,$2,$3,SAMPLE"_"$4,$5}'; done | sort -k1,1 -k2,2n -S 64G | bgzip -c -@ 16 > /nfs/lab/parulk/HPAP_scATAC/merged_HPAP-035_HPAP-051_HPAP-055_HPAP-061_HPAP-062.bed.gz
zcat merged_HPAP-035_HPAP-051_HPAP-055_HPAP-061_HPAP-062.bed.gz | grep -v '^#' > merged_HPAP-035_HPAP-051_HPAP-055_HPAP-061_HPAP-062-new.bed
gzip merged_HPAP-035_HPAP-051_HPAP-055_HPAP-061_HPAP-062-new.bed

tabix -p bed /nfs/lab/parulk/HPAP_scATAC/merged_HPAP-035_HPAP-051_HPAP-055_HPAP-061_HPAP-062-new.bed.gz </code>
<br> pass the path to the fragment.tsv from cellranger <br>

In [None]:
head(atac)

In [None]:
frag.file ='/nfs/lab/parulk/HPAP_scATAC/merged_HPAP-035_HPAP-051_HPAP-055_HPAP-061_HPAP-062-new.bed.gz'
head(frag.file)

In [None]:
atac_assay <- CreateChromatinAssay(counts=atac, sep=c(':', '-'), genome='hg38', fragments=frag.file, min.cells=0, min.features=-1, annotation=annotations)
head(atac_assay)

In [None]:
head(atac_assay)

In [None]:
Atac <-CreateSeuratObject(atac_assay, project = "HPAP", assay = "ATAC_windows",
  min.cells = 0, min.features = 0, names.field = 2,
  names.delim = "_")
head(Atac)
str(Atac)

In [None]:
### Since our sampleIDs are structured HPAP- we cut off the barcode bit of each row and store as the library
Atac$library <- gsub('.{23}$', '', rownames(Atac@meta.data))
head(Atac$library)

In [None]:
### add whatever metadata for harmony
sex_F = list('HPAP-051', 'HPAP-062')
cond_t1d = list('HPAP-055')
cond_t2d = list('HPAP-051', 'HPAP-061', 'HPAP-062')
cond_ctrl = list('HPAP-035')           




Atac@meta.data$sex[Atac@meta.data$library %in% sex_F] <- 'F'
Atac@meta.data$sex[!Atac@meta.data$library %in% sex_F] <- 'M'

Atac@meta.data$condition[Atac@meta.data$library %in% cond_t1d] <- 'T1D'
Atac@meta.data$condition[Atac@meta.data$library %in% cond_t2d] <- 'T2D'
Atac@meta.data$condition[Atac@meta.data$library %in% cond_ctrl] <- 'Control'
head(Atac@meta.data)
head(Atac@meta.data$condition)
head(Atac@meta.data$sex)

In [None]:
#### I save everything up to here because Seurat likes to crash and its annoying to re do everything

saveRDS(Atac, file='/nfs/lab/parulk/HPAP_scATAC/HVW.rds')



In [None]:
### For questions google Signac : https://satijalab.org/signac/index.html

In [None]:
Atac <- readRDS('/nfs/lab/parulk/HPAP_scATAC/HVW.rds')
head(Atac[['ATAC_windows']][])

In [None]:
DefaultAssay(Atac) <- 'ATAC_windows'
Atac <- RunTFIDF(Atac)
hvw1 <- FindTopFeatures(object = Atac[['ATAC_windows']][], min.cutoff = "q9")
hvw1_cut <- hvw1[seq(1,50000),]
hvw1_fin <- row.names(hvw1_cut) #write this out to a file in the correct format!
head(hvw1_fin)
write.table(hvw1_fin, file = "hvw1_fin", sep = "\t",
            row.names = FALSE)
#Atac <- FindTopFeatures(Atac, min.cutoff='q0', verbose=FALSE)

### At this point you can save out the HVWs for 4-6 samples
### then restart the notebook from the top with all the samples

colnames(hvw1)

In [None]:
hvw1

In [None]:
write.table(hvw1_fin, file = "/nfs/lab/parulk/HPAP_scATAC/hvw_fin", sep = "\t",
            row.names = FALSE)

### STOP HERE FOR INITIAL 4-6 SAMPLE RUN
#### SAVE OUT HVWs AND RESTART FROM THE TOP WITH ALL THE SAMPLES 

In [None]:
samples <- c('HPAP-035', 'HPAP-036', 'HPAP-039', 'HPAP-040', 'HPAP-044', 'HPAP-045', 'HPAP-047', 'HPAP-049', 'HPAP-050', 'HPAP-051', 'HPAP-052', 'HPAP-053', 'HPAP-054', 'HPAP-055', 'HPAP-056', 'HPAP-059', 'HPAP-061', 'HPAP-062', 'HPAP-063', 'HPAP-064', 'HPAP-067', 'HPAP-069', 'HPAP-072', 'HPAP-075', 'HPAP-077', 'HPAP-079', 'HPAP-080', 'HPAP-081', 'HPAP-083', 'HPAP-084', 'HPAP-085', 'HPAP-088', 'HPAP-092', 'HPAP-099', 'HPAP-100', 'HPAP-101', 'HPAP-103', 'HPAP-104', 'HPAP-105', 'HPAP-106', 'HPAP-109')


In [None]:
#### READ IN JOSHS MATRICES
#read in ATAC data from the lfm matrices (sm workaround method for now)
# load in starting ATAC long format matrices to a list 
atacs_OG <- list()
atacs_FINAL <- list()

for (sample in samples) {
    #print(sample)
    wd <- sprintf('/nfs/lab/parulk/HPAP_scATAC/lfm1', sample)
    atacs_OG[[sample]] <- read.table(file.path(wd, sprintf('/%s.long_fmt_mtx.txt.gz',sample)), sep='\t', header=FALSE, stringsAsFactors=FALSE)
    #atacs_OG[[sample]]$V1 <- as.factor(atacs_OG[[sample]]$V1)
    #atacs_OG[[sample]]$V2 <- as.factor(atacs_OG[[sample]]$V2)
    atacs_FINAL[[sample]] <- read.table(file.path(wd, sprintf('/%s.long_fmt_mtx.txt.gz',sample)), sep='\t', header=FALSE, stringsAsFactors=FALSE)
    atacs_FINAL[[sample]]$V1 <- as.factor(atacs_OG[[sample]]$V2)
    atacs_FINAL[[sample]]$V2 <- as.factor(atacs_OG[[sample]]$V1)
    atacs_OG[[sample]] <- NULL
    #atacs_FINAL[[sample]] <- atacs_FINAL[[sample]][atacs_FINAL[[sample]]$V2 %in% good,]
}

In [None]:
atac_mod <- atacs_FINAL
head(atac_mod)

In [None]:
#### ADD '-1' TO THE BARCODES FROM JOSHS MATRICES 
# THIS IS ALL BECAUSE THE FRAGMENT FILE FROM CELLRANGER IS IN THIS FORMAT AND SEURAT NEEDS THEM TO MATCH
atac_mod$`HPAP-035`$V2 <- paste0(atac_mod$`HPAP-035`$V2, "-1")
atac_mod$`HPAP-036`$V2 <- paste0(atac_mod$`HPAP-036`$V2, "-1")
atac_mod$`HPAP-039`$V2 <- paste0(atac_mod$`HPAP-039`$V2, "-1")
atac_mod$`HPAP-040`$V2 <- paste0(atac_mod$`HPAP-040`$V2, "-1")
atac_mod$`HPAP-044`$V2 <- paste0(atac_mod$`HPAP-044`$V2, "-1")
atac_mod$`HPAP-045`$V2 <- paste0(atac_mod$`HPAP-045`$V2, "-1")
atac_mod$`HPAP-047`$V2 <- paste0(atac_mod$`HPAP-047`$V2, "-1")
atac_mod$`HPAP-049`$V2 <- paste0(atac_mod$`HPAP-049`$V2, "-1")
atac_mod$`HPAP-050`$V2 <- paste0(atac_mod$`HPAP-050`$V2, "-1")
atac_mod$`HPAP-051`$V2 <- paste0(atac_mod$`HPAP-051`$V2, "-1")
atac_mod$`HPAP-052`$V2 <- paste0(atac_mod$`HPAP-052`$V2, "-1")
atac_mod$`HPAP-053`$V2 <- paste0(atac_mod$`HPAP-053`$V2, "-1")
atac_mod$`HPAP-054`$V2 <- paste0(atac_mod$`HPAP-054`$V2, "-1")
atac_mod$`HPAP-055`$V2 <- paste0(atac_mod$`HPAP-055`$V2, "-1")
atac_mod$`HPAP-056`$V2 <- paste0(atac_mod$`HPAP-056`$V2, "-1")
atac_mod$`HPAP-059`$V2 <- paste0(atac_mod$`HPAP-059`$V2, "-1")
atac_mod$`HPAP-061`$V2 <- paste0(atac_mod$`HPAP-061`$V2, "-1")
atac_mod$`HPAP-062`$V2 <- paste0(atac_mod$`HPAP-062`$V2, "-1")
atac_mod$`HPAP-063`$V2 <- paste0(atac_mod$`HPAP-063`$V2, "-1")
atac_mod$`HPAP-064`$V2 <- paste0(atac_mod$`HPAP-064`$V2, "-1")
atac_mod$`HPAP-067`$V2 <- paste0(atac_mod$`HPAP-067`$V2, "-1")
atac_mod$`HPAP-069`$V2 <- paste0(atac_mod$`HPAP-069`$V2, "-1")
atac_mod$`HPAP-072`$V2 <- paste0(atac_mod$`HPAP-072`$V2, "-1")
atac_mod$`HPAP-075`$V2 <- paste0(atac_mod$`HPAP-075`$V2, "-1")
atac_mod$`HPAP-077`$V2 <- paste0(atac_mod$`HPAP-077`$V2, "-1")
atac_mod$`HPAP-079`$V2 <- paste0(atac_mod$`HPAP-079`$V2, "-1")
atac_mod$`HPAP-080`$V2 <- paste0(atac_mod$`HPAP-080`$V2, "-1")
atac_mod$`HPAP-081`$V2 <- paste0(atac_mod$`HPAP-081`$V2, "-1")
atac_mod$`HPAP-083`$V2 <- paste0(atac_mod$`HPAP-083`$V2, "-1")
atac_mod$`HPAP-084`$V2 <- paste0(atac_mod$`HPAP-084`$V2, "-1")
atac_mod$`HPAP-085`$V2 <- paste0(atac_mod$`HPAP-085`$V2, "-1")
atac_mod$`HPAP-088`$V2 <- paste0(atac_mod$`HPAP-088`$V2, "-1")
atac_mod$`HPAP-092`$V2 <- paste0(atac_mod$`HPAP-092`$V2, "-1")
atac_mod$`HPAP-099`$V2 <- paste0(atac_mod$`HPAP-099`$V2, "-1")
atac_mod$`HPAP-100`$V2 <- paste0(atac_mod$`HPAP-100`$V2, "-1")
atac_mod$`HPAP-101`$V2 <- paste0(atac_mod$`HPAP-101`$V2, "-1")
atac_mod$`HPAP-103`$V2 <- paste0(atac_mod$`HPAP-103`$V2, "-1")
atac_mod$`HPAP-104`$V2 <- paste0(atac_mod$`HPAP-104`$V2, "-1")
atac_mod$`HPAP-105`$V2 <- paste0(atac_mod$`HPAP-105`$V2, "-1")
atac_mod$`HPAP-106`$V2 <- paste0(atac_mod$`HPAP-106`$V2, "-1")
atac_mod$`HPAP-109`$V2 <- paste0(atac_mod$`HPAP-109`$V2, "-1")

In [None]:
#read in the HVWs set, we'll cut down all samples to be these windows some basic formatting awk -F'\t' '{print $1,":",$2,"-",$3}' hvw_fin > hvw.txt
hvw_fp2 <- '/nfs/lab/parulk/HPAP_scATAC/hvw.txt'
hvws2 <- scan(hvw_fp2, what="", sep="\n")
print(head(hvws2))

#sort alphanumerically
hvws_fin2 <- sort(hvws2)
print(head(hvws_fin2))

In [None]:
#function which takes in a list of long format atac_fragment dfs with 
#sample names (df), an overall windows file (windows) and then makes these 
#into sparse matrices and merges them together

#modified to take in the hvws set and use those... will still check if 
#there's any missing windows and add those, so only a few changes!

merge_sparse_matrices_hvws <- function(dfs, windows){
    samples <- names(dfs)
    for (sample in samples){
        #get missing windows list for this sample
        print(paste(sample,Sys.time(),sep=': '))
        df <- dfs[[sample]]
        mis_windows <- windows[!windows %in% levels(df$V1)]
        
        #make sure there are missing_windows
        if (length(mis_windows) > 0){
            print('Adding missing windows')
            #create a new long format matrix (sm) with the missing windows added as 0 counts
            filler_bc <- as.character(df$V2[[1]])
            print(paste("Using filler BC:",filler_bc,sep=" "))
            new_rows <- cbind(as.data.frame(mis_windows),
                              as.data.frame(rep(filler_bc),length(mis_windows)),
                              as.data.frame(rep(0,length(mis_windows))))
            colnames(new_rows) <- c("V1","V2","V3")
            lfm <- rbind(df,new_rows)
        #if there aren't, set lfm to df
        } else {
            print('No windows were missing')
            lfm <- df
        }
        
        #cut down lfm to just be the hvws (windows)
        lfm_cut <- lfm[lfm$V1 %in% windows,]
        
        #cut down barcodes to keep those in final set 
        #lfm_cut <- lfm_cut[lfm_cut$V2 %in% good,]
        #lfm_cut <- lfm[,lfm$V2 %in% good]
        #atacs_FINAL[[sample]] <- atacs_FINAL[[sample]][atacs_FINAL[[sample]]$V2 %in% good,]
        #set the levels of the lfm based on the desired bc order and reorder V1
        lfm_cut$V1 <- factor(lfm_cut$V1, levels=windows)
        lfm2 <- lfm_cut[order(lfm_cut$V1),]
        lfm2$V2 <- as.factor(lfm2$V2)
        
        if (sample == samples[1]){
            #if first sample, will make the overall sparse matrix 
            overall_sm <- with(lfm2,sparseMatrix(i=as.numeric(V1), j=as.numeric(V2), x=V3, dimnames=list(levels(V1), levels(V2))))
            print(dim(overall_sm))
            
        } else {
            #lfm2 <- lfm2[lfm2$V2 %in% good,]

            #otherwise, convert into a sparse matrix and add to the overall one
            sm = with(lfm2,sparseMatrix(i=as.numeric(V1), j=as.numeric(V2), x=V3, dimnames=list(levels(V1), levels(V2))))
            print(dim(sm))
            overall_sm = cbind(overall_sm, sm) 
        }
    }
    return(overall_sm)
}

In [None]:
overall_sm2 <- merge_sparse_matrices_hvws(atac_mod,hvws_fin2)
dim(overall_sm2)
overall_sm2

In [None]:
#check if the windows of overall_sm are sorted (they should be)
sorted_windows2 <- sort(row.names(overall_sm2))
table(sorted_windows2 == row.names(overall_sm2))

In [None]:
## atac == giant matrix of barcodes by highly varibale windows
atac <- overall_sm2

In [None]:
#continue to make Seurat compatible object
suppressMessages(annotations <- GetGRangesFromEnsDb(ensdb=EnsDb.Hsapiens.v86))
seqlevelsStyle(annotations) <- 'UCSC'
genome(annotations) <- 'hg38'


In [None]:
head(atac_mod)

### Create the merged fragment file for the 4-6 samples

<code>for SAMPLE in HPAP-035 HPAP-036 HPAP-039 HPAP-040 HPAP-044 HPAP-045 HPAP-047 HPAP-049 HPAP-050 HPAP-051 HPAP-052 HPAP-053 HPAP-054 HPAP-055 HPAP-056 HPAP-059 HPAP-061 HPAP-062 HPAP-063 HPAP-064 HPAP-067 HPAP-069 HPAP-072 HPAP-075 HPAP-077 HPAP-079 HPAP-080 HPAP-081 HPAP-083 HPAP-084 HPAP-085 HPAP-088 HPAP-092 HPAP-099 HPAP-100 HPAP-101 HPAP-103 HPAP-104 HPAP-105 HPAP-106 HPAP-109 ; do zcat /nfs/lab/hpap_data/atac/${SAMPLE}/Upenn_scATACseq/cellranger_RME/${SAMPLE}/outs/fragments.tsv.gz | awk -v SAMPLE=$SAMPLE \ 'BEGIN{FS=OFS="\t"} {print $1,$2,$3,SAMPLE"_"$4,$5}'; done | sort -k1,1 -k2,2n -S 64G | bgzip -c -@ 16 > /nfs/lab/parulk/HPAP_scATAC/merged_samples.bed.gz</code>

<code>zcat merged_samples.bed.gz | grep -v '^#' > merged_samples-new.bed</code>

<code>bgzip -c merged_samples-new.bed > merged_samples-new.bed.gz</code>

<code>tabix -p bed merged_samples-new.bed.gz</code>


In [None]:
frag.file ='/nfs/lab/parulk/HPAP_scATAC/merged_samples-new.bed.gz'

In [None]:
atac_assay <- CreateChromatinAssay(counts=atac, sep=c(':', '-'), genome='hg38', fragments=frag.file, min.cells=0, min.features=-1, annotation=annotations)
head(atac_assay)

In [None]:
Atac <-CreateSeuratObject(atac_assay, project = "HPAP", assay = "ATAC_windows",
  min.cells = 0, min.features = 0, names.field = 2,
  names.delim = "_")
head(Atac)
str(Atac)

In [None]:
### Since our sampleIDs are structured HPAP- we cut off the barcode bit of each row and store as the library
Atac$library <- gsub('.{19}$', '', rownames(Atac@meta.data))
head(Atac$library)

In [None]:
### add whatever metadata for harmony
### add whatever metadata for harmony
sex_F = list('HPAP-036', 'HPAP-039', 'HPAP-044','HPAP-045', 'HPAP-050', 'HPAP-051', 'HPAP-053', 'HPAP-054', 'HPAP-062', 'HPAP-063', 'HPAP-069', 'HPAP-079', 'HPAP-081', 'HPAP-084', 'HPAP-085', 'HPAP-099', 'HPAP-101', 'HPAP-103', 'HPAP-105', 'HPAP-109')
cond_t1d = list('HPAP-055', 'HPAP-064', 'HPAP-084')
cond_t2d = list('HPAP-051', 'HPAP-061', 'HPAP-062', 'HPAP-079', 'HPAP-081', 'HPAP-083', 'HPAP-085', 'HPAP-088', 'HPAP-100', 'HPAP-106', 'HPAP-109')
cond_ctrl = list('HPAP-035', 'HPAP-036', 'HPAP-039', 'HPAP-040', 'HPAP-044', 'HPAP-045', 'HPAP-047', 'HPAP-049', 'HPAP-050', 'HPAP-052', 'HPAP-053', 'HPAP-054', 'HPAP-056', 'HPAP-059', 'HPAP-063', 'HPAP-067', 'HPAP-069', 'HPAP-072', 'HPAP-075', 'HPAP-077', 'HPAP-080', 'HPAP-092', 'HPAP-099', 'HPAP-101', 'HPAP-103', 'HPAP-104', 'HPAP-105')           




Atac@meta.data$sex[Atac@meta.data$library %in% sex_F] <- 'F'
Atac@meta.data$sex[!Atac@meta.data$library %in% sex_F] <- 'M'

Atac@meta.data$condition[Atac@meta.data$library %in% cond_t1d] <- 'T1D'
Atac@meta.data$condition[Atac@meta.data$library %in% cond_t2d] <- 'T2D'
Atac@meta.data$condition[Atac@meta.data$library %in% cond_ctrl] <- 'Control'
head(Atac@meta.data)
head(Atac@meta.data$condition)
head(Atac@meta.data$sex)

In [None]:
Atac@meta.data$sex <- as.factor(Atac@meta.data$sex)
unique_values <- levels(Atac@meta.data$sex)


In [None]:
unique_values

In [None]:
saveRDS(Atac, file='/nfs/lab/parulk/HPAP_scATAC/HVW_all_samples.rds')

In [None]:
Atac <- readRDS('/nfs/lab/parulk/HPAP_scATAC/HVW_all_samples.rds')

In [None]:
Atac

In [None]:
DefaultAssay(Atac) <- 'ATAC_windows'
Atac <- RunTFIDF(Atac)
Atac <- FindTopFeatures(Atac, min.cutoff='q0', verbose=FALSE)

In [None]:
Atac@reductions

In [None]:
Atac <- RunSVD(Atac)

In [None]:
DepthCor(Atac)

In [None]:
hm_atac <- HarmonyMatrix(Embeddings(Atac, reduction='lsi'),Atac@meta.data,  c("library","sex"), do_pca=FALSE,plot_convergence = TRUE, verbose = TRUE)

In [None]:
Atac[['harmony.atac']] <- CreateDimReducObject(embeddings=hm_atac, key='LSI_', assay= 'ATAC_windows')

In [None]:
Atac <- RunUMAP(Atac, dims=2:30, reduction='harmony.atac', reduction.name='umap.atac', reduction.key='atacUMAP_')


In [None]:
options(repr.plot.width=10, repr.plot.height=10)
p3 <- DimPlot(Atac, reduction='umap.atac', group.by = 'library', label=TRUE, label.size=6, repel=TRUE, raster=FALSE) + ggtitle('WNN')
p3 <- p3 + xlab('UMAP 1') + ylab('UMAP 2') + ggtitle('ATAC')
p3

In [None]:
Atac@reductions

In [None]:
DepthCor(Atac, reduction = 'harmony.atac')

In [None]:
Atac <- FindNeighbors(object = Atac, reduction = 'harmony.atac', dims = 2:30)
Atac <- FindClusters(object = Atac, algorithm=4,resolution = 0.5,method = "igraph") 

DimPlot(object = Atac, label = TRUE) + NoLegend()


In [None]:
options(repr.plot.width=10, repr.plot.height=10)
DimPlot(object = Atac, group.by = 'library',label = FALSE)# + NoLegend()
DimPlot(object = Atac, group.by = 'condition',label = FALSE)# + NoLegend()


In [None]:
#Save object after processing
saveRDS(Atac, file='/nfs/lab/parulk/HPAP_scATAC/HVW_all_samples_harmony_reduced_final.rds')

In [None]:
Atac <- readRDS('/nfs/lab/parulk/HPAP_scATAC/HVW_all_samples_harmony_reduced_final.rds')

In [None]:
DimPlot(object = Atac, label = TRUE) + NoLegend()


In [None]:
#gene activity
gene.activities <- GeneActivity(Atac)
# add the gene activity matrix to the Seurat object as a new assay and normalize it
Atac[['RNA']] <- CreateAssayObject(counts = gene.activities)
Atac <- NormalizeData(
  object = Atac,
  assay = 'RNA',
  normalization.method = 'LogNormalize',
  scale.factor = median(Atac$nCount_RNA)
)

In [None]:
Atac <- NucleosomeSignal(object = Atac)
Atac <- TSSEnrichment(object = Atac, fast = FALSE)
head(Atac@meta.data)

In [None]:
saveRDS(Atac, file='/nfs/lab/parulk/HPAP_scATAC/HVW_all_samples_harmony_reduced_final_gene_activity.rds')

In [None]:
Atac <- readRDS('/nfs/lab/parulk/HPAP_scATAC/HVW_all_samples_harmony_reduced_final_gene_activity.rds')

In [None]:
#check
# add qc metrics from josh
samples <- c('HPAP-035', 'HPAP-036', 'HPAP-039', 'HPAP-040', 'HPAP-044', 'HPAP-045', 'HPAP-047', 'HPAP-049', 'HPAP-050', 'HPAP-051', 'HPAP-052', 'HPAP-053', 'HPAP-054', 'HPAP-055', 'HPAP-056', 'HPAP-059', 'HPAP-061', 'HPAP-062', 'HPAP-063', 'HPAP-064', 'HPAP-067', 'HPAP-069', 'HPAP-072', 'HPAP-075', 'HPAP-077', 'HPAP-079', 'HPAP-080', 'HPAP-081', 'HPAP-083', 'HPAP-084', 'HPAP-085', 'HPAP-088', 'HPAP-092', 'HPAP-099', 'HPAP-100', 'HPAP-101', 'HPAP-103', 'HPAP-104', 'HPAP-105', 'HPAP-106', 'HPAP-109')
qcs <- list()
for (sample in samples) {
    wd <- sprintf('/nfs/lab/parulk/HPAP_scATAC/lfm1/')
    qc <- read.table(file.path(wd, sprintf('%s.qc_metrics.txt', sample, sample)), sep='\t', header=TRUE, stringsAsFactors=FALSE)
    #qc <- qc[qc$is_cell==1,]
    #qc$X <- paste0(sample, '_', qc$barcodes)
    qcs[[sample]] <- qc
}
qc <- as.data.frame(rbindlist(qcs))
qc$X <- paste0(qc$X, "-1")

head(qc)
rownames(qc) <- qc$X
qc <- qc[Cells(Atac), 6:length(colnames(qc))]
Atac <- AddMetaData(object=Atac, metadata=qc)
qc <- qcs <- NULL
gc()
head(Atac)
metadata <- Atac@meta.data
head(Atac@meta.data)

In [None]:
Atac

In [None]:
table(Idents(Atac))

In [None]:
options(repr.plot.width=20, repr.plot.height=15)
p1 <- VlnPlot(Atac, features='reads_in_peaks',  pt.size=0, log=TRUE) + geom_boxplot(width=.6, fill='white', alpha=.6) + geom_hline(yintercept=median(Atac$frac_reads_in_peaks), linetype='dashed')#
p2 <- VlnPlot(Atac, features='reads_in_promoters', pt.size=0, log=TRUE) + geom_boxplot(width=.6, fill='white', alpha=.6) + geom_hline(yintercept=median(Atac$frac_reads_in_promoters), linetype='dashed')
p3 <- VlnPlot(Atac, features='tss_used',  pt.size=0, log=TRUE) + geom_boxplot(width=.6, fill='white', alpha=.6) + geom_hline(yintercept=median(Atac$tss_used), linetype='dashed')
p1 / p2 / p3

In [None]:
sessionInfo()

In [None]:
install.packages("ggpubr")
install.packages("ggbreak")
install.packages("gridExtra")
install.packages("grid")
install.packages("ggh4x")
library(ggpubr)
library(ggbreak)
library(gridExtra)
library(grid)
library(ggh4x)
library(ggplot2)
library(ggforce)

In [None]:
library('tidyr')

In [None]:
# Load markers list
project.dir = "/nfs/lab/parulk/HPAP_scATAC/"
cell.markers = read.table("/nfs/lab/parulk/HPAP_scATAC/Cell.markers.txt", sep = ',', header = TRUE)
# Make it long, remove useless column and void markers
cell.markers <- cell.markers %>% gather(Key, marker, c(3:ncol(cell.markers)))
cell.markers = cell.markers[,-3]
cell.markers = cell.markers[cell.markers$marker != "", ]
head(cell.markers)
# Factorize columns
cell.markersCompartment <- cell.markers$cell.markersCompartment
cell.markersCellType <- cell.markers$cell.markersCellType
cell.markersCompartment = factor(cell.markersCompartment, levels = c("Endocrine cells", "Non-endocrine cells"))
cell.markersCellType = factor(cell.markersCellType, levels = c("Beta", "Alpha", "Delta", "Gamma", "Epsilon", "Ductal", "MUC5B Ductal", "Acinar", "Stellate", "Act. Stellate", "Q. Stellate", "Endothelial", "T Cell", "Schwann", "Macrophages", "Dividing Cells"))

In [None]:
g = DotPlot(Atac, assay='RNA', features=cell.markers$marker, cluster.idents=TRUE, col.min=0) +
        theme(axis.text.x=element_text(angle=45, hjust=1)) + xlab('') + ylab('')
    meta_summary = g$data
    colnames(meta_summary)[3] = "marker"
    meta_summary = merge(meta_summary, cell.markers, by = "marker")

    options(repr.plot.width=25, repr.plot.height=10)
    figure <- ggplot(meta_summary, aes(x = marker, y = id)) +
      geom_point(aes(size = pct.exp, fill = avg.exp.scaled, stroke=NA),
                 shape = 21) +
      scale_size("% detected", range = c(0, 6)) +
      scale_fill_gradient(low = "lightgray", high = "blue",
                           guide = guide_colorbar(nbin = 200,
                                                  ticks.colour = "black", frame.colour = "black"),
                           name = "Average\nexpression") +
      ylab("Cluster") + xlab("") +
      theme_bw() +
      theme(axis.text = element_text(size = 100),
            axis.text.x = element_text(size = 10, angle = 45, hjust = 1, color = "black"),
            strip.text.x = element_text(size = 10),
            axis.text.y = element_text(size = 12, color = "black"),
            axis.title = element_text(size = 14)) +
facet_nested(~ Compartment + CellType, scales = "free")

figure

In [None]:
options(repr.plot.width=8, repr.plot.height=10)
sample.abundance = as.data.frame(table(Atac$seurat_clusters, Atac$library))
colnames(sample.abundance) = c("cluster", "sample", "Freq")
ggplot(sample.abundance, aes(fill = sample, y = Freq, x = cluster)) +
  theme_bw() +
  coord_flip() +
  geom_bar(position = position_fill(reverse = TRUE), stat = 'identity', color = 'black', size = 0.2) +
  xlab('') + ylab('percentage')

In [None]:
covariant.ls = c("library", "sex", "condition")

In [None]:
gg.ls = list()

# Library
    i = 1
    covariant = covariant.ls[i]
    Covariant.table = as.data.frame(table(Atac$seurat_clusters, Atac$library))
    colnames(Covariant.table) = c("cluster", "covariant", "Freq")
    gg.ls[[i]] = ggplot(Covariant.table, aes(fill = covariant, y = Freq, x = cluster)) +
      theme_bw() +
      coord_flip() +
      geom_bar(position = position_fill(reverse = TRUE), stat = 'identity', color = 'black', size = 0.2) +
      labs(y= "\n Percentage", x = "", title = covariant) + 
      theme(axis.text = element_text(size = 12), axis.title = element_text(size = 12, face = "bold"),
                      axis.text.x = element_text(angle = 90),
                                 plot.title = element_text(size = 18, face = "bold", , hjust = 0.5))

# Sex
    i = 2
    covariant = covariant.ls[i]
    Covariant.table = as.data.frame(table(Atac$seurat_clusters, Atac$sex))
    colnames(Covariant.table) = c("cluster", "covariant", "Freq")
    gg.ls[[i]] = ggplot(Covariant.table, aes(fill = covariant, y = Freq, x = cluster)) +
      theme_bw() +
      coord_flip() +
      geom_bar(position = position_fill(reverse = TRUE), stat = 'identity', color = 'black', size = 0.2) +
      labs(y= "\n Percentage", x = "", title = covariant) + 
      theme(axis.text = element_text(size = 12), axis.title = element_text(size = 12, face = "bold"),
                      axis.text.x = element_text(angle = 90),
                                 plot.title = element_text(size = 18, face = "bold", , hjust = 0.5))

# Condition
    i = 3
    covariant = covariant.ls[i]
    Covariant.table = as.data.frame(table(Atac$seurat_clusters, Atac$condition))
    colnames(Covariant.table) = c("cluster", "covariant", "Freq")
    gg.ls[[i]] = ggplot(Covariant.table, aes(fill = covariant, y = Freq, x = cluster)) +
      theme_bw() +
      coord_flip() +
      geom_bar(position = position_fill(reverse = TRUE), stat = 'identity', color = 'black', size = 0.2) +
      labs(y= "\n Percentage", x = "", title = covariant) + 
      theme(axis.text = element_text(size = 12), axis.title = element_text(size = 12, face = "bold"),
                      axis.text.x = element_text(angle = 90),
                                 plot.title = element_text(size = 18, face = "bold", , hjust = 0.5))

In [None]:
options(repr.plot.height = 20, repr.plot.width = 40)
cp <- gg.ls[[1]] + gg.ls[[2]] + gg.ls[[3]]
cp

In [None]:
cell.sample = as.data.frame(table(Atac$library))
colnames(cell.sample)[1] = "sample.ID"

In [None]:
options(repr.plot.width=15, repr.plot.height=15)

width = 0.8
position = position_dodge(width = width)
gg1 = ggplot(cell.sample, aes(x= sample.ID, y = Freq, label = Freq)) + 
            theme_bw () +
            labs(y= "", x = "", title = paste("Final Cells: ", length(colnames(Atac[["RNA"]])))) +
            geom_bar(stat="identity", 
                     position = position, width = width,
                     colour="black") +
            ylim(0, 20000)+
            theme(axis.text = element_text(size = 12), axis.title = element_text(size = 12, face = "bold"),
                  axis.text.x = element_text(angle = 90),
                             plot.title = element_text(size = 18, face = "bold", , hjust = 0.5))+
            geom_text(hjust = -0.3, size = 2.9, position = position_dodge(width = width))+
            coord_flip()
gg1

In [None]:
new.cluster.ids <- c('Alpha', 'Beta', 'Acinar', '4', 'Ductal', 'Delta', 'Alpha', '8', 'Stellate', 'Endothelial', 'Ductal', '12', 'Stellate', 'Macrophage', 'Beta')
names(new.cluster.ids) <- levels(Atac)
atac_obj <- RenameIdents(Atac, new.cluster.ids)

In [None]:
options(repr.plot.height = 10, repr.plot.width = 14)

DimPlot(object = atac_obj, label = TRUE,pt.size = 1 ) + NoLegend()

In [None]:
table(Idents(Atac))