# ChromVAR  
  
This is a notebook to run chromvar on peaks derived from scATAC-seq stored in a Seurat object. This can be reworked to use peaks not entered into a Seurat object. ChromVAR needs 3 inputs. 1) A count matrix that is barcode x peak or sample x peak 2) GRanges identifying peaks and 3) motifs. Motifs can be a custom set, or downloaded from JASPAR. I have code in another location to pull and save JASPAR motifs into a usable format. JASPAR seems to update yearly. The motifs I pulled are all human.  

From here there are follow-up notebooks to compare cell types and disease states.

A reference for ChromVAR can be found here: https://greenleaflab.github.io/chromVAR/articles/Introduction.html

Throughout, the code uses functions from the Seurat, Signac, and chromVAR packages to handle single-cell ATAC-seq data, integrating genomic data with single-cell expression data, identifying transcription factor motifs, and assessing variability across cells. The focus on motif variability can uncover insights into the regulatory landscape affecting gene expression in different cell types or conditions.

## Suppressing Messages & Library Imports:

Suppresses package startup messages for a cleaner output.
Loads necessary libraries for the analysis. Libraries like Signac, Seurat, and JASPAR2020 are crucial for single-cell ATAC-seq analysis, motif analysis, and accessing DNA-binding profiles, respectively.

In [None]:
suppressMessages(library(chromVAR))
suppressMessages(library(motifmatchr))
suppressMessages(library(SummarizedExperiment))

library(Signac)
library(Seurat)
library(JASPAR2020)
library(TFBSTools)
library(BSgenome.Hsapiens.UCSC.hg38)
library(patchwork)
library(readr)
library(stringr)
library(dplyr)
set.seed(1234)

library(Matrix)
library(BiocParallel)
register(MulticoreParam(8))

# Setting Up Environment:

Sets a random seed for reproducibility.
Registers multiple cores for parallel computation to speed up the process.

In [None]:
#### Send to channel code
library(parallel)
library(plyr)
library(ggpubr)
library(car)
library(qvalue)

# Data Loading and Pre-processing:

Reads a Seurat object containing ATAC-seq data.
Sets the default assay to 'Unified_Peaks'.
Extracts the assay data and peak locations, formatting them into a GRanges object for spatial information.

In [None]:
#Load in Seurat ATAC object with unified peaks

atac <- readRDS('HPAP_atac_obj_withUnifiedpeaks.rds')
atac

DefaultAssay(atac) <- 'Unified_Peaks'

#Prep inputs for SummarizedExperiment

#Extract matrix from Seurat object
sc.data <- GetAssayData(atac,slot='data', assay="Unified_Peaks")
sc.data.stored <- sc.data

# Extract peak locations and reformat into GRanges object
bed = str_split_fixed(rownames(sc.data), "\\-", 3) #This may need to be modified depending on your peak naming convention
bed[,1] <- paste0('chr', bed[,1])

gr= GRanges(seqnames = bed[,1], ranges = IRanges(start = as.numeric(bed[,2]), end = as.numeric(bed[,3])))

str(atac)

options(repr.plot.width=20, repr.plot.height=15)

gg1_1 <- DimPlot(atac, shuffle = TRUE, label.size = 6.5, repel = T, pt.size = 0.8) + theme(axis.text.x=element_text(angle=45, hjust=1), legend.key.size = unit(2,"line"), legend.text=element_text(size=20), plot.title = element_text(size = 30, face = "bold", hjust = 0.5)) + xlab('') + ylab('') 
gg1_1

# SummarizedExperiment Object Creation:

Creates a SummarizedExperiment object from the extracted matrix and GRanges, which is used to store the count data and associated genomic ranges.

In [None]:
#Use the matrix and GRanges to make a SummarizedExperiment to put into chromvar and add metadata

fragment.counts <- SummarizedExperiment(assays=list(counts=sc.data), rowRanges=gr)
fragment.counts.stored <- fragment.counts

# Metadata Addition:
Adds additional metadata from the Seurat object to the SummarizedExperiment object, which might be useful for downstream analyses.

In [None]:
# Adjust to include any columns from the Seurat object metadata you will use downstream. Easier to include more now.
metrics <- select(atac[[]], orig.ident, nCount_ATAC_peaks, nFeature_ATAC_peaks, library, sex,
                  condition, seurat_clusters,Cell.Type) 
if (length(rownames(metrics)) == sum(rownames(colData(fragment.counts)) == rownames(metrics))) {
    print("Success adding meta data")
    colData(fragment.counts) <- cbind(colData(fragment.counts), metrics[rownames(colData(fragment.counts)),])
} else {
    print("Failed to add meta data, check column names.")
}

# Bias Correction and Motif Analysis:

Applies GC bias correction to the counts.
Loads JASPAR motifs and matches them to the genomic ranges using motifmatchr, preparing for chromVAR analysis.

In [None]:
fragment.counts <- addGCBias(fragment.counts, genome=BSgenome.Hsapiens.UCSC.hg38)
fragment.counts

In [None]:
#Load in motifs
jaspar.motifs <- readRDS(file ='/nfs/lab/welison/multiome/chromvar/jaspar_2022_object.Rdata')
motif.ix <- matchMotifs(jaspar.motifs, fragment.counts, genome=BSgenome.Hsapiens.UCSC.hg38)

motif.ix

fragment.counts

# ChromVAR Analysis:
Runs chromVAR to compute deviation scores, which indicate how much each motifs accessibility varies from what is expected. This can highlight motifs that are particularly variable across conditions or cell types.

In [None]:
# Run chromVAR
dev <- computeDeviations(object=fragment.counts, annotations=motif.ix)#, expectation=expected)#
head(dev)

In [None]:
saveRDS(dev, "/nfs/lab/parulk/HPAP_scATAC/motif_analysis/ChromVAR_Object.RDS")

In [None]:
# Collect details for downstream analysis

# Reload chromVAR object and motifs if picking up again
# dev <- readRDS("/nfs/lab/parulk/HPAP_scATAC/motif_analysis/ChromVAR_Object.RDS")
jaspar.motifs <- readRDS(file ='/nfs/lab/welison/multiome/chromvar/jaspar_2022_object.Rdata')

dev
jaspar.motifs

In [None]:
#This is a cell by motif deviation score (aka accessibility) matrix
devtab = deviationScores(dev)
head(devtab)

# Variation of accessibiility across deviation scores, basically the standard errror. Null is about 1
variability <- computeVariability(dev)
head(variability)

motifdata = cbind(sapply(jaspar.motifs, function(x) unlist(x@name)),sapply(jaspar.motifs, function(x) unlist(x@matrixClass )))                                                  
motifdata             #  sapply(pfm, function(x) x@tags$symbol )  ,     sapply(pfm, function(x) x@tags$family ))

head(atac)

In [None]:
#Pull in some metadata
info = data.frame(def   = atac$library,
                    cells = Cells(atac),
                    groups = atac$condition,
                    cluster = atac@active.ident, samples = atac$library)
head(info)
write.table(info, file='/nfs/lab/parulk/HPAP_scATAC/motif_analysis/info_table_ctrlvdiabetes.txt',quote = FALSE, sep='\t')

table(atac[[]]$library)

In [None]:
output_dir <- '/nfs/lab/parulk/HPAP_scATAC/motif_analysis/'
#dir.create(output_dir)

#Save devscores, motifdata, and info
write.table(devtab, file=paste0(output_dir,'devscores.txt'),quote = FALSE, col.names = TRUE, row.names = TRUE)
write.table(motifdata, file=paste0(output_dir,'motifdata.txt'),quote = FALSE, col.names = FALSE,sep='\t')
write.table(info, file=paste0(output_dir,'info_table.txt'),quote = FALSE, sep='\t')
write.table(variability, file=paste0(output_dir,'variability.txt'),quote = FALSE, col.names = TRUE, row.names = TRUE, sep='\t')

In [None]:
in_dir <- '/nfs/lab/parulk/HPAP_scATAC/motif_analysis/'

variability <- read.table(file=paste0(in_dir,'variability.txt'))
variability

# Variability distirbution
ggplot(variability, aes(x=variability)) +
geom_histogram() + geom_vline(xintercept = 1.2)

# I often work with highly variable motifs, they in theory have some sort of difference greater than the null hypothesis
sum(variability$variability > 1.2) 
length(variability$variability > 1.2) 

# How many pass different cutoffs
var_filts <- c(0,.5,1,1.2,1.25,1.3,1.4,1.5,2,2.5)

print(length(variability$variability > var_filts))

for (var_filt in var_filts) {
    print(var_filt)
    print(sum(variability$variability > var_filt))
}

# Highly variable motifs
variability[variability$variability > 1.25,]