In [None]:
library(Seurat)
library(Signac)
library(GenomeInfoDb)
library(EnsDb.Mmusculus.v79)
library(ggplot2)
library(ggpubr)
library(glue)
library(scDblFinder)
library(RColorBrewer)
library(dplyr)
library(ggridges)
library(CopyscAT)
library(BSgenome.Mmusculus.UCSC.mm10)
library(harmony)
set.seed(123)

# Goals: 
## 1. Identify differentially accassibile regions (DARs) between KO and WT groups in atac_subset_gex obj using Logistic Regression (LR) method
## 2. Find the enriched motifs using DARs

# <span style="color:green"> Part 1a: Find accessibile regions in KO compared to the WT group (only.pos = T) with LR. Use "nCount_peaks" and "nFeature_peaks" as latent.vars

In [None]:
dc1_atac_obj <- readRDS("atac_subGex_clustered_obj.RDS")
dim(dc1_atac_obj)

In [None]:
table(dc1_atac_obj$origin)

### check nFeature_peaks and nCount_peaks

In [None]:
options(repr.plot.width=12, repr.plot.height=6) #"#9ed4a0"

one = VlnPlot(dc1_atac_obj, group.by = "origin", features = "nCount_peaks")
two = VlnPlot(dc1_atac_obj, group.by = "origin", features = "nFeature_peaks")

ggarrange(one, two)

In [None]:
table(dc1_atac_obj$experimental_groups)

## Run LR test

In [None]:
DARs <- FindMarkers(dc1_atac_obj, 
                         assay = "peaks", 
                         test.use = "LR", 
                         group.by = "experimental_groups",
                         ident.1 = "KO",
                         ident.2 = "WT",
                         latent.vars = c("nCount_peaks", "nFeature_peaks"),
                         only.pos = TRUE,
                         min.pct = 0.05)               

In [None]:
dim(DARs)


In [None]:
DARs_sig <- DARs[DARs$p_val_adj < 0.05,]
dim(DARs_sig)

In [None]:
write.csv(DARs, "LR_DARs_df_atacSUBgex.csv") 

In [None]:
write.csv(DARs_sig, "LR_sigDARs_df_atacSUBgex.csv") #more accessible in KO compared to WT

## annotate DARs - find the gene closest to each region

In [None]:
regions <- StringToGRanges(rownames(DARs_sig))
regions

In [None]:
closest_DAR <- ClosestFeature(dc1_atac_obj, regions)

In [None]:
closest_DAR


### Merge DRAs_df with anotated_DARs_df to have both pval and annotation in one df

In [None]:
DARs_sig$query_region <- rownames(DARs_sig)

annotated_sigDARs_df <- merge(DARs_sig, closest_DAR, by = "query_region")
head(annotated_sigDARs_df, 5)

In [None]:
write.csv(annotated_sigDARs_df, "LR_annotated_sigDARs_df_atacSUBgex.csv")

# <span style="color:green"> Part 1b: Find DARs between KO and WT (both ways) with LR. Use "nCount_peaks" and "nFeature_peaks" as latent.vars

In [None]:
DARs_full = FindMarkers(dc1_atac_obj, 
                         assay = "peaks", 
                         test.use = "LR", 
                         group.by = "experimental_groups",
                         ident.1 = "KO",
                         ident.2 = "WT",
                         latent.vars = c("nCount_peaks", "nFeature_peaks"),
                         min.pct = 0.05)

In [None]:
dim(DARs_full)

In [None]:
sig_DARs_full = DARs_full[DARs_full$p_val_adj < 0.05,]
dim(sig_DARs_full)

In [None]:
sig_DARs_up = sig_DARs_full[sig_DARs_full$avg_log2FC > 0,]
dim(sig_DARs_up)

In [None]:
sig_DARs_down = sig_DARs_full[sig_DARs_full$avg_log2FC < 0,]
dim(sig_DARs_down)

## annotate DARs in WT compared to KO - find the gene closest to each region

In [None]:
regions = StringToGRanges(rownames(sig_DARs_down))
closest_DAR <- ClosestFeature(dc1_atac_obj, regions)

In [None]:
dim(closest_DAR)

In [None]:
sig_DARs_down$query_region = rownames(sig_DARs_down)

In [None]:
annotated_sig_DARs_down = merge(sig_DARs_down, closest_DAR, by = "query_region")

In [None]:
head(annotated_sig_DARs_down)

In [None]:
write.csv(annotated_sig_DARs_down, "LR_annotated_sigDARs_down_df_atacSUBgex.csv") # DARs in WT compared to KO

# <span style="color:green"> Part 2: Moitf enrichment analysis - for regions more accessible in KO compared to WT

In [None]:
dc1_atac_obj <- readRDS("atac_subGex_clustered_obj.RDS")
DARs <- read.csv("LR_sigDARs_df_atacSUBgex.csv") # accessible regions in KO compared to WT

dim(dc1_atac_obj)
dim(DARs)

## Adding motif infomration to the seurat object using a list of motif position frequency matrices from the JASPAR database

In [None]:
# pfm : position frequency matrix
pfm <- getMatrixSet(
  x = JASPAR2020,
  opts = list(collection = "CORE", tax_group = 'vertebrates', all_versions = FALSE)
)


In [None]:
pfm

#### add motif information

In [None]:
dc1_atac_obj <- AddMotifs(
  object = dc1_atac_obj,
  genome = BSgenome.Mmusculus.UCSC.mm10,
  pfm = pfm
)


In [None]:
head(seqlevels(BSgenome.Mmusculus.UCSC.mm10))

In [None]:
head(seqnames(dc1_atac_obj))

In [None]:
saveRDS(dc1_atac_obj, "atac_subGex_clustered_obj.RDS")

## Find motifs enriched in DAR 

In [None]:
head(DARs, 3)

In [None]:
DARs_locations <- DARs$X

### match the background peaks:  further restrict the background peaks to those that are accessible in the groups of cells compared when finding differentially accessible peaks.

In [None]:
open.peaks <- AccessiblePeaks(dc1_atac_obj)

meta.feature <- GetAssayData(dc1_atac_obj, assay = "peaks", layer = "meta.features")
peaks.matched <- MatchRegionStats(
  meta.feature = meta.feature[open.peaks, ],
  query.feature = meta.feature[DARs_locations, ],
  n = 50000
)

In [None]:
enriched.motifs <- FindMotifs(
  object = dc1_atac_obj,
  features = DARs_locations,
  background=peaks.matched
)

In [None]:
sig_enriched.motifs <- enriched.motifs[enriched.motifs$p.adjust < 0.05,]
dim(sig_enriched.motifs)

In [None]:
options(repr.plot.width=20, repr.plot.height=20)

MotifPlot(
  object = dc1_atac_obj,
  motifs = rownames(sig_enriched.motifs)[1:12]
)

In [None]:
write.csv(sig_enriched.motifs2, "sig_enriched_motifs_inKO_atacSUBgex_df.csv")