# 2024-026 Peak Annotator
This notebook details methods to re-annotate peaks using ChIPseeker.

## Initialize Environment

Load libraries and set correct location.

In [1]:
# Load Libraries
library(ChIPseeker)
library(GenomicFeatures)
library(dplyr)
library(tidyr)

# Load data
exp_dir     <- "/home/dalbao/2024-026-Tcf7ATAC"
nfcore_dir  <- "01_nfcore_241228"
nb_dir      <- "02_bp_notebooks"
prefix      <- "peakAnno"                                # bp for broad peak calling settings

filter_file <- "241231-bp-peakFilter.txt"
# Add folder structure to filter_file
filter_file <- paste(nfcore_dir, filter_file, sep = "/")

# nf-core ATAC-seq pipeline calls peaks from two different libraries:
library     <- "merged_library"
# If library == merged_library, lib_sh is mLb
# else if it is == merged_replicates, lib_sh is mRp
lib_sh      <- ifelse(library == "merged_library", "mLb", "mRp")

# nf-core ATAC-seq pipeline calls peaks in two types, broad_peak and narrow_peak:
peak_type   <- "broad_peak"
# If peak_type == broad_peak, peak_sh is bp
peak_sh     <- ifelse(peak_type == "broad_peak", "bp", "np")

# Add peak type and date to prefix
prefix <- paste0(format(Sys.Date(), "%y%m%d"), "-", peak_sh, "-", prefix)

# Move to working directory
setwd(exp_dir)



ChIPseeker v1.38.0

If you use ChIPseeker in published research, please cite:
Qianwen Wang, Ming Li, Tianzhi Wu, Li Zhan, Lin Li, Meijun Chen, Wenqin Xie, Zijing Xie, Erqiang Hu, Shuangbin Xu, Guangchuang Yu. Exploring epigenomic datasets by ChIPseeker. Current Protocols 2022, 2(10): e585

Loading required package: BiocGenerics


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
    Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
    table, tapply, union, unique, unsplit, which.max, which.min


Loading required package: S4Vectors

Loading required package: stats4


Attaching packa

## Import GTF Data

In [2]:
# Create a TxDb object for annotations 
txdb <- makeTxDbFromGFF("ref/genes.gtf", format = "gtf") # GRCm38

Import genomic features from the file as a GRanges object ... 
OK

Prepare the 'metadata' data frame ... 
OK

Make the TxDb object ... 
“The "phase" metadata column contains non-NA values for features of type
  stop_codon. This information was ignored.”
OK



## Annotate Peaks with Flanks

In [3]:
# File name for unfiltered consensus peaks bed file
consensus_file <- paste("consensus_peaks", lib_sh, "clN.bed", sep = ".")
consensus_loc  <- paste(nfcore_dir,
                        "bwa",
                        library,
                        "macs2",
                        peak_type,
                        "consensus",
                        consensus_file,
                        sep = "/")

# Annotate peaks
peak_anno <- annotatePeak(
    peak = consensus_loc,
    tssRegion = c(-3000, 3000),     # Region to consider as TSS proximal
    TxDb = txdb,                    # Ensembl TxDb for GRCm38
    addFlankGeneInfo = TRUE,        # Add gene information
    flankDistance = 150000          # Distance to consider as gene body
)

>> loading peak file...				 2025-01-13 06:16:03 AM 
>> preparing features information...		 2025-01-13 06:16:04 AM 
>> identifying nearest features...		 2025-01-13 06:16:04 AM 
>> calculating distance from peak to TSS...	 2025-01-13 06:16:07 AM 
>> assigning genomic annotation...		 2025-01-13 06:16:07 AM 
>> adding flank feature information from peaks...	 2025-01-13 06:16:17 AM 
>> assigning chromosome lengths			 2025-01-13 06:17:33 AM 
>> done...					 2025-01-13 06:17:33 AM 


In [4]:
# Check peak_anno
peak_anno

Annotated peaks generated by ChIPseeker
285417/285417  peaks were annotated
Genomic Annotation Summary:
              Feature   Frequency
9    Promoter (<=1kb)  9.65569675
10   Promoter (1-2kb)  3.32706181
11   Promoter (2-3kb)  3.11053651
4              5' UTR  0.12052541
3              3' UTR  1.42493264
1            1st Exon  0.02908026
7          Other Exon  2.77839092
2          1st Intron  8.30644285
8        Other Intron 21.07092430
6  Downstream (<=300)  0.10475900
5   Distal Intergenic 50.07164955

In [5]:
# Extract peak annotations
peak_anno_df <- as.data.frame(peak_anno)

# Rename the V4 column as interval_id using tidyr
#peak_anno_df <- peak_anno_df %>% rename(V4= "interval_id")
peak_anno_df <- peak_anno_df %>% rename(interval_id = "V4")
head(peak_anno_df)

Unnamed: 0_level_0,seqnames,start,end,width,strand,interval_id,V5,V6,annotation,geneChr,geneStart,geneEnd,geneLength,geneStrand,geneId,transcriptId,distanceToTSS,flank_txIds,flank_geneIds,flank_gene_distances
Unnamed: 0_level_1,<fct>,<int>,<int>,<int>,<fct>,<chr>,<int>,<chr>,<chr>,<int>,<int>,<int>,<int>,<int>,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>
1,1,3008717,3009098,382,*,Interval_1,0,+,Distal Intergenic,29,3073253,3074322,1070,1,ENSMUSG00000102693,ENSMUST00000193812,-64155,ENSMUST00000193812;ENSMUST00000082908,ENSMUSG00000102693;ENSMUSG00000064842,-64155;-92918
2,1,3026197,3027005,809,*,Interval_2,0,+,Distal Intergenic,29,3073253,3074322,1070,1,ENSMUSG00000102693,ENSMUST00000193812,-46248,ENSMUST00000193812;ENSMUST00000082908,ENSMUSG00000102693;ENSMUSG00000064842,-46248;-75011
3,1,3062294,3062545,252,*,Interval_3,0,+,Distal Intergenic,29,3073253,3074322,1070,1,ENSMUSG00000102693,ENSMUST00000193812,-10708,ENSMUST00000193812;ENSMUST00000082908;ENSMUST00000162897;ENSMUST00000159265,ENSMUSG00000102693;ENSMUSG00000064842;ENSMUSG00000051951;ENSMUSG00000051951,-10708;-39471;153799;153087
4,1,3062945,3063216,272,*,Interval_4,0,+,Distal Intergenic,29,3073253,3074322,1070,1,ENSMUSG00000102693,ENSMUST00000193812,-10037,ENSMUST00000193812;ENSMUST00000082908;ENSMUST00000162897;ENSMUST00000159265,ENSMUSG00000102693;ENSMUSG00000064842;ENSMUSG00000051951;ENSMUSG00000051951,-10037;-38800;153128;152416
5,1,3063384,3064024,641,*,Interval_5,0,+,Distal Intergenic,29,3073253,3074322,1070,1,ENSMUSG00000102693,ENSMUST00000193812,-9229,ENSMUST00000193812;ENSMUST00000082908;ENSMUST00000162897;ENSMUST00000159265,ENSMUSG00000102693;ENSMUSG00000064842;ENSMUSG00000051951;ENSMUSG00000051951,-9229;-37992;152320;151608
6,1,3084439,3085013,575,*,Interval_6,0,+,Distal Intergenic,29,3073253,3074322,1070,1,ENSMUSG00000102693,ENSMUST00000193812,11186,ENSMUST00000193812;ENSMUST00000082908;ENSMUST00000162897;ENSMUST00000159265;ENSMUST00000070533,ENSMUSG00000102693;ENSMUSG00000064842;ENSMUSG00000051951;ENSMUSG00000051951;ENSMUSG00000051951,11186;-17003;131331;130619;0


## Import Kept Peaks

In [6]:
# Import peak filter
kept_peaks <- read.table(filter_file, header = TRUE)
kept_peaks <- kept_peaks[kept_peaks$keep == TRUE, ]
head(kept_peaks)

# Remove transcripts
peak_anno_df <- peak_anno_df %>% select(-V5, -V6, -transcriptId, -flank_txIds, -flank_gene_distances)

# Cleanup (fuck)
geneIds <- strsplit(peak_anno_df$flank_geneIds, ";")
geneIds <- lapply(geneIds, unique)
geneIds <- sapply(geneIds, paste, collapse = ";")
peak_anno_df$flank_geneIds <- geneIds

fn <- paste(nfcore_dir, "/", prefix, "-Unfiltered-NoSymbol.csv", sep = "")
# Save the peak kept peaks and annotated peaks
write.csv(  peak_anno_df, file = fn,
            quote = FALSE, row.names = FALSE)

# Only keep peak_anno_df observations in kept_peaks$keep
peak_anno_df <- peak_anno_df[peak_anno_df$interval_id %in% kept_peaks$interval_id, ]

head(peak_anno_df)

Unnamed: 0_level_0,chr,start,end,interval_id,keep_peak
Unnamed: 0_level_1,<chr>,<int>,<int>,<chr>,<lgl>
98,1,4496008,4497219,Interval_98,True
103,1,4747606,4748936,Interval_103,True
105,1,4768351,4770866,Interval_105,True
107,1,4779308,4780669,Interval_107,True
108,1,4785113,4786365,Interval_108,True
110,1,4807125,4808614,Interval_110,True


Unnamed: 0_level_0,seqnames,start,end,width,strand,interval_id,annotation,geneChr,geneStart,geneEnd,geneLength,geneStrand,geneId,distanceToTSS,flank_geneIds
Unnamed: 0_level_1,<fct>,<int>,<int>,<int>,<fct>,<chr>,<chr>,<int>,<int>,<int>,<int>,<int>,<chr>,<dbl>,<chr>
98,1,4496009,4497219,1211,*,Interval_98,Promoter (<=1kb),29,4492458,4496330,3873,2,ENSMUSG00000025902,0,ENSMUSG00000025900;ENSMUSG00000104123;ENSMUSG00000025902;ENSMUSG00000104238;ENSMUSG00000102269;ENSMUSG00000096126;ENSMUSG00000103003;ENSMUSG00000104328;ENSMUSG00000102735
103,1,4747607,4748936,1330,*,Interval_103,Distal Intergenic,29,4735046,4735676,631,2,ENSMUSG00000103265,-11931,ENSMUSG00000102735;ENSMUSG00000098104;ENSMUSG00000102175;ENSMUSG00000088000;ENSMUSG00000103265;ENSMUSG00000103922;ENSMUSG00000033845;ENSMUSG00000102275;ENSMUSG00000025903;ENSMUSG00000104217;ENSMUSG00000033813;ENSMUSG00000062588
105,1,4768352,4770866,2515,*,Interval_105,Promoter (<=1kb),29,4771131,4772199,1069,1,ENSMUSG00000103922,-265,ENSMUSG00000098104;ENSMUSG00000102175;ENSMUSG00000088000;ENSMUSG00000103265;ENSMUSG00000103922;ENSMUSG00000033845;ENSMUSG00000102275;ENSMUSG00000025903;ENSMUSG00000104217;ENSMUSG00000033813;ENSMUSG00000062588;ENSMUSG00000103280;ENSMUSG00000002459
107,1,4779309,4780669,1361,*,Interval_107,Promoter (<=1kb),29,4778063,4779212,1150,2,ENSMUSG00000102275,-97,ENSMUSG00000098104;ENSMUSG00000102175;ENSMUSG00000088000;ENSMUSG00000103265;ENSMUSG00000103922;ENSMUSG00000033845;ENSMUSG00000102275;ENSMUSG00000025903;ENSMUSG00000104217;ENSMUSG00000033813;ENSMUSG00000062588;ENSMUSG00000103280;ENSMUSG00000002459;ENSMUSG00000091305
108,1,4785114,4786365,1252,*,Interval_108,Promoter (<=1kb),29,4783572,4785692,2121,2,ENSMUSG00000033845,0,ENSMUSG00000098104;ENSMUSG00000102175;ENSMUSG00000088000;ENSMUSG00000103265;ENSMUSG00000103922;ENSMUSG00000033845;ENSMUSG00000102275;ENSMUSG00000025903;ENSMUSG00000104217;ENSMUSG00000033813;ENSMUSG00000062588;ENSMUSG00000103280;ENSMUSG00000002459;ENSMUSG00000091305
110,1,4807126,4808614,1489,*,Interval_110,Promoter (<=1kb),29,4807788,4848410,40623,1,ENSMUSG00000025903,0,ENSMUSG00000098104;ENSMUSG00000102175;ENSMUSG00000088000;ENSMUSG00000103265;ENSMUSG00000103922;ENSMUSG00000033845;ENSMUSG00000102275;ENSMUSG00000025903;ENSMUSG00000104217;ENSMUSG00000033813;ENSMUSG00000062588;ENSMUSG00000103280;ENSMUSG00000002459;ENSMUSG00000091305;ENSMUSG00000102653


## Convert EnsemblIDs to Gene Symbols

In [7]:
# Load EnsemblID conversion
ensembl2symbol <- read.table("ref/ensembl_102.txt", header = TRUE, sep = "\t", stringsAsFactors = FALSE)
colnames(ensembl2symbol) <- c("GeneID", "GeneSymbol")

# Check df
head(ensembl2symbol)

Unnamed: 0_level_0,GeneID,GeneSymbol
Unnamed: 0_level_1,<chr>,<chr>
1,ENSMUSG00000064372,mt-Tp
2,ENSMUSG00000064371,mt-Tt
3,ENSMUSG00000064370,mt-Cytb
4,ENSMUSG00000064369,mt-Te
5,ENSMUSG00000064368,mt-Nd6
6,ENSMUSG00000064367,mt-Nd5


In [8]:
# Function to convert EnsemblIDs to GeneSymbols
convert_ids <- function(ids, conversion_df) {
    sapply(ids, function(id) {
        id_list <- unlist(strsplit(id, ";"))
        symbol_list <- sapply(id_list, function(single_id) {
            symbol <- conversion_df$GeneSymbol[conversion_df$GeneID == single_id]
            if (length(symbol) == 0 || is.na(symbol[1]) || symbol[1] == "") {
                return(single_id)
            } else {
                return(symbol)
            }
        })
        paste(symbol_list, collapse = ";")
    })
}

In [9]:
# Keep only items in ensemble2symbol that are in peak_anno_df
uniqueids <- unique(unlist(strsplit(peak_anno_df$flank_geneIds, ";")))
uniqueids <- unique(c(uniqueids, peak_anno_df$flank_geneId))

ensembl2symbol <- ensembl2symbol[ensembl2symbol$GeneID %in% uniqueids, ]

In [10]:
# Convert geneId and flank_geneIds columns
peak_anno_df$geneSymbol <- convert_ids(peak_anno_df$geneId, ensembl2symbol)
peak_anno_df$flank_geneSymbols <- convert_ids(peak_anno_df$flank_geneIds, ensembl2symbol)
print("Done Annotation")

[1] "Done Annotation"


In [16]:
# For some reason this is needed or else write times suffer.
gc(full = TRUE)

# Save annotated file
fn <- paste(nfcore_dir, "/", prefix, "-Filtered.csv", sep = "")
write.csv(  peak_anno_df, file = fn,
            quote = TRUE, row.names = FALSE)
print(fn)

Unnamed: 0,used,(Mb),gc trigger,(Mb).1,max used,(Mb).2
Ncells,11708379,625.3,33603878,1794.7,52506058,2804.2
Vcells,46129721,352.0,135546944,1034.2,264740105,2019.9


[1] "01_nfcore_241228/250113-bp-peakAnno-Filtered.csv"


In [14]:
test <- peak_anno_df[!grepl("ENS", peak_anno_df$geneId), ]

In [15]:
test

seqnames,start,end,width,strand,interval_id,annotation,geneChr,geneStart,geneEnd,geneLength,geneStrand,geneId,distanceToTSS,flank_geneIds,geneSymbol,flank_geneSymbols
<fct>,<int>,<int>,<int>,<fct>,<chr>,<chr>,<int>,<int>,<int>,<int>,<int>,<chr>,<dbl>,<chr>,<chr>,<chr>
