**Set environment**

In [1]:
suppressMessages(suppressWarnings(source("../config/config_sing.R")))
show_env()

You are in Singularity: singularity_proj_encode_fcc 
BASE DIRECTORY (FD_BASE): /data/reddylab/Kuei 
WORK DIRECTORY (FD_WORK): /data/reddylab/Kuei/out 
CODE DIRECTORY (FD_CODE): /data/reddylab/Kuei/code 
PATH OF PROJECT (FD_PRJ): /data/reddylab/Kuei/code/Proj_CombEffect_ENCODE_FCC 
PATH OF RESULTS (FD_RES): /data/reddylab/Kuei/out/proj_combeffect_encode_fcc 
PATH OF LOG     (FD_LOG): /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/log 


**Check data**

In [3]:
fdiry = file.path(
    FD_RES, 
    "results", 
    "region", 
    "KS91_K562_ASTARRseq_peak_macs_input", 
    "annotation_chipseq_histone")
fnames = dir(fdiry)
fnames

In [4]:
lst = str_split(string = fnames, pattern = "\\.")
lst = lapply(lst, function(vec){vec[6]})
labels = unlist(lst)
labels

## Import data

In [6]:
fdiry = file.path(
    FD_RES, 
    "results", 
    "region", 
    "annotation_chipseq_histone")
fname = "description.tsv"
fpath = file.path(fdiry, fname)

dat = read_tsv(fpath, show_col_types = FALSE)

dat_cnames = dat
print(dim(dat))
dat

[1] 10  2


Name,Description
<chr>,<chr>
chrom,"Name of the chromosome (or contig, scaffold, etc.)."
Start,The starting position of the feature in the chromosome or scaffold. The first base in a chromosome is numbered 0.
End,"The ending position of the feature in the chromosome or scaffold. The chromEnd base is not included in the display of the feature. For example, the first 100 bases of a chromosome are defined as chromStart=0, chromEnd=100, and span the bases numbered 0-99."
Name,Name given to a region (preferably unique). Use '.' if no name is assigned.
Score,"Indicates how dark the peak will be displayed in the browser (0-1000). If all scores were '0' when the data were submitted to the DCC, the DCC assigned scores 1-1000 based on signal value. Ideally the average signalValue per base spread is between 100-1000."
Strand,+/- to denote strand or orientation (whenever applicable). Use '.' if no orientation is assigned.
SignalValue,"Measurement of overall (usually, average) enrichment for the region."
pValue,Measurement of statistical significance (-log10). Use -1 if no pValue is assigned.
qValue,Measurement of statistical significance using false discovery rate (-log10). Use -1 if no qValue is assigned.
peak,Point-source called for this peak; 0-based offset from chromStart. Use -1 if no point-source called.


In [7]:
annotation = "ChIPseq_Histone"
cnames = dat_cnames$Name
cnames = c("Chrom_ATAC", "Start_ATAC", "End_ATAC", cnames, "Overlap")

fdiry = file.path(
    FD_RES, 
    "results", 
    "region", 
    "KS91_K562_ASTARRseq_peak_macs_input", 
    "annotation_chipseq_histone")

lst = lapply(labels, function(label){
    ### set file path
    fname = paste("*", label, "bed.gz", sep = ".")
    fglob = file.path(fdiry, fname)
    fpath = Sys.glob(fglob)
    
    ### read data
    dat = read_tsv(fpath, col_names = cnames, show_col_types = FALSE)
    dat = dat %>% dplyr::mutate(
        Peak_ATAC  = paste(Chrom_ATAC, Start_ATAC, End_ATAC, sep="_"),
        #Peak_ATAC  = paste0(Chrom_ATAC, ":", Start_ATAC, "_", End_ATAC),
        Annotation = annotation,
        Label      = label
    )
    return(dat)
})

lst_peak_annot = lst
print(length(lst))
head(lst[[1]])

[1] 19


Chrom_ATAC,Start_ATAC,End_ATAC,chrom,Start,End,Name,Score,Strand,SignalValue,pValue,qValue,peak,Overlap,Peak_ATAC,Annotation,Label
<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>
chr1,137737,139544,chr1,137650,138169,Peak_16228,317,.,9.24885,31.74444,29.32013,338,432,chr1_137737_139544,ChIPseq_Histone,H3K27ac
chr1,137737,139544,chr1,138356,139236,Peak_19531,228,.,7.55215,22.82356,20.52066,178,880,chr1_137737_139544,ChIPseq_Histone,H3K27ac
chr1,137737,139544,chr1,139285,139490,Peak_33716,88,.,4.2713,8.85756,6.82864,172,205,chr1_137737_139544,ChIPseq_Histone,H3K27ac
chr1,777949,779437,chr1,777712,778823,Peak_6597,939,.,19.33186,93.91751,90.8652,823,874,chr1_777949_779437,ChIPseq_Histone,H3K27ac
chr1,777949,779437,chr1,778887,779412,Peak_16912,296,.,8.99031,29.6308,27.2331,306,525,chr1_777949_779437,ChIPseq_Histone,H3K27ac
chr1,826754,828040,chr1,826565,827901,Peak_8064,783,.,17.13224,78.32069,75.40894,869,1147,chr1_826754_828040,ChIPseq_Histone,H3K27ac


## Arrange and summarize

In [8]:
lst = lst_peak_annot

lst = lapply(lst, function(dat){
    dat = dat %>% 
        dplyr::group_by(Peak_ATAC, Annotation, Label) %>%
        dplyr::summarise(Count = n(), .groups = "drop")
    return(dat)
})

dat = bind_rows(lst)
dat = dat %>% dplyr::rename("Peak" = "Peak_ATAC")

dat_peak_annot_count = dat
print(dim(dat))
head(dat)

[1] 701933      4


Peak,Annotation,Label,Count
<chr>,<chr>,<chr>,<int>
chr10_100009096_100010466,ChIPseq_Histone,H3K27ac,1
chr10_100185017_100187275,ChIPseq_Histone,H3K27ac,1
chr10_100228452_100230090,ChIPseq_Histone,H3K27ac,1
chr10_100267066_100268374,ChIPseq_Histone,H3K27ac,1
chr10_100285974_100287341,ChIPseq_Histone,H3K27ac,1
chr10_100289686_100290090,ChIPseq_Histone,H3K27ac,1


In [10]:
dat = dat_peak_annot_count
table(dat$Count)


     1      2      3      4      5      6      7      8      9     10     11 
455033 148046  37935  47376   8327   3134   1272    562    144     88     16 

In [12]:
dat = dat_peak_annot_count
dat %>% dplyr::filter(Count == 11)

Peak,Annotation,Label,Count
<chr>,<chr>,<chr>,<int>
chr11_2890447_2893046,ChIPseq_Histone,H3K4me3,11
chr17_38255901_38257878,ChIPseq_Histone,H3K4me3,11
chr17_79809894_79811714,ChIPseq_Histone,H3K4me3,11
chr8_8227407_8229235,ChIPseq_Histone,H3K4me3,11
chr11_2890447_2893046,ChIPseq_Histone,H3K4me3,11
chr17_38255901_38257878,ChIPseq_Histone,H3K4me3,11
chr17_79809894_79811714,ChIPseq_Histone,H3K4me3,11
chr8_8227407_8229235,ChIPseq_Histone,H3K4me3,11
chr11_2890447_2893046,ChIPseq_Histone,H3K4me3,11
chr17_38255901_38257878,ChIPseq_Histone,H3K4me3,11


## Save results

In [9]:
fdiry = file.path(FD_RES, "results", "region", "KS91_K562_ASTARRseq_peak_macs_input", "summary")
fname = "peak.summary.chipseq_histone.tsv"
fpath = file.path(fdiry, fname)

dat = dat_peak_annot_count
write_tsv(dat, fpath)