**Set environment**

In [1]:
suppressMessages(suppressWarnings(source("../config/config_sing.R")))
show_env()

You are in Singularity: singularity_proj_encode_fcc 
BASE DIRECTORY (FD_BASE): /data/reddylab/Kuei 
WORK DIRECTORY (FD_WORK): /data/reddylab/Kuei/out 
CODE DIRECTORY (FD_CODE): /data/reddylab/Kuei/code 
PATH OF PROJECT (FD_PRJ): /data/reddylab/Kuei/code/Proj_CombEffect_ENCODE_FCC 
PATH OF RESULTS (FD_RES): /data/reddylab/Kuei/out/proj_combeffect_encode_fcc 
PATH OF LOG     (FD_LOG): /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/log 


**Check data**

In [2]:
FOLDER = "annotation_ccre"

In [3]:
fdiry = file.path(
    FD_RES, 
    "results", 
    "region", 
    FOLDER)
for(fname in dir(fdiry)){print(fname)}

[1] "ccre_v3.bed.gz"
[1] "ccre_v4.bed.gz"
[1] "description.tsv"
[1] "summary"


In [4]:
fdiry = file.path(
    FD_RES, 
    "results", 
    "region", 
    "KS91_K562_ASTARRseq_peak_macs_input", 
    FOLDER)
for(fname in dir(fdiry)){print(fname)}

[1] "peak.annotation.ccre_v3.bed.gz"
[1] "peak.annotation.ccre_v4.bed.gz"


## Import data

In [5]:
fdiry  = file.path(FD_RES, "results", "region", FOLDER)
fname = "description.tsv"
fpath = file.path(fdiry, fname)

dat = read_tsv(fpath, show_col_types = FALSE)

dat_cnames = dat
print(dim(dat))
dat

[1] 11  2


Name,Description
<chr>,<chr>
Chrom,Chromosome
Start,Start position
End,End position
Name,Name
Score,Score
Strand,[+-.]; Use '.' if no strand is assigned.
ThickStart,The starting position at which the feature is drawn thickly
ThickEnd,The ending position at which the feature is drawn thickly
ItemRgb,"An RGB value of the form R,G,B (e.g. 255,0,0)"
Category,Label of cCREs type


In [6]:
### init: set column names
cnames = dat_cnames$Name
cnames = c("Chrom_ATAC", "Start_ATAC", "End_ATAC", cnames, "Overlap")

### init: set file path
fdiry = file.path(
    FD_RES, 
    "results", 
    "region", 
    "KS91_K562_ASTARRseq_peak_macs_input", 
    FOLDER)
fname  = paste("peak.annotation", "bed.gz", sep="*")
fglob  = file.path(fdiry, fname)
fpaths = Sys.glob(fglob)
fnames = basename(fpaths)

### init: annotation and label
annotations = fnames %>% 
    str_split(., "\\.") %>%
    do.call(rbind, .) %>%
    as.data.frame %>%
    dplyr::pull(3)
names(fnames) = annotations
names(annotations) = annotations
print(fnames)

                         ccre_v3                          ccre_v4 
"peak.annotation.ccre_v3.bed.gz" "peak.annotation.ccre_v4.bed.gz" 


In [7]:
### Import data
lst = lapply(annotations, function(annotation){
    ### set file path
    fname = fnames[annotation]
    fpath = file.path(fdiry, fname)
    
    ### read data
    dat = read_tsv(fpath, col_names = cnames, show_col_types = FALSE)
    dat = dat %>% dplyr::mutate(
        Peak_ATAC  = paste0(Chrom_ATAC, ":", Start_ATAC, "-", End_ATAC),
        Annotation = annotation,
        Label      = Category,
        Region     = paste0(Chrom, ":", Start, "-", End),
        Value      = Score,
        Note       = paste0(Name, ":", Category)
    )
    return(dat)
})

### assign and show
lst_peak_annot_import = lst
print(length(lst))
cat("=========================\n")
print(names(lst))
cat("=========================\n")
head(lst[[1]])

[1] 2
[1] "ccre_v3" "ccre_v4"


Chrom_ATAC,Start_ATAC,End_ATAC,Chrom,Start,End,Name,Score,Strand,ThickStart,ThickEnd,ItemRgb,Category,Note,Overlap,Peak_ATAC,Annotation,Label,Region,Value
<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<dbl>
chr1,180982,182087,chr1,181251,181601,EH38E1310153,0,.,181251,181601,6218147,DNase-only,EH38E1310153:DNase-only,350,chr1:180982-182087,ccre_v3,DNase-only,chr1:181251-181601,0
chr1,777949,779437,chr1,778562,778912,EH38E1310158,0,.,778562,778912,25500,"PLS,CTCF-bound","EH38E1310158:PLS,CTCF-bound",350,chr1:777949-779437,ccre_v3,"PLS,CTCF-bound",chr1:778562-778912,0
chr1,777949,779437,chr1,779086,779355,EH38E1310159,0,.,779086,779355,25500,PLS,EH38E1310159:PLS,269,chr1:777949-779437,ccre_v3,PLS,chr1:779086-779355,0
chr1,816774,817547,chr1,817080,817403,EH38E1310166,0,.,817080,817403,225225225,Low-DNase,EH38E1310166:Low-DNase,323,chr1:816774-817547,ccre_v3,Low-DNase,chr1:817080-817403,0
chr1,817905,818348,chr1,817903,818252,EH38E1310167,0,.,817903,818252,225225225,Low-DNase,EH38E1310167:Low-DNase,347,chr1:817905-818348,ccre_v3,Low-DNase,chr1:817903-818252,0
chr1,818602,819380,chr1,818718,818872,EH38E1310168,0,.,818718,818872,225225225,Low-DNase,EH38E1310168:Low-DNase,154,chr1:818602-819380,ccre_v3,Low-DNase,chr1:818718-818872,0


In [8]:
summary(lst[[1]]$Score)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
      0       0       0       0       0       0 

## Arrange and summarize

In [9]:
### init
lst = lst_peak_annot_import

lst = lapply(lst, function(dat){
    ### summarize
    dat = dat %>%
        dplyr::group_by(Chrom_ATAC, Start_ATAC, End_ATAC, Peak_ATAC, Annotation, Label) %>%
        dplyr::summarise(
            Count  = n(), 
            Region = paste(Region, collapse="|"),
            Score  = mean(Value),
            Note   = paste(Note, collapse="|"),
            .groups = "drop")
    
    ### Arrange
    dat = dat %>% 
        dplyr::arrange(Chrom_ATAC, Start_ATAC, End_ATAC) %>%
        dplyr::select(-Chrom_ATAC, -Start_ATAC, -End_ATAC) %>%
        dplyr::rename("Peak" = Peak_ATAC)
    return(dat)
})

### assign and show
lst_peak_annot_summary = lst
print(length(lst))
cat("=========================\n")
print(names(lst))
cat("=========================\n")
head(lst[[1]])

[1] 2
[1] "ccre_v3" "ccre_v4"


Peak,Annotation,Label,Count,Region,Score,Note
<chr>,<chr>,<chr>,<int>,<chr>,<dbl>,<chr>
chr1:180982-182087,ccre_v3,DNase-only,1,chr1:181251-181601,0,EH38E1310153:DNase-only
chr1:777949-779437,ccre_v3,PLS,1,chr1:779086-779355,0,EH38E1310159:PLS
chr1:777949-779437,ccre_v3,"PLS,CTCF-bound",1,chr1:778562-778912,0,"EH38E1310158:PLS,CTCF-bound"
chr1:816774-817547,ccre_v3,Low-DNase,1,chr1:817080-817403,0,EH38E1310166:Low-DNase
chr1:817905-818348,ccre_v3,Low-DNase,1,chr1:817903-818252,0,EH38E1310167:Low-DNase
chr1:818602-819380,ccre_v3,Low-DNase,1,chr1:818718-818872,0,EH38E1310168:Low-DNase


## Explore and check results

In [10]:
lst = lst_peak_annot_summary
lst = lapply(lst, function(dat){
    res = table(dat$Count)
    return(res)
})
print(lst)

$ccre_v3

     1      2      3      4      5      6      7      8      9     10     11 
106905  27655   8345   2711    915    356    120     51     14      6      2 

$ccre_v4

     1      2      3      4      5      6      7      8      9     10     11 
156242  47366  17294   6408   2429   1051    450    176     79     32     15 
    12     13     14 
     7      4      1 



## Save results

In [11]:
fdiry = file.path(
    FD_RES, 
    "results", 
    "region", 
    "KS91_K562_ASTARRseq_peak_macs_input", 
    "summary")

lst = lst_peak_annot_summary
for (idn in names(lst)){
    fname = paste("peak.summary", idn,"tsv", sep=".")
    fpath = file.path(fdiry, fname)
    print(fname)
    flush.console()
    
    dat = lst[[idn]]
    write_tsv(dat, fpath)
}

[1] "peak.summary.ccre_v3.tsv"
[1] "peak.summary.ccre_v4.tsv"
