**Set environment**

In [1]:
suppressMessages(suppressWarnings(source("../config/config_sing.R")))
show_env()

You are in Singularity: singularity_proj_encode_fcc 
BASE DIRECTORY (FD_BASE): /data/reddylab/Kuei 
WORK DIRECTORY (FD_WORK): /data/reddylab/Kuei/out 
CODE DIRECTORY (FD_CODE): /data/reddylab/Kuei/code 
PATH OF PROJECT (FD_PRJ): /data/reddylab/Kuei/code/Proj_CombEffect_ENCODE_FCC 
PATH OF RESULTS (FD_RES): /data/reddylab/Kuei/out/proj_combeffect_encode_fcc 
PATH OF LOG     (FD_LOG): /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/log 


**Check data**

In [2]:
fdiry = file.path(
    FD_RES, 
    "results", 
    "region", 
    "KS91_K562_ASTARRseq_peak_macs_input", 
    "annotation_tss_pol2_span")
fnames = dir(fdiry)
head(fnames)

In [3]:
lst = str_split(string = fnames, pattern = "\\.")
lst = lapply(lst, function(vec){vec[6]})
labels = unlist(lst)
head(labels)

## Import data

In [4]:
fdiry = file.path(
    FD_RES, 
    "results", 
    "region", 
    "annotation_tss_pol2_span")
fname = "description.tsv"
fpath = file.path(fdiry, fname)

dat = read_tsv(fpath, show_col_types = FALSE)

dat_cnames = dat
print(dim(dat))
dat

[1] 6 2


Name,Description
<chr>,<chr>
Chrom,Chromosome
Start,Start position
End,End position
Name,Name
Score,Pol2 score
Gene,Gene of the TSS


In [5]:
annotation = "TSS_POL2_Span"
cnames = dat_cnames$Name
cnames = c("Chrom_ATAC", "Start_ATAC", "End_ATAC", cnames, "Overlap")

fdiry = file.path(
    FD_RES, 
    "results", 
    "region", 
    "KS91_K562_ASTARRseq_peak_macs_input", 
    "annotation_tss_pol2_span")

lst = lapply(labels, function(label){
    ### set file path
    fname = paste("*", label, "bed.gz", sep = ".")
    fglob = file.path(fdiry, fname)
    fpath = Sys.glob(fglob)
    
    ### read data
    dat = read_tsv(fpath, col_names = cnames, show_col_types = FALSE)
    dat = dat %>% dplyr::mutate(
        Peak_ATAC  = paste(Chrom_ATAC, Start_ATAC, End_ATAC, sep="_"),
        #Peak_ATAC  = paste0(Chrom_ATAC, ":", Start_ATAC, "_", End_ATAC),
        Annotation = annotation
    )
    
    ### set label
    dat = dat %>% 
        tidyr::separate(Name, c("Chrom_TSS", "Loc_TSS", "Label", "Direction"), sep=":", remove = FALSE) %>%
        dplyr::mutate(TSS = paste(Chrom_TSS, Loc_TSS, sep=":"))
    
    return(dat)
})

lst_peak_annot = lst
print(length(lst))
head(lst[[1]])

[1] 500


Chrom_ATAC,Start_ATAC,End_ATAC,Chrom,Start,End,Name,Chrom_TSS,Loc_TSS,Label,Direction,Score,Gene,Overlap,Peak_ATAC,Annotation,TSS
<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>,<dbl>,<chr>,<chr>,<chr>
chr1,30803,31072,chr1,30365,31365,chr1:30365-30366:0-1K:+,chr1,30365-30366,0-1K,+,0.562995,MIR1302-10,269,chr1_30803_31072,TSS_POL2_Span,chr1:30365-30366
chr1,30803,31072,chr1,30365,31365,chr1:30365-30366:0-1K:+,chr1,30365-30366,0-1K,+,0.562995,MIR1302-11,269,chr1_30803_31072,TSS_POL2_Span,chr1:30365-30366
chr1,30803,31072,chr1,30365,31365,chr1:30365-30366:0-1K:+,chr1,30365-30366,0-1K,+,0.562995,MIR1302-2,269,chr1_30803_31072,TSS_POL2_Span,chr1:30365-30366
chr1,30803,31072,chr1,30365,31365,chr1:30365-30366:0-1K:+,chr1,30365-30366,0-1K,+,0.562995,MIR1302-9,269,chr1_30803_31072,TSS_POL2_Span,chr1:30365-30366
chr1,180982,182087,chr1,181387,182387,chr1:182387-182388:0-1K:-,chr1,182387-182388,0-1K,-,0.00779966,DDX11L17,700,chr1_180982_182087,TSS_POL2_Span,chr1:182387-182388
chr1,777949,779437,chr1,778634,779634,chr1:778634-778635:0-1K:+,chr1,778634-778635,0-1K,+,230.561,LOC100288069,803,chr1_777949_779437,TSS_POL2_Span,chr1:778634-778635


## Arrange and summarize

In [8]:
lst = lst_peak_annot
dat = bind_rows(lst)
dat = dat %>% 
    dplyr::select(Peak_ATAC, Annotation, Label, Direction, TSS, Gene) %>% 
    dplyr::rename("Peak" = "Peak_ATAC")

dat_peak_annot = dat
print(dim(dat))
head(dat)

[1] 3270027       6


Peak,Annotation,Label,Direction,TSS,Gene
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
chr1_30803_31072,TSS_POL2_Span,0-1K,+,chr1:30365-30366,MIR1302-10
chr1_30803_31072,TSS_POL2_Span,0-1K,+,chr1:30365-30366,MIR1302-11
chr1_30803_31072,TSS_POL2_Span,0-1K,+,chr1:30365-30366,MIR1302-2
chr1_30803_31072,TSS_POL2_Span,0-1K,+,chr1:30365-30366,MIR1302-9
chr1_180982_182087,TSS_POL2_Span,0-1K,-,chr1:182387-182388,DDX11L17
chr1_777949_779437,TSS_POL2_Span,0-1K,+,chr1:778634-778635,LOC100288069


## Save results

In [9]:
fdiry = file.path(FD_RES, "results", "region", "KS91_K562_ASTARRseq_peak_macs_input", "summary")
fname = "peak.summary.tss_pol2_span.tsv"
fpath = file.path(fdiry, fname)

dat = dat_peak_annot
write_tsv(dat, fpath)