**Set environment**

In [1]:
suppressMessages(suppressWarnings(source("../config/config_sing.R")))
show_env()

You are in Singularity: singularity_proj_encode_fcc 
BASE DIRECTORY (FD_BASE): /data/reddylab/Kuei 
WORK DIRECTORY (FD_WORK): /data/reddylab/Kuei/out 
CODE DIRECTORY (FD_CODE): /data/reddylab/Kuei/code 
PATH OF PROJECT (FD_PRJ): /data/reddylab/Kuei/code/Proj_CombEffect_ENCODE_FCC 
PATH OF RESULTS (FD_RES): /data/reddylab/Kuei/out/proj_combeffect_encode_fcc 
PATH OF LOG     (FD_LOG): /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/log 


**Check data**

In [2]:
FOLDER = "annotation_tss_pol2"

In [3]:
fdiry = file.path(
    FD_RES, 
    "results", 
    "region", 
    FOLDER)
for(fname in dir(fdiry)){print(fname)}

[1] "description.tsv"
[1] "K562.TSS.selected_by_highest_Pol2_signal.bed"
[1] "K562.TSS.selected_by_highest_Pol2_signal.bed.gz"
[1] "K562.TSS.selected_by_highest_Pol2_signal.filtered_by_RNAseq_TPM.bed.gz"


In [4]:
fdiry = file.path(
    FD_RES, 
    "results", 
    "region", 
    "KS91_K562_ASTARRseq_peak_macs_input", 
    FOLDER)
for(fname in dir(fdiry)){print(fname)}

[1] "peak.annotation.K562.TSS.selected_by_highest_Pol2_signal.bed.gz"
[1] "peak.annotation.K562.TSS.selected_by_highest_Pol2_signal.filtered_by_RNAseq_TPM.bed.gz"


## Import data

**Get column names**

In [5]:
fdiry  = file.path(FD_RES, "results", "region", FOLDER)
fname = "description.tsv"
fpath = file.path(fdiry, fname)

dat = read_tsv(fpath, show_col_types = FALSE)

dat_cnames = dat
print(dim(dat))
dat

[1] 5 2


Name,Description
<chr>,<chr>
Chrom,Chromosome
Start,TSS position
End,TSS position
Name,Gene name
Score,"highest level of pol2 chip seq (ENCFF914WIS.bigWig) at [TSS-500, TSS+500] among TSS isoform"


**Import data**

In [29]:
### init: set column names
cnames = dat_cnames$Name
cnames = c("Chrom_ATAC", "Start_ATAC", "End_ATAC", cnames, "Overlap")

### init: set file path
fdiry = file.path(
    FD_RES, 
    "results", 
    "region", 
    "KS91_K562_ASTARRseq_peak_macs_input", 
    FOLDER)
fname = paste("peak.annotation", "bed.gz", sep="*")
fglob = file.path(fdiry, fname)
fpaths = Sys.glob(fglob)

### init: annotation and label
annotations   = c("TSS_POL2", "TSS_POL2_RNAseq")
label         = "TSS"
names(fpaths) = annotations
print(fpaths)

### import data
lst = lapply(annotations, function(annotation){
    ### init: get fpath
    fpath = fpaths[annotation]

    ### read table
    dat = read_tsv(fpath, col_names = cnames, show_col_types = FALSE)
    
    ### arrange columns
    dat = dat %>% dplyr::mutate(
        Peak_ATAC  = paste0(Chrom_ATAC, ":", Start_ATAC, "-", End_ATAC),
        Annotation = annotation,
        Label      = label,
        Region     = paste0(Chrom, ":", Start, "-", End),
        Value      = Score,
        Note       = Name
    )
    return(dat)
})
names(lst) = annotations

### assign and show
lst_dat_peak_annot_import = lst

                                                                                                                                                                                                          TSS_POL2 
                       "/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/region/KS91_K562_ASTARRseq_peak_macs_input/annotation_tss_pol2/peak.annotation.K562.TSS.selected_by_highest_Pol2_signal.bed.gz" 
                                                                                                                                                                                                   TSS_POL2_RNAseq 
"/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/region/KS91_K562_ASTARRseq_peak_macs_input/annotation_tss_pol2/peak.annotation.K562.TSS.selected_by_highest_Pol2_signal.filtered_by_RNAseq_TPM.bed.gz" 


In [30]:
lst = lst_dat_peak_annot_import
dat = lst[[1]]
head(dat, 3)

Chrom_ATAC,Start_ATAC,End_ATAC,Chrom,Start,End,Name,Score,Overlap,Peak_ATAC,Annotation,Label,Region,Value,Note
<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>
chr1,17237,17772,chr1,17436,17437,MIR6859-1,9.43812,1,chr1:17237-17772,TSS_POL2,TSS,chr1:17436-17437,9.43812,MIR6859-1
chr1,17237,17772,chr1,17436,17437,MIR6859-2,9.43812,1,chr1:17237-17772,TSS_POL2,TSS,chr1:17436-17437,9.43812,MIR6859-2
chr1,17237,17772,chr1,17436,17437,MIR6859-3,9.43812,1,chr1:17237-17772,TSS_POL2,TSS,chr1:17436-17437,9.43812,MIR6859-3


In [31]:
lst = lst_dat_peak_annot_import
dat = lst[[2]]
head(dat, 3)

Chrom_ATAC,Start_ATAC,End_ATAC,Chrom,Start,End,Name,Score,Overlap,Peak_ATAC,Annotation,Label,Region,Value,Note
<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>
chr1,28903,29613,chr1,29370,29371,WASH7P,0.00023,1,chr1:28903-29613,TSS_POL2_RNAseq,TSS,chr1:29370-29371,0.00023,WASH7P
chr1,826754,828040,chr1,827522,827523,LINC00115,64.4656,1,chr1:826754-828040,TSS_POL2_RNAseq,TSS,chr1:827522-827523,64.4656,LINC00115
chr1,826754,828040,chr1,827590,827591,LINC01128,64.4603,1,chr1:826754-828040,TSS_POL2_RNAseq,TSS,chr1:827590-827591,64.4603,LINC01128


## Arrange and summarize

In [32]:
### init
lst = lst_dat_peak_annot_import

### loop, summarize, and arrange
lst = lapply(lst, function(dat){
    ### summarize
    dat = dat %>%
        dplyr::group_by(Chrom_ATAC, Start_ATAC, End_ATAC, Peak_ATAC, Annotation, Label) %>%
        dplyr::summarise(
            Count  = n(), 
            Region = paste(Region, collapse="|"),
            Score  = mean(Value),
            Note   = paste(Note, collapse="|"),
            .groups = "drop")

    ### Arrange
    dat = dat %>% 
        dplyr::arrange(Chrom_ATAC, Start_ATAC, End_ATAC) %>%
        dplyr::select(-Chrom_ATAC, -Start_ATAC, -End_ATAC) %>%
        dplyr::rename("Peak" = Peak_ATAC)
    
    return(dat)
})

### assign and show
lst_dat_peak_annot_summary = lst
print(names(lst))

[1] "TSS_POL2"        "TSS_POL2_RNAseq"


In [33]:
lst = lst_dat_peak_annot_summary
dat = lst[[1]]
head(dat, 3)

Peak,Annotation,Label,Count,Region,Score,Note
<chr>,<chr>,<chr>,<int>,<chr>,<dbl>,<chr>
chr1:17237-17772,TSS_POL2,TSS,4,chr1:17436-17437|chr1:17436-17437|chr1:17436-17437|chr1:17436-17437,9.43812,MIR6859-1|MIR6859-2|MIR6859-3|MIR6859-4
chr1:28903-29613,TSS_POL2,TSS,1,chr1:29370-29371,0.00023,WASH7P
chr1:777949-779437,TSS_POL2,TSS,1,chr1:778634-778635,230.561,LOC100288069


In [34]:
lst = lst_dat_peak_annot_summary
dat = lst[[2]]
head(dat, 3)

Peak,Annotation,Label,Count,Region,Score,Note
<chr>,<chr>,<chr>,<int>,<chr>,<dbl>,<chr>
chr1:28903-29613,TSS_POL2_RNAseq,TSS,1,chr1:29370-29371,0.00023,WASH7P
chr1:826754-828040,TSS_POL2_RNAseq,TSS,2,chr1:827522-827523|chr1:827590-827591,64.46295,LINC00115|LINC01128
chr1:876474-878030,TSS_POL2_RNAseq,TSS,1,chr1:876802-876803,0.00788399,FAM41C


## Explore and check results

In [36]:
lst = lst_dat_peak_annot_summary

for (txt in names(lst)){
    cat(txt, "\n")
    dat = lst[[txt]]
    print(table(dat$Count))
    cat("\n\n")
}

TSS_POL2 

    1     2     3     4     5     6    10 
12784  2283   107    12     2     2     1 


TSS_POL2_RNAseq 

   1    2    3    5 
8622 1142   27    1 




## Save results

In [37]:
fdiry = file.path(
    FD_RES, 
    "results", 
    "region", 
    "KS91_K562_ASTARRseq_peak_macs_input", 
    "summary")
fname = "peak.summary.tss_pol2.tsv"
fpath = file.path(fdiry, fname)

lst = lst_dat_peak_annot_summary
dat = lst[["TSS_POL2"]]
write_tsv(dat, fpath)

print(dim(dat))
head(dat, 3)

[1] 15191     7


Peak,Annotation,Label,Count,Region,Score,Note
<chr>,<chr>,<chr>,<int>,<chr>,<dbl>,<chr>
chr1:17237-17772,TSS_POL2,TSS,4,chr1:17436-17437|chr1:17436-17437|chr1:17436-17437|chr1:17436-17437,9.43812,MIR6859-1|MIR6859-2|MIR6859-3|MIR6859-4
chr1:28903-29613,TSS_POL2,TSS,1,chr1:29370-29371,0.00023,WASH7P
chr1:777949-779437,TSS_POL2,TSS,1,chr1:778634-778635,230.561,LOC100288069


In [38]:
fdiry = file.path(
    FD_RES, 
    "results", 
    "region", 
    "KS91_K562_ASTARRseq_peak_macs_input", 
    "summary")
fname = "peak.summary.tss_pol2_rnaseq.tsv"
fpath = file.path(fdiry, fname)

lst = lst_dat_peak_annot_summary
dat = lst[["TSS_POL2_RNAseq"]]
write_tsv(dat, fpath)

print(dim(dat))
head(dat, 3)

[1] 9792    7


Peak,Annotation,Label,Count,Region,Score,Note
<chr>,<chr>,<chr>,<int>,<chr>,<dbl>,<chr>
chr1:28903-29613,TSS_POL2_RNAseq,TSS,1,chr1:29370-29371,0.00023,WASH7P
chr1:826754-828040,TSS_POL2_RNAseq,TSS,2,chr1:827522-827523|chr1:827590-827591,64.46295,LINC00115|LINC01128
chr1:876474-878030,TSS_POL2_RNAseq,TSS,1,chr1:876802-876803,0.00788399,FAM41C
