**Set environment**

In [1]:
suppressMessages(suppressWarnings(source("../run_config_project_sing.R")))
show_env()

You are working on        Singularity: singularity_proj_encode_fcc 
BASE DIRECTORY (FD_BASE): /data/reddylab/Kuei 
REPO DIRECTORY (FD_REPO): /data/reddylab/Kuei/repo 
WORK DIRECTORY (FD_WORK): /data/reddylab/Kuei/work 
DATA DIRECTORY (FD_DATA): /data/reddylab/Kuei/data 

You are working with      ENCODE FCC 
PATH OF PROJECT (FD_PRJ): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC 
PROJECT RESULTS (FD_RES): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results 
PROJECT SCRIPTS (FD_EXE): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/scripts 
PROJECT DATA    (FD_DAT): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/data 
PROJECT NOTE    (FD_NBK): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/notebooks 
PROJECT DOCS    (FD_DOC): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/docs 
PROJECT LOG     (FD_LOG): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/log 
PROJECT REF     (FD_REF): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/references 



## Prepare

**Set global variable**

In [2]:
vec = c(
    "fcc_astarr_macs_input_overlap",
    "fcc_astarr_macs_input_union"
)
names(vec) = vec

VEC_TXT_FOLDER = vec
for(txt in vec){cat(txt, "\n")}

fcc_astarr_macs_input_overlap 
fcc_astarr_macs_input_union 


In [3]:
TXT_FNAME_INP = "region.closest.summary.genome_tss.tsv"

**View files**

In [4]:
txt_fdiry = file.path(FD_RES, "region_closest", "*", "summary")
txt_fname = TXT_FNAME_INP
txt_fglob = file.path(txt_fdiry, txt_fname)

vec = Sys.glob(txt_fglob)
for(txt in vec){cat(txt, "\n")}

/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region_closest/fcc_astarr_macs_input_overlap/summary/region.closest.summary.genome_tss.tsv 
/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region_closest/fcc_astarr_macs_input_union/summary/region.closest.summary.genome_tss.tsv 


## Import data

In [5]:
### loop to import data
lst = lapply(VEC_TXT_FOLDER, function(txt_folder){
    ### set file directory
    txt_fdiry = file.path(FD_RES, "region_closest", txt_folder, "summary")
    txt_fname = TXT_FNAME_INP
    txt_fpath = file.path(txt_fdiry, txt_fname)

    ### read table
    dat = read_tsv(txt_fpath, show_col_types = FALSE)
    return(dat)
})

### assign and show
lst_dat_region_annot_import = lst

res = lapply(lst, dim)
print(res)

dat = lst[[1]]
fun_display_table(head(dat, 3))

$fcc_astarr_macs_input_overlap
[1] 304915     11

$fcc_astarr_macs_input_union
[1] 499336     11



Chrom,ChromStart,ChromEnd,Region,Annotation_A,Annotation_B,Region_TSS,Score_Pol2,Gene,Distance2TSS,TSS_Proximity
chr1,10038,10405,chr1:10038-10405,fcc_astarr_macs_input_overlap,genome_tss_pol2,chr1:11873-11874,0.00023,DDX11L1,1469,Proximal
chr1,10038,10405,chr1:10038-10405,fcc_astarr_macs_input_overlap,genome_tss_pol2_rnaseq,chr1:29370-29371,0.00023,WASH7P,18966,Distal
chr1,14282,14614,chr1:14282-14614,fcc_astarr_macs_input_overlap,genome_tss_pol2,chr1:11873-11874,0.00023,DDX11L1,2409,Distal


## Arrange table

In [6]:
lst = lst_dat_region_annot_import
lst = lapply(lst, function(dat){
    dat = dat %>% 
        dplyr::group_by(Chrom, ChromStart, ChromEnd, Region, Annotation_A, Annotation_B) %>%
        dplyr::summarise(
            Distance2TSS  = mean(Distance2TSS),
            TSS_Proximity = paste(unique(TSS_Proximity), collapse = ","),
            Count   = n(),
            .groups = "drop"
        )
    return(dat)
})

### assign and show
lst_dat_region_annot_arrange = lst

res = lapply(lst, dim)
print(res)

dat = lst[[1]]
fun_display_table(head(dat))

$fcc_astarr_macs_input_overlap
[1] 300084      9

$fcc_astarr_macs_input_union
[1] 493704      9



Chrom,ChromStart,ChromEnd,Region,Annotation_A,Annotation_B,Distance2TSS,TSS_Proximity,Count
chr1,10038,10405,chr1:10038-10405,fcc_astarr_macs_input_overlap,genome_tss_pol2,1469,Proximal,1
chr1,10038,10405,chr1:10038-10405,fcc_astarr_macs_input_overlap,genome_tss_pol2_rnaseq,18966,Distal,1
chr1,14282,14614,chr1:14282-14614,fcc_astarr_macs_input_overlap,genome_tss_pol2,2409,Distal,1
chr1,14282,14614,chr1:14282-14614,fcc_astarr_macs_input_overlap,genome_tss_pol2_rnaseq,14757,Distal,1
chr1,16025,16338,chr1:16025-16338,fcc_astarr_macs_input_overlap,genome_tss_pol2,1099,Proximal,4
chr1,16025,16338,chr1:16025-16338,fcc_astarr_macs_input_overlap,genome_tss_pol2_rnaseq,13033,Distal,1


**Check: Count**

In [7]:
lst = lst_dat_region_annot_arrange
lst = lapply(lst, function(dat){
    res = table(dat$Count, dnn = "Count")
    dat = as.data.frame(res)
    return(dat)
})

dat = bind_rows(lst, .id = "Folder")
dat = dat %>% tidyr::spread(Folder, Freq)
fun_display_table(dat)

Count,fcc_astarr_macs_input_overlap,fcc_astarr_macs_input_union
1,295552.0,488447
2,4346.0,5035
3,148.0,161
4,27.0,46
6,1.0,2
10,7.0,7
11,1.0,3
16,2.0,2
5,,1


**Check: TSS_Proximity**

In [8]:
lst = lst_dat_region_annot_arrange
lst = lapply(lst, function(dat){
    res = table(dat$TSS_Proximity, dnn = "TSS_Proximity")
    dat = as.data.frame(res)
    return(dat)
})

dat = bind_rows(lst, .id = "Folder")
dat = dat %>% tidyr::spread(Folder, Freq)
fun_display_table(dat)

TSS_Proximity,fcc_astarr_macs_input_overlap,fcc_astarr_macs_input_union
Distal,267693,457441
Proximal,32391,36263


## Export results

In [9]:
for (txt_folder in VEC_TXT_FOLDER){
    ### get table
    dat_region_annot_result = lst_dat_region_annot_arrange[[txt_folder]]

    ### split by TSS annotation
    dat = dat_region_annot_result
    lst = split(dat, dat$Annotation_B)

    ### for each TSS region set, export data
    for (idx in names(lst)){
        ### arrange table
        dat = lst[[idx]]
        dat = dat %>% dplyr::select(
            Chrom:Region, 
            Annotation_A, Annotation_B, 
            Distance2TSS, TSS_Proximity
        ) %>%
        dplyr::arrange(Chrom, ChromStart, ChromEnd)

        ### set file directory
        txt_annot = idx
        txt_fdiry = file.path(FD_RES, "region_closest", txt_folder, "summary")
        txt_fname = paste("region", "annotation", txt_annot, "tss_proximity", "tsv", sep = ".")
        txt_fpath = file.path(txt_fdiry, txt_fname)

        ### write table
        write_tsv(dat, txt_fpath)

        ### show progress
        cat("Save file:", "\n")
        cat("Folder:", txt_folder, "\n")
        cat(txt_fpath, "\n")
        cat("\n")
        flush.console()
    }
}

Save file: 
Folder: fcc_astarr_macs_input_overlap 
/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region_closest/fcc_astarr_macs_input_overlap/summary/region.annotation.genome_tss_pol2.tss_proximity.tsv 

Save file: 
Folder: fcc_astarr_macs_input_overlap 
/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region_closest/fcc_astarr_macs_input_overlap/summary/region.annotation.genome_tss_pol2_rnaseq.tss_proximity.tsv 

Save file: 
Folder: fcc_astarr_macs_input_union 
/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region_closest/fcc_astarr_macs_input_union/summary/region.annotation.genome_tss_pol2.tss_proximity.tsv 

Save file: 
Folder: fcc_astarr_macs_input_union 
/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region_closest/fcc_astarr_macs_input_union/summary/region.annotation.genome_tss_pol2_rnaseq.tss_proximity.tsv 

