**Set environment**

In [1]:
suppressMessages(suppressWarnings(source("../config/config_sing.R")))
show_env()

You are in Singularity: singularity_proj_encode_fcc 
BASE DIRECTORY (FD_BASE): /data/reddylab/Kuei 
WORK DIRECTORY (FD_WORK): /data/reddylab/Kuei/out 
CODE DIRECTORY (FD_CODE): /data/reddylab/Kuei/code 
PATH OF PROJECT (FD_PRJ): /data/reddylab/Kuei/code/Proj_CombEffect_ENCODE_FCC 
PATH OF RESULTS (FD_RES): /data/reddylab/Kuei/out/proj_combeffect_encode_fcc 
PATH OF LOG     (FD_LOG): /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/log 


**Check data files**

In [4]:
FOLDER_REG = "hic_intact_deep"

fdiry = file.path(FD_RES, "results", "region", FOLDER_REG, "summary")
for (fname in dir(fdiry)){cat(fname, "\n")}

peak.summary.crispri_growth.dhs.active.tsv 
peak.summary.crispri_hcrff.casa.merge.tsv 
peak.summary.encode_e2g_benchmark.active.merge.tsv 
peak.summary.encode_e2g_prediction.active.merge.tsv 
peak.summary.enhancer_zscore_junke.concat.tsv 
peak.summary.KS91_K562_ASTARRseq_peak_macs_input.tsv 


## Import data

In [35]:
### get file paths
fdiry  = file.path(FD_RES, "results", "region", FOLDER_REG, "summary")
fname  = "peak.summary.*"
fglob  = file.path(fdiry, fname)
fpaths = Sys.glob(fglob)

### read tables
lst = lapply(fpaths, function(fpath){
    dat = read_tsv(fpath, show_col_types = FALSE)
    return(dat)
})

dat = bind_rows(lst)

### assign and show
dat_peak_summary_combine = dat
print(dim(dat))
head(dat)

[1] 1424119       6


Peak,Loop,Distance,Annotation,Label_Peak,Label_Loop
<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>
chr1:605550-605627,chr1:778000-779000,172374,CRISPRi,CRISPRi-Growth,Loop_A
chr1:826642-827902,chr1:868000-870000,40099,CRISPRi,CRISPRi-Growth,Loop_A
chr1:964946-965136,chr1:955000-960000,4947,CRISPRi,CRISPRi-Growth,Loop_A
chr1:964946-965136,chr1:958000-960000,4947,CRISPRi,CRISPRi-Growth,Loop_A
chr1:964946-965136,chr1:959000-960000,4947,CRISPRi,CRISPRi-Growth,Loop_A
chr1:995761-996190,chr1:1000000-1001000,3811,CRISPRi,CRISPRi-Growth,Loop_A


In [31]:
dat = dat_peak_summary_combine
lst = split(dat$Peak, dat$Label_Peak)
for (idx in names(lst)){
    cat(idx, "\n")
    vec = lst[[idx]]
    vec = unique(vec)
    cat(length(vec), "\n")
    cat("\n")
}

ASTARR_A 
9368 

ASTARR_AB 
3123 

ASTARR_R 
17897 

ATAC 
230509 

CRISPRi-Growth 
6242 

CRISPRi-HCRFF 
80 

ENCODE-E2G_Benchmark 
409 

ENCODE-E2G_Prediction 
51829 

LMPRA_A 
40096 

LMPRA_AB 
26732 

LMPRA_R 
1525 

TMPRA_A 
8294 

TMPRA_R 
363 

WSTARR_A 
79738 

WSTARR_AB 
25505 

WSTARR_R 
62201 



## Filter to get looped peaks

In [32]:
dat = dat_peak_summary_combine

### filter
dat = dat %>% 
    dplyr::filter(Distance == 0) %>%
    dplyr::select(Peak, Label_Peak) %>%
    tidyr::separate(Peak, c("Chrom", "Start", "End"), remove = FALSE)

### split by assays and arrange
lst = split(dat, dat$Label_Peak)
lst = lapply(lst, function(dat){
    dat = dat %>% 
        dplyr::select(Chrom, Start, End, Peak) %>%
        dplyr::arrange(Chrom, Start, End) %>%
        dplyr::distinct()
    return(dat)
})

### assign and show
lst_dat_peak_summary_filter = lst

for (idx in names(lst)){
    cat(idx, "\n")
    dat = lst[[idx]]
    cat(nrow(dat), "\n")
    cat("\n")
}

ASTARR_A 
6106 

ASTARR_AB 
2095 

ASTARR_R 
12235 

ATAC 
65084 

CRISPRi-Growth 
3183 

CRISPRi-HCRFF 
55 

ENCODE-E2G_Benchmark 
342 

ENCODE-E2G_Prediction 
32449 

LMPRA_A 
21085 

LMPRA_AB 
14303 

LMPRA_R 
579 

TMPRA_A 
1391 

TMPRA_R 
51 

WSTARR_A 
22816 

WSTARR_AB 
10454 

WSTARR_R 
6789 



In [33]:
lst = lst_dat_peak_summary_filter
dat = lst[[1]]
head(dat)

Chrom,Start,End,Peak
<chr>,<chr>,<chr>,<chr>
chr1,100133020,100133120,chr1:100133020-100133120
chr1,100322940,100323170,chr1:100322940-100323170
chr1,100347430,100348010,chr1:100347430-100348010
chr1,100352390,100352520,chr1:100352390-100352520
chr1,100785600,100785700,chr1:100785600-100785700
chr1,10415490,10415720,chr1:10415490-10415720


## Save results

In [34]:
lst = lst_dat_peak_summary_filter

for (idx in names(lst)){
    
    ### get table
    dat = lst[[idx]]
    
    ### set file name
    txt = idx
    txt = tolower(txt)
    txt = str_replace(string = txt, pattern = "-", replacement = "_")
    
    ### set file path
    fdiry = file.path(FD_RES, "results", "region", FOLDER_REG, "summary")
    fname = paste("peak.looped", txt, "bed.gz", sep=".")
    fpath = file.path(fdiry, fname)
    
    ### write table
    write_tsv(dat, fpath)
}