**Set environment**

In [1]:
suppressWarnings(suppressMessages(source("../config/config_sing.R")))
show_env()

You are in Singularity: singularity_proj_encode_fcc 
BASE DIRECTORY (FD_BASE): /data/reddylab/Kuei 
WORK DIRECTORY (FD_WORK): /data/reddylab/Kuei/out 
CODE DIRECTORY (FD_CODE): /data/reddylab/Kuei/code 
PATH OF PROJECT (FD_PRJ): /data/reddylab/Kuei/code/Proj_CombEffect_ENCODE_FCC 
PATH OF RESULTS (FD_RES): /data/reddylab/Kuei/out/proj_combeffect_encode_fcc 
PATH OF LOG     (FD_LOG): /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/log 


## Import data

In [2]:
fdiry = file.path(FD_RES, "results", "region", "KS91_K562_ASTARRseq_peak_macs_input")
fname = "KS91_K562_hg38_ASTARRseq_Input.all_reps.masked.union_narrowPeak.q5.bed.gz"
fpath = file.path(fdiry, fname)

cnames = c("Chrom", "Start", "End")
dat = read_tsv(fpath, col_names = cnames, show_col_types = FALSE)
dat = dat %>% dplyr::mutate(Name = paste0(Chrom, ":", Start, "-", End))

dat_peak_ocr = dat
print(dim(dat))
head(dat)

[1] 247520      4


Chrom,Start,End,Name
<chr>,<dbl>,<dbl>,<chr>
chr1,10015,10442,chr1:10015-10442
chr1,14253,14645,chr1:14253-14645
chr1,16015,16477,chr1:16015-16477
chr1,17237,17772,chr1:17237-17772
chr1,28903,29613,chr1:28903-29613
chr1,30803,31072,chr1:30803-31072


In [3]:
fdiry = file.path(FD_RES, "results", "region")
fname = "region_screened_selected.bed.gz"
fpath = file.path(fdiry, fname)

cnames = c("Chrom", "Start", "End", "Name")
dat = read_tsv(fpath, col_names = cnames, show_col_types = FALSE)

dat_region_subset = dat
print(dim(dat))
dat

[1] 16  4


Chrom,Start,End,Name
<chr>,<dbl>,<dbl>,<chr>
chr11,4000000,6600000,chr11:4091884-6505900
chr11,32800000,35000000,chr11:32869701-34870100
chr11,61700000,62000000,chr11:61787329-61898348
chr12,54290000,54310000,chr12:54300766-54301042
chr12,110000000,113000000,chr12:110726151-112325737
chr16,0,1300000,chr16:10001-1173100
chr19,12880000,12900000,chr19:12887110-12887237
chr2,59000000,62000000,chr2:59553301-61553700
chr20,55500000,59000000,chr20:56391301-58391700
chr3,128475000,128512000,chr3:128487571-128487937


In [4]:
dat = dat_region_subset
vec = dat$Chrom
vec_chrom = sort(unique(vec))
print(vec_chrom)

 [1] "chr11" "chr12" "chr16" "chr19" "chr2"  "chr20" "chr3"  "chr4"  "chr5" 
[10] "chr6"  "chr8"  "chrX" 


## Create combinations of OCRs for selected chromosomes

In [5]:
dat = dat_peak_ocr
lst = split(dat$Name, dat$Chrom)

vec = vec_chrom
lst = lst[vec]

lst = lapply(lst, function(vec){
    ### Show progress
    cat(vec[1], "\n", length(vec), "\n\n")
    flush.console()
    
    ### create combinations without permutation
    dat_comb2_self  = data.frame(V1 = vec, V2 = vec)
    dat_comb2_pair  = combn(vec, 2) %>% t %>% as.data.frame    
    dat_comb2_total = dplyr::bind_rows(
        dat_comb2_self, 
        dat_comb2_pair) %>% 
        dplyr::arrange(V1, V2)
    return(dat_comb2_total)
})

lst_peak_comb2 = lst
print(names(lst))
head(lst[[1]])

chr11:113408-113701 
 12010 

chr12:10606-11448 
 10725 

chr16:10131-11496 
 8999 

chr19:70732-71295 
 8447 

chr2:11292-11934 
 19379 

chr20:129971-130487 
 6046 

chr3:10230-10640 
 14422 

chr4:11715-12387 
 10120 

chr5:10627-11783 
 13453 

chr6:126583-127039 
 18166 

chr8:191891-192124 
 10120 

chrX:19079-20855 
 4214 

 [1] "chr11" "chr12" "chr16" "chr19" "chr2"  "chr20" "chr3"  "chr4"  "chr5" 
[10] "chr6"  "chr8"  "chrX" 


Unnamed: 0_level_0,V1,V2
Unnamed: 0_level_1,<chr>,<chr>
1,chr11:1002205-1002620,chr11:1002205-1002620
2,chr11:1002205-1002620,chr11:100285883-100286264
3,chr11:1002205-1002620,chr11:100451963-100452380
4,chr11:1002205-1002620,chr11:100547023-100547250
5,chr11:1002205-1002620,chr11:100563051-100563718
6,chr11:1002205-1002620,chr11:100602670-100604359


In [6]:
lst = lst_peak_comb2
for (idn in names(lst)){
    dat = lst[[idn]]
    cat(idn, "|", nrow(dat)/10^6, "\n")
}

chr11 | 72.12605 
chr12 | 57.51817 
chr16 | 40.4955 
chr19 | 35.68013 
chr2 | 187.7825 
chr20 | 18.28008 
chr3 | 104.0043 
chr4 | 51.21226 
chr5 | 90.49833 
chr6 | 165.0109 
chr8 | 51.21226 
chrX | 8.881005 


## Split each dataset into subset and save results

In [7]:
lst_dat = lst_peak_comb2
for (idn in names(lst)){

    ### set file directory
    fdiry = file.path(
        FD_RES, "results", "region", 
        "KS91_K562_ASTARRseq_peak_macs_input", 
        "region_pair",
        paste("region_pair", idn, sep="_"))
    
    txt = paste("mkdir", "-p", fdiry)
    system(txt)
    
    ### extract data and split
    dat = lst_dat[[idn]]
    lst_tmp = split(dat, (seq(nrow(dat))-1) %/% 10^6)
    
    ### show progress
    cat("==========================\n")
    cat(idn, "|", nrow(dat)/10^6, "\n")
    cat("Splitted into", length(lst_tmp), "pieces", "\n")
    cat("\n")
    flush.console()
    
    ### 
    for (idx in seq_along(lst_tmp)){
        ### extract data
        dat   = lst_tmp[[idx]]
        txt   = paste0("SUBSET", idx)
        
        ### set file directory
        fname = paste("region_pair", idn, txt, "tsv.gz", sep=".")
        fpath = file.path(fdiry, fname)
        
        ### save the results
        write_tsv(dat, fpath, col_names=FALSE)
        
        ### show progress
        cat("File name: ", fname,    "\n")
        cat("Data shape:", dim(dat), "\n")
        cat("\n")
        flush.console()
    }
}

chr11 | 72.12605 
Splitted into 73 pieces 

File name:  region_pair.chr11.SUBSET1.tsv.gz 
Data shape: 1000000 2 

File name:  region_pair.chr11.SUBSET2.tsv.gz 
Data shape: 1000000 2 

File name:  region_pair.chr11.SUBSET3.tsv.gz 
Data shape: 1000000 2 

File name:  region_pair.chr11.SUBSET4.tsv.gz 
Data shape: 1000000 2 

File name:  region_pair.chr11.SUBSET5.tsv.gz 
Data shape: 1000000 2 

File name:  region_pair.chr11.SUBSET6.tsv.gz 
Data shape: 1000000 2 

File name:  region_pair.chr11.SUBSET7.tsv.gz 
Data shape: 1000000 2 

File name:  region_pair.chr11.SUBSET8.tsv.gz 
Data shape: 1000000 2 

File name:  region_pair.chr11.SUBSET9.tsv.gz 
Data shape: 1000000 2 

File name:  region_pair.chr11.SUBSET10.tsv.gz 
Data shape: 1000000 2 

File name:  region_pair.chr11.SUBSET11.tsv.gz 
Data shape: 1000000 2 

File name:  region_pair.chr11.SUBSET12.tsv.gz 
Data shape: 1000000 2 

File name:  region_pair.chr11.SUBSET13.tsv.gz 
Data shape: 1000000 2 

File name:  region_pair.chr11.SUBSET14.tsv