**Set environment**

In [1]:
suppressMessages(suppressWarnings(source("../run_config_project_sing.R")))
show_env()

You are working on        Singularity 
BASE DIRECTORY (FD_BASE): /mount 
REPO DIRECTORY (FD_REPO): /mount/repo 
WORK DIRECTORY (FD_WORK): /mount/work 
DATA DIRECTORY (FD_DATA): /mount/data 

You are working with      ENCODE FCC 
PATH OF PROJECT (FD_PRJ): /mount/repo/Proj_CombEffect_ENCODE_FCC 
PROJECT RESULTS (FD_RES): /mount/repo/Proj_CombEffect_ENCODE_FCC/results 
PROJECT SCRIPTS (FD_EXE): /mount/repo/Proj_CombEffect_ENCODE_FCC/scripts 
PROJECT DATA    (FD_DAT): /mount/repo/Proj_CombEffect_ENCODE_FCC/data 
PROJECT NOTE    (FD_NBK): /mount/repo/Proj_CombEffect_ENCODE_FCC/notebooks 
PROJECT DOCS    (FD_DOC): /mount/repo/Proj_CombEffect_ENCODE_FCC/docs 
PROJECT LOG     (FD_LOG): /mount/repo/Proj_CombEffect_ENCODE_FCC/log 
PROJECT APP     (FD_APP): /mount/repo/Proj_CombEffect_ENCODE_FCC/app 
PROJECT REF     (FD_REF): /mount/repo/Proj_CombEffect_ENCODE_FCC/references 



In [4]:
txt_fdiry = file.path(FD_RES, "assay_fcc")
dir(txt_fdiry)

In [6]:
VEC_TXT_ASSAY = c(
    "STARR_ATAC_K562_Reddy_KS91",
    "STARR_WHG_K562_Reddy_A001"
)

## Import region data

**Read region metadata table**

In [14]:
### set directory
txt_fdiry = file.path(FD_RES, "region", "summary")
txt_fname = "metadata_region_label.tsv"
txt_fpath = file.path(txt_fdiry, txt_fname)

### read table
dat = read_tsv(txt_fpath, show_col_types = FALSE)

### assign and show
dat_metadata_region = dat
fun_display_table(dat)

Folder,FName,Label
encode_open_chromatin,K562.hg38.ENCSR000EKS.ENCFF274YGF.DNase.bed.gz,dnase_ENCFF274YGF
encode_open_chromatin,K562.hg38.ENCSR000EOT.ENCFF185XRG.DNase.bed.gz,dnase_ENCFF185XRG
encode_open_chromatin,K562.hg38.ENCSR483RKN.ENCFF558BLC.ATAC.bed.gz,atac_ENCFF558BLC
encode_open_chromatin,K562.hg38.ENCSR483RKN.ENCFF925CYR.ATAC.bed.gz,atac_ENCFF925CYR
encode_open_chromatin,K562.hg38.ENCSR868FGK.ENCFF333TAT.ATAC.bed.gz,atac_ENCFF333TAT
encode_open_chromatin,K562.hg38.ENCSR868FGK.ENCFF948AFM.ATAC.bed.gz,atac_ENCFF948AFM
fcc_astarr_macs,ASTARRseq_K562_KS91.hg38.Input.rep_all.max_overlaps.q5.bed,astarr_macs_input_overlap
fcc_astarr_macs,ASTARRseq_K562_KS91.hg38.Input.rep_all.union.q5.bed,astarr_macs_input_union


**Create region list for looping**

In [17]:
dat = dat_metadata_region
lst = split(dat$Label, dat$Folder)

lst_region = lst
print(lst)

$encode_open_chromatin
[1] "dnase_ENCFF274YGF" "dnase_ENCFF185XRG" "atac_ENCFF558BLC" 
[4] "atac_ENCFF925CYR"  "atac_ENCFF333TAT"  "atac_ENCFF948AFM" 

$fcc_astarr_macs
[1] "astarr_macs_input_overlap" "astarr_macs_input_union"  



## Loop through regions and assays to create count matrix

**Define helper function**

In [51]:
fun = function(dat1, dat2){
    dat = dplyr::full_join(dat1, dat2, by = c("Chrom", "ChromStart", "ChromEnd", "Region"))
    return(dat)
}

**Generate count matrix for each region set for each assay**

In [54]:
for (txt_region_folder in names(lst_region)[1]){
    
    ### get region label
    vec_txt_region_label = lst_region[[txt_region_folder]]

    ### set directory for region description
    txt_fdiry  = file.path(FD_RES, "region", txt_region_folder)
    txt_fname  = "description.tsv"
    txt_fpath  = file.path(txt_fdiry, txt_fname)

    ### read region description and set column names/types
    dat = read_tsv(txt_fpath, show_col_types = FALSE)
    vec = dat$Name
    vec_txt_cname = c(vec, "Count")
    vec_col_ctype = cols(Count = col_character())
    
    ###
    for (txt_region_label in vec_txt_region_label[1]){

        for (txt_assay in VEC_TXT_ASSAY) {

            ### show progress: init
            cat("\n====================\n")
            cat("Region Folder:", txt_region_folder, "\n")
            cat("Region Label: ", txt_region_label,  "\n")
            cat("Assay  Folder:", txt_assay,         "\n")
            cat("\n")
            flush.console()
            
            ### set directory for assay metadata
            txt_fdiry = file.path(FD_RES, "assay_fcc", txt_assay, "fragment_counts", "summary")
            txt_fname = "metadata.tsv"
            txt_fpath = file.path(txt_fdiry, txt_fname)
        
            ### read metadata
            dat_metadata_assay = read_tsv(txt_fpath, show_col_types = FALSE)

            ### setup sample names
            vec1 = dat_metadata_assay$FName
            vec2 = dat_metadata_assay$Sample
            vec_txt_sample = setNames(vec1, vec2)
            
            ### set directory for assay counts
            txt_fdiry = file.path(FD_RES, "assay_fcc", txt_assay, "coverage", txt_region_folder, txt_region_label)
            txt_fname = "*WGS*bed.gz"
            txt_fglob = file.path(txt_fdiry, txt_fname)

            ### get count files and show
            vec_txt_fpath = Sys.glob(txt_fglob)
            vec_txt_fname = basename(vec_txt_fpath)

            ### show progress: prepare
            cat("Region Metatable:\n")
            flush.console()
            fun_display_table(dat_metadata_assay)
            cat("\n")
            flush.console()
            
            cat("Import Counts:\n")
            flush.console()
            print(vec_txt_fname)
            cat("\n")
            flush.console()
            
            ### loop through count files and import them
            lst_dat = lapply(vec_txt_fpath, function(txt_fpath){

                ### read table and arrange the columns
                dat = read_tsv(
                    txt_fpath, 
                    col_names = vec_txt_cname, 
                    col_types = vec_col_ctype, 
                    show_col_types = FALSE)
                dat = dat %>% 
                    dplyr::select(Chrom, ChromStart, ChromEnd, Count) %>%
                    dplyr::mutate(Region = paste0(Chrom, ":", ChromStart, "-", ChromEnd)) %>%
                    dplyr::mutate(Count  = replace(Count, Count == ".", "0")) %>%
                    dplyr::mutate(Count  = as.integer(Count)) %>%
                    dplyr::select(Chrom, ChromStart, ChromEnd, Region, Count) %>%
                    dplyr::distinct()
    
                ### rename column to include the file name
                txt_fname = basename(txt_fpath)
                vec = c("Chrom", "ChromStart", "ChromEnd", "Region", txt_fname)
                colnames(dat) = vec
                
                return(dat)
            })

            ### combine imported counts into a matrix
            dat_cnt = Reduce(fun, lst_dat)
            dat_cnt = dplyr::rename(dat_cnt, any_of(vec_txt_sample))

            ### combine count matrix and column data
            lst_dat = list(
                "data_cnt" = dat_cnt,
                "data_col" = dat_metadata_assay)

            ### set directory for saving results
            txt_fdiry = file.path(FD_RES, "assay_fcc", txt_assay, "coverage", txt_region_folder, txt_region_label, "summary")
            txt_cmd = paste("mkdir -p", txt_fdiry)
            system(txt_cmd)

            ### save count matrix
            txt_fname = "matrix.raw.count.WGS.tsv"
            txt_fpath = file.path(txt_fdiry, txt_fname)
            
            dat = dat_cnt
            write_tsv(dat, txt_fpath)

            ### save data list
            txt_fname = "data_list_count_column.rds"
            txt_fpath = file.path(txt_fdiry, txt_fname)

            obj = lst_dat
            saveRDS(obj, txt_fpath)

            ### show progress: results
            cat("Save count matrix and data list.\n")
            flush.console()
            fun_display_table(head(dat_cnt))
            fun_display_table(tail(dat_cnt))
            cat("\n")
            flush.console()
        }   
    }
}


Region Folder: encode_open_chromatin 
Region Label:  dnase_ENCFF274YGF 
Assay  Folder: STARR_ATAC_K562_Reddy_KS91 

Region Metatable:


FName,Assay,Prefix,Group,Sample
ASTARRseq_K562_KS91.hg38.Input.rep1.WGS.unstranded.bed.gz,STARR_ATAC_K562_Reddy_KS91,ASTARRseq_K562_KS91,Input,Input.rep1
ASTARRseq_K562_KS91.hg38.Input.rep2.WGS.unstranded.bed.gz,STARR_ATAC_K562_Reddy_KS91,ASTARRseq_K562_KS91,Input,Input.rep2
ASTARRseq_K562_KS91.hg38.Input.rep3.WGS.unstranded.bed.gz,STARR_ATAC_K562_Reddy_KS91,ASTARRseq_K562_KS91,Input,Input.rep3
ASTARRseq_K562_KS91.hg38.Input.rep4.WGS.unstranded.bed.gz,STARR_ATAC_K562_Reddy_KS91,ASTARRseq_K562_KS91,Input,Input.rep4
ASTARRseq_K562_KS91.hg38.Input.rep5.WGS.unstranded.bed.gz,STARR_ATAC_K562_Reddy_KS91,ASTARRseq_K562_KS91,Input,Input.rep5
ASTARRseq_K562_KS91.hg38.Input.rep6.WGS.unstranded.bed.gz,STARR_ATAC_K562_Reddy_KS91,ASTARRseq_K562_KS91,Input,Input.rep6
ASTARRseq_K562_KS91.hg38.Output.rep1.WGS.unstranded.bed.gz,STARR_ATAC_K562_Reddy_KS91,ASTARRseq_K562_KS91,Output,Output.rep1
ASTARRseq_K562_KS91.hg38.Output.rep2.WGS.unstranded.bed.gz,STARR_ATAC_K562_Reddy_KS91,ASTARRseq_K562_KS91,Output,Output.rep2
ASTARRseq_K562_KS91.hg38.Output.rep3.WGS.unstranded.bed.gz,STARR_ATAC_K562_Reddy_KS91,ASTARRseq_K562_KS91,Output,Output.rep3
ASTARRseq_K562_KS91.hg38.Output.rep4.WGS.unstranded.bed.gz,STARR_ATAC_K562_Reddy_KS91,ASTARRseq_K562_KS91,Output,Output.rep4



Import Counts:
 [1] "ASTARRseq_K562_KS91.hg38.Input.rep1.WGS.unstranded.bed.gz" 
 [2] "ASTARRseq_K562_KS91.hg38.Input.rep2.WGS.unstranded.bed.gz" 
 [3] "ASTARRseq_K562_KS91.hg38.Input.rep3.WGS.unstranded.bed.gz" 
 [4] "ASTARRseq_K562_KS91.hg38.Input.rep4.WGS.unstranded.bed.gz" 
 [5] "ASTARRseq_K562_KS91.hg38.Input.rep5.WGS.unstranded.bed.gz" 
 [6] "ASTARRseq_K562_KS91.hg38.Input.rep6.WGS.unstranded.bed.gz" 
 [7] "ASTARRseq_K562_KS91.hg38.Output.rep1.WGS.unstranded.bed.gz"
 [8] "ASTARRseq_K562_KS91.hg38.Output.rep2.WGS.unstranded.bed.gz"
 [9] "ASTARRseq_K562_KS91.hg38.Output.rep3.WGS.unstranded.bed.gz"
[10] "ASTARRseq_K562_KS91.hg38.Output.rep4.WGS.unstranded.bed.gz"

Save count matrix and data list.


Chrom,ChromStart,ChromEnd,Region,Input.rep1,Input.rep2,Input.rep3,Input.rep4,Input.rep5,Input.rep6,Output.rep1,Output.rep2,Output.rep3,Output.rep4
chr1,181400,181530,chr1:181400-181530,166,229,227,218,220,180,7,11,16,8
chr1,778660,778800,chr1:778660-778800,3014,3520,3601,3482,3447,3189,1937,2413,2644,4819
chr1,779137,779200,chr1:779137-779200,1137,1390,1465,1383,1383,1268,84,128,182,343
chr1,827460,827554,chr1:827460-827554,2402,2947,3119,2971,2918,2721,636,717,820,1657
chr1,842880,843060,chr1:842880-843060,448,526,566,547,510,471,99,190,217,284
chr1,869800,869980,chr1:869800-869980,880,1034,1068,1034,994,935,46,38,51,105


Chrom,ChromStart,ChromEnd,Region,Input.rep1,Input.rep2,Input.rep3,Input.rep4,Input.rep5,Input.rep6,Output.rep1,Output.rep2,Output.rep3,Output.rep4
chrX,156001660,156001860,chrX:156001660-156001860,363,478,464,446,422,402,64,190,175,330
chrX,156002520,156002780,chrX:156002520-156002780,336,423,499,448,439,392,92,134,117,295
chrY,11103140,11103280,chrY:11103140-11103280,0,0,0,0,0,0,0,0,0,0
chrY,11712280,11712332,chrY:11712280-11712332,0,0,0,0,0,0,0,0,0,0
chrY,12347360,12347500,chrY:12347360-12347500,0,0,0,0,0,0,0,0,0,0
chrY,26563060,26563204,chrY:26563060-26563204,0,0,0,0,0,0,0,0,0,0




Region Folder: encode_open_chromatin 
Region Label:  dnase_ENCFF274YGF 
Assay  Folder: STARR_WHG_K562_Reddy_A001 

Region Metatable:


FName,Assay,Prefix,Group,Sample
WSTARRseq_K562_A001.hg38.Input.rep1.WGS.unstranded.bed.gz,STARR_WHG_K562_Reddy_A001,WSTARRseq_K562_A001,Input,Input.rep1
WSTARRseq_K562_A001.hg38.Input.rep2.WGS.unstranded.bed.gz,STARR_WHG_K562_Reddy_A001,WSTARRseq_K562_A001,Input,Input.rep2
WSTARRseq_K562_A001.hg38.Input.rep3.WGS.unstranded.bed.gz,STARR_WHG_K562_Reddy_A001,WSTARRseq_K562_A001,Input,Input.rep3
WSTARRseq_K562_A001.hg38.Input.rep4.WGS.unstranded.bed.gz,STARR_WHG_K562_Reddy_A001,WSTARRseq_K562_A001,Input,Input.rep4
WSTARRseq_K562_A001.hg38.Output.rep1.WGS.unstranded.bed.gz,STARR_WHG_K562_Reddy_A001,WSTARRseq_K562_A001,Output,Output.rep1
WSTARRseq_K562_A001.hg38.Output.rep2.WGS.unstranded.bed.gz,STARR_WHG_K562_Reddy_A001,WSTARRseq_K562_A001,Output,Output.rep2
WSTARRseq_K562_A001.hg38.Output.rep3.WGS.unstranded.bed.gz,STARR_WHG_K562_Reddy_A001,WSTARRseq_K562_A001,Output,Output.rep3



Import Counts:
[1] "WSTARRseq_K562_A001.hg38.Input.rep1.WGS.unstranded.bed.gz" 
[2] "WSTARRseq_K562_A001.hg38.Input.rep2.WGS.unstranded.bed.gz" 
[3] "WSTARRseq_K562_A001.hg38.Input.rep3.WGS.unstranded.bed.gz" 
[4] "WSTARRseq_K562_A001.hg38.Input.rep4.WGS.unstranded.bed.gz" 
[5] "WSTARRseq_K562_A001.hg38.Output.rep1.WGS.unstranded.bed.gz"
[6] "WSTARRseq_K562_A001.hg38.Output.rep2.WGS.unstranded.bed.gz"
[7] "WSTARRseq_K562_A001.hg38.Output.rep3.WGS.unstranded.bed.gz"

Save count matrix and data list.


Chrom,ChromStart,ChromEnd,Region,Input.rep1,Input.rep2,Input.rep3,Input.rep4,Output.rep1,Output.rep2,Output.rep3
chr1,181400,181530,chr1:181400-181530,3,18,12,15,34,29,51
chr1,778660,778800,chr1:778660-778800,10,35,27,30,195,181,312
chr1,779137,779200,chr1:779137-779200,8,31,17,23,56,77,114
chr1,827460,827554,chr1:827460-827554,10,25,24,25,99,89,128
chr1,842880,843060,chr1:842880-843060,7,21,19,14,48,40,96
chr1,869800,869980,chr1:869800-869980,9,25,46,33,43,41,79


Chrom,ChromStart,ChromEnd,Region,Input.rep1,Input.rep2,Input.rep3,Input.rep4,Output.rep1,Output.rep2,Output.rep3
chrX,156001660,156001860,chrX:156001660-156001860,6,16,28,27,80,40,108
chrX,156002520,156002780,chrX:156002520-156002780,7,28,23,23,68,54,119
chrY,11103140,11103280,chrY:11103140-11103280,5,8,7,6,10,11,26
chrY,11712280,11712332,chrY:11712280-11712332,2,1,4,4,4,4,6
chrY,12347360,12347500,chrY:12347360-12347500,0,0,0,1,0,0,0
chrY,26563060,26563204,chrY:26563060-26563204,2,4,3,4,5,4,16



