**Set environment**

In [1]:
suppressMessages(suppressWarnings(source("../config/config_sing.R")))
show_env()

You are in Singularity: singularity_proj_encode_fcc 
BASE DIRECTORY (FD_BASE): /data/reddylab/Kuei 
WORK DIRECTORY (FD_WORK): /data/reddylab/Kuei/out 
CODE DIRECTORY (FD_CODE): /data/reddylab/Kuei/code 
PATH OF PROJECT (FD_PRJ): /data/reddylab/Kuei/code/Proj_CombEffect_ENCODE_FCC 
PATH OF RESULTS (FD_RES): /data/reddylab/Kuei/out/proj_combeffect_encode_fcc 
PATH OF LOG     (FD_LOG): /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/log 


## Helper function

In [2]:
get_group  = function(fpath){return(str_extract(fpath, "Input|Output"))}
get_sample = function(fpath){return(str_extract(fpath, "Input.rep\\d|Input.mean|Output.rep\\d|Output.mean"))}
get_prefix = function(fpath){return(str_extract(fpath, "^[^\\.]*"))}

get_type   = function(fpath){
    res = ifelse(
        str_detect(fpath, "norm"),
        "norm",
        "raw"
    )
    return(res)
}

get_genome   = function(fpath){
    res = ifelse(
        str_detect(fpath, "hg19"),
        "hg19",
        "hg38"
    )
    return(res)
}

## Summarize information of library size for each assay

In [3]:
### init
ASSAYS=c("KS91_K562_ASTARRseq", "A001_K562_WSTARRseq", "Tewhey_K562_TileMPRA")
cnames = c("FName", "Size")
ctypes = c(col_character(), col_double())

### loop through each assay
for (assay in ASSAYS) {
    
    ### set file path
    fdiry = file.path(FD_RES, "results", assay, "coverage", "summary")
    fname = "library_size_count.csv"
    fpath = file.path(fdiry, fname)
    
    ### import and add additional information
    dat_lib = read_csv(fpath, col_names = cnames, col_types = ctypes)
    dat_lib = dat_lib %>% 
        dplyr::mutate(
            Prefix = get_prefix(FName),
            Group  = get_group(FName),
            Sample = get_sample(FName),
            Type   = get_type(FName),
            Genome = get_genome(FName)
        ) %>%
        dplyr::select(FName, Size, Prefix, Sample, Group, Type, Genome)
    
    ### store table
    fname = "library_size_summary.tsv"
    fpath = file.path(fdiry, fname)
    write_tsv(dat_lib, fpath)
    
    ### create sorted version without header for bash join command
    dat_lib = dat_lib %>% arrange(FName)
    
    fname = "library_size_summary_sorted_no_cnames.tsv"
    fpath = file.path(fdiry, fname)
    write_tsv(dat_lib, fpath, col_names = FALSE)
    
    ### show progress
    print(assay)
    print(dat_lib)
    cat("++++++++++++++++++++++++++\n")

}

[1] "KS91_K562_ASTARRseq"
[90m# A tibble: 10 × 7[39m
   FName                                   Size Prefix Sample Group Type  Genome
   [3m[90m<chr>[39m[23m                                  [3m[90m<dbl>[39m[23m [3m[90m<chr>[39m[23m  [3m[90m<chr>[39m[23m  [3m[90m<chr>[39m[23m [3m[90m<chr>[39m[23m [3m[90m<chr>[39m[23m 
[90m 1[39m KS91_K562_ASTARRseq.Input.rep1.WGS.u… 3.49[90me[39m8 KS91_… Input… Input raw   hg38  
[90m 2[39m KS91_K562_ASTARRseq.Input.rep2.WGS.u… 4.51[90me[39m8 KS91_… Input… Input raw   hg38  
[90m 3[39m KS91_K562_ASTARRseq.Input.rep3.WGS.u… 4.88[90me[39m8 KS91_… Input… Input raw   hg38  
[90m 4[39m KS91_K562_ASTARRseq.Input.rep4.WGS.u… 4.56[90me[39m8 KS91_… Input… Input raw   hg38  
[90m 5[39m KS91_K562_ASTARRseq.Input.rep5.WGS.u… 4.44[90me[39m8 KS91_… Input… Input raw   hg38  
[90m 6[39m KS91_K562_ASTARRseq.Input.rep6.WGS.u… 3.97[90me[39m8 KS91_… Input… Input raw   hg38  
[90m 7[39m KS91_K562_ASTARRseq.Output.rep1

In [4]:
dat_lib

FName,Size,Prefix,Sample,Group,Type,Genome
<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>
OL13_20220512.hg19.norm.Input.rep1.stranded_pos.bed,6418470,OL13_20220512,Input.rep1,Input,norm,hg19
OL13_20220512.hg19.norm.Input.rep2.stranded_pos.bed,6412311,OL13_20220512,Input.rep2,Input,norm,hg19
OL13_20220512.hg19.norm.Input.rep3.stranded_pos.bed,6413568,OL13_20220512,Input.rep3,Input,norm,hg19
OL13_20220512.hg19.norm.Input.rep4.stranded_pos.bed,6410822,OL13_20220512,Input.rep4,Input,norm,hg19
OL13_20220512.hg19.norm.Output.rep1.stranded_pos.bed,24214237,OL13_20220512,Output.rep1,Output,norm,hg19
OL13_20220512.hg19.norm.Output.rep2.stranded_pos.bed,21967607,OL13_20220512,Output.rep2,Output,norm,hg19
OL13_20220512.hg19.norm.Output.rep3.stranded_pos.bed,20773782,OL13_20220512,Output.rep3,Output,norm,hg19
OL13_20220512.hg19.norm.Output.rep4.stranded_pos.bed,19239234,OL13_20220512,Output.rep4,Output,norm,hg19
OL13_20220512.hg19.raw.Input.rep1.stranded_pos.bed,11710957,OL13_20220512,Input.rep1,Input,raw,hg19
OL13_20220512.hg19.raw.Input.rep2.stranded_pos.bed,6544172,OL13_20220512,Input.rep2,Input,raw,hg19


In [4]:
### set file path
fdiry = file.path(FD_RES, "results", assay, "coverage", "summary")
fname = "library_size_count.csv"
fpath = file.path(fdiry, fname)

### import and add additional information
cnames = c("FName", "Size")
ctypes = c(col_character(), col_double())
dat_lib = read_csv(fpath, col_names = cnames, col_types = ctypes)

dat_lib = dat_lib %>% 
    dplyr::mutate(
        Prefix = get_prefix(FName),
        Group  = get_group(FName),
        Sample = get_sample(FName),
        Type   = get_type(FName),
        Genome = get_genome(FName)
    )
dat_lib

FName,Size,Prefix,Group,Sample,Type,Genome
<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>
KS91_K562_ASTARRseq.Input.rep1.WGS.unstranded.bed.gz,348695063,KS91_K562_ASTARRseq,Input,Input.rep1,raw,hg38
KS91_K562_ASTARRseq.Input.rep2.WGS.unstranded.bed.gz,451369741,KS91_K562_ASTARRseq,Input,Input.rep2,raw,hg38
KS91_K562_ASTARRseq.Input.rep3.WGS.unstranded.bed.gz,487579055,KS91_K562_ASTARRseq,Input,Input.rep3,raw,hg38
KS91_K562_ASTARRseq.Input.rep4.WGS.unstranded.bed.gz,456246254,KS91_K562_ASTARRseq,Input,Input.rep4,raw,hg38
KS91_K562_ASTARRseq.Input.rep5.WGS.unstranded.bed.gz,444268950,KS91_K562_ASTARRseq,Input,Input.rep5,raw,hg38
KS91_K562_ASTARRseq.Input.rep6.WGS.unstranded.bed.gz,397333562,KS91_K562_ASTARRseq,Input,Input.rep6,raw,hg38
KS91_K562_ASTARRseq.Output.rep1.WGS.unstranded.bed.gz,44103844,KS91_K562_ASTARRseq,Output,Output.rep1,raw,hg38
KS91_K562_ASTARRseq.Output.rep2.WGS.unstranded.bed.gz,97471282,KS91_K562_ASTARRseq,Output,Output.rep2,raw,hg38
KS91_K562_ASTARRseq.Output.rep3.WGS.unstranded.bed.gz,84103298,KS91_K562_ASTARRseq,Output,Output.rep3,raw,hg38
KS91_K562_ASTARRseq.Output.rep4.WGS.unstranded.bed.gz,183115379,KS91_K562_ASTARRseq,Output,Output.rep4,raw,hg38


In [5]:
dat_lib %>% arrange(FName)

FName,Size,Prefix,Group,Sample,Type,Genome
<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>
KS91_K562_ASTARRseq.Input.rep1.WGS.unstranded.bed.gz,348695063,KS91_K562_ASTARRseq,Input,Input.rep1,raw,hg38
KS91_K562_ASTARRseq.Input.rep2.WGS.unstranded.bed.gz,451369741,KS91_K562_ASTARRseq,Input,Input.rep2,raw,hg38
KS91_K562_ASTARRseq.Input.rep3.WGS.unstranded.bed.gz,487579055,KS91_K562_ASTARRseq,Input,Input.rep3,raw,hg38
KS91_K562_ASTARRseq.Input.rep4.WGS.unstranded.bed.gz,456246254,KS91_K562_ASTARRseq,Input,Input.rep4,raw,hg38
KS91_K562_ASTARRseq.Input.rep5.WGS.unstranded.bed.gz,444268950,KS91_K562_ASTARRseq,Input,Input.rep5,raw,hg38
KS91_K562_ASTARRseq.Input.rep6.WGS.unstranded.bed.gz,397333562,KS91_K562_ASTARRseq,Input,Input.rep6,raw,hg38
KS91_K562_ASTARRseq.Output.rep1.WGS.unstranded.bed.gz,44103844,KS91_K562_ASTARRseq,Output,Output.rep1,raw,hg38
KS91_K562_ASTARRseq.Output.rep2.WGS.unstranded.bed.gz,97471282,KS91_K562_ASTARRseq,Output,Output.rep2,raw,hg38
KS91_K562_ASTARRseq.Output.rep3.WGS.unstranded.bed.gz,84103298,KS91_K562_ASTARRseq,Output,Output.rep3,raw,hg38
KS91_K562_ASTARRseq.Output.rep4.WGS.unstranded.bed.gz,183115379,KS91_K562_ASTARRseq,Output,Output.rep4,raw,hg38


In [4]:
assay = "A001_K562_WSTARRseq"

In [5]:
### set file path
fdiry = file.path(FD_RES, "results", assay, "coverage", "summary")
fname = "library_size_count.csv"
fpath = file.path(fdiry, fname)

### import and add additional information
cnames = c("FName", "Size")
ctypes = c(col_character(), col_double())
dat_lib = read_csv(fpath, col_names = cnames, col_types = ctypes)

dat_lib = dat_lib %>% 
    dplyr::mutate(
        Prefix = get_prefix(FName),
        Group  = get_group(FName),
        Sample = get_sample(FName),
        Type   = get_type(FName),
        Genome = get_genome(FName)
    )
dat_lib

FName,Size,Prefix,Group,Sample,Type,Genome
<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>
A001_K562_WSTARRseq.Input.rep1.WGS.unstranded.bed.gz,26908970,A001_K562_WSTARRseq,Input,Input.rep1,raw,hg38
A001_K562_WSTARRseq.Input.rep2.WGS.unstranded.bed.gz,99899775,A001_K562_WSTARRseq,Input,Input.rep2,raw,hg38
A001_K562_WSTARRseq.Input.rep3.WGS.unstranded.bed.gz,105623984,A001_K562_WSTARRseq,Input,Input.rep3,raw,hg38
A001_K562_WSTARRseq.Input.rep4.WGS.unstranded.bed.gz,108635002,A001_K562_WSTARRseq,Input,Input.rep4,raw,hg38
A001_K562_WSTARRseq.Output.rep1.WGS.unstranded.bed.gz,160349140,A001_K562_WSTARRseq,Output,Output.rep1,raw,hg38
A001_K562_WSTARRseq.Output.rep2.WGS.unstranded.bed.gz,157326312,A001_K562_WSTARRseq,Output,Output.rep2,raw,hg38
A001_K562_WSTARRseq.Output.rep3.WGS.unstranded.bed.gz,328185275,A001_K562_WSTARRseq,Output,Output.rep3,raw,hg38


In [42]:
assay = "A001_K562_WSTARRseq"

In [43]:
### set file path
fdiry = file.path(FD_RES, "results", assay, "coverage", "summary")
fname = "library_size_count.csv"
fpath = file.path(fdiry, fname)

### import and add additional information
cnames = c("FName", "Size")
ctypes = c(col_character(), col_double())
dat_lib = read_csv(fpath, col_names = cnames, col_types = ctypes)

dat_lib = dat_lib %>% 
    dplyr::mutate(
        Prefix = get_prefix(FName),
        Group  = get_group(FName),
        Sample = get_sample(FName),
        Type   = get_type(FName),
        Genome = get_genome(FName)
    )
dat_lib

FName,Size,Prefix,Group,Sample,Type,Genome
<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>
A001_K562_WSTARRseq.Input.rep1.WGS.unstranded.bed.gz,26908970,A001_K562_WSTARRseq,Input,Input.rep1,raw,hg38
A001_K562_WSTARRseq.Input.rep2.WGS.unstranded.bed.gz,99899775,A001_K562_WSTARRseq,Input,Input.rep2,raw,hg38
A001_K562_WSTARRseq.Input.rep3.WGS.unstranded.bed.gz,105623984,A001_K562_WSTARRseq,Input,Input.rep3,raw,hg38
A001_K562_WSTARRseq.Input.rep4.WGS.unstranded.bed.gz,108635002,A001_K562_WSTARRseq,Input,Input.rep4,raw,hg38
A001_K562_WSTARRseq.Output.rep1.WGS.unstranded.bed.gz,160349140,A001_K562_WSTARRseq,Output,Output.rep1,raw,hg38
A001_K562_WSTARRseq.Output.rep2.WGS.unstranded.bed.gz,157326312,A001_K562_WSTARRseq,Output,Output.rep2,raw,hg38
A001_K562_WSTARRseq.Output.rep3.WGS.unstranded.bed.gz,38522888,A001_K562_WSTARRseq,Output,Output.rep3,raw,hg38


In [6]:
assay = "Tewhey_K562_TileMPRA"

In [7]:
### set file path
fdiry = file.path(FD_RES, "results", assay, "coverage", "summary")
fname = "library_size_count.csv"
fpath = file.path(fdiry, fname)

### import and add additional information
cnames = c("FName", "Size")
ctypes = c(col_character(), col_double())
dat_lib = read_csv(fpath, col_names = cnames, col_types = ctypes)

dat_lib = dat_lib %>% 
    dplyr::mutate(
        Prefix = get_prefix(FName),
        Group  = get_group(FName),
        Sample = get_sample(FName),
        Type   = get_type(FName),
        Genome = get_genome(FName)
    )
dat_lib

FName,Size,Prefix,Group,Sample,Type,Genome
<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>
OL13_20220512.hg19.norm.Input.rep1.stranded_pos.bed,6418470,OL13_20220512,Input,Input.rep1,norm,hg19
OL13_20220512.hg19.norm.Input.rep2.stranded_pos.bed,6412311,OL13_20220512,Input,Input.rep2,norm,hg19
OL13_20220512.hg19.norm.Input.rep3.stranded_pos.bed,6413568,OL13_20220512,Input,Input.rep3,norm,hg19
OL13_20220512.hg19.norm.Input.rep4.stranded_pos.bed,6410822,OL13_20220512,Input,Input.rep4,norm,hg19
OL13_20220512.hg19.norm.Output.rep1.stranded_pos.bed,24214237,OL13_20220512,Output,Output.rep1,norm,hg19
OL13_20220512.hg19.norm.Output.rep2.stranded_pos.bed,21967607,OL13_20220512,Output,Output.rep2,norm,hg19
OL13_20220512.hg19.norm.Output.rep3.stranded_pos.bed,20773782,OL13_20220512,Output,Output.rep3,norm,hg19
OL13_20220512.hg19.norm.Output.rep4.stranded_pos.bed,19239234,OL13_20220512,Output,Output.rep4,norm,hg19
OL13_20220512.hg19.raw.Input.rep1.stranded_pos.bed,11710957,OL13_20220512,Input,Input.rep1,raw,hg19
OL13_20220512.hg19.raw.Input.rep2.stranded_pos.bed,6544172,OL13_20220512,Input,Input.rep2,raw,hg19


In [8]:
dat_lib %>% arrange(FName)

FName,Size,Prefix,Group,Sample,Type,Genome
<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>
OL13_20220512.hg19.norm.Input.rep1.stranded_pos.bed,6418470,OL13_20220512,Input,Input.rep1,norm,hg19
OL13_20220512.hg19.norm.Input.rep2.stranded_pos.bed,6412311,OL13_20220512,Input,Input.rep2,norm,hg19
OL13_20220512.hg19.norm.Input.rep3.stranded_pos.bed,6413568,OL13_20220512,Input,Input.rep3,norm,hg19
OL13_20220512.hg19.norm.Input.rep4.stranded_pos.bed,6410822,OL13_20220512,Input,Input.rep4,norm,hg19
OL13_20220512.hg19.norm.Output.rep1.stranded_pos.bed,24214237,OL13_20220512,Output,Output.rep1,norm,hg19
OL13_20220512.hg19.norm.Output.rep2.stranded_pos.bed,21967607,OL13_20220512,Output,Output.rep2,norm,hg19
OL13_20220512.hg19.norm.Output.rep3.stranded_pos.bed,20773782,OL13_20220512,Output,Output.rep3,norm,hg19
OL13_20220512.hg19.norm.Output.rep4.stranded_pos.bed,19239234,OL13_20220512,Output,Output.rep4,norm,hg19
OL13_20220512.hg19.raw.Input.rep1.stranded_pos.bed,11710957,OL13_20220512,Input,Input.rep1,raw,hg19
OL13_20220512.hg19.raw.Input.rep2.stranded_pos.bed,6544172,OL13_20220512,Input,Input.rep2,raw,hg19


In [27]:
fpath = "OL45_20220927.hg38.raw.Input.rep4.stranded_pos.bed"
unlist(str_split(fpath, "\\."))[1]