**Set environment**

In [1]:
suppressMessages(suppressWarnings(source("../config/config_sing.R")))
show_env()

You are in Singularity: singularity_proj_combeffect 
BASE DIRECTORY:     /mount/work 
PATH OF SOURCE:     /mount/work/source 
PATH OF EXECUTABLE: /mount/work/exe 
PATH OF ANNOTATION: /mount/work/annotation 
PATH OF PROJECT:    /mount/project 
PATH OF RESULTS:    /mount/work/out/proj_combeffect_encode_fcc 


## Helper function

In [2]:
PREFIX  = "A001_K562_WSTARRseq"

GROUPS  = c("Input", "Output")

SAMPLES = c(
    paste0("Input_rep",  1:4),
    paste0("Output_rep", 1:3))

get_group = function(fpath){
    fname = basename(fpath)
    has_input = str_detect(fname, "input")
    if (has_input){
        return("Input")
    } else {
        return("Output")
    }
}

## Get file paths and set metadata

In [3]:
fpath = "/mount/work/out/proj_combeffect_encode_fcc/A001_K562_WSTARRseq/coverage_astarrseq_peak_macs_input/A001-input-K562-rep1.masked.dedup.fragments.counts.txt.gz"
fname = basename(fpath)
str_detect(fname, "input")

In [4]:
PREFIX = "A001_K562_WSTARRseq"
FOLDER = "coverage_astarrseq_peak_macs_input"
fglob  = file.path(FD_RES, PREFIX, FOLDER, "*counts.txt.gz")
fpaths = Sys.glob(fglob)
for (fpath in fpaths){
    print(fpath)
    print(get_group(fpath))
    #print(get_sample(fpath))
}

[1] "/mount/work/out/proj_combeffect_encode_fcc/A001_K562_WSTARRseq/coverage_astarrseq_peak_macs_input/A001-input-K562-rep1.masked.dedup.fragments.counts.txt.gz"
[1] "Input"
[1] "/mount/work/out/proj_combeffect_encode_fcc/A001_K562_WSTARRseq/coverage_astarrseq_peak_macs_input/A001-input-K562-rep2.masked.dedup.fragments.counts.txt.gz"
[1] "Input"
[1] "/mount/work/out/proj_combeffect_encode_fcc/A001_K562_WSTARRseq/coverage_astarrseq_peak_macs_input/A001-input-K562-rep3.masked.dedup.fragments.counts.txt.gz"
[1] "Input"
[1] "/mount/work/out/proj_combeffect_encode_fcc/A001_K562_WSTARRseq/coverage_astarrseq_peak_macs_input/A001-input-K562-rep4.masked.dedup.fragments.counts.txt.gz"
[1] "Input"
[1] "/mount/work/out/proj_combeffect_encode_fcc/A001_K562_WSTARRseq/coverage_astarrseq_peak_macs_input/A001-K562-rep1.masked.dedup.fragments.counts.txt.gz"
[1] "Output"
[1] "/mount/work/out/proj_combeffect_encode_fcc/A001_K562_WSTARRseq/coverage_astarrseq_peak_macs_input/A001-K562-rep2.masked.dedup.frag

In [5]:
dat_meta = data.frame(
    Sample = SAMPLES,
    Group  = sapply(fpaths, get_group),
    FPath  = fpaths
)
rownames(dat_meta) = SAMPLES
dat_meta

Unnamed: 0_level_0,Sample,Group,FPath
Unnamed: 0_level_1,<chr>,<chr>,<chr>
Input_rep1,Input_rep1,Input,/mount/work/out/proj_combeffect_encode_fcc/A001_K562_WSTARRseq/coverage_astarrseq_peak_macs_input/A001-input-K562-rep1.masked.dedup.fragments.counts.txt.gz
Input_rep2,Input_rep2,Input,/mount/work/out/proj_combeffect_encode_fcc/A001_K562_WSTARRseq/coverage_astarrseq_peak_macs_input/A001-input-K562-rep2.masked.dedup.fragments.counts.txt.gz
Input_rep3,Input_rep3,Input,/mount/work/out/proj_combeffect_encode_fcc/A001_K562_WSTARRseq/coverage_astarrseq_peak_macs_input/A001-input-K562-rep3.masked.dedup.fragments.counts.txt.gz
Input_rep4,Input_rep4,Input,/mount/work/out/proj_combeffect_encode_fcc/A001_K562_WSTARRseq/coverage_astarrseq_peak_macs_input/A001-input-K562-rep4.masked.dedup.fragments.counts.txt.gz
Output_rep1,Output_rep1,Output,/mount/work/out/proj_combeffect_encode_fcc/A001_K562_WSTARRseq/coverage_astarrseq_peak_macs_input/A001-K562-rep1.masked.dedup.fragments.counts.txt.gz
Output_rep2,Output_rep2,Output,/mount/work/out/proj_combeffect_encode_fcc/A001_K562_WSTARRseq/coverage_astarrseq_peak_macs_input/A001-K562-rep2.masked.dedup.fragments.counts.txt.gz
Output_rep3,Output_rep3,Output,/mount/work/out/proj_combeffect_encode_fcc/A001_K562_WSTARRseq/coverage_astarrseq_peak_macs_input/A001-K562-rep3.masked.dedup.fragments.counts.txt.gz


## Import data and arrange into count matrix

In [7]:
cnames = c("Chrom", "Start", "End", "Count")
ctypes = c(col_character(), col_integer(), col_integer(), col_integer())

lst = lapply(seq_along(fpaths), function(idx){
    fpath = fpaths[idx]
    sam = SAMPLES[idx]
    dat = read_tsv(fpath, col_names = cnames, col_types = ctypes) %>% 
        dplyr::filter(Count != ".") %>% 
        dplyr::mutate(Count  = as.integer(Count)) %>%
        dplyr::mutate(
            Peak   = paste(Chrom, Start, End, sep="_"),
            Sample = sam)
    return(dat)
})

dat_count = bind_rows(lst) %>% spread(Sample, Count)
head(dat_count)

Chrom,Start,End,Peak,Input_rep1,Input_rep2,Input_rep3,Input_rep4,Output_rep1,Output_rep2,Output_rep3
<chr>,<dbl>,<dbl>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
chr1,10015,10442,chr1_10015_10442,1,1,1,1,,,
chr1,17237,17772,chr1_17237_17772,5,12,15,17,18.0,23.0,27.0
chr1,136071,137429,chr1_136071_137429,3,4,7,4,12.0,12.0,29.0
chr1,137737,139544,chr1_137737_139544,14,40,41,52,145.0,144.0,217.0
chr1,180982,182087,chr1_180982_182087,8,31,26,28,63.0,57.0,99.0
chr1,183239,184602,chr1_183239_184602,12,40,36,49,71.0,75.0,163.0


**Check**

In [8]:
fpath = fpaths[5]
dat = read_tsv(fpath, col_names = cnames, col_types = ctypes)
head(dat)

Chrom,Start,End,Count
<chr>,<dbl>,<dbl>,<chr>
chr1,10015,10442,.
chr1,14253,14645,.
chr1,16015,16477,20
chr1,17237,17772,18
chr1,28903,29613,5
chr1,30803,31072,.


## Store the results

In [9]:
fdiry = file.path(FD_RES, PREFIX, FOLDER, "summary")

fname = "wgs_count_matrix.tsv"
fpath = file.path(fdiry, fname)
write_tsv(dat_count, fpath)

fname = "wgs_metadata.tsv"
fpath = file.path(fdiry, fname)
write_tsv(dat_meta, fpath)