**Set environment**

In [1]:
suppressMessages(suppressWarnings(source("../config/config_sing.R")))
show_env()

You are in Singularity: singularity_proj_combeffect 
BASE DIRECTORY:     /data/reddylab/Kuei 
WORK DIRECTORY:     /data/reddylab/Kuei/out 
CODE DIRECTORY:     /data/reddylab/Kuei/code 
PATH OF SOURCE:     /data/reddylab/Kuei/source 
PATH OF EXECUTABLE: /data/reddylab/Kuei/bin 
PATH OF ANNOTATION: /data/reddylab/Kuei/annotation 
PATH OF PROJECT:    /data/reddylab/Kuei/code/Proj_CombEffect_ENCODE_FCC 
PATH OF RESULTS:    /data/reddylab/Kuei/out/proj_combeffect_encode_fcc 


**Check data**

In [2]:
ASSAY = "A001_K562_WSTARRseq"
REGIONS=c("GATA1", "MYC", "FADS")

get_group  = function(fpath){return(str_extract(fpath, "Input|Output"))}
get_sample = function(fpath){return(str_extract(fpath, "Input.rep\\d|Input.mean|Output.rep\\d|Output.mean"))}
get_region = function(fpath){return(str_extract(fpath, "WGS|GATA1|MYC|FADS"))}

In [3]:
region = REGIONS[1]
fdiry  = file.path(FD_RES, "results", ASSAY, "coverage")
fname  = paste("", region, "bed.gz", sep="*")
fglob  = file.path(fdiry, fname)
fpaths = Sys.glob(fglob)

for (fpath in fpaths){
    fname  = basename(fpath)
    region = get_region(fpath) 
    group  = get_group(fpath)
    sam    = get_sample(fpath)
    print(fname)
    print(region)
    print(group)
    print(sam)
    cat("++++++++++++++++++++++\n")
}

[1] "A001_K562_WSTARRseq.Input.rep1.GATA1.unstranded.bed.gz"
[1] "GATA1"
[1] "Input"
[1] "Input.rep1"
++++++++++++++++++++++
[1] "A001_K562_WSTARRseq.Input.rep2.GATA1.unstranded.bed.gz"
[1] "GATA1"
[1] "Input"
[1] "Input.rep2"
++++++++++++++++++++++
[1] "A001_K562_WSTARRseq.Input.rep3.GATA1.unstranded.bed.gz"
[1] "GATA1"
[1] "Input"
[1] "Input.rep3"
++++++++++++++++++++++
[1] "A001_K562_WSTARRseq.Input.rep4.GATA1.unstranded.bed.gz"
[1] "GATA1"
[1] "Input"
[1] "Input.rep4"
++++++++++++++++++++++
[1] "A001_K562_WSTARRseq.Output.rep1.GATA1.unstranded.bed.gz"
[1] "GATA1"
[1] "Output"
[1] "Output.rep1"
++++++++++++++++++++++
[1] "A001_K562_WSTARRseq.Output.rep2.GATA1.unstranded.bed.gz"
[1] "GATA1"
[1] "Output"
[1] "Output.rep2"
++++++++++++++++++++++
[1] "A001_K562_WSTARRseq.Output.rep3.GATA1.unstranded.bed.gz"
[1] "GATA1"
[1] "Output"
[1] "Output.rep3"
++++++++++++++++++++++


**RUN**

In [4]:
### import library size
fdiry  = file.path(FD_RES, "results", ASSAY, "coverage", "summary")
fname  = "library_size_summary.csv"
fpath  = file.path(fdiry, fname)
dat_lib = read_csv(fpath, show_col_types = FALSE)
dat_lib = dat_lib %>% dplyr::select(-FName)
dat_lib

Type,Region,Group,Sample,Size
<chr>,<chr>,<chr>,<chr>,<dbl>
raw,FADS,Input,Input.rep1,1892
raw,GATA1,Input,Input.rep1,19540
raw,MYC,Input,Input.rep1,21892
raw,WGS,Input,Input.rep1,26908970
raw,FADS,Input,Input.rep2,6921
raw,GATA1,Input,Input.rep2,72119
raw,MYC,Input,Input.rep2,81027
raw,WGS,Input,Input.rep2,99899775
raw,FADS,Input,Input.rep3,7348
raw,GATA1,Input,Input.rep3,75963


In [5]:
cnames = c("Chrom", "Start", "End", "Count")
ctypes = c("ciic")

for (region in REGIONS){
    ### show progress
    cat("\n+++++++++++++++++++\n")
    cat("Region:", region, "\n")
    flush.console()
    
    ### set file paths
    fdiry  = file.path(FD_RES, "results", ASSAY, "coverage")
    fname  = paste("", region, "bed.gz", sep="*")
    fglob  = file.path(fdiry, fname)
    fpaths = Sys.glob(fglob)
    
    ### import data and add info
    lst = lapply(fpaths, function(fpath){
        ### import data
        dat = read_tsv(fpath, col_names = cnames, col_types = ctypes)
        dat = dat %>% 
           dplyr::mutate(Count = replace(Count, Count == ".", 0)) %>%
           dplyr::mutate(Count = as.double(Count))

        ### add information
        #dat$Region = get_region(fpath)
        dat$Region = "WGS"
        dat$Group  = get_group(fpath)
        dat$Sample = get_sample(fpath)

        ### return result dataframe
        return(dat)
    })
    
    ### get raw counts
    dat_count = bind_rows(lst) %>% 
        dplyr::select(Chrom, Start, End, Count, Sample) %>% 
        tidyr::spread(Sample, Count)
    
    cat("\nGet raw counts\n")
    print(dim(dat_count))
    print(head(dat_count))
    flush.console()
    
    ### calculate counts per million
    dat_cpm = bind_rows(lst) %>% 
        dplyr::left_join(dat_lib, by=c("Region", "Group", "Sample")) %>%
        dplyr::mutate(CPM = Count * 1000000 / Size) %>% 
        dplyr::select(Chrom, Start, End, CPM, Sample) %>% 
        tidyr::spread(Sample, CPM)
    
    cat("\nGet CPM\n")
    print(dim(dat_cpm))
    print(head(dat_cpm))
    flush.console()
    
    ### summarize input and output to calculate fold changes
    dat   = dat_cpm %>% dplyr::select(Chrom, Start, End)
    x_inp = dat_cpm %>% dplyr::select(starts_with("Input"))  %>% apply(., 1, mean, na.omit=TRUE)
    x_out = dat_cpm %>% dplyr::select(starts_with("Output")) %>% apply(., 1, mean, na.omit=TRUE)
    x_xfc =      x_out      /      x_inp
    x_lfc = log2(x_out)     - log2(x_inp)
    x_pfc = log2(x_out + 1) - log2(x_inp + 1)
    
    dat_Log2fc = data.frame(cbind(dat, x_inp, x_out, x_xfc, x_lfc, x_pfc))
    colnames(dat_Log2fc) = c("Chrom", "Start", "End", "Input", "Output", "FC", "Log2FC", "pLog2FC")
    
    cat("\nGet Log2FC\n")
    print(dim(dat_Log2fc))
    print(head(dat_Log2fc))
    flush.console()
    
    ### store the results
    fdiry = file.path(FD_RES, "results", ASSAY, "coverage", "summary")
    fname = paste("track", "raw", "count",  region, "tsv", sep=".")
    fpath = file.path(fdiry, fname)
    write_tsv(dat_count, fpath)
    cat(fpath, "\n")
          
    fname = paste("track", "raw", "cpm",    region, "tsv", sep=".")
    fpath = file.path(fdiry, fname)
    write_tsv(dat_cpm, fpath)
    cat(fpath, "\n")
          
    fname = paste("track", "raw", "Log2FC", region, "tsv", sep=".")
    fpath = file.path(fdiry, fname)
    write_tsv(dat_Log2fc, fpath)
    cat(fpath, "\n")
    flush.console()
}


+++++++++++++++++++
Region: GATA1 

Get raw counts
[1] 2095150      10
[90m# A tibble: 6 × 10[39m
  Chrom    Start     End Input…¹ Input…² Input…³ Input…⁴ Outpu…⁵ Outpu…⁶ Outpu…⁷
  [3m[90m<chr>[39m[23m    [3m[90m<int>[39m[23m   [3m[90m<int>[39m[23m   [3m[90m<dbl>[39m[23m   [3m[90m<dbl>[39m[23m   [3m[90m<dbl>[39m[23m   [3m[90m<dbl>[39m[23m   [3m[90m<dbl>[39m[23m   [3m[90m<dbl>[39m[23m   [3m[90m<dbl>[39m[23m
[90m1[39m chrX  47[4m7[24m[4m8[24m[4m5[24m501  4.78[90me[39m7       0       1       5       8       6       4       8
[90m2[39m chrX  47[4m7[24m[4m8[24m[4m5[24m502  4.78[90me[39m7       0       1       5       8       6       4       8
[90m3[39m chrX  47[4m7[24m[4m8[24m[4m5[24m503  4.78[90me[39m7       0       1       5       8       6       4       8
[90m4[39m chrX  47[4m7[24m[4m8[24m[4m5[24m504  4.78[90me[39m7       0       1       5       8       6       4       8
[90m5[39m chrX  47[4m7[24m[4m8