**Set environment**

In [1]:
suppressMessages(suppressWarnings(source("../config/config_sing.R")))
show_env()

You are in Singularity: singularity_proj_combeffect 
BASE DIRECTORY:     /data/reddylab/Kuei 
WORK DIRECTORY:     /data/reddylab/Kuei/out 
CODE DIRECTORY:     /data/reddylab/Kuei/code 
PATH OF SOURCE:     /data/reddylab/Kuei/source 
PATH OF EXECUTABLE: /data/reddylab/Kuei/bin 
PATH OF ANNOTATION: /data/reddylab/Kuei/annotation 
PATH OF PROJECT:    /data/reddylab/Kuei/code/Proj_CombEffect_ENCODE_FCC 
PATH OF RESULTS:    /data/reddylab/Kuei/out/proj_combeffect_encode_fcc 


## Helper function

In [2]:
ASSAY  = "A001_K562_WSTARRseq"

GROUPS  = c("Input", "Output")

SAMPLES = c(
    paste0("Input.rep",  1:4),
    paste0("Output.rep", 1:3))

get_info = function(fpath, strings){
    idx = str_detect(string = fpath, pattern = strings)
    return(strings[idx])
}

get_group  = function(fpath){return(get_info(fpath, GROUPS))}
get_sample = function(fpath){return(get_info(fpath, SAMPLES))}

## Get file paths and set metadata

In [3]:
ASSAY = "A001_K562_WSTARRseq"
FOLDER = "coverage_astarrseq_peak_macs_input"
fglob  = file.path(FD_RES, "results", ASSAY, FOLDER, "*WGS*bed.gz")
fpaths = Sys.glob(fglob)
for (fpath in fpaths){
    print(fpath)
    print(get_group(fpath))
    print(get_sample(fpath))
}

[1] "/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/A001_K562_WSTARRseq/coverage_astarrseq_peak_macs_input/A001_K562_WSTARRseq.Input.rep1.WGS.unstranded.bed.gz"
[1] "Input"
[1] "Input.rep1"
[1] "/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/A001_K562_WSTARRseq/coverage_astarrseq_peak_macs_input/A001_K562_WSTARRseq.Input.rep2.WGS.unstranded.bed.gz"
[1] "Input"
[1] "Input.rep2"
[1] "/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/A001_K562_WSTARRseq/coverage_astarrseq_peak_macs_input/A001_K562_WSTARRseq.Input.rep3.WGS.unstranded.bed.gz"
[1] "Input"
[1] "Input.rep3"
[1] "/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/A001_K562_WSTARRseq/coverage_astarrseq_peak_macs_input/A001_K562_WSTARRseq.Input.rep4.WGS.unstranded.bed.gz"
[1] "Input"
[1] "Input.rep4"
[1] "/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/A001_K562_WSTARRseq/coverage_astarrseq_peak_macs_input/A001_K562_WSTARRseq.Output.rep1.WGS.unstranded.bed.gz"
[1] "Output"
[1] "

In [4]:
dat_meta = data.frame(
    Sample = SAMPLES,
    Group  = sapply(fpaths, get_group),
    FPath  = fpaths
)
rownames(dat_meta) = SAMPLES
dat_meta

Unnamed: 0_level_0,Sample,Group,FPath
Unnamed: 0_level_1,<chr>,<chr>,<chr>
Input.rep1,Input.rep1,Input,/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/A001_K562_WSTARRseq/coverage_astarrseq_peak_macs_input/A001_K562_WSTARRseq.Input.rep1.WGS.unstranded.bed.gz
Input.rep2,Input.rep2,Input,/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/A001_K562_WSTARRseq/coverage_astarrseq_peak_macs_input/A001_K562_WSTARRseq.Input.rep2.WGS.unstranded.bed.gz
Input.rep3,Input.rep3,Input,/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/A001_K562_WSTARRseq/coverage_astarrseq_peak_macs_input/A001_K562_WSTARRseq.Input.rep3.WGS.unstranded.bed.gz
Input.rep4,Input.rep4,Input,/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/A001_K562_WSTARRseq/coverage_astarrseq_peak_macs_input/A001_K562_WSTARRseq.Input.rep4.WGS.unstranded.bed.gz
Output.rep1,Output.rep1,Output,/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/A001_K562_WSTARRseq/coverage_astarrseq_peak_macs_input/A001_K562_WSTARRseq.Output.rep1.WGS.unstranded.bed.gz
Output.rep2,Output.rep2,Output,/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/A001_K562_WSTARRseq/coverage_astarrseq_peak_macs_input/A001_K562_WSTARRseq.Output.rep2.WGS.unstranded.bed.gz
Output.rep3,Output.rep3,Output,/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/A001_K562_WSTARRseq/coverage_astarrseq_peak_macs_input/A001_K562_WSTARRseq.Output.rep3.WGS.unstranded.bed.gz


## Import data and arrange into count matrix

In [5]:
cnames = c("Chrom", "Start", "End", "Count")
ctypes = c(col_character(), col_integer(), col_integer(), col_integer())

lst = lapply(seq_along(fpaths), function(idx){
    fpath = fpaths[idx]
    sam = SAMPLES[idx]
    dat = read_tsv(fpath, col_names = cnames, col_types = ctypes) %>% 
        dplyr::filter(Count != ".") %>% 
        dplyr::mutate(Count  = as.integer(Count)) %>%
        dplyr::mutate(
            Peak   = paste(Chrom, Start, End, sep="_"),
            Sample = sam)
    return(dat)
})

dat_count = bind_rows(lst) %>% spread(Sample, Count)
head(dat_count)

Chrom,Start,End,Peak,Input.rep1,Input.rep2,Input.rep3,Input.rep4,Output.rep1,Output.rep2,Output.rep3
<chr>,<dbl>,<dbl>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
chr1,10015,10442,chr1_10015_10442,1,1,1,1,,,
chr1,17237,17772,chr1_17237_17772,5,12,15,17,18.0,23.0,27.0
chr1,136071,137429,chr1_136071_137429,3,4,7,4,12.0,12.0,29.0
chr1,137737,139544,chr1_137737_139544,14,40,41,52,145.0,144.0,217.0
chr1,180982,182087,chr1_180982_182087,8,31,26,28,63.0,57.0,99.0
chr1,183239,184602,chr1_183239_184602,12,40,36,49,71.0,75.0,163.0


In [6]:
dat_count %>% dplyr::filter(is.na(Peak))

Chrom,Start,End,Peak,Input.rep1,Input.rep2,Input.rep3,Input.rep4,Output.rep1,Output.rep2,Output.rep3
<chr>,<dbl>,<dbl>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>


In [7]:
dat_count %>% dplyr::filter(is.na(Input.rep1))

Chrom,Start,End,Peak,Input.rep1,Input.rep2,Input.rep3,Input.rep4,Output.rep1,Output.rep2,Output.rep3
<chr>,<dbl>,<dbl>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
chr1,16015,16477,chr1_16015_16477,,6,9,12,20,14,33
chr1,115411,115986,chr1_115411_115986,,1,2,,2,2,8
chr1,118518,118743,chr1_118518_118743,,1,2,,,3,3
chr1,202111,202447,chr1_202111_202447,,2,3,3,3,2,3
chr1,811251,811723,chr1_811251_811723,,4,6,6,14,14,19
chr1,892051,892408,chr1_892051_892408,,6,12,10,10,8,14
chr1,2687105,2688564,chr1_2687105_2688564,,5,8,9,13,8,33
chr1,2757629,2757900,chr1_2757629_2757900,,4,3,2,3,2,7
chr1,3643071,3643605,chr1_3643071_3643605,,30,31,37,56,61,95
chr1,7601060,7601336,chr1_7601060_7601336,,11,7,7,14,15,28


In [8]:
dat_count = dat_count %>% replace(is.na(.), 0)
dat_count %>% dplyr::filter(is.na(Input.rep1))

Chrom,Start,End,Peak,Input.rep1,Input.rep2,Input.rep3,Input.rep4,Output.rep1,Output.rep2,Output.rep3
<chr>,<dbl>,<dbl>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>


## From raw count to CPM and log2FC

**Import library size**

In [9]:
fdiry = file.path(FD_RES, "results", ASSAY, "coverage", "summary")
fname = "library_size_summary.csv"
fpath = file.path(fdiry, fname)
dat_lib = read_csv(fpath, show_col_types = FALSE)
dat_lib = dat_lib %>% dplyr::filter(Region == "WGS")
dat_lib

Type,Region,Group,Sample,Size,FName
<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>
raw,WGS,Input,Input.rep1,26908970,A001_K562_WSTARRseq.Input.rep1.WGS.unstranded.bed.gz
raw,WGS,Input,Input.rep2,99899775,A001_K562_WSTARRseq.Input.rep2.WGS.unstranded.bed.gz
raw,WGS,Input,Input.rep3,105623984,A001_K562_WSTARRseq.Input.rep3.WGS.unstranded.bed.gz
raw,WGS,Input,Input.rep4,108635002,A001_K562_WSTARRseq.Input.rep4.WGS.unstranded.bed.gz
raw,WGS,Output,Output.rep1,160349140,A001_K562_WSTARRseq.Output.rep1.WGS.unstranded.bed.gz
raw,WGS,Output,Output.rep2,157326312,A001_K562_WSTARRseq.Output.rep2.WGS.unstranded.bed.gz
raw,WGS,Output,Output.rep3,328185275,A001_K562_WSTARRseq.Output.rep3.WGS.unstranded.bed.gz


**Calculate CPM matrix and mean**

In [10]:
dat = dat_count %>% 
    tidyr::gather(Sample, Count, -Chrom, -Start, -End, -Peak) %>%
    dplyr::left_join(dat_lib, by="Sample") %>%
    dplyr::mutate(CPM = Count * 1e6 / Size)

dat_cpm = dat %>% 
   dplyr::select(Chrom, Start, End, Peak, Sample, CPM) %>%
   tidyr::spread(Sample, CPM)

dat_log2fc = dat %>% 
   dplyr::group_by(Chrom, Start, End, Peak, Group) %>%
   dplyr::summarise(CPM = mean(CPM, na.omit=TRUE), .groups="drop") %>%
   tidyr::spread(Group, CPM) %>%
   dplyr::mutate(log2FoldChange = log2(Output) - log2(Input))

In [11]:
head(dat_cpm)

Chrom,Start,End,Peak,Input.rep1,Input.rep2,Input.rep3,Input.rep4,Output.rep1,Output.rep2,Output.rep3
<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
chr1,10015,10442,chr1_10015_10442,0.03716233,0.01001003,0.009467547,0.009205136,0.0,0.0,0.0
chr1,17237,17772,chr1_17237_17772,0.18581165,0.12012039,0.1420132,0.156487317,0.112255,0.14619296,0.0822706
chr1,136071,137429,chr1_136071_137429,0.11148699,0.04004013,0.066272827,0.036820545,0.0748367,0.07627459,0.08836472
chr1,137737,139544,chr1_137737_139544,0.52027261,0.4004013,0.388169414,0.478667087,0.9042768,0.91529508,0.66121187
chr1,180982,182087,chr1_180982_182087,0.29729863,0.31031101,0.246156214,0.257743816,0.3928927,0.3623043,0.30165887
chr1,183239,184602,chr1_183239_184602,0.44594795,0.4004013,0.340831681,0.451051679,0.4427838,0.47671619,0.49667067


In [12]:
head(dat_log2fc)

Chrom,Start,End,Peak,Input,Output,log2FoldChange
<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>
chr1,10015,10442,chr1_10015_10442,0.016461261,0.0,-inf
chr1,14253,14645,chr1_14253_14645,0.0,0.001015686,inf
chr1,16015,16477,chr1_16015_16477,0.063932438,0.104755936,0.712412
chr1,17237,17772,chr1_17237_17772,0.151108138,0.113572871,-0.4119631
chr1,28903,29613,chr1_28903_29613,0.002366887,0.023860007,3.3335319
chr1,101603,101849,chr1_101603_101849,0.004668171,0.005125856,0.1349357


In [13]:
dat_cpm %>% dplyr::filter(is.na(Input.rep1))

Chrom,Start,End,Peak,Input.rep1,Input.rep2,Input.rep3,Input.rep4,Output.rep1,Output.rep2,Output.rep3
<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>


In [14]:
dat_log2fc %>% dplyr::filter(is.na(Input))

Chrom,Start,End,Peak,Input,Output,log2FoldChange
<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>


## Store the results

In [15]:
fdiry = file.path(FD_RES, "results", ASSAY, FOLDER, "summary")

fname = "matrix.raw.count.WGS.tsv"
fpath = file.path(fdiry, fname)
write_tsv(dat_count, fpath)

fname = "matrix.raw.cpm.WGS.tsv"
fpath = file.path(fdiry, fname)
write_tsv(dat_cpm, fpath)

fname = "metadata.raw.WGS.tsv"
fpath = file.path(fdiry, fname)
write_tsv(dat_meta, fpath)

fname = "result.Log2FC.raw.cpm.WGS.tsv"
fpath = file.path(fdiry, fname)
write_tsv(dat_log2fc, fpath)