**Set environment**

In [1]:
suppressMessages(suppressWarnings(source("../config/config_sing.R")))
show_env()

You are in Singularity: singularity_proj_encode_fcc 
BASE DIRECTORY (FD_BASE): /data/reddylab/Kuei 
WORK DIRECTORY (FD_WORK): /data/reddylab/Kuei/out 
CODE DIRECTORY (FD_CODE): /data/reddylab/Kuei/code 
PATH OF PROJECT (FD_PRJ): /data/reddylab/Kuei/code/Proj_CombEffect_ENCODE_FCC 
PATH OF RESULTS (FD_RES): /data/reddylab/Kuei/out/proj_combeffect_encode_fcc 
PATH OF LOG     (FD_LOG): /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/log 


**Check data**

In [2]:
fdiry = file.path(FD_RES, "results", "A001_K562_WSTARRseq", "coverage_astarrseq_peak_macs_input")
dir(fdiry)

In [3]:
ASSAY  = "A001_K562_WSTARRseq"

GROUPS  = c("Input", "Output")

SAMPLES = c(
    paste0("Input.rep",  1:4),
    paste0("Output.rep", 1:3))

CNAMES = c("Chrom", "Start", "End", "Count")

get_info = function(fpath, strings){
    idx = str_detect(string = fpath, pattern = strings)
    return(strings[idx])
}

get_group  = function(fpath){return(get_info(fpath, GROUPS))}
get_sample = function(fpath){return(get_info(fpath, SAMPLES))}

## Get file paths and set metadata

In [4]:
ASSAY  = "A001_K562_WSTARRseq"
FOLDER = "coverage_astarrseq_peak_macs_input"
fglob  = file.path(FD_RES, "results", ASSAY, FOLDER, "*WGS*bed.gz")
fpaths = Sys.glob(fglob)
for (fpath in fpaths){
    print(fpath)
    print(get_group(fpath))
    print(get_sample(fpath))
}

[1] "/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/A001_K562_WSTARRseq/coverage_astarrseq_peak_macs_input/A001_K562_WSTARRseq.Input.rep1.WGS.unstranded.bed.gz"
[1] "Input"
[1] "Input.rep1"
[1] "/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/A001_K562_WSTARRseq/coverage_astarrseq_peak_macs_input/A001_K562_WSTARRseq.Input.rep2.WGS.unstranded.bed.gz"
[1] "Input"
[1] "Input.rep2"
[1] "/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/A001_K562_WSTARRseq/coverage_astarrseq_peak_macs_input/A001_K562_WSTARRseq.Input.rep3.WGS.unstranded.bed.gz"
[1] "Input"
[1] "Input.rep3"
[1] "/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/A001_K562_WSTARRseq/coverage_astarrseq_peak_macs_input/A001_K562_WSTARRseq.Input.rep4.WGS.unstranded.bed.gz"
[1] "Input"
[1] "Input.rep4"
[1] "/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/A001_K562_WSTARRseq/coverage_astarrseq_peak_macs_input/A001_K562_WSTARRseq.Output.rep1.WGS.unstranded.bed.gz"
[1] "Output"
[1] "

In [5]:
dat_meta = data.frame(
    Sample = sapply(fpaths, get_sample),
    Group  = sapply(fpaths, get_group),
    FPath  = fpaths,
    row.names = NULL
)
dat_meta

Sample,Group,FPath
<chr>,<chr>,<chr>
Input.rep1,Input,/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/A001_K562_WSTARRseq/coverage_astarrseq_peak_macs_input/A001_K562_WSTARRseq.Input.rep1.WGS.unstranded.bed.gz
Input.rep2,Input,/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/A001_K562_WSTARRseq/coverage_astarrseq_peak_macs_input/A001_K562_WSTARRseq.Input.rep2.WGS.unstranded.bed.gz
Input.rep3,Input,/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/A001_K562_WSTARRseq/coverage_astarrseq_peak_macs_input/A001_K562_WSTARRseq.Input.rep3.WGS.unstranded.bed.gz
Input.rep4,Input,/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/A001_K562_WSTARRseq/coverage_astarrseq_peak_macs_input/A001_K562_WSTARRseq.Input.rep4.WGS.unstranded.bed.gz
Output.rep1,Output,/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/A001_K562_WSTARRseq/coverage_astarrseq_peak_macs_input/A001_K562_WSTARRseq.Output.rep1.WGS.unstranded.bed.gz
Output.rep2,Output,/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/A001_K562_WSTARRseq/coverage_astarrseq_peak_macs_input/A001_K562_WSTARRseq.Output.rep2.WGS.unstranded.bed.gz
Output.rep3,Output,/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/A001_K562_WSTARRseq/coverage_astarrseq_peak_macs_input/A001_K562_WSTARRseq.Output.rep3.WGS.unstranded.bed.gz


## Import data

In [6]:
cnames = c("Chrom", "Start", "End", "Count")
ctypes = c(col_character(), col_integer(), col_integer(), col_integer())

lst = lapply(fpaths, function(fpath){
    sam = get_sample(fpath)
    dat = read_tsv(fpath, col_names = cnames, col_types = ctypes) %>% 
        dplyr::filter(Count != ".") %>% 
        dplyr::mutate(Count  = as.integer(Count)) %>%
        dplyr::mutate(
            Peak   = paste0(Chrom, ":", Start, "-", End),
            Sample = sam)
    return(dat)
})

dat_count = bind_rows(lst)
head(dat_count)

Chrom,Start,End,Count,Peak,Sample
<chr>,<dbl>,<dbl>,<int>,<chr>,<chr>
chr1,10015,10442,1,chr1:10015-10442,Input.rep1
chr1,17237,17772,5,chr1:17237-17772,Input.rep1
chr1,136071,137429,3,chr1:136071-137429,Input.rep1
chr1,137737,139544,14,chr1:137737-139544,Input.rep1
chr1,180982,182087,8,chr1:180982-182087,Input.rep1
chr1,183239,184602,12,chr1:183239-184602,Input.rep1


In [7]:
fdiry = file.path(FD_RES, "results", ASSAY, "coverage", "summary")
fname = "library_size_summary.tsv"
fpath = file.path(fdiry, fname)
dat_lib = read_tsv(fpath, show_col_types = FALSE)
dat_lib

FName,Size,Prefix,Sample,Group,Type,Genome
<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>
A001_K562_WSTARRseq.Input.rep1.WGS.unstranded.bed.gz,26908970,A001_K562_WSTARRseq,Input.rep1,Input,raw,hg38
A001_K562_WSTARRseq.Input.rep2.WGS.unstranded.bed.gz,99899775,A001_K562_WSTARRseq,Input.rep2,Input,raw,hg38
A001_K562_WSTARRseq.Input.rep3.WGS.unstranded.bed.gz,105623984,A001_K562_WSTARRseq,Input.rep3,Input,raw,hg38
A001_K562_WSTARRseq.Input.rep4.WGS.unstranded.bed.gz,108635002,A001_K562_WSTARRseq,Input.rep4,Input,raw,hg38
A001_K562_WSTARRseq.Output.rep1.WGS.unstranded.bed.gz,160349140,A001_K562_WSTARRseq,Output.rep1,Output,raw,hg38
A001_K562_WSTARRseq.Output.rep2.WGS.unstranded.bed.gz,157326312,A001_K562_WSTARRseq,Output.rep2,Output,raw,hg38
A001_K562_WSTARRseq.Output.rep3.WGS.unstranded.bed.gz,328185275,A001_K562_WSTARRseq,Output.rep3,Output,raw,hg38


## Calulate CPM

In [8]:
###
dat = left_join(dat_count, dat_lib, by = "Sample")
dat = dat %>%
    dplyr::mutate(CPM = Count * 10^6 / Size) %>%
    dplyr::select(Chrom, Start, End, Peak, Sample, Count, CPM)

###
dat_count_cpm = dat
print(dim(dat))
head(dat)

[1] 1725002       7


Chrom,Start,End,Peak,Sample,Count,CPM
<chr>,<dbl>,<dbl>,<chr>,<chr>,<int>,<dbl>
chr1,10015,10442,chr1:10015-10442,Input.rep1,1,0.03716233
chr1,17237,17772,chr1:17237-17772,Input.rep1,5,0.18581165
chr1,136071,137429,chr1:136071-137429,Input.rep1,3,0.11148699
chr1,137737,139544,chr1:137737-139544,Input.rep1,14,0.52027261
chr1,180982,182087,chr1:180982-182087,Input.rep1,8,0.29729863
chr1,183239,184602,chr1:183239-184602,Input.rep1,12,0.44594795


## Arrange to matrix

In [9]:
dat = dat_count_cpm
dat = dat %>% 
    dplyr::select(Peak, Sample, Count) %>%
    spread(Sample, Count) %>% 
    replace(is.na(.), 0)

mat_count_raw = dat
print(dim(dat))
head(dat)

[1] 246832      8


Peak,Input.rep1,Input.rep2,Input.rep3,Input.rep4,Output.rep1,Output.rep2,Output.rep3
<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
chr1:100006256-100006880,17,51,63,41,69,57,136
chr1:100010437-100010915,5,31,29,35,49,39,77
chr1:10002087-10003910,13,68,72,64,85,85,177
chr1:100021298-100021629,0,10,13,9,14,9,19
chr1:100023727-100023976,2,14,14,6,16,17,48
chr1:100027983-100029702,23,75,84,57,103,107,225


In [10]:
dat = dat_count_cpm
dat = dat %>% 
    dplyr::select(Peak, Sample, CPM) %>%
    spread(Sample, CPM) %>% 
    replace(is.na(.), 0)

mat_count_cpm = dat
print(dim(dat))
head(dat)

[1] 246832      8


Peak,Input.rep1,Input.rep2,Input.rep3,Input.rep4,Output.rep1,Output.rep2,Output.rep3
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
chr1:100006256-100006880,0.6317596,0.5105117,0.5964554,0.37741059,0.43031101,0.3623043,0.41440007
chr1:100010437-100010915,0.18581165,0.310311,0.2745589,0.32217977,0.30558318,0.24789242,0.23462357
chr1:10002087-10003910,0.48311028,0.6806822,0.6816634,0.58912872,0.53009327,0.54027835,0.5393295
chr1:100021298-100021629,0.0,0.1001003,0.1230781,0.08284623,0.08730948,0.05720594,0.05789413
chr1:100023727-100023976,0.07432466,0.1401405,0.1325457,0.05523082,0.09978226,0.10805567,0.14625885
chr1:100027983-100029702,0.85473357,0.7507524,0.7952739,0.52469277,0.64234832,0.6801151,0.68558835


## Calculate log2fc

In [11]:
dat = mat_count_cpm
head(dat)

Peak,Input.rep1,Input.rep2,Input.rep3,Input.rep4,Output.rep1,Output.rep2,Output.rep3
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
chr1:100006256-100006880,0.6317596,0.5105117,0.5964554,0.37741059,0.43031101,0.3623043,0.41440007
chr1:100010437-100010915,0.18581165,0.310311,0.2745589,0.32217977,0.30558318,0.24789242,0.23462357
chr1:10002087-10003910,0.48311028,0.6806822,0.6816634,0.58912872,0.53009327,0.54027835,0.5393295
chr1:100021298-100021629,0.0,0.1001003,0.1230781,0.08284623,0.08730948,0.05720594,0.05789413
chr1:100023727-100023976,0.07432466,0.1401405,0.1325457,0.05523082,0.09978226,0.10805567,0.14625885
chr1:100027983-100029702,0.85473357,0.7507524,0.7952739,0.52469277,0.64234832,0.6801151,0.68558835


In [12]:
### Row-wise means in dplyr – Jeffrey Girard
### https://jmgirard.com/rowwise-means/

dat = mat_count_cpm
dat = dat %>%
    dplyr::mutate(
        Input  = rowMeans(across(starts_with("Input"))),
        Output = rowMeans(across(starts_with("Output")))
    ) %>%
    dplyr::mutate(
        Log2FC  = log2(Output)   - log2(Input),
        pLog2FC = log2(Output+1) - log2(Input+1)
    ) %>%
    dplyr::select(Peak, Input, Output, Log2FC, pLog2FC)

###
dat_log2fc = dat
print(dim(dat))
head(dat)

[1] 246832      5


Peak,Input,Output,Log2FC,pLog2FC
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
chr1:100006256-100006880,0.52903432,0.40233846,-0.39495167,-0.1247862
chr1:100010437-100010915,0.27321532,0.26269972,-0.05662371,-0.01196482
chr1:10002087-10003910,0.60864614,0.53656704,-0.18184528,-0.0661363
chr1:100021298-100021629,0.07650616,0.06746985,-0.18133305,-0.01216126
chr1:100023727-100023976,0.1005604,0.11803226,0.23111898,0.0227235
chr1:100027983-100029702,0.73136318,0.66935059,-0.12782594,-0.05262141


## Save results

In [13]:
ASSAY  = "A001_K562_WSTARRseq"
FOLDER = "coverage_astarrseq_peak_macs_input"

In [14]:
fdiry = file.path(FD_RES, "results", ASSAY, FOLDER, "summary")
fname = "metadata.raw.WGS.tsv"
fpath = file.path(fdiry, fname)
print(fpath)

dat = dat_meta
write_tsv(dat, fpath)

[1] "/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/A001_K562_WSTARRseq/coverage_astarrseq_peak_macs_input/summary/metadata.raw.WGS.tsv"


In [15]:
fdiry = file.path(FD_RES, "results", ASSAY, FOLDER, "summary")
fname = "matrix.raw.count.WGS.tsv"
fpath = file.path(fdiry, fname)
print(fpath)

dat = mat_count_raw
write_tsv(dat, fpath)

[1] "/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/A001_K562_WSTARRseq/coverage_astarrseq_peak_macs_input/summary/matrix.raw.count.WGS.tsv"


In [16]:
fdiry = file.path(FD_RES, "results", ASSAY, FOLDER, "summary")
fname = "matrix.raw.cpm.WGS.tsv"
fpath = file.path(fdiry, fname)
print(fpath)

dat = mat_count_cpm
write_tsv(dat, fpath)

[1] "/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/A001_K562_WSTARRseq/coverage_astarrseq_peak_macs_input/summary/matrix.raw.cpm.WGS.tsv"


In [17]:
fdiry = file.path(FD_RES, "results", ASSAY, FOLDER, "summary")
fname = "result.Log2FC.raw.cpm.WGS.tsv"
fpath = file.path(fdiry, fname)
print(fpath)

dat = dat_log2fc
write_tsv(dat, fpath)

[1] "/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/A001_K562_WSTARRseq/coverage_astarrseq_peak_macs_input/summary/result.Log2FC.raw.cpm.WGS.tsv"
