**Set environment**

In [1]:
suppressMessages(suppressWarnings(source("../config/config_sing.R")))
show_env()

You are in Singularity: singularity_proj_encode_fcc 
BASE DIRECTORY (FD_BASE): /data/reddylab/Kuei 
WORK DIRECTORY (FD_WORK): /data/reddylab/Kuei/out 
CODE DIRECTORY (FD_CODE): /data/reddylab/Kuei/code 
PATH OF PROJECT (FD_PRJ): /data/reddylab/Kuei/code/Proj_CombEffect_ENCODE_FCC 
PATH OF RESULTS (FD_RES): /data/reddylab/Kuei/out/proj_combeffect_encode_fcc 
PATH OF LOG     (FD_LOG): /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/log 


**Check data**

In [2]:
fdiry = file.path(FD_RES, "results", "Tewhey_K562_TileMPRA", "coverage_astarrseq_peak_macs_input")
dir(fdiry)

In [3]:
ASSAY   = "Tewhey_K562_TileMPRA"
PREFIXS = c("OL13_20220512", "OL43_20221003", "OL45_20220927")

FOLDER  = "coverage_astarrseq_peak_macs_input"
GENOME  = "hg38"

TYPE    = "raw"
GROUPS  = c("Input", "Output")

SAMPLES = c(
    paste0("Input.rep",  1:6),
    paste0("Output.rep", 1:6))

CNAMES = c("Chrom", "Start", "End", "Count")

get_info = function(fpath, strings){
    idx = str_detect(string = fpath, pattern = strings)
    return(strings[idx])
}

get_group  = function(fpath){return(get_info(fpath, GROUPS))}
get_sample = function(fpath){return(get_info(fpath, SAMPLES))}

## Library size

In [4]:
fdiry = file.path(FD_RES, "results", ASSAY, "coverage", "summary")
fname = "library_size_summary.tsv"
fpath = file.path(fdiry, fname)
dat_lib = read_tsv(fpath, show_col_types = FALSE)
dat_lib

FName,Size,Prefix,Sample,Group,Type,Genome
<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>
OL13_20220512.hg19.norm.Input.rep1.stranded_pos.bed,6418470,OL13_20220512,Input.rep1,Input,norm,hg19
OL13_20220512.hg19.norm.Input.rep2.stranded_pos.bed,6412311,OL13_20220512,Input.rep2,Input,norm,hg19
OL13_20220512.hg19.norm.Input.rep3.stranded_pos.bed,6413568,OL13_20220512,Input.rep3,Input,norm,hg19
OL13_20220512.hg19.norm.Input.rep4.stranded_pos.bed,6410822,OL13_20220512,Input.rep4,Input,norm,hg19
OL13_20220512.hg19.norm.Output.rep1.stranded_pos.bed,24214237,OL13_20220512,Output.rep1,Output,norm,hg19
OL13_20220512.hg19.norm.Output.rep2.stranded_pos.bed,21967607,OL13_20220512,Output.rep2,Output,norm,hg19
OL13_20220512.hg19.norm.Output.rep3.stranded_pos.bed,20773782,OL13_20220512,Output.rep3,Output,norm,hg19
OL13_20220512.hg19.norm.Output.rep4.stranded_pos.bed,19239234,OL13_20220512,Output.rep4,Output,norm,hg19
OL13_20220512.hg19.raw.Input.rep1.stranded_pos.bed,11710957,OL13_20220512,Input.rep1,Input,raw,hg19
OL13_20220512.hg19.raw.Input.rep2.stranded_pos.bed,6544172,OL13_20220512,Input.rep2,Input,raw,hg19


## Loop (Raw)

In [5]:
for (PREFIX in PREFIXS){
    ###
    cat("\n=========================================\n")
    cat("Prefix:", PREFIX, "\n")
    
    fname  = paste(PREFIX, GENOME, TYPE, "bed.gz", sep="*") 
    fglob  = file.path(FD_RES, "results", ASSAY, FOLDER, fname)
    fpaths = Sys.glob(fglob)
    
    ###
    cat("\n+++++++++++++++++++++++++++++++++++++++++\n")
    cat("Metadata:", "\n")
    
    dat_meta = data.frame(
        Sample = sapply(fpaths, get_sample),
        Group  = sapply(fpaths, get_group),
        FPath  = fpaths,
        row.names = NULL
    )
    print(dat_meta)
    
    ###
    cat("\n+++++++++++++++++++++++++++++++++++++++++\n")
    cat("Import count data:", "\n")
    
    cnames = c("Chrom", "Start", "End", "Count")
    ctypes = c(col_character(), col_integer(), col_integer(), col_integer())

    lst = lapply(fpaths, function(fpath){
        sam = get_sample(fpath)
        dat = read_tsv(fpath, col_names = cnames, col_types = ctypes) %>% 
            dplyr::filter(Count != ".") %>% 
            dplyr::mutate(Count  = as.integer(Count)) %>%
            dplyr::mutate(
                Peak   = paste0(Chrom, ":", Start, "-", End),
                Sample = sam,
                Prefix = PREFIX,
                Type   = TYPE,
                Genome = GENOME)
        return(dat)
    })
    dat = bind_rows(lst)
    
    dat_count = dat
    cat(dim(dat), "\n\n")
    print(head(dat_count))
    
    ###
    cat("\n+++++++++++++++++++++++++++++++++++++++++\n")
    cat("Calulate CPM:", "\n")
    
    dat = left_join(dat_count, dat_lib, by = c("Sample", "Prefix", "Type", "Genome"))
    dat = dat %>%
        dplyr::mutate(CPM = Count * 10^6 / Size) %>%
        dplyr::select(Chrom, Start, End, Peak, Sample, Count, CPM)

    dat_count_cpm = dat
    print(dim(dat))
    print(head(dat))
    
    ###
    cat("\n+++++++++++++++++++++++++++++++++++++++++\n")
    cat("Arrange to matrix (Raw):", "\n")    
    dat = dat_count_cpm
    dat = dat %>% 
        dplyr::select(Peak, Sample, Count) %>%
        spread(Sample, Count) %>% 
        replace(is.na(.), 0)

    mat_count_raw = dat
    print(dim(dat))
    print(head(dat))
    
    ###
    cat("\n+++++++++++++++++++++++++++++++++++++++++\n")
    cat("Arrange to matrix (CPM):", "\n")    
    dat = dat_count_cpm
    dat = dat %>% 
        dplyr::select(Peak, Sample, CPM) %>%
        spread(Sample, CPM) %>% 
        replace(is.na(.), 0)

    mat_count_cpm = dat
    print(dim(dat))
    print(head(dat))
    
    ###
    cat("\n+++++++++++++++++++++++++++++++++++++++++\n")
    cat("Calculate log2fc:", "\n")    
    dat = mat_count_cpm
    dat = dat %>%
        dplyr::mutate(
            Input  = rowMeans(across(starts_with("Input"))),
            Output = rowMeans(across(starts_with("Output")))
        ) %>%
        dplyr::mutate(
            Log2FC  = log2(Output)   - log2(Input),
            pLog2FC = log2(Output+1) - log2(Input+1)
        ) %>%
        dplyr::select(Peak, Input, Output, Log2FC, pLog2FC)

    dat_log2fc = dat
    print(dim(dat))
    print(head(dat))

    ###
    cat("\n+++++++++++++++++++++++++++++++++++++++++\n")
    cat("Save results: (Metadata)", "\n")
    
    fdiry = file.path(FD_RES, "results", ASSAY, FOLDER, "summary")
    fname = paste("metadata", TYPE, PREFIX, "tsv", sep=".") # "metadata.raw.OL13.tsv"
    fpath = file.path(fdiry, fname)
    print(fpath)

    dat = dat_meta
    write_tsv(dat, fpath)
    
    ###
    cat("\n+++++++++++++++++++++++++++++++++++++++++\n")
    cat("Save results: (Count)", "\n")
    
    fdiry = file.path(FD_RES, "results", ASSAY, FOLDER, "summary")
    fname = paste("matrix", TYPE, "count", PREFIX, "tsv", sep=".") # "matrix.raw.count.OL13.tsv"
    fpath = file.path(fdiry, fname)
    print(fpath)

    dat = mat_count_raw
    write_tsv(dat, fpath)
    
    ###
    cat("\n+++++++++++++++++++++++++++++++++++++++++\n")
    cat("Save results: (CPM)", "\n")
    
    fdiry = file.path(FD_RES, "results", ASSAY, FOLDER, "summary")
    fname = paste("matrix", TYPE, "cpm", PREFIX, "tsv", sep=".") # "matrix.raw.cpm.OL13.tsv"
    fpath = file.path(fdiry, fname)
    print(fpath)

    dat = mat_count_cpm
    write_tsv(dat, fpath)
    
    ###
    cat("\n+++++++++++++++++++++++++++++++++++++++++\n")
    cat("Save results: (Log2FC)", "\n")
    fdiry = file.path(FD_RES, "results", ASSAY, FOLDER, "summary")
    fname = paste("result.Log2FC", TYPE, "cpm", PREFIX, "tsv", sep=".") # "result.Log2FC.raw.cpm.OL13.tsv"
    fpath = file.path(fdiry, fname)
    print(fpath)

    dat = dat_log2fc
    write_tsv(dat, fpath)

    flush.console()
}


Prefix: OL13_20220512 

+++++++++++++++++++++++++++++++++++++++++
Metadata: 
       Sample  Group
1  Input.rep1  Input
2  Input.rep2  Input
3  Input.rep3  Input
4  Input.rep4  Input
5 Output.rep1 Output
6 Output.rep2 Output
7 Output.rep3 Output
8 Output.rep4 Output
                                                                                                                                                                      FPath
1  /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/Tewhey_K562_TileMPRA/coverage_astarrseq_peak_macs_input/OL13_20220512.hg38.raw.Input.rep1.stranded_pos.bed.gz
2  /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/Tewhey_K562_TileMPRA/coverage_astarrseq_peak_macs_input/OL13_20220512.hg38.raw.Input.rep2.stranded_pos.bed.gz
3  /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/Tewhey_K562_TileMPRA/coverage_astarrseq_peak_macs_input/OL13_20220512.hg38.raw.Input.rep3.stranded_pos.bed.gz
4  /data/reddylab/Kuei/out/proj_combeffect_en

In [6]:
dat_meta = data.frame(
        Sample = sapply(fpaths, get_sample),
        Group  = sapply(fpaths, get_group),
        FPath  = fpaths,
    row.names = NULL
    )
    print(dat_meta)

       Sample  Group
1  Input.rep1  Input
2  Input.rep2  Input
3  Input.rep3  Input
4  Input.rep4  Input
5 Output.rep1 Output
6 Output.rep2 Output
7 Output.rep3 Output
8 Output.rep4 Output
                                                                                                                                                                      FPath
1  /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/Tewhey_K562_TileMPRA/coverage_astarrseq_peak_macs_input/OL45_20220927.hg38.raw.Input.rep1.stranded_pos.bed.gz
2  /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/Tewhey_K562_TileMPRA/coverage_astarrseq_peak_macs_input/OL45_20220927.hg38.raw.Input.rep2.stranded_pos.bed.gz
3  /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/Tewhey_K562_TileMPRA/coverage_astarrseq_peak_macs_input/OL45_20220927.hg38.raw.Input.rep3.stranded_pos.bed.gz
4  /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/Tewhey_K562_TileMPRA/coverage_astarrseq_peak_macs_input/OL45_