**Set environment**

In [1]:
suppressWarnings(suppressMessages(source("../config/config_sing.R")))
show_env()

You are in Singularity: singularity_proj_combeffect 
BASE DIRECTORY:     /mount/work 
PATH OF SOURCE:     /mount/work/source 
PATH OF EXECUTABLE: /mount/work/exe 
PATH OF ANNOTATION: /mount/work/annotation 
PATH OF PROJECT:    /mount/project 
PATH OF RESULTS:    /mount/work/out/proj_combeffect_encode_fcc 


In [13]:
REGIONS = c("GATA1", "MYC")
GROUPS  = c("Input", "Output")

In [2]:
SAMPLES = c(
    paste0("Input_rep",  1:6),
    paste0("Output_rep", 1:4))

In [3]:
get_sample = function(fpath){
    idx = str_detect(string = fpath, pattern = SAMPLES)
    return(SAMPLES[idx])
}

## Library size

In [4]:
fdiry = file.path(FD_RES, "KS91_K562_ASTARRseq", "coverage", "library_size")
fname = "library_size_summary.csv"
fpath = file.path(fdiry, fname)

dat_lib = read_csv(fpath) %>% 
    dplyr::mutate(Size = Count) %>% 
    dplyr::select(Sample, Group, Size)
dat_lib

[1mRows: [22m[34m10[39m [1mColumns: [22m[34m4[39m
[36m──[39m [1mColumn specification[22m [36m──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (3): Fpath, Sample, Group
[32mdbl[39m (1): Count

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


Sample,Group,Size
<chr>,<chr>,<dbl>
Input_rep1,Input,348695063
Input_rep2,Input,451369741
Input_rep3,Input,487579055
Input_rep4,Input,456246254
Input_rep5,Input,444268950
Input_rep6,Input,397333562
Output_rep1,Output,44103844
Output_rep2,Output,97471282
Output_rep3,Output,84103298
Output_rep4,Output,183115379


In [29]:
### initialization
prefix = "KS91_K562_ASTARRseq"
cnames = c("Chrom", "Start", "End", "Count")

### loop through regions and groups to summarize counts
for (region in REGIONS){
    for (group in GROUPS){
        
        ### start timer
        cat("\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n")
        cat("Region:", region, ";", "Group:", group, "\n\n")
        timer_start = Sys.time()
        
        ### get file directories
        fdiry  = file.path(FD_RES, "KS91_K562_ASTARRseq", "coverage")
        fname  = paste0("*", group, "*", region, "*unstranded*perbase*")
        fglob  = file.path(fdiry, fname)
        fpaths = Sys.glob(fglob)
        
        cat("Start importing data:\n")
        print(fpaths)
        cat("++++++++++++++++++++++++++++++++++++++\n")
        flush.console()
        
        ### import data
        lst = lapply(fpaths, function(fpath){
            sam = get_sample(fpath)
            dat = read_tsv(fpath, col_name = cnames, show_col_types = FALSE)
            dat = dat %>% 
                dplyr::mutate(Sample = sam) %>%
                dplyr::mutate(across(Count, na_if, ".")) %>%
                dplyr::mutate(across(Count, as.integer))
            return(dat)
        })
        dat_cnt = bind_rows(lst)
        
        cat("Imported data:\n")
        cat("    Shape:", dim(dat_cnt), "\n")
        cat("++++++++++++++++++++++++++++++++++++++\n")
        flush.console()
        
        ### summarize
        dat_stats = left_join(dat_cnt, dat_lib, by="Sample") %>% 
            mutate(Cpm = Count * 10^6 / Size)  %>%
            group_by(Chrom, Start, End, Group) %>%
            summarise(
                Cpm_mu = mean(    Cpm,       na.rm = TRUE),
                Cpm_sd = sd(      Cpm,       na.rm = TRUE),
                Cpm_md = median(  Cpm,       na.rm = TRUE),
                Cpm_q1 = quantile(Cpm, 0.25, na.rm = TRUE),
                Cpm_q3 = quantile(Cpm, 0.75, na.rm = TRUE),
                .groups = 'drop')
        
        ### store table
        fdiry = file.path(FD_RES, "KS91_K562_ASTARRseq", "coverage", "summary")
        fname = paste0(prefix, "_", group, ".", region, ".unstreated.perbase.stats.tsv")
        fpath = file.path(fdiry, fname)
        write_tsv(dat_stats, fpath)
        
        cat("Saved data:\n")
        cat("    Shape:", dim(dat_stats), "\n")
        print(head(dat_stats))
        cat("++++++++++++++++++++++++++++++++++++++\n")
        flush.console()
        
        ### end timer
        timer_stop = Sys.time()
        runtime = timer_stop - timer_start
        cat("Runtime: ", runtime, "\n")
        cat("++++++++++++++++++++++++++++++++++++++\n")
        flush.console()
        
    } # end inner loop
} # end outer loop


^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Region: GATA1 ; Group: Input 

Start importing data:
[1] "/mount/work/out/proj_combeffect_encode_fcc/KS91_K562_ASTARRseq/coverage/KS91_K562_hg38_ASTARRseq_Input_rep1.GATA1.unstranded.perbase.tsv.gz"
[2] "/mount/work/out/proj_combeffect_encode_fcc/KS91_K562_ASTARRseq/coverage/KS91_K562_hg38_ASTARRseq_Input_rep2.GATA1.unstranded.perbase.tsv.gz"
[3] "/mount/work/out/proj_combeffect_encode_fcc/KS91_K562_ASTARRseq/coverage/KS91_K562_hg38_ASTARRseq_Input_rep3.GATA1.unstranded.perbase.tsv.gz"
[4] "/mount/work/out/proj_combeffect_encode_fcc/KS91_K562_ASTARRseq/coverage/KS91_K562_hg38_ASTARRseq_Input_rep4.GATA1.unstranded.perbase.tsv.gz"
[5] "/mount/work/out/proj_combeffect_encode_fcc/KS91_K562_ASTARRseq/coverage/KS91_K562_hg38_ASTARRseq_Input_rep5.GATA1.unstranded.perbase.tsv.gz"
[6] "/mount/work/out/proj_combeffect_encode_fcc/KS91_K562_ASTARRseq/coverage/KS91_K562_hg38_ASTARRseq_Input_rep6.GATA1.unstranded.perbase.tsv.gz"
+++++++++++++++++++++++++++++++++

ERROR: Error: Cannot open file for writing:
* '/mount/work/out/proj_combeffect_encode_fcc/KS91_K562_ASTARRseq/coverage/summary/KS91_K562_ASTARRseq_Input.GATA1.unstreated.perbase.stats.tsv'


In [None]:
print(1)

## Import data | GATA | Input

In [5]:
fdiry = file.path(FD_RES, "KS91_K562_ASTARRseq", "coverage")
fglob = file.path(fdiry, "*GATA*unstranded*perbase*")
Sys.glob(fglob)

In [6]:
### data directories
fdiry = file.path(FD_RES, "KS91_K562_ASTARRseq", "coverage")
fglob = file.path(fdiry, "*Input*GATA*unstranded*perbase*")
fpaths = Sys.glob(fglob)

### set columns
cnames = c("Chrom", "Start", "End", "Count")

### import data
lst = lapply(fpaths[1:3], function(fpath){
    sam = get_sample(fpath)
    dat = read_tsv(fpath, col_name = cnames, show_col_types = FALSE)
    dat = dat %>% 
        dplyr::mutate(Sample = sam) %>%
        dplyr::mutate(across(Count, na_if, ".")) %>%
        dplyr::mutate(across(Count, as.integer))
    return(dat)
})

dat_cnt = bind_rows(lst)
print(dim(dat_cnt))
head(dat_cnt)

[1] 6000453       5


Chrom,Start,End,Count,Sample
<chr>,<dbl>,<dbl>,<int>,<chr>
chrX,47786500,47786501,,Input_rep1
chrX,47786501,47786502,,Input_rep1
chrX,47786502,47786503,,Input_rep1
chrX,47786503,47786504,,Input_rep1
chrX,47786504,47786505,,Input_rep1
chrX,47786505,47786506,,Input_rep1


In [7]:
dat = left_join(dat_cnt, dat_lib, by="Sample") %>% 
    mutate(Count = as.integer(Count))  %>%
    mutate(Cpm = Count * 10^6 / Size)  %>%
    group_by(Chrom, Start, End, Group) %>%
    summarise(
        Cpm_mu = mean(    Cpm,       na.rm = TRUE),
        Cpm_sd = sd(      Cpm,       na.rm = TRUE),
        Cpm_md = median(  Cpm,       na.rm = TRUE),
        Cpm_q1 = quantile(Cpm, 0.25, na.rm = TRUE),
        Cpm_q3 = quantile(Cpm, 0.75, na.rm = TRUE),
        .groups = 'drop') %>%
    na.omit
head(dat)

Chrom,Start,End,Group,Cpm_mu,Cpm_sd,Cpm_md,Cpm_q1,Cpm_q3
<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
chrX,47787231,47787232,Input,0.002541657,0.0004612858,0.002541657,0.002378568,0.002704746
chrX,47787232,47787233,Input,0.002541657,0.0004612858,0.002541657,0.002378568,0.002704746
chrX,47787233,47787234,Input,0.002541657,0.0004612858,0.002541657,0.002378568,0.002704746
chrX,47787234,47787235,Input,0.002541657,0.0004612858,0.002541657,0.002378568,0.002704746
chrX,47787235,47787236,Input,0.002541657,0.0004612858,0.002541657,0.002378568,0.002704746
chrX,47787236,47787237,Input,0.002541657,0.0004612858,0.002541657,0.002378568,0.002704746


In [8]:
print(dim(dat))

[1] 1677719       9


## Import data | GATA | Output

## Import data | GATA | Output