**Set environment**

In [1]:
suppressMessages(suppressWarnings(source("../config/config_sing.R")))
show_env()

You are in Singularity: singularity_proj_encode_fcc 
BASE DIRECTORY (FD_BASE): /data/reddylab/Kuei 
WORK DIRECTORY (FD_WORK): /data/reddylab/Kuei/out 
CODE DIRECTORY (FD_CODE): /data/reddylab/Kuei/code 
PATH OF PROJECT (FD_PRJ): /data/reddylab/Kuei/code/Proj_CombEffect_ENCODE_FCC 
PATH OF RESULTS (FD_RES): /data/reddylab/Kuei/out/proj_combeffect_encode_fcc 
PATH OF LOG     (FD_LOG): /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/log 


## Import data (All ATAC peaks)

In [2]:
fdiry = file.path(FD_RES, "results", "region", "KS91_K562_ASTARRseq_peak_macs_input")
fname = "KS91_K562_hg38_ASTARRseq_Input.all_reps.masked.union_narrowPeak.q5.bed.gz"
fpath = file.path(fdiry, fname)

cnames = c("Chrom", "Start", "End")
dat = read_tsv(fpath, col_names = cnames, show_col_types = FALSE)

dat_peak_atac_total = dat
print(dim(dat))
head(dat)

[1] 247520      3


Chrom,Start,End
<chr>,<dbl>,<dbl>
chr1,10015,10442
chr1,14253,14645
chr1,16015,16477
chr1,17237,17772
chr1,28903,29613
chr1,30803,31072


## Import data (FPKM)

In [3]:
ASSAY  = "KS91_K562_ASTARRseq"
FOLDER = "coverage_astarrseq_peak_macs_input"

fdiry = file.path(FD_RES, "results", ASSAY, FOLDER, "summary")
fname = "result.score.raw.cpm.WGS.tsv"
fpath = file.path(fdiry, fname)

dat = read_tsv(fpath, show_col_types = FALSE)

dat_peak_coverage_astarr = dat
print(dim(dat))
head(dat)

[1] 246852     15


Chrom,Start,End,Peak,Length,Input,Output,Log2FC,pLog2FC,Input_Residual,Input_Score,Output_Residual,Output_Score,Input_FPKM,Output_FPKM
<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
chr1,10015,10442,chr1:10015-10442,427,0.4863317,0.09736537,-2.3204599,-0.43771215,0.25149985,0.1362392,-0.1433799,0.07463616,1.1389501,0.2280219
chr1,14253,14645,chr1:14253-14645,392,0.3152472,0.27902967,-0.1760648,-0.0402842,0.19879784,0.135728,0.1693269,0.07589325,0.8042019,0.7118104
chr1,16015,16477,chr1:16015-16477,462,0.4361424,0.15900628,-1.4557156,-0.30931046,0.08292808,0.134604,-0.2127815,0.07435717,0.9440313,0.3441694
chr1,17237,17772,chr1:17237-17772,535,0.8007562,0.22194085,-1.8511878,-0.5594284,0.20062968,0.1357457,-0.4231641,0.07351143,1.4967405,0.4148427
chr1,28903,29613,chr1:28903-29613,710,0.7629703,0.16679463,-2.1935541,-0.59545755,-0.42906879,0.1296375,-1.1335228,0.07065579,1.0746061,0.234922
chr1,30803,31072,chr1:30803-31072,269,0.2774187,0.21678565,-0.3557958,-0.07015643,0.57699942,0.1393966,0.5676036,0.07749432,1.0312963,0.8058946


## Split peaks by quantiles (Test)

In [4]:
dat     = dat_peak_coverage_astarr
vec_num = dat$Input_FPKM
vec_num = quantile(vec_num, probs = seq(0, 1, length.out = 5))
vec_txt = paste0("ATAC_Q", 1:4)

print(vec_num)
print(vec_txt)

          0%          25%          50%          75%         100% 
7.240126e-04 6.462627e-01 7.982122e-01 1.224492e+00 3.991072e+01 
[1] "ATAC_Q1" "ATAC_Q2" "ATAC_Q3" "ATAC_Q4"


In [5]:
dat = dat_peak_coverage_astarr
dat = dat %>% 
    dplyr::select(Peak, Input_FPKM) %>% 
    dplyr::mutate(
        Assay_Label = cut(
            Input_FPKM, 
            breaks = vec_num, 
            labels = vec_txt,
            include.lowest=TRUE
        )
    )
head(dat)

Peak,Input_FPKM,Assay_Label
<chr>,<dbl>,<fct>
chr1:10015-10442,1.1389501,ATAC_Q3
chr1:14253-14645,0.8042019,ATAC_Q3
chr1:16015-16477,0.9440313,ATAC_Q3
chr1:17237-17772,1.4967405,ATAC_Q4
chr1:28903-29613,1.0746061,ATAC_Q3
chr1:30803-31072,1.0312963,ATAC_Q3


In [6]:
lst = split(dat$Input_FPKM, dat$Assay_Label)
lst = lapply(lst, summary)
lst

$ATAC_Q1
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
0.000724 0.557734 0.590694 0.585428 0.618757 0.646262 

$ATAC_Q2
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.6463  0.6759  0.7090  0.7134  0.7485  0.7982 

$ATAC_Q3
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.7982  0.8609  0.9427  0.9641  1.0546  1.2245 

$ATAC_Q4
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.224   1.520   2.185   3.578   4.281  39.911 


## Split peaks by quantiles

In [7]:
### init
dat = dat_peak_coverage_astarr

### label peaks by quantile
dat = dat %>%
    dplyr::mutate(Peak = paste0(Chrom, ":", Start, "-", End)) %>% 
    dplyr::mutate(
        Score       = Input_FPKM,
        Assay_Type  = "ATAC",
        Assay_Label = cut(
            Input_FPKM, 
            breaks = vec_num, 
            labels = vec_txt,
            include.lowest=TRUE
        )
    )

### arrange table
dat = dat %>% 
    dplyr::select(Chrom, Start, End, Peak, Score, Assay_Type, Assay_Label) %>%
    dplyr::arrange(Chrom, Start, End)

### assign and show
dat_peak_atac_label = dat
print(dim(dat))
head(dat)

[1] 246852      7


Chrom,Start,End,Peak,Score,Assay_Type,Assay_Label
<chr>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<fct>
chr1,10015,10442,chr1:10015-10442,1.1389501,ATAC,ATAC_Q3
chr1,14253,14645,chr1:14253-14645,0.8042019,ATAC,ATAC_Q3
chr1,16015,16477,chr1:16015-16477,0.9440313,ATAC,ATAC_Q3
chr1,17237,17772,chr1:17237-17772,1.4967405,ATAC,ATAC_Q4
chr1,28903,29613,chr1:28903-29613,1.0746061,ATAC,ATAC_Q3
chr1,30803,31072,chr1:30803-31072,1.0312963,ATAC,ATAC_Q3


**Results**

In [8]:
dat = dat_peak_atac_label
table(dat$Assay_Label)


ATAC_Q1 ATAC_Q2 ATAC_Q3 ATAC_Q4 
  61713   61713   61713   61713 

In [9]:
dat = dat_peak_atac_label
dat = dat %>% dplyr::select(Assay_Type, Assay_Label) %>% dplyr::distinct()
dat

Assay_Type,Assay_Label
<chr>,<fct>
ATAC,ATAC_Q3
ATAC,ATAC_Q4
ATAC,ATAC_Q2
ATAC,ATAC_Q1


## Save results

In [10]:
### set file paths
fdiry  = file.path(
    FD_RES, 
    "results",
    "region",
    "KS91_K562_ASTARRseq_peak_macs_input",
    "summary"
)
fname = paste("peak", "screened", "atac", "quantile", "tsv", sep = ".")
fpath = file.path(fdiry, fname)
print(fname)

### save table
dat = dat_peak_atac_label
write_tsv(dat, fpath)

[1] "peak.screened.atac.quantile.tsv"
