**Set environment**

In [1]:
suppressMessages(suppressWarnings(source("../run_config_project_sing.R")))
show_env()

You are working on        Singularity 
BASE DIRECTORY (FD_BASE): /mount 
REPO DIRECTORY (FD_REPO): /mount/repo 
WORK DIRECTORY (FD_WORK): /mount/work 
DATA DIRECTORY (FD_DATA): /mount/data 

You are working with      ENCODE FCC 
PATH OF PROJECT (FD_PRJ): /mount/repo/Proj_ENCODE_FCC 
PROJECT RESULTS (FD_RES): /mount/repo/Proj_ENCODE_FCC/results 
PROJECT SCRIPTS (FD_EXE): /mount/repo/Proj_ENCODE_FCC/scripts 
PROJECT DATA    (FD_DAT): /mount/repo/Proj_ENCODE_FCC/data 
PROJECT NOTE    (FD_NBK): /mount/repo/Proj_ENCODE_FCC/notebooks 
PROJECT DOCS    (FD_DOC): /mount/repo/Proj_ENCODE_FCC/docs 
PROJECT LOG     (FD_LOG): /mount/repo/Proj_ENCODE_FCC/log 
PROJECT APP     (FD_APP): /mount/repo/Proj_ENCODE_FCC/app 
PROJECT REF     (FD_REF): /mount/repo/Proj_ENCODE_FCC/references 



In [2]:
txt_fdiry = file.path(FD_RES, "assay_fcc")
vec = dir(txt_fdiry)
for (txt in vec) {cat(txt, "\n")}

CRISPRi_FlowFISH_K562_Riley_JinWoo 
CRISPRi_Growth_K562_Gersbach_JinWoo 
MPRA_Lenti_K562_Nadav_Vikram 
MPRA_Tiling_K562_Tewhey_Hannah 
STARR_ATAC_K562_Reddy_KS274 
STARR_ATAC_K562_Reddy_KS91 
STARR_WHG_K562_Reddy_A001 


In [5]:
VEC_TXT_ASSAY = c(
    "STARR_ATAC_K562_Reddy_KS91",
    #"STARR_ATAC_K562_Reddy_KS274",
    "STARR_WHG_K562_Reddy_A001"
)

In [6]:
get_fpkm = function(mat, vec_num_length){
    ### normlaized by size
    fun = function(x){x * 10^6 / sum(x)}
    mat = apply(mat, 2, fun)
    
    ### normlaized by length
    vec = vec_num_length
    fun = function(x){x/vec}
    mat = apply(mat, 2, fun)

    return(mat)
}

get_tpm  = function(mat, vec_num_length){
    ### normlaized by length
    vec = vec_num_length
    fun = function(x){x/vec}
    mat = apply(mat, 2, fun)
    
    ### normlaized by size
    fun = function(x){x * 10^6 / sum(x)}
    mat = apply(mat, 2, fun)
    
    return(mat)
}

In [8]:
###
vec_txt_assay_name = c("STARR_ATAC_K562_Reddy_KS91")
vec_txt_region_label = c("astarr_macs_input_overlap", "astarr_macs_input_union")

txt_region_folder = "fcc_astarr_macs"

###
for (txt_assay_name in vec_txt_assay_name){
    for (txt_region_label in vec_txt_region_label){
        ### ========================================
        ### write tables
        ### ----------------------------------------
        
        ### show progress
        cat("Assay        ", txt_assay_name,    "\n")
        cat("Region Folder", txt_region_folder, "\n")
        cat("Region Label ", txt_region_label,  "\n")
        flush.console()
        
        ###
        txt_fdiry = file.path(
            FD_RES, 
            "assay_fcc", 
            txt_assay_name, 
            "coverage", 
            txt_region_folder, 
            txt_region_label,
            "overlap_count",
            "summary")
        txt_fname = "data_list_count_column.rds"
        txt_fpath = file.path(txt_fdiry, txt_fname)
    
        ###
        lst = readRDS(txt_fpath)
        dat_cnt = lst$data_cnt
        dat_col = lst$data_col

        ### ========================================
        ### write tables
        ### ----------------------------------------
        ###
        dat = dat_cnt
        dat = dat %>% dplyr::mutate(Length = ChromEnd - ChromStart)
        mat = dat %>% 
           dplyr::select(
               Region, 
               starts_with("Input"), 
               starts_with("Output")) %>% 
           column_to_rownames(var = "Region")
        
        ###
        mat_tpm  = get_tpm(mat, dat$Length)
        mat_fpkm = get_fpkm(mat, dat$Length)
        
        ### convert to tidy data
        lst = list(
            "TPM"  = mat_tpm,
            "FPKM" = mat_fpkm
        )
        lst = lapply(lst, function(mat){
            dat = mat %>% 
                as.data.frame %>% 
                rownames_to_column(var = "Region") %>% 
                tidyr::gather(Sample, Value, -Region)
            return(dat)
        })

        ### ========================================
        ### write tables
        ### ----------------------------------------
        
        ###
        tmp = dat_col %>% dplyr::select(Sample, Group)
        dat = bind_rows(lst, .id = "Method")
        dat = dplyr::left_join(dat, tmp, by="Sample")
        dat = dat %>%
            dplyr::group_by(Region, Method, Group) %>%
            dplyr::summarize(Mean = mean(Value), .groups = "drop") %>%
            dplyr::mutate(Name = paste(Group, Method, sep = "_")) %>%
            dplyr::select(Region, Name, Mean) %>%
            tidyr::spread(Name, Mean)
        
        ###
        tmp = dat_cnt %>% dplyr::select(Chrom, ChromStart, ChromEnd, Region)
        dat = dplyr::left_join(tmp, dat, by="Region")
        
        ### assign and show
        dat_score_mean = dat
        fun_display_table(head(dat))
        flush.console()
        
        ### ========================================
        ### write tables
        ### ----------------------------------------
        txt_fdiry = file.path(
            FD_RES, 
            "assay_fcc", 
            txt_assay_name, 
            "coverage", 
            txt_region_folder, 
            txt_region_label,
            "overlap_count",
            "summary")
        txt_fname = "result.score.mean.tsv"
        txt_fpath = file.path(txt_fdiry, txt_fname)
        
        dat = dat_score_mean
        write_tsv(dat, txt_fpath)
    }
}


Chrom,ChromStart,ChromEnd,Region,Input_FPKM,Input_TPM,Output_FPKM,Output_TPM
chr1,10038,10405,chr1:10038-10405,0.0041644,3.940038,0.0007357,0.7181993
chr1,14282,14614,chr1:14282-14614,0.0030033,2.841707,0.0022621,2.2104314
chr1,16025,16338,chr1:16025-16338,0.0040487,3.830812,0.0012867,1.2597204
chr1,17288,17689,chr1:17288-17689,0.0065512,6.198372,0.0017373,1.7059186
chr1,28934,29499,chr1:28934-29499,0.0042953,4.064322,0.0006562,0.6447721
chr1,115429,115969,chr1:115429-115969,0.0159548,15.096518,0.0282829,27.6549997


Assay         STARR_ATAC_K562_Reddy_KS91 
Region Folder fcc_astarr_macs 
Region Label  astarr_macs_input_overlap 


Chrom,ChromStart,ChromEnd,Region,Input_FPKM,Input_TPM,Output_FPKM,Output_TPM
chr1,10015,10442,chr1:10015-10442,0.003083,3.160573,0.0006187,0.6571619
chr1,14253,14645,chr1:14253-14645,0.0021763,2.231167,0.001951,2.0734986
chr1,16015,16477,chr1:16015-16477,0.0025549,2.619283,0.0009085,0.966125
chr1,17237,17772,chr1:17237-17772,0.0040513,4.153273,0.0011264,1.2014016
chr1,28903,29613,chr1:28903-29613,0.0029078,2.981219,0.0006405,0.6827709
chr1,30803,31072,chr1:30803-31072,0.0027935,2.863264,0.0021818,2.3197849


Assay         STARR_ATAC_K562_Reddy_KS91 
Region Folder fcc_astarr_macs 
Region Label  astarr_macs_input_union 


In [5]:
txt_assay = "STARR_ATAC_K562_Reddy_KS91"
txt_fdiry = file.path(
    FD_RES, 
    "assay_fcc", 
    txt_assay, 
    "coverage", 
    "fcc_astarr_macs", 
    "astarr_macs_input_overlap",
    "overlap_count",
    "summary")

txt_fname = "data_list_count_column.rds"
txt_fpath = file.path(txt_fdiry, txt_fname)

lst = readRDS(txt_fpath)
lst_ocr_overlap = lst
names(lst)

In [6]:
txt_assay = "STARR_ATAC_K562_Reddy_KS91"
txt_fdiry = file.path(
    FD_RES, 
    "assay_fcc", 
    txt_assay, 
    "coverage", 
    "fcc_astarr_macs", 
    "astarr_macs_input_union",
    "overlap_count",
    "summary")

txt_fname = "data_list_count_column.rds"
txt_fpath = file.path(txt_fdiry, txt_fname)

lst = readRDS(txt_fpath)
lst_ocr_union = lst
names(lst)

In [10]:
dat_cnt = lst$data_cnt
dat_col = lst$data_col

In [11]:
head(dat_cnt)

Chrom,ChromStart,ChromEnd,Region,Input.rep1,Input.rep2,Input.rep3,Input.rep4,Input.rep5,Input.rep6,Output.rep1,Output.rep2,Output.rep3,Output.rep4
<chr>,<dbl>,<dbl>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
chr1,10038,10405,chr1:10038-10405,145,207,244,229,178,206,5,12,4,10
chr1,14282,14614,chr1:14282-14614,106,139,153,137,123,128,8,23,26,41
chr1,16025,16338,chr1:16025-16338,124,189,175,170,178,164,8,7,15,18
chr1,17288,17689,chr1:17288-17689,259,350,400,367,369,331,7,13,23,59
chr1,28934,29499,chr1:28934-29499,258,324,349,324,346,310,8,11,3,26
chr1,115429,115969,chr1:115429-115969,950,1174,1276,1151,1179,1054,234,443,469,752


In [13]:
244	/ 367

In [58]:
get_fpkm = function(mat, vec_num_length){
    ### normlaized by size
    fun = function(x){x * 10^6 / sum(x)}
    mat = apply(mat, 2, fun)
    
    ### normlaized by length
    vec = vec_num_length
    fun = function(x){x/vec}
    mat = apply(mat, 2, fun)

    return(mat)
}

get_tpm  = function(mat, vec_num_length){
    ### normlaized by length
    vec = vec_num_length
    fun = function(x){x/vec}
    mat = apply(mat, 2, fun)
    
    ### normlaized by size
    fun = function(x){x * 10^6 / sum(x)}
    mat = apply(mat, 2, fun)
    
    return(mat)
}



In [60]:
###
dat = dat_cnt
dat = dat %>% dplyr::mutate(Length = ChromEnd - ChromStart)
mat = dat %>% 
   dplyr::select(
       Region, 
       starts_with("Input"), 
       starts_with("Output")) %>% 
   column_to_rownames(var = "Region")

###
mat_tpm  = get_tpm(mat, dat$Length)
mat_fpkm = get_fpkm(mat, dat$Length)

###
lst = list(
    "TPM"  = mat_tpm,
    "FPKM" = mat_fpkm
)

lst = lapply(lst, function(mat){
    dat = mat %>% 
        as.data.frame %>% 
        rownames_to_column(var = "Region") %>% 
        tidyr::gather(Sample, Value, -Region)
    return(dat)
})

###
tmp = dat_col %>% dplyr::select(Sample, Group)
dat = bind_rows(lst, .id = "Method")
dat = dplyr::left_join(dat, tmp, by="Sample")
dat = dat %>%
    dplyr::group_by(Region, Method, Group) %>%
    dplyr::summarize(Mean = mean(Value), .groups = "drop") %>%
    dplyr::mutate(Name = paste(Group, Method, sep = "_")) %>%
    dplyr::select(Region, Name, Mean) %>%
    tidyr::spread(Name, Mean)

###
tmp = dat_cnt %>% dplyr::select(Chrom, ChromStart, ChromEnd, Region)
dat = dplyr::left_join(tmp, dat, by="Region")

### assign and show
dat_score_mean = dat
print(dim(dat))
head(dat)

[1] 150041      8


Chrom,ChromStart,ChromEnd,Region,Input_FPKM,Input_TPM,Output_FPKM,Output_TPM
<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
chr1,10038,10405,chr1:10038-10405,0.004164383,3.940038,0.0006671239,0.6488611
chr1,14282,14614,chr1:14282-14614,0.003003325,2.841707,0.0020670469,2.0120603
chr1,16025,16338,chr1:16025-16338,0.004048701,3.830812,0.0012104446,1.1818766
chr1,17288,17689,chr1:17288-17689,0.006551198,6.198372,0.0015784974,1.5431749
chr1,28934,29499,chr1:28934-29499,0.004295316,4.064322,0.0005928433,0.5801715
chr1,115429,115969,chr1:115429-115969,0.015954822,15.096518,0.0260260669,25.3605429


In [61]:
txt_assay = "STARR_ATAC_K562_Reddy_KS91"
txt_fdiry = file.path(
    FD_RES, 
    "assay_fcc", 
    txt_assay, 
    "coverage", 
    "fcc_astarr_macs", 
    "astarr_macs_input_overlap",
    "summary")

txt_fname = "result.score.mean.tsv"
txt_fpath = file.path(txt_fdiry, txt_fname)

dat = dat_score_mean
write_tsv(dat, txt_fpath)