**Set environment**

In [1]:
suppressMessages(suppressWarnings(source("../run_config_project_sing.R")))
suppressMessages(suppressWarnings(library("DESeq2")))
suppressMessages(suppressWarnings(library("edgeR")))
show_env()

You are working on        Singularity 
BASE DIRECTORY (FD_BASE): /mount 
REPO DIRECTORY (FD_REPO): /mount/repo 
WORK DIRECTORY (FD_WORK): /mount/work 
DATA DIRECTORY (FD_DATA): /mount/data 

You are working with      ENCODE FCC 
PATH OF PROJECT (FD_PRJ): /mount/repo/Proj_ENCODE_FCC 
PROJECT RESULTS (FD_RES): /mount/repo/Proj_ENCODE_FCC/results 
PROJECT SCRIPTS (FD_EXE): /mount/repo/Proj_ENCODE_FCC/scripts 
PROJECT DATA    (FD_DAT): /mount/repo/Proj_ENCODE_FCC/data 
PROJECT NOTE    (FD_NBK): /mount/repo/Proj_ENCODE_FCC/notebooks 
PROJECT DOCS    (FD_DOC): /mount/repo/Proj_ENCODE_FCC/docs 
PROJECT LOG     (FD_LOG): /mount/repo/Proj_ENCODE_FCC/log 
PROJECT APP     (FD_APP): /mount/repo/Proj_ENCODE_FCC/app 
PROJECT REF     (FD_REF): /mount/repo/Proj_ENCODE_FCC/references 



**Set global variable**

In [2]:
txt_fdiry = file.path(FD_RES, "assay_fcc")
vec = dir(txt_fdiry)
for (txt in vec) {cat(txt, "\n")}

CRISPRi_FlowFISH_K562_Riley_JinWoo 
CRISPRi_Growth_K562_Gersbach_JinWoo 
MPRA_Lenti_K562_Nadav_Vikram 
MPRA_Tiling_K562_Tewhey_Hannah 
STARR_ATAC_K562_Reddy_KS274 
STARR_ATAC_K562_Reddy_KS91 
STARR_ATAC_K562_Reddy_KSMerge 
STARR_WHG_K562_Reddy_A001 


In [3]:
VEC_TXT_ASSAY = c(
    "STARR_ATAC_K562_Reddy_KS91",
    "STARR_ATAC_K562_Reddy_KS274",
    "STARR_ATAC_K562_Reddy_KSMerge",
    "STARR_WHG_K562_Reddy_A001"
)

**Helper function**

In [4]:
get_fpkm = function(mat, vec_num_length){
    ### normlaized by size
    fun = function(x){x * 10^6 / sum(x)}
    mat = apply(mat, 2, fun)
    
    ### normlaized by length
    vec = vec_num_length
    fun = function(x){x/vec}
    mat = apply(mat, 2, fun)

    return(mat)
}

get_tpm  = function(mat, vec_num_length){
    ### normlaized by length
    vec = vec_num_length
    fun = function(x){x/vec}
    mat = apply(mat, 2, fun)
    
    ### normlaized by size
    fun = function(x){x * 10^6 / sum(x)}
    mat = apply(mat, 2, fun)
    
    return(mat)
}

## Execute

In [6]:
###
vec_txt_assay  = VEC_TXT_ASSAY
vec_txt_region = c("fcc_astarr_macs_input_overlap") #, "astarr_macs_input_union")
txt_region_folder = "fcc_astarr_macs"

###
for (txt_assay in vec_txt_assay){
    for (txt_region_label in vec_txt_region) {
        
        ### show progress
        cat("Assay        ", txt_assay,         "\n")
        cat("Region Folder", txt_region_folder, "\n")
        cat("Region Label ", txt_region_label,  "\n")
        cat("\n")
        flush.console()

        ### get column and column data
        txt_fdiry = file.path(
            FD_RES, 
            "region_coverage_fcc",
            txt_region_label,
            txt_assay, 
            "overlap_count", 
            "summary")

        ###
        txt_fname = "data.count_column.raw.WGS.rds"
        txt_fpath = file.path(txt_fdiry, txt_fname)

        lst_dat = readRDS(txt_fpath)
        dat_cnt = lst_dat$data_cnt
        dat_col = lst_dat$data_col
        dat_reg = dat_cnt %>% dplyr::select(Chrom, ChromStart, ChromEnd, Region)
        
        ###
        txt_fname = "data.deseq2.rds"
        txt_fpath = file.path(txt_fdiry, txt_fname)

        dds = readRDS(txt_fpath)
        
        ###
        txt_fname = "data.edger.rds"
        txt_fpath = file.path(txt_fdiry, txt_fname)

        lst_dge = readRDS(txt_fpath)

        ### ========================================
        ### Calculate: Screened
        ### ----------------------------------------
        
        lst = lst_dge
        lst = lapply(lst, function(dge){
            idx = filterByExpr(dge)
            dat = data.frame(
                Region = names(idx),
                Screen = idx
            )
            return(dat)
        })
        
        dat = bind_rows(lst, .id = "Set")
        dat = dat %>% tidyr::spread(Set, Screen)
        dat = dplyr::left_join(dat_reg, dat, by = "Region")
        
        dat_screen = dat
        
        ### ========================================
        ### Calculate DESeq2 Counts & Log2FC
        ### ----------------------------------------

        ### DESeq2 normalized counts
        mat = counts(dds, normalized=TRUE)
        dat = mat %>% as.data.frame %>% rownames_to_column("Region")
        dat = dplyr::left_join(dat_reg, dat, by = "Region")

        mat_dds_count = mat
        dat_dds_count = dat

        ### Deseq2 results
        res = results(dds, contrast = c("Group", "Output", "Input"))
        dat = res %>% as.data.frame %>% rownames_to_column("Region")
        dat = dplyr::left_join(dat_reg, dat, by = "Region")
        
        dat_dds_result = dat

        ### ========================================
        ### Calculate CPM
        ### ----------------------------------------
        
        ### ========================================
        ### Calculate TPM & FPKM
        ### ----------------------------------------

        ### init and get region length
        dat = dat_cnt
        dat = dat %>% dplyr::mutate(Length = ChromEnd - ChromStart)

        ### arrange count into matrix
        mat = dat %>% 
           dplyr::select(
               Region, 
               starts_with("Input"), 
               starts_with("Output")) %>% 
           column_to_rownames(var = "Region")
        
        ### calculate TPM and FPKM
        mat_tpm  = get_tpm(mat, dat$Length)
        mat_fpkm = get_fpkm(mat, dat$Length)

        ### arrange matrix into dataframe
        mat = mat_tpm
        dat = mat %>% as.data.frame %>% rownames_to_column("Region")
        dat = dplyr::left_join(dat_reg, dat, by = "Region")
        dat_tpm = dat

        mat = mat_fpkm
        dat = mat %>% as.data.frame %>% rownames_to_column("Region")
        dat = dplyr::left_join(dat_reg, dat, by = "Region")
        dat_fpkm = dat
        
        ### ========================================
        ### Calculate mean TPM & FPKM
        ### ----------------------------------------
        
        ### convert to tidy data
        lst = list(
            "TPM"   = mat_tpm,
            "FPKM"  = mat_fpkm,
            "DESeq" = mat_dds_count
        )
        lst = lapply(lst, function(mat){
            dat = mat %>% 
                as.data.frame %>% 
                rownames_to_column(var = "Region") %>% 
                tidyr::gather(Sample, Value, -Region)
            return(dat)
        })
        dat = bind_rows(lst, .id = "Method")
        
        ### calculate mean by group (Input & Output)
        tmp = dat_col %>% dplyr::select(Sample, Group)
        dat = dplyr::left_join(dat, tmp, by="Sample")
        dat = dat %>%
            dplyr::group_by(Region, Method, Group) %>%
            dplyr::summarize(Mean = mean(Value), .groups = "drop") %>%
            dplyr::mutate(Name = paste(Group, Method, sep = "_")) %>%
            dplyr::select(Region, Name, Mean) %>%
            tidyr::spread(Name, Mean)
        
        dat = dplyr::left_join(dat_reg, dat, by = "Region")
        dat_summary = dat

        ### ========================================
        ### Save results
        ### ----------------------------------------
        
        txt_fname = "matrix.count.TPM.WGS.tsv"
        txt_fpath = file.path(txt_fdiry, txt_fname)
        dat = dat_tpm
        write_tsv(dat, txt_fpath)
        
        txt_fname = "matrix.count.FPKM.WGS.tsv"
        txt_fpath = file.path(txt_fdiry, txt_fname)
        dat = dat_fpkm
        write_tsv(dat, txt_fpath)
        
        txt_fname = "matrix.count.deseq.WGS.tsv"
        txt_fpath = file.path(txt_fdiry, txt_fname)
        dat = dat_dds_count
        write_tsv(dat, txt_fpath)
        
        txt_fname = "result.coverage.Log2FC.deseq.tsv"
        txt_fpath = file.path(txt_fdiry, txt_fname)
        dat = dat_dds_result
        write_tsv(dat, txt_fpath)
        
        txt_fname = "result.coverage.summary.tsv"
        txt_fpath = file.path(txt_fdiry, txt_fname)
        dat = dat_summary
        write_tsv(dat, txt_fpath)
        
        txt_fname = "result.coverage.screened.tsv"
        txt_fpath = file.path(txt_fdiry, txt_fname)
        dat = dat_screen
        write_tsv(dat, txt_fpath)
        
    }
}

Assay         STARR_ATAC_K562_Reddy_KS91 
Region Folder fcc_astarr_macs 
Region Label  fcc_astarr_macs_input_overlap 



“All samples appear to belong to the same group.”
“All samples appear to belong to the same group.”


Assay         STARR_ATAC_K562_Reddy_KS274 
Region Folder fcc_astarr_macs 
Region Label  fcc_astarr_macs_input_overlap 



“All samples appear to belong to the same group.”
“All samples appear to belong to the same group.”


Assay         STARR_ATAC_K562_Reddy_KSMerge 
Region Folder fcc_astarr_macs 
Region Label  fcc_astarr_macs_input_overlap 



“All samples appear to belong to the same group.”
“All samples appear to belong to the same group.”


Assay         STARR_WHG_K562_Reddy_A001 
Region Folder fcc_astarr_macs 
Region Label  fcc_astarr_macs_input_overlap 



“All samples appear to belong to the same group.”
“All samples appear to belong to the same group.”


## Review

In [56]:
head(dat_dds_count, 3)

Chrom,ChromStart,ChromEnd,Region,Input.rep1,Input.rep2,Input.rep3,Input.rep4,Input.rep5,Input.rep6,Output.rep1,Output.rep2,Output.rep3,Output.rep4
<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
chr1,10038,10405,chr1:10038-10405,85.29216,94.85917,103.92258,104.02099,82.94011,106.93828,26.91726,34.93514,11.10767,15.78996
chr1,14282,14614,chr1:14282-14614,62.35151,63.6977,65.16457,62.2309,57.31255,66.44709,43.06761,66.95901,72.19984,64.73885
chr1,16025,16338,chr1:16025-16338,72.9395,86.61054,74.53464,77.22082,82.94011,85.13533,43.06761,20.37883,41.65375,28.42194


In [57]:
head(dat_dds_result, 3)

Chrom,ChromStart,ChromEnd,Region,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
chr1,10038,10405,chr1:10038-10405,66.67233,-2.23485278,0.2790669,-8.0083052,1.163e-15,8.299026e-15
chr1,14282,14614,chr1:14282-14614,62.41696,0.03179054,0.1644506,0.1933136,0.8467134,0.8816469
chr1,16025,16338,chr1:16025-16338,61.29031,-1.33934074,0.222691,-6.0143468,1.806134e-09,8.654528e-09


In [59]:
head(dat_tpm, 3)

Chrom,ChromStart,ChromEnd,Region,Input.rep1,Input.rep2,Input.rep3,Input.rep4,Input.rep5,Input.rep6,Output.rep1,Output.rep2,Output.rep3,Output.rep4
<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
chr1,10038,10405,chr1:10038-10405,3.445338,3.888148,4.290341,4.273202,3.39739,4.345809,0.8436118,1.17135,0.3693823,0.4884531
chr1,14282,14614,chr1:14282-14614,2.784182,2.886126,2.973866,2.825963,2.595126,2.98498,1.4920749,2.481768,2.654101,2.2137812
chr1,16025,16338,chr1:16025-16338,3.454675,4.162517,3.607961,3.719535,3.983521,4.056664,1.5826481,0.801171,1.6241611,1.0309013


In [60]:
head(dat_fpkm, 3)

Chrom,ChromStart,ChromEnd,Region,Input.rep1,Input.rep2,Input.rep3,Input.rep4,Input.rep5,Input.rep6,Output.rep1,Output.rep2,Output.rep3,Output.rep4
<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
chr1,10038,10405,chr1:10038-10405,0.003629544,0.004114802,0.004548045,0.004522405,0.003592098,0.004579404,0.0008557153,0.0012171316,0.0003817082,0.000488374
chr1,14282,14614,chr1:14282-14614,0.002933039,0.003054368,0.003152495,0.002990766,0.002743856,0.003145428,0.001513482,0.0025787671,0.0027426652,0.002213423
chr1,16025,16338,chr1:16025-16338,0.00363938,0.004405164,0.003824677,0.003936449,0.004211821,0.004274716,0.0016053547,0.0008324843,0.0016783574,0.001030734


In [58]:
head(dat_summary, 3)

Chrom,ChromStart,ChromEnd,Region,Input_FPKM,Input_TPM,Output_FPKM,Output_TPM
<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
chr1,10038,10405,chr1:10038-10405,0.004164383,3.940038,0.0007357323,0.7181993
chr1,14282,14614,chr1:14282-14614,0.003003325,2.841707,0.0022620843,2.2104314
chr1,16025,16338,chr1:16025-16338,0.004048701,3.830812,0.0012867327,1.2597204


In [53]:
mat = counts(dds, normalized=TRUE)
dat = mat %>% as.data.frame %>% rownames_to_column("Region")
head(dat)

Unnamed: 0_level_0,Region,Input.rep1,Input.rep2,Input.rep3,Input.rep4,Input.rep5,Input.rep6,Output.rep1,Output.rep2,Output.rep3,Output.rep4
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,chr1:10038-10405,85.29216,94.85917,103.92258,104.02099,82.94011,106.93828,26.91726,34.93514,11.107668,15.78996
2,chr1:14282-14614,62.35151,63.6977,65.16457,62.2309,57.31255,66.44709,43.06761,66.95901,72.199841,64.73885
3,chr1:16025-16338,72.9395,86.61054,74.53464,77.22082,82.94011,85.13533,43.06761,20.37883,41.653755,28.42194
4,chr1:17288-17689,152.34944,160.3899,170.36488,166.70613,171.93764,171.82801,37.68416,37.8464,63.86909,93.16079
5,chr1:28934-29499,151.76122,148.47522,148.64336,147.17381,161.22066,160.92653,43.06761,32.02388,8.330751,41.05391
6,chr1:115429-115969,558.81068,537.99354,543.46398,522.83041,549.36173,547.15022,1259.72765,1289.68879,1302.374058,1187.40534


In [40]:
res = results(dds, contrast = c("Group", "Output", "Input"))
dat = as.data.frame(res) %>% rownames_to_column("Region")

dat_log2fc = dat
head(dat, 3)

Unnamed: 0_level_0,Region,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,chr1:10038-10405,66.67233,-2.23485278,0.2790669,-8.0083052,1.163e-15,8.299026e-15
2,chr1:14282-14614,62.41696,0.03179054,0.1644506,0.1933136,0.8467134,0.8816469
3,chr1:16025-16338,61.29031,-1.33934074,0.222691,-6.0143468,1.806134e-09,8.654528e-09


In [39]:
lst = lst_dge
lst = lapply(lst, function(dge){
    idx = filterByExpr(dge)
    dat = data.frame(
        Region = names(idx),
        Screen = idx
    )
    return(dat)
})

dat = bind_rows(lst, .id = "Set")
dat = dat %>% tidyr::spread(Set, Screen)

dat_screen = dat
head(dat, 3)

“All samples appear to belong to the same group.”
“All samples appear to belong to the same group.”


Unnamed: 0_level_0,Region,Input,Output,Total
Unnamed: 0_level_1,<chr>,<lgl>,<lgl>,<lgl>
1,chr1:100028014-100029653,True,True,True
2,chr1:100036895-100039189,True,True,True
3,chr1:100046252-100046533,True,True,True


In [17]:
dat_cnt = lst$data_cnt
head(dat_cnt)

Chrom,ChromStart,ChromEnd,Region,Input.rep1,Input.rep2,Input.rep3,Input.rep4,Input.rep5,Input.rep6,Output.rep1,Output.rep2,Output.rep3,Output.rep4
<chr>,<dbl>,<dbl>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
chr1,10038,10405,chr1:10038-10405,145,207,244,229,178,206,5,12,4,10
chr1,14282,14614,chr1:14282-14614,106,139,153,137,123,128,8,23,26,41
chr1,16025,16338,chr1:16025-16338,124,189,175,170,178,164,8,7,15,18
chr1,17288,17689,chr1:17288-17689,259,350,400,367,369,331,7,13,23,59
chr1,28934,29499,chr1:28934-29499,258,324,349,324,346,310,8,11,3,26
chr1,115429,115969,chr1:115429-115969,950,1174,1276,1151,1179,1054,234,443,469,752


In [16]:
res

log2 fold change (MLE): Group Output vs Input 
Wald test p-value: Group Output vs Input 
DataFrame with 150040 rows and 6 columns
                          baseMean log2FoldChange     lfcSE       stat
                         <numeric>      <numeric> <numeric>  <numeric>
chr1:10038-10405           66.6723     -2.2348528  0.279067  -8.008305
chr1:14282-14614           62.4170      0.0317905  0.164451   0.193314
chr1:16025-16338           61.2903     -1.3393407  0.222691  -6.014347
chr1:17288-17689          122.6136     -1.3305558  0.163003  -8.162766
chr1:28934-29499          104.2677     -2.2809210  0.216204 -10.549835
...                            ...            ...       ...        ...
chrX:156000431-156003126 1495.2194       0.534917  0.039766   13.45164
chrX:156009723-156010194   46.9103      -1.779449  0.284619   -6.25203
chrX:156016432-156016780   53.2759      -4.199531  0.549431   -7.64341
chrX:156024963-156025554  161.3988      -0.343981  0.113474   -3.03136
chrX:156030352-156

## Execute

In [8]:
###
vec_txt_assay_name = c("STARR_ATAC_K562_Reddy_KS91")
vec_txt_region_label = c("astarr_macs_input_overlap", "astarr_macs_input_union")

txt_region_folder = "fcc_astarr_macs"

###
for (txt_assay_name in vec_txt_assay_name){
    for (txt_region_label in vec_txt_region_label){
        ### ========================================
        ### write tables
        ### ----------------------------------------
        
        ### show progress
        cat("Assay        ", txt_assay_name,    "\n")
        cat("Region Folder", txt_region_folder, "\n")
        cat("Region Label ", txt_region_label,  "\n")
        flush.console()
        
        ###
        txt_fdiry = file.path(
            FD_RES, 
            "assay_fcc", 
            txt_assay_name, 
            "coverage", 
            txt_region_folder, 
            txt_region_label,
            "overlap_count",
            "summary")
        txt_fname = "data_list_count_column.rds"
        txt_fpath = file.path(txt_fdiry, txt_fname)
    
        ###
        lst = readRDS(txt_fpath)
        dat_cnt = lst$data_cnt
        dat_col = lst$data_col

        ### ========================================
        ### write tables
        ### ----------------------------------------
        ###
        dat = dat_cnt
        dat = dat %>% dplyr::mutate(Length = ChromEnd - ChromStart)
        mat = dat %>% 
           dplyr::select(
               Region, 
               starts_with("Input"), 
               starts_with("Output")) %>% 
           column_to_rownames(var = "Region")
        
        ###
        mat_tpm  = get_tpm(mat, dat$Length)
        mat_fpkm = get_fpkm(mat, dat$Length)
        
        ### convert to tidy data
        lst = list(
            "TPM"  = mat_tpm,
            "FPKM" = mat_fpkm
        )
        lst = lapply(lst, function(mat){
            dat = mat %>% 
                as.data.frame %>% 
                rownames_to_column(var = "Region") %>% 
                tidyr::gather(Sample, Value, -Region)
            return(dat)
        })

        ### ========================================
        ### write tables
        ### ----------------------------------------
        
        ###
        tmp = dat_col %>% dplyr::select(Sample, Group)
        dat = bind_rows(lst, .id = "Method")
        dat = dplyr::left_join(dat, tmp, by="Sample")
        dat = dat %>%
            dplyr::group_by(Region, Method, Group) %>%
            dplyr::summarize(Mean = mean(Value), .groups = "drop") %>%
            dplyr::mutate(Name = paste(Group, Method, sep = "_")) %>%
            dplyr::select(Region, Name, Mean) %>%
            tidyr::spread(Name, Mean)
        
        ###
        tmp = dat_cnt %>% dplyr::select(Chrom, ChromStart, ChromEnd, Region)
        dat = dplyr::left_join(tmp, dat, by="Region")
        
        ### assign and show
        dat_score_mean = dat
        fun_display_table(head(dat))
        flush.console()
        
        ### ========================================
        ### write tables
        ### ----------------------------------------
        txt_fdiry = file.path(
            FD_RES, 
            "assay_fcc", 
            txt_assay_name, 
            "coverage", 
            txt_region_folder, 
            txt_region_label,
            "overlap_count",
            "summary")
        txt_fname = "result.coverage.TPM.FPKM.tsv"
        txt_fpath = file.path(txt_fdiry, txt_fname)
        
        dat = dat_score_mean
        write_tsv(dat, txt_fpath)
    }
}


Chrom,ChromStart,ChromEnd,Region,Input_FPKM,Input_TPM,Output_FPKM,Output_TPM
chr1,10038,10405,chr1:10038-10405,0.0041644,3.940038,0.0007357,0.7181993
chr1,14282,14614,chr1:14282-14614,0.0030033,2.841707,0.0022621,2.2104314
chr1,16025,16338,chr1:16025-16338,0.0040487,3.830812,0.0012867,1.2597204
chr1,17288,17689,chr1:17288-17689,0.0065512,6.198372,0.0017373,1.7059186
chr1,28934,29499,chr1:28934-29499,0.0042953,4.064322,0.0006562,0.6447721
chr1,115429,115969,chr1:115429-115969,0.0159548,15.096518,0.0282829,27.6549997


Assay         STARR_ATAC_K562_Reddy_KS91 
Region Folder fcc_astarr_macs 
Region Label  astarr_macs_input_overlap 


Chrom,ChromStart,ChromEnd,Region,Input_FPKM,Input_TPM,Output_FPKM,Output_TPM
chr1,10015,10442,chr1:10015-10442,0.003083,3.160573,0.0006187,0.6571619
chr1,14253,14645,chr1:14253-14645,0.0021763,2.231167,0.001951,2.0734986
chr1,16015,16477,chr1:16015-16477,0.0025549,2.619283,0.0009085,0.966125
chr1,17237,17772,chr1:17237-17772,0.0040513,4.153273,0.0011264,1.2014016
chr1,28903,29613,chr1:28903-29613,0.0029078,2.981219,0.0006405,0.6827709
chr1,30803,31072,chr1:30803-31072,0.0027935,2.863264,0.0021818,2.3197849


Assay         STARR_ATAC_K562_Reddy_KS91 
Region Folder fcc_astarr_macs 
Region Label  astarr_macs_input_union 


In [5]:
txt_assay = "STARR_ATAC_K562_Reddy_KS91"
txt_fdiry = file.path(
    FD_RES, 
    "assay_fcc", 
    txt_assay, 
    "coverage", 
    "fcc_astarr_macs", 
    "astarr_macs_input_overlap",
    "overlap_count",
    "summary")

txt_fname = "data_list_count_column.rds"
txt_fpath = file.path(txt_fdiry, txt_fname)

lst = readRDS(txt_fpath)
lst_ocr_overlap = lst
names(lst)

In [6]:
txt_assay = "STARR_ATAC_K562_Reddy_KS91"
txt_fdiry = file.path(
    FD_RES, 
    "assay_fcc", 
    txt_assay, 
    "coverage", 
    "fcc_astarr_macs", 
    "astarr_macs_input_union",
    "overlap_count",
    "summary")

txt_fname = "data_list_count_column.rds"
txt_fpath = file.path(txt_fdiry, txt_fname)

lst = readRDS(txt_fpath)
lst_ocr_union = lst
names(lst)

In [10]:
dat_cnt = lst$data_cnt
dat_col = lst$data_col

In [11]:
head(dat_cnt)

Chrom,ChromStart,ChromEnd,Region,Input.rep1,Input.rep2,Input.rep3,Input.rep4,Input.rep5,Input.rep6,Output.rep1,Output.rep2,Output.rep3,Output.rep4
<chr>,<dbl>,<dbl>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
chr1,10038,10405,chr1:10038-10405,145,207,244,229,178,206,5,12,4,10
chr1,14282,14614,chr1:14282-14614,106,139,153,137,123,128,8,23,26,41
chr1,16025,16338,chr1:16025-16338,124,189,175,170,178,164,8,7,15,18
chr1,17288,17689,chr1:17288-17689,259,350,400,367,369,331,7,13,23,59
chr1,28934,29499,chr1:28934-29499,258,324,349,324,346,310,8,11,3,26
chr1,115429,115969,chr1:115429-115969,950,1174,1276,1151,1179,1054,234,443,469,752


In [13]:
244	/ 367

In [58]:
get_fpkm = function(mat, vec_num_length){
    ### normlaized by size
    fun = function(x){x * 10^6 / sum(x)}
    mat = apply(mat, 2, fun)
    
    ### normlaized by length
    vec = vec_num_length
    fun = function(x){x/vec}
    mat = apply(mat, 2, fun)

    return(mat)
}

get_tpm  = function(mat, vec_num_length){
    ### normlaized by length
    vec = vec_num_length
    fun = function(x){x/vec}
    mat = apply(mat, 2, fun)
    
    ### normlaized by size
    fun = function(x){x * 10^6 / sum(x)}
    mat = apply(mat, 2, fun)
    
    return(mat)
}



In [60]:
###
dat = dat_cnt
dat = dat %>% dplyr::mutate(Length = ChromEnd - ChromStart)
mat = dat %>% 
   dplyr::select(
       Region, 
       starts_with("Input"), 
       starts_with("Output")) %>% 
   column_to_rownames(var = "Region")

###
mat_tpm  = get_tpm(mat, dat$Length)
mat_fpkm = get_fpkm(mat, dat$Length)

###
lst = list(
    "TPM"  = mat_tpm,
    "FPKM" = mat_fpkm
)

lst = lapply(lst, function(mat){
    dat = mat %>% 
        as.data.frame %>% 
        rownames_to_column(var = "Region") %>% 
        tidyr::gather(Sample, Value, -Region)
    return(dat)
})

###
tmp = dat_col %>% dplyr::select(Sample, Group)
dat = bind_rows(lst, .id = "Method")
dat = dplyr::left_join(dat, tmp, by="Sample")
dat = dat %>%
    dplyr::group_by(Region, Method, Group) %>%
    dplyr::summarize(Mean = mean(Value), .groups = "drop") %>%
    dplyr::mutate(Name = paste(Group, Method, sep = "_")) %>%
    dplyr::select(Region, Name, Mean) %>%
    tidyr::spread(Name, Mean)

###
tmp = dat_cnt %>% dplyr::select(Chrom, ChromStart, ChromEnd, Region)
dat = dplyr::left_join(tmp, dat, by="Region")

### assign and show
dat_score_mean = dat
print(dim(dat))
head(dat)

[1] 150041      8


Chrom,ChromStart,ChromEnd,Region,Input_FPKM,Input_TPM,Output_FPKM,Output_TPM
<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
chr1,10038,10405,chr1:10038-10405,0.004164383,3.940038,0.0006671239,0.6488611
chr1,14282,14614,chr1:14282-14614,0.003003325,2.841707,0.0020670469,2.0120603
chr1,16025,16338,chr1:16025-16338,0.004048701,3.830812,0.0012104446,1.1818766
chr1,17288,17689,chr1:17288-17689,0.006551198,6.198372,0.0015784974,1.5431749
chr1,28934,29499,chr1:28934-29499,0.004295316,4.064322,0.0005928433,0.5801715
chr1,115429,115969,chr1:115429-115969,0.015954822,15.096518,0.0260260669,25.3605429


In [61]:
txt_assay = "STARR_ATAC_K562_Reddy_KS91"
txt_fdiry = file.path(
    FD_RES, 
    "assay_fcc", 
    txt_assay, 
    "coverage", 
    "fcc_astarr_macs", 
    "astarr_macs_input_overlap",
    "summary")

txt_fname = "result.score.mean.tsv"
txt_fpath = file.path(txt_fdiry, txt_fname)

dat = dat_score_mean
write_tsv(dat, txt_fpath)