**Set environment**

In [1]:
suppressMessages(suppressWarnings(source("../run_config_project_sing.R")))
suppressMessages(suppressWarnings(library("DESeq2")))
suppressMessages(suppressWarnings(library("edgeR")))
show_env()

You are working on        Singularity: singularity_proj_encode_fcc 
BASE DIRECTORY (FD_BASE): /data/reddylab/Kuei 
REPO DIRECTORY (FD_REPO): /data/reddylab/Kuei/repo 
WORK DIRECTORY (FD_WORK): /data/reddylab/Kuei/work 
DATA DIRECTORY (FD_DATA): /data/reddylab/Kuei/data 

You are working with      ENCODE FCC 
PATH OF PROJECT (FD_PRJ): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC 
PROJECT RESULTS (FD_RES): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results 
PROJECT SCRIPTS (FD_EXE): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/scripts 
PROJECT DATA    (FD_DAT): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/data 
PROJECT NOTE    (FD_NBK): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/notebooks 
PROJECT DOCS    (FD_DOC): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/docs 
PROJECT LOG     (FD_LOG): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/log 
PROJECT REF     (FD_REF): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/references 



**Set global variable**

In [4]:
txt_fdiry = file.path(FD_RES, "assay_fcc")
vec = dir(txt_fdiry)
for (txt in vec) {cat(txt, "\n")}

CRISPRi_FlowFISH_K562_Riley_JinWoo 
CRISPRi_Growth_K562_Gersbach_JinWoo 
MPRA_Lenti_K562_Nadav_Vikram 
MPRA_Tiling_K562_Tewhey_Hannah 
STARR_ATAC_K562_Reddy_KS274 
STARR_ATAC_K562_Reddy_KS91 
STARR_ATAC_K562_Reddy_KSMerge 
STARR_WHG_K562_Reddy_A001 


In [5]:
#VEC_TXT_ASSAY = c(
#    "STARR_ATAC_K562_Reddy_KS91",
#    "STARR_ATAC_K562_Reddy_KS274",
#    "STARR_ATAC_K562_Reddy_KSMerge",
#    "STARR_WHG_K562_Reddy_A001"
#)
TXT_ASSAY = "MPRA_Tiling_K562_Tewhey_Hannah"
VEC_TXT_PREFIX = c("OL13", "OL43", "OL45")

**Helper function**

In [10]:
get_fpkm = function(mat, vec_num_length){
    ### normlaized by size
    fun = function(x){x * 10^6 / sum(x)}
    mat = apply(mat, 2, fun)
    
    ### normlaized by length
    vec = vec_num_length
    fun = function(x){x/vec}
    mat = apply(mat, 2, fun)

    return(mat)
}

get_tpm  = function(mat, vec_num_length){
    ### normlaized by length
    vec = vec_num_length
    fun = function(x){x/vec}
    mat = apply(mat, 2, fun)
    
    ### normlaized by size
    fun = function(x){x * 10^6 / sum(x)}
    mat = apply(mat, 2, fun)
    
    return(mat)
}

## Execute

In [11]:
txt_region = "fcc_astarr_macs_input_overlap"
txt_assay  = TXT_ASSAY
txt_fdiry = file.path(
    FD_RES, 
    "region_coverage_fcc",
    txt_region,
    txt_assay, 
    "overlap_count", 
    "summary")
for (txt in dir(txt_fdiry)){cat(txt, "\n")}

data.count_column.norm.OL13.rds 
data.count_column.norm.OL43.rds 
data.count_column.norm.OL45.rds 
data.count_column.raw.OL13.rds 
data.count_column.raw.OL43.rds 
data.count_column.raw.OL45.rds 
data.deseq2.OL13.rds 
data.deseq2.OL43.rds 
data.deseq2.OL45.rds 
data.edger.OL13.rds 
data.edger.OL43.rds 
data.edger.OL45.rds 
matrix.count.norm.OL13.tsv 
matrix.count.norm.OL43.tsv 
matrix.count.norm.OL45.tsv 
matrix.count.raw.OL13.tsv 
matrix.count.raw.OL43.tsv 
matrix.count.raw.OL45.tsv 


In [12]:
### init
txt_assay      = TXT_ASSAY
vec_txt_prefix = VEC_TXT_PREFIX
vec_txt_region = c("fcc_astarr_macs_input_overlap") #, "astarr_macs_input_union")

### init 
lst_dat_screen     = list()
lst_dat_dds_result = list()
lst_dat_summary    = list()

### excute
for (txt_prefix in vec_txt_prefix){
    for (txt_region_label in vec_txt_region) {
        
        ### show progress
        cat("Assay: ", txt_assay,         "\n")
        cat("Prefix:", txt_prefix,        "\n")
        cat("Region:", txt_region_label,  "\n")
        cat("\n")
        flush.console()

        ### set file directory
        txt_fdiry = file.path(
            FD_RES, 
            "region_coverage_fcc",
            txt_region_label,
            txt_assay, 
            "overlap_count", 
            "summary"
        )
                
        ### get raw 
        txt_fname = paste("data.count_column.raw", txt_prefix, "rds", sep = ".")
        txt_fpath = file.path(txt_fdiry, txt_fname)

        lst_dat = readRDS(txt_fpath)
        dat_cnt = lst_dat$data_cnt
        dat_col = lst_dat$data_col
        dat_reg = dat_cnt %>% dplyr::select(Chrom, ChromStart, ChromEnd, Region)
        
        ### get DESeq2 object
        txt_fname = paste("data.deseq2", txt_prefix, "rds", sep = ".")
        txt_fpath = file.path(txt_fdiry, txt_fname)

        dds = readRDS(txt_fpath)
        
        ### get EdgeR object
        txt_fname = paste("data.edger", txt_prefix, "rds", sep = ".")
        txt_fpath = file.path(txt_fdiry, txt_fname)

        lst_dge = readRDS(txt_fpath)

        ### ========================================
        ### Calculate: Screened
        ### ----------------------------------------
        
        lst = lst_dge
        lst = lapply(lst, function(dge){
            idx = filterByExpr(dge)
            dat = data.frame(
                Region = names(idx),
                Screen = idx
            )
            return(dat)
        })
        
        dat = bind_rows(lst, .id = "Set")
        dat = dat %>% tidyr::spread(Set, Screen)
        dat = dplyr::left_join(dat_reg, dat, by = "Region")
        
        dat_screen = dat
        
        ### ========================================
        ### Calculate DESeq2 Counts & Log2FC
        ### ----------------------------------------

        ### DESeq2 normalized counts
        mat = counts(dds, normalized=TRUE)
        dat = mat %>% as.data.frame %>% rownames_to_column("Region")
        dat = dplyr::left_join(dat_reg, dat, by = "Region")

        mat_dds_count = mat
        dat_dds_count = dat

        ### Deseq2 results
        res = results(dds, contrast = c("Group", "Output", "Input"))
        dat = res %>% as.data.frame %>% rownames_to_column("Region")
        dat = dplyr::left_join(dat_reg, dat, by = "Region")
        
        dat_dds_result = dat
        
        ### ========================================
        ### Calculate CPM
        ### ----------------------------------------
        
        ### ========================================
        ### Calculate TPM & FPKM
        ### ----------------------------------------

        ### init and get region length
        dat = dat_cnt
        dat = dat %>% dplyr::mutate(Length = ChromEnd - ChromStart)

        ### arrange count into matrix
        mat = dat %>% 
           dplyr::select(
               Region, 
               starts_with("Input"), 
               starts_with("Output")) %>% 
           column_to_rownames(var = "Region")
        
        ### calculate TPM and FPKM
        mat_tpm  = get_tpm(mat, dat$Length)
        mat_fpkm = get_fpkm(mat, dat$Length)
        
        ### arrange matrix into dataframe
        mat = mat_tpm
        dat = mat %>% as.data.frame %>% rownames_to_column("Region")
        dat = dplyr::left_join(dat_reg, dat, by = "Region")
        
        dat_tpm = dat
        
        mat = mat_fpkm
        dat = mat %>% as.data.frame %>% rownames_to_column("Region")
        dat = dplyr::left_join(dat_reg, dat, by = "Region")
        
        dat_fpkm = dat
        
        ### ========================================
        ### Calculate mean TPM & FPKM
        ### ----------------------------------------
        
        ### convert to tidy data
        lst = list(
            "TPM"   = mat_tpm,
            "FPKM"  = mat_fpkm,
            "DESeq" = mat_dds_count
        )
        lst = lapply(lst, function(mat){
            dat = mat %>% 
                as.data.frame %>% 
                rownames_to_column(var = "Region") %>% 
                tidyr::gather(Sample, Value, -Region)
            return(dat)
        })
        dat = bind_rows(lst, .id = "Method")
        
        ### calculate mean by group (Input & Output)
        tmp = dat_col %>% dplyr::select(Sample, Group)
        dat = dplyr::left_join(dat, tmp, by="Sample")
        dat = dat %>%
            dplyr::group_by(Region, Method, Group) %>%
            dplyr::summarize(Mean = mean(Value), .groups = "drop") %>%
            dplyr::mutate(Name = paste(Group, Method, sep = "_")) %>%
            dplyr::select(Region, Name, Mean) %>%
            tidyr::spread(Name, Mean)
        
        dat = dplyr::left_join(dat_reg, dat, by = "Region")
        dat_summary = dat

        ### ========================================
        ### Save results
        ### ----------------------------------------

        ### collect
        lst_dat_screen[[txt_prefix]]     = dat_screen
        lst_dat_dds_result[[txt_prefix]] = dat_dds_result
        lst_dat_summary[[txt_prefix]]    = dat_summary
        
        #txt_fname = "matrix.count.TPM.WGS.tsv"
        txt_fname = paste("matrix.count.TPM", txt_prefix, "tsv", sep = ".")
        txt_fpath = file.path(txt_fdiry, txt_fname)
        dat = dat_tpm
        write_tsv(dat, txt_fpath)
        
        #txt_fname = "matrix.count.FPKM.WGS.tsv"
        txt_fname = paste("matrix.count.FPKM", txt_prefix, "tsv", sep = ".")
        txt_fpath = file.path(txt_fdiry, txt_fname)
        dat = dat_fpkm
        write_tsv(dat, txt_fpath)
        
        #txt_fname = "matrix.count.deseq.WGS.tsv"
        txt_fname = paste("matrix.count.deseq", txt_prefix, "tsv", sep = ".")
        txt_fpath = file.path(txt_fdiry, txt_fname)
        dat = dat_dds_count
        write_tsv(dat, txt_fpath)
        
        #txt_fname = "result.coverage.Log2FC.deseq.tsv"
        txt_fname = paste("result.coverage.Log2FC.deseq", txt_prefix, "tsv", sep = ".")
        txt_fpath = file.path(txt_fdiry, txt_fname)
        dat = dat_dds_result
        write_tsv(dat, txt_fpath)
        
        #txt_fname = "result.coverage.TPM.FPKM.tsv"
        txt_fname = paste("result.coverage.summary", txt_prefix, "tsv", sep = ".")
        txt_fpath = file.path(txt_fdiry, txt_fname)
        dat = dat_summary
        write_tsv(dat, txt_fpath)
        
        #txt_fname = "result.coverage.screened.tsv"
        txt_fname = paste("result.coverage.screened", txt_prefix, "tsv", sep = ".")
        txt_fpath = file.path(txt_fdiry, txt_fname)
        dat = dat_screen
        write_tsv(dat, txt_fpath)
    }
}

Assay:  MPRA_Tiling_K562_Tewhey_Hannah 
Prefix: OL13 
Region: fcc_astarr_macs_input_overlap 



“All samples appear to belong to the same group.”
“All samples appear to belong to the same group.”


Assay:  MPRA_Tiling_K562_Tewhey_Hannah 
Prefix: OL43 
Region: fcc_astarr_macs_input_overlap 



“All samples appear to belong to the same group.”
“All samples appear to belong to the same group.”


Assay:  MPRA_Tiling_K562_Tewhey_Hannah 
Prefix: OL45 
Region: fcc_astarr_macs_input_overlap 



“All samples appear to belong to the same group.”
“All samples appear to belong to the same group.”


In [34]:
### concatenate batches
lst = lst_dat_screen
dat = bind_rows(lst, .id = "Batch")
dat_screen_merge = dat

lst = lst_dat_dds_result
dat = bind_rows(lst, .id = "Batch")
dat_dds_result_merge = dat

lst = lst_dat_summary
dat = bind_rows(lst, .id = "Batch")
dat_summary_merge = dat

#txt_fname = "result.coverage.Log2FC.deseq.tsv"
txt_fname = paste("result.coverage.Log2FC.deseq", "tsv", sep = ".")
txt_fpath = file.path(txt_fdiry, txt_fname)
dat = dat_dds_result
write_tsv(dat, txt_fpath)

#txt_fname = "result.coverage.TPM.FPKM.tsv"
txt_fname = paste("result.coverage.summary", "tsv", sep = ".")
txt_fpath = file.path(txt_fdiry, txt_fname)
dat = dat_summary
write_tsv(dat, txt_fpath)

#txt_fname = "result.coverage.screened.tsv"
txt_fname = paste("result.coverage.screened", "tsv", sep = ".")
txt_fpath = file.path(txt_fdiry, txt_fname)
dat = dat_screen
write_tsv(dat, txt_fpath)

## Review

In [13]:
head(dat_dds_count, 3)

Chrom,ChromStart,ChromEnd,Region,Input.rep1,Input.rep2,Input.rep3,Input.rep4,Output.rep1,Output.rep2,Output.rep3,Output.rep4
<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
chr11,4551522,4551988,chr11:4551522-4551988,3716.401,3689.427,3781.9,3471.689,3477.805,3062.594,3092.089,3827.598
chr11,4554256,4554817,chr11:4554256-4554817,4330.065,4009.521,4027.212,4348.617,18448.857,22620.272,25658.699,17362.778
chr11,4577505,4578002,chr11:4577505-4578002,4143.71,4283.688,4479.677,4265.496,2262.879,2264.222,2298.794,2077.526


In [14]:
head(dat_dds_result, 3)

Chrom,ChromStart,ChromEnd,Region,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
chr11,4551522,4551988,chr11:4551522-4551988,3514.938,-0.1224555,0.06977571,-1.754988,0.07926142,0.08515946
chr11,4554256,4554817,chr11:4554256-4554817,12600.753,2.3306759,0.13529796,17.226245,1.68756e-66,3.5887900000000004e-66
chr11,4577505,4578002,chr11:4577505-4578002,3259.499,-0.9483542,0.04568293,-20.759486,1.006372e-95,2.5563090000000002e-95


In [15]:
head(dat_tpm, 3)

Chrom,ChromStart,ChromEnd,Region,Input.rep1,Input.rep2,Input.rep3,Input.rep4,Output.rep1,Output.rep2,Output.rep3,Output.rep4
<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
chr11,4551522,4551988,chr11:4551522-4551988,1208.863,1205.974,1231.235,1128.986,518.0909,485.576,485.0615,543.7931
chr11,4554256,4554817,chr11:4554256-4554817,1169.962,1088.665,1089.077,1174.686,2282.9334,2979.124,3343.509,2049.0362
chr11,4577505,4578002,chr11:4577505-4578002,1263.785,1312.884,1367.437,1300.608,316.076,336.6017,338.1227,276.7472


In [16]:
head(dat_fpkm, 3)

Chrom,ChromStart,ChromEnd,Region,Input.rep1,Input.rep2,Input.rep3,Input.rep4,Output.rep1,Output.rep2,Output.rep3,Output.rep4
<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
chr11,4551522,4551988,chr11:4551522-4551988,1.610915,1.608294,1.642112,1.505946,0.6019377,0.5733794,0.5738492,0.6303662
chr11,4554256,4554817,chr11:4554256-4554817,1.559077,1.451851,1.452514,1.566905,2.6523983,3.5178187,3.955519,2.3752475
chr11,4577505,4578002,chr11:4577505-4578002,1.684104,1.75087,1.823766,1.734873,0.3672291,0.3974671,0.4000141,0.320806


In [17]:
head(dat_summary, 3)

Chrom,ChromStart,ChromEnd,Region,Input_DESeq,Input_FPKM,Input_TPM,Output_DESeq,Output_FPKM,Output_TPM
<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
chr11,4551522,4551988,chr11:4551522-4551988,3664.854,1.591817,1193.764,3365.022,0.5948831,508.1304
chr11,4554256,4554817,chr11:4554256-4554817,4178.854,1.507587,1130.598,21022.651,3.1252459,2663.6506
chr11,4577505,4578002,chr11:4577505-4578002,4293.143,1.748403,1311.179,2225.855,0.3713791,316.8869
