**Set environment**

In [1]:
suppressMessages(suppressWarnings(source("../run_config_project_sing.R")))
suppressMessages(suppressWarnings(library("DESeq2")))
suppressMessages(suppressWarnings(library("edgeR")))
show_env()

You are working on        Singularity 
BASE DIRECTORY (FD_BASE): /mount 
REPO DIRECTORY (FD_REPO): /mount/repo 
WORK DIRECTORY (FD_WORK): /mount/work 
DATA DIRECTORY (FD_DATA): /mount/data 

You are working with      ENCODE FCC 
PATH OF PROJECT (FD_PRJ): /mount/repo/Proj_ENCODE_FCC 
PROJECT RESULTS (FD_RES): /mount/repo/Proj_ENCODE_FCC/results 
PROJECT SCRIPTS (FD_EXE): /mount/repo/Proj_ENCODE_FCC/scripts 
PROJECT DATA    (FD_DAT): /mount/repo/Proj_ENCODE_FCC/data 
PROJECT NOTE    (FD_NBK): /mount/repo/Proj_ENCODE_FCC/notebooks 
PROJECT DOCS    (FD_DOC): /mount/repo/Proj_ENCODE_FCC/docs 
PROJECT LOG     (FD_LOG): /mount/repo/Proj_ENCODE_FCC/log 
PROJECT APP     (FD_APP): /mount/repo/Proj_ENCODE_FCC/app 
PROJECT REF     (FD_REF): /mount/repo/Proj_ENCODE_FCC/references 



## Loop through regions and assays to create count matrix

**Test loop**

In [2]:
txt_region = "fcc_astarr_macs_input_overlap"
txt_fdiry = file.path(
    FD_RES, "region_coverage_fcc",
    txt_region,
    "*",
    "overlap_count",
    "summary"
)
txt_fname = "data.count_column.raw.WGS.rds"
txt_fglob = file.path(txt_fdiry, txt_fname)

vec_txt_fpath = Sys.glob(txt_fglob)
print(vec_txt_fpath)

[1] "/mount/repo/Proj_ENCODE_FCC/results/region_coverage_fcc/fcc_astarr_macs_input_overlap/STARR_ATAC_K562_Reddy_KS274/overlap_count/summary/data.count_column.raw.WGS.rds"  
[2] "/mount/repo/Proj_ENCODE_FCC/results/region_coverage_fcc/fcc_astarr_macs_input_overlap/STARR_ATAC_K562_Reddy_KS91/overlap_count/summary/data.count_column.raw.WGS.rds"   
[3] "/mount/repo/Proj_ENCODE_FCC/results/region_coverage_fcc/fcc_astarr_macs_input_overlap/STARR_ATAC_K562_Reddy_KSMerge/overlap_count/summary/data.count_column.raw.WGS.rds"
[4] "/mount/repo/Proj_ENCODE_FCC/results/region_coverage_fcc/fcc_astarr_macs_input_overlap/STARR_WHG_K562_Reddy_A001/overlap_count/summary/data.count_column.raw.WGS.rds"    


**Generate effect size**

In [3]:
### init: region folders
txt_fdiry = file.path(FD_RES, "region_coverage_fcc")
vec_txt_region = dir(txt_fdiry)
print(vec_txt_region)
cat("\n")

vec_txt_region = c("astarr_macs_input_overlap", "astarr_macs_input_union")

### init: assay name
vec_txt_assay = c(
    "STARR_ATAC_K562_Reddy_KS91",
    "STARR_ATAC_K562_Reddy_KS274",
    "STARR_ATAC_K562_Reddy_KSMerge",
    "STARR_WHG_K562_Reddy_A001"
)

fun_get_assay = function(txt){
    res = fun_str_map_detect(txt, vec_txt_assay, vec_txt_assay)
    return(res)
}

### loop
for (txt_region in vec_txt_region) {
    
    ### init: get RDS files w/ count matrix and column information
    txt_fdiry = file.path(FD_RES, "region_coverage_fcc", txt_region, "STARR*", "overlap_count", "summary")
    txt_fname = "data.count_column.raw.WGS.rds"
    txt_fglob = file.path(txt_fdiry, txt_fname)
    
    vec_txt_fpath_inp = Sys.glob(txt_fglob)
    
    ### loop
    for (txt_fpath_inp in vec_txt_fpath_inp){

        ### read object
        lst_dat = readRDS(txt_fpath_inp)
        dat_cnt = lst_dat$data_cnt
        dat_col = lst_dat$data_col
        
        ### arrange column data
        dat_col = dat_col  %>% 
            dplyr::mutate(Group = factor(Group, levels = c("Input", "Output"))) %>%
            column_to_rownames(var = "Sample")
        
        ### arrange count matrix
        mat_tot = dat_cnt %>%
            dplyr::select(-Chrom, -ChromStart, -ChromEnd) %>%
            column_to_rownames(var = "Region")
        mat_inp = mat_tot %>% dplyr::select(starts_with("Input"))
        mat_out = mat_tot %>% dplyr::select(starts_with("Output"))

        ### create list of DGE objects for total, input, and output
        lst = list()
        lst[["Total"]]  = DGEList(counts=mat_tot, group = dat_col$Group)
        lst[["Input"]]  = DGEList(counts=mat_inp, group = rep("Input",  ncol(mat_inp)))
        lst[["Output"]] = DGEList(counts=mat_out, group = rep("Output", ncol(mat_out)))
        lst_edger = lst

        ### filter counts using filterByExpr from edgeR
        dge = lst_edger[["Total"]]
        idx = filterByExpr(dge)

        ### Check: index matched when filtering
        x = names(idx)
        y = row.names(mat_tot)
        if (!all(x == y)) {stop("Error index")}

        ### if passed: filter matrix
        mat_cnt = mat_tot[idx,]

        ### show progress:
        txt_fpath = txt_fpath_inp
        txt_fname = basename(txt_fpath)
        txt_assay = fun_get_assay(txt_fpath)
        
        cat("\n====================\n")
        cat("Region:", txt_region, "\n")
        cat("Assay: ", txt_assay,  "\n")
        cat("FPath: ", txt_fpath,  "\n")
        cat("FName: ", txt_fname,  "\n")
        cat("Import Counts:\n")
        cat("#Rows (Before filter):", nrow(dat_cnt), "\n")
        cat("#Rows (After  filter):", nrow(mat_cnt), "\n")
        cat("\n")
        flush.console()

        ### create DESeq2 object
        if (str_detect(txt_assay, "KSMerge")){
            cat("Apply DESeq2: ~Group+Prefix", "\n")
            dds = DESeqDataSetFromMatrix(
                countData = mat_cnt, 
                colData   = dat_col, 
                design    = ~Group+Prefix)
        } else {
            cat("Apply DESeq2: ~Group", "\n")
            dds = DESeqDataSetFromMatrix(
                countData = mat_cnt, 
                colData   = dat_col, 
                design    = ~Group)
        }

        ### ensure Input group is set as reference
        dds$Group <- relevel(dds$Group, ref = "Input")

        ### run deseq analysis
        dds = DESeq(dds)

        ### save deseq object
        txt_fdiry = dirname(txt_fpath_inp)
        txt_fname = "data.deseq2.rds"
        txt_fpath = file.path(txt_fdiry, txt_fname)
        
        obj = dds
        saveRDS(obj, txt_fpath)

        ### save DGEList object
        txt_fdiry = dirname(txt_fpath_inp)
        txt_fname = "data.edger.rds"
        txt_fpath = file.path(txt_fdiry, txt_fname)
        
        obj = lst_edger
        saveRDS(obj, txt_fpath)
    }
}

[1] "atac_ENCFF333TAT"              "atac_ENCFF558BLC"             
[3] "atac_ENCFF925CYR"              "atac_ENCFF948AFM"             
[5] "dnase_ENCFF185XRG"             "dnase_ENCFF274YGF"            
[7] "fcc_astarr_macs_input_overlap" "fcc_astarr_macs_input_union"  


Region: fcc_astarr_macs_input_overlap 
Assay:  STARR_ATAC_K562_Reddy_KS274 
FPath:  /mount/repo/Proj_ENCODE_FCC/results/region_coverage_fcc/fcc_astarr_macs_input_overlap/STARR_ATAC_K562_Reddy_KS274/overlap_count/summary/data.count_column.raw.WGS.rds 
FName:  data.count_column.raw.WGS.rds 
Import Counts:
#Rows (Before filter): 150042 
#Rows (After  filter): 150041 

Apply DESeq2: ~Group 


estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing




Region: fcc_astarr_macs_input_overlap 
Assay:  STARR_ATAC_K562_Reddy_KS91 
FPath:  /mount/repo/Proj_ENCODE_FCC/results/region_coverage_fcc/fcc_astarr_macs_input_overlap/STARR_ATAC_K562_Reddy_KS91/overlap_count/summary/data.count_column.raw.WGS.rds 
FName:  data.count_column.raw.WGS.rds 
Import Counts:
#Rows (Before filter): 150041 
#Rows (After  filter): 150040 

Apply DESeq2: ~Group 


estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing




Region: fcc_astarr_macs_input_overlap 
Assay:  STARR_ATAC_K562_Reddy_KSMerge 
FPath:  /mount/repo/Proj_ENCODE_FCC/results/region_coverage_fcc/fcc_astarr_macs_input_overlap/STARR_ATAC_K562_Reddy_KSMerge/overlap_count/summary/data.count_column.raw.WGS.rds 
FName:  data.count_column.raw.WGS.rds 
Import Counts:
#Rows (Before filter): 150042 
#Rows (After  filter): 150040 

Apply DESeq2: ~Group+Prefix 


“some variables in design formula are characters, converting to factors”
estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing




Region: fcc_astarr_macs_input_overlap 
Assay:  STARR_WHG_K562_Reddy_A001 
FPath:  /mount/repo/Proj_ENCODE_FCC/results/region_coverage_fcc/fcc_astarr_macs_input_overlap/STARR_WHG_K562_Reddy_A001/overlap_count/summary/data.count_column.raw.WGS.rds 
FName:  data.count_column.raw.WGS.rds 
Import Counts:
#Rows (Before filter): 150030 
#Rows (After  filter): 146480 

Apply DESeq2: ~Group 


estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

