**Set environment**

In [1]:
suppressMessages(suppressWarnings(source("../run_config_project_sing.R")))
suppressMessages(suppressWarnings(library("DESeq2")))
suppressMessages(suppressWarnings(library("edgeR")))
show_env()

You are working on        Singularity 
BASE DIRECTORY (FD_BASE): /mount 
REPO DIRECTORY (FD_REPO): /mount/repo 
WORK DIRECTORY (FD_WORK): /mount/work 
DATA DIRECTORY (FD_DATA): /mount/data 

You are working with      ENCODE FCC 
PATH OF PROJECT (FD_PRJ): /mount/repo/Proj_ENCODE_FCC 
PROJECT RESULTS (FD_RES): /mount/repo/Proj_ENCODE_FCC/results 
PROJECT SCRIPTS (FD_EXE): /mount/repo/Proj_ENCODE_FCC/scripts 
PROJECT DATA    (FD_DAT): /mount/repo/Proj_ENCODE_FCC/data 
PROJECT NOTE    (FD_NBK): /mount/repo/Proj_ENCODE_FCC/notebooks 
PROJECT DOCS    (FD_DOC): /mount/repo/Proj_ENCODE_FCC/docs 
PROJECT LOG     (FD_LOG): /mount/repo/Proj_ENCODE_FCC/log 
PROJECT APP     (FD_APP): /mount/repo/Proj_ENCODE_FCC/app 
PROJECT REF     (FD_REF): /mount/repo/Proj_ENCODE_FCC/references 



## Loop through regions and assays to create count matrix

**Test loop**

In [2]:
txt_assay  = "MPRA_Tiling_K562_Tewhey_Hannah"
txt_region = "fcc_astarr_macs_input_overlap"
txt_fdiry  = file.path(
    FD_RES, "region_coverage_fcc",
    txt_region,
    txt_assay,
    "overlap_count",
    "summary"
)
txt_fname = "data*raw*rds"
txt_fglob = file.path(txt_fdiry, txt_fname)

vec_txt_fpath = Sys.glob(txt_fglob)
print(vec_txt_fpath)

[1] "/mount/repo/Proj_ENCODE_FCC/results/region_coverage_fcc/fcc_astarr_macs_input_overlap/MPRA_Tiling_K562_Tewhey_Hannah/overlap_count/summary/data.count_column.raw.OL13.rds"
[2] "/mount/repo/Proj_ENCODE_FCC/results/region_coverage_fcc/fcc_astarr_macs_input_overlap/MPRA_Tiling_K562_Tewhey_Hannah/overlap_count/summary/data.count_column.raw.OL43.rds"
[3] "/mount/repo/Proj_ENCODE_FCC/results/region_coverage_fcc/fcc_astarr_macs_input_overlap/MPRA_Tiling_K562_Tewhey_Hannah/overlap_count/summary/data.count_column.raw.OL45.rds"


**Generate effect size**

In [None]:
### init: region folders
txt_fdiry = file.path(FD_RES, "region_coverage_fcc")
vec_txt_region = dir(txt_fdiry)
print(vec_txt_region)
cat("\n")

vec_txt_region = "fcc_astarr_macs_input_overlap"

### init: assay name
txt_assay  = "MPRA_Tiling_K562_Tewhey_Hannah"

### init: assay prefix
vec_txt_prefix = c("OL13", "OL43", "OL45")
fun_get_prefix = function(txt){
    res = fun_str_map_detect(txt, vec_txt_prefix, vec_txt_prefix)
    return(res)
}

### loop
for (txt_region in vec_txt_region) {

    ### init: get RDS files w/ count matrix and column information
    txt_fdiry = file.path(FD_RES, "region_coverage_fcc", txt_region, txt_assay, "overlap_count", "summary")
    txt_fname = "data*count_column*raw*rds"
    txt_fglob = file.path(txt_fdiry, txt_fname)
    
    vec_txt_fpath_inp = Sys.glob(txt_fglob)
    
    ### loop
    for (txt_fpath_inp in vec_txt_fpath_inp){

        ### read object
        lst_dat = readRDS(txt_fpath_inp)
        dat_cnt = lst_dat$data_cnt
        dat_col = lst_dat$data_col
        
        ### arrange column data
        dat_col = dat_col  %>% 
            dplyr::mutate(Group = factor(Group, levels = c("Input", "Output"))) %>%
            column_to_rownames(var = "Sample")
        
        ### arrange count matrix
        mat_tot = dat_cnt %>%
            dplyr::select(-Chrom, -ChromStart, -ChromEnd) %>%
            column_to_rownames(var = "Region")
        mat_inp = mat_tot %>% dplyr::select(starts_with("Input"))
        mat_out = mat_tot %>% dplyr::select(starts_with("Output"))

        ### create list of DGE objects for total, input, and output
        lst = list()
        lst[["Total"]]  = DGEList(counts=mat_tot, group = dat_col$Group)
        lst[["Input"]]  = DGEList(counts=mat_inp, group = rep("Input",  ncol(mat_inp)))
        lst[["Output"]] = DGEList(counts=mat_out, group = rep("Output", ncol(mat_out)))
        lst_edger = lst

        ### filter counts using filterByExpr from edgeR
        dge = lst_edger[["Total"]]
        idx = filterByExpr(dge)

        ### Check: index matched when filtering
        x = names(idx)
        y = row.names(mat_tot)
        if (!all(x == y)) {stop("Error index")}

        ### if passed: filter matrix
        mat_cnt = mat_tot[idx,]

        ### show progress:
        txt_fpath  = txt_fpath_inp
        txt_fname  = basename(txt_fpath)
        txt_prefix = fun_get_prefix(txt_fname)
        
        cat("\n====================\n")
        cat("Region:", txt_region, "\n")
        cat("Assay: ", txt_assay,  "\n")
        cat("Prefix:", txt_prefix, "\n")
        cat("FPath: ", txt_fpath,  "\n")
        cat("FName: ", txt_fname,  "\n")
        cat("Import Counts:\n")
        cat("#Rows (Before filter):", nrow(dat_cnt), "\n")
        cat("#Rows (After  filter):", nrow(mat_cnt), "\n")
        cat("\n")
        flush.console()

        ### create dds object
        cat("Apply DESeq2: ~Group", "\n")
        dds = DESeqDataSetFromMatrix(
            countData = mat_cnt, 
            colData   = dat_col, 
            design    = ~Group)
        
        ### ensure Input group is set as reference
        dds$Group <- relevel(dds$Group, ref = "Input")

        ### run deseq analysis
        dds = DESeq(dds)

        ### save deseq object
        txt_fdiry = dirname(txt_fpath_inp)
        txt_fname = paste("data", "deseq2", txt_prefix, "rds", sep=".")
        txt_fpath = file.path(txt_fdiry, txt_fname)
        
        obj = dds
        saveRDS(obj, txt_fpath)

        ### save DGEList object
        txt_fdiry = dirname(txt_fpath_inp)
        txt_fname = paste("data", "edger", txt_prefix, "rds", sep=".")
        txt_fpath = file.path(txt_fdiry, txt_fname)
        
        obj = lst_edger
        saveRDS(obj, txt_fpath)
    }
}

[1] "atac_ENCFF333TAT"              "atac_ENCFF558BLC"             
[3] "atac_ENCFF925CYR"              "atac_ENCFF948AFM"             
[5] "dnase_ENCFF185XRG"             "dnase_ENCFF274YGF"            
[7] "fcc_astarr_macs_input_overlap" "fcc_astarr_macs_input_union"  


Region: fcc_astarr_macs_input_overlap 
Assay:  MPRA_Tiling_K562_Tewhey_Hannah 
Prefix: OL13 
FPath:  /mount/repo/Proj_ENCODE_FCC/results/region_coverage_fcc/fcc_astarr_macs_input_overlap/MPRA_Tiling_K562_Tewhey_Hannah/overlap_count/summary/data.count_column.raw.OL13.rds 
FName:  data.count_column.raw.OL13.rds 
Import Counts:
#Rows (Before filter): 18 
#Rows (After  filter): 18 

Apply DESeq2: ~Group 


estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

-- note: fitType='parametric', but the dispersion trend was not well captured by the
   function: y = a/x + b, and a local regression fit was automatically substituted.
   specify fitType='local' or 'mean' to avoid this message next time.

final dispersion estimates

fitting model and testing




Region: fcc_astarr_macs_input_overlap 
Assay:  MPRA_Tiling_K562_Tewhey_Hannah 
Prefix: OL43 
FPath:  /mount/repo/Proj_ENCODE_FCC/results/region_coverage_fcc/fcc_astarr_macs_input_overlap/MPRA_Tiling_K562_Tewhey_Hannah/overlap_count/summary/data.count_column.raw.OL43.rds 
FName:  data.count_column.raw.OL43.rds 
Import Counts:
#Rows (Before filter): 307 
#Rows (After  filter): 307 

Apply DESeq2: ~Group 


estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing




Region: fcc_astarr_macs_input_overlap 
Assay:  MPRA_Tiling_K562_Tewhey_Hannah 
Prefix: OL45 
FPath:  /mount/repo/Proj_ENCODE_FCC/results/region_coverage_fcc/fcc_astarr_macs_input_overlap/MPRA_Tiling_K562_Tewhey_Hannah/overlap_count/summary/data.count_column.raw.OL45.rds 
FName:  data.count_column.raw.OL45.rds 
Import Counts:
#Rows (Before filter): 823 
#Rows (After  filter): 823 

Apply DESeq2: ~Group 


estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

-- note: fitType='parametric', but the dispersion trend was not well captured by the
   function: y = a/x + b, and a local regression fit was automatically substituted.
   specify fitType='local' or 'mean' to avoid this message next time.

final dispersion estimates

fitting model and testing



In [64]:
#txt_fdiry = "/data/reddylab/Kuei/work"
txt_fpath = "/data/reddylab/Kuei/work/proj_encode_fcc/results_v4/Tewhey_K562_TileMPRA/coverage_astarrseq_peak_macs_input/summary/result.Log2FC.raw.deseq.OL13_20220512.tsv"

dat = read_tsv(txt_fpath)
head(dat)

[1mRows: [22m[34m22[39m [1mColumns: [22m[34m7[39m
[36m──[39m [1mColumn specification[22m [36m─────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (1): Peak
[32mdbl[39m (6): baseMean, log2FoldChange, lfcSE, stat, pvalue, padj

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


Peak,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
chr11:61792068-61793464,260242.12,1.6665654,0.03671583,45.390919,0.0,0.0
chr11:61800085-61801113,216437.58,1.8554038,0.04389284,42.271214,0.0,0.0
chr11:61806630-61807154,49327.33,-1.3562271,0.07733082,-17.537989,7.34817e-69,1.347165e-68
chr11:61814735-61817343,682819.52,2.5310145,0.03600593,70.294376,0.0,0.0
chr11:61822094-61822443,20363.1,-1.7731459,0.16511332,-10.738963,6.679004000000001e-27,8.163227000000001e-27
chr11:61825795-61826306,29539.4,-0.3921375,0.05359585,-7.316565,2.543988e-13,2.66513e-13


In [67]:
dat = as.data.frame(results(dds))
head(dat)

Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
chr11:61792068-61793464,260242.12,1.6665705,0.04117109,40.479145,0.0,0.0
chr11:61800085-61801113,216437.58,1.8554055,0.0497289,37.310405,1.112851e-304,6.120682e-304
chr11:61806630-61807154,49327.33,-1.3562455,0.08825841,-15.366758,2.735191e-53,4.628784e-53
chr11:61814735-61817343,682819.52,2.5310135,0.03452518,73.309201,0.0,0.0
chr11:61822094-61822443,20363.1,-1.7731145,0.14597377,-12.146802,5.965339e-34,7.71985e-34
chr11:61825795-61826306,29539.4,-0.3921839,0.07941651,-4.938318,7.879932e-07,7.879932e-07
