**Set environment**

In [1]:
suppressMessages(suppressWarnings(source("../config/config_sing.R")))
suppressMessages(suppressWarnings(library("DESeq2")))
show_env()

You are in Singularity: singularity_proj_encode_fcc 
BASE DIRECTORY (FD_BASE): /data/reddylab/Kuei 
WORK DIRECTORY (FD_WORK): /data/reddylab/Kuei/out 
CODE DIRECTORY (FD_CODE): /data/reddylab/Kuei/code 
PATH OF PROJECT (FD_PRJ): /data/reddylab/Kuei/code/Proj_CombEffect_ENCODE_FCC 
PATH OF RESULTS (FD_RES): /data/reddylab/Kuei/out/proj_combeffect_encode_fcc 
PATH OF LOG     (FD_LOG): /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/log 


## Import count matrix and metadata to setup DESeq2

In [2]:
PREFIX   = "Tewhey_K562_TileMPRA"
FOLDER   = "coverage_astarrseq_peak_macs_input"

fdiry = file.path(FD_RES, "results", PREFIX, FOLDER, "summary")
print(dir(fdiry))

 [1] "matrix.raw.count.OL13_20220512.tsv"       
 [2] "matrix.raw.count.OL43_20221003.tsv"       
 [3] "matrix.raw.count.OL45_20220927.tsv"       
 [4] "matrix.raw.cpm.OL13_20220512.tsv"         
 [5] "matrix.raw.cpm.OL43_20221003.tsv"         
 [6] "matrix.raw.cpm.OL45_20220927.tsv"         
 [7] "metadata.raw.OL13_20220512.tsv"           
 [8] "metadata.raw.OL43_20221003.tsv"           
 [9] "metadata.raw.OL45_20220927.tsv"           
[10] "result.Log2FC.raw.cpm.Merge.tsv"          
[11] "result.Log2FC.raw.cpm.OL13_20220512.tsv"  
[12] "result.Log2FC.raw.cpm.OL43_20221003.tsv"  
[13] "result.Log2FC.raw.cpm.OL45_20220927.tsv"  
[14] "result.Log2FC.raw.deseq.Merge.tsv"        
[15] "result.Log2FC.raw.deseq.OL13_20220512.tsv"
[16] "result.Log2FC.raw.deseq.OL43_20221003.tsv"
[17] "result.Log2FC.raw.deseq.OL45_20220927.tsv"


In [3]:
PREFIX   = "Tewhey_K562_TileMPRA"
FOLDER   = "coverage_astarrseq_peak_macs_input"
DATASETS = c("OL13", "OL43", "OL45")

fdiry = file.path(FD_RES, "results", PREFIX, FOLDER, "summary")
for (DATASET in DATASETS){
    cat(DATASET, "\n")
    
    fname = paste("matrix.raw.count", DATASET, "tsv", sep = "*")
    fglob = file.path(fdiry, fname)
    fpath = Sys.glob(fglob)
    fname = basename(fpath)
    print(fname)
    
    fname = paste("metadata.raw", DATASET, "tsv", sep = "*")
    fglob = file.path(fdiry, fname)
    fpath = Sys.glob(fglob)
    fname = basename(fpath)
    print(fname)
}

OL13 
[1] "matrix.raw.count.OL13_20220512.tsv"
[1] "metadata.raw.OL13_20220512.tsv"
OL43 
[1] "matrix.raw.count.OL43_20221003.tsv"
[1] "metadata.raw.OL43_20221003.tsv"
OL45 
[1] "matrix.raw.count.OL45_20220927.tsv"
[1] "metadata.raw.OL45_20220927.tsv"


In [4]:
PREFIX   = "Tewhey_K562_TileMPRA"
FOLDER   = "coverage_astarrseq_peak_macs_input"
DATASETS = c(
    "OL13_20220512", 
    "OL43_20221003", 
    "OL45_20220927")

lst = lapply(DATASETS, function(DATASET){
    
    ### show progress
    cat("\n=============================\n")
    cat(DATASET, "\n")
    
    ### Import count matrix
    fname = paste("matrix.raw.count", DATASET, "tsv", sep = "*")
    fglob = file.path(fdiry, fname)
    fpath = Sys.glob(fglob)
    dat   = read_tsv(fpath, show_col_types = FALSE)
    
    cat("\n+++++++++++++++++++++++++++++\n")
    cat("Count matrix", "\n")
    dat_count = dat
    print(head(dat))
    
    ### Import metadata
    fname = paste("metadata.raw", DATASET, "tsv", sep = "*")
    fglob = file.path(fdiry, fname)
    fpath = Sys.glob(fglob)
    dat   = read_tsv(fpath, show_col_types = FALSE)
    
    cat("\n+++++++++++++++++++++++++++++\n")
    cat("Metadata", "\n")
    dat_meta = dat
    print(dat)
    
    ### Arrange count matrix and metadata
    dat_col = dat_meta  %>% 
        dplyr::select(Sample, Group) %>% 
        dplyr::rename(condition = Group) %>%
        column_to_rownames(var = "Sample")

    dat_cnt = dat_count %>% 
        column_to_rownames(var = "Peak")

    dat_cnt[is.na(dat_cnt)] = 0
    
    ### checking
    cat("\n+++++++++++++++++++++++++++++\n")
    cat("Check 01:", all(rownames(dat_col) %in% colnames(dat_cnt)), "\n")
    cat("Check 02:", all(rownames(dat_col) ==   colnames(dat_cnt)), "\n")
    
    ### create DESeq2 object
    dds = DESeqDataSetFromMatrix(
        countData = dat_cnt, 
        colData   = dat_col, 
        design    = ~condition)
    return(dds)
})

names(lst) = DATASETS
lst_dds_setup = lst


OL13_20220512 

+++++++++++++++++++++++++++++
Count matrix 
[90m# A tibble: 6 × 9[39m
  Peak       Input.rep1 Input.rep2 Input.rep3 Input.rep4 Output.rep1 Output.rep2
  [3m[90m<chr>[39m[23m           [3m[90m<dbl>[39m[23m      [3m[90m<dbl>[39m[23m      [3m[90m<dbl>[39m[23m      [3m[90m<dbl>[39m[23m       [3m[90m<dbl>[39m[23m       [3m[90m<dbl>[39m[23m
[90m1[39m chr11:617…     [4m1[24m[4m2[24m[4m3[24m964      [4m6[24m[4m9[24m072     [4m1[24m[4m1[24m[4m3[24m487      [4m7[24m[4m1[24m526      [4m4[24m[4m6[24m[4m1[24m111      [4m4[24m[4m6[24m[4m1[24m000
[90m2[39m chr11:618…      [4m9[24m[4m3[24m429      [4m5[24m[4m3[24m063      [4m8[24m[4m3[24m490      [4m5[24m[4m3[24m637      [4m4[24m[4m2[24m[4m4[24m590      [4m3[24m[4m9[24m[4m8[24m992
[90m3[39m chr11:618…      [4m7[24m[4m3[24m252      [4m3[24m[4m9[24m188      [4m6[24m[4m3[24m205      [4m4[24m[4m0[24m131       [4m2[24m[4m9[24

converting counts to integer mode

“some variables in design formula are characters, converting to factors”



OL43_20221003 

+++++++++++++++++++++++++++++
Count matrix 
[90m# A tibble: 6 × 12[39m
  Peak         Input.rep1 Input.rep2 Input.rep3 Input.rep4 Input.rep5 Input.rep6
  [3m[90m<chr>[39m[23m             [3m[90m<dbl>[39m[23m      [3m[90m<dbl>[39m[23m      [3m[90m<dbl>[39m[23m      [3m[90m<dbl>[39m[23m      [3m[90m<dbl>[39m[23m      [3m[90m<dbl>[39m[23m
[90m1[39m chr8:126778…      [4m1[24m[4m0[24m372      [4m1[24m[4m1[24m304      [4m1[24m[4m0[24m609      [4m1[24m[4m1[24m042       [4m5[24m318       [4m5[24m475
[90m2[39m chr8:126782…       [4m3[24m076       [4m3[24m477       [4m3[24m122       [4m3[24m211       [4m1[24m563       [4m1[24m614
[90m3[39m chr8:126804…      [4m1[24m[4m6[24m184      [4m1[24m[4m8[24m791      [4m1[24m[4m7[24m453      [4m1[24m[4m7[24m754       [4m8[24m529       [4m8[24m533
[90m4[39m chr8:126817…      [4m1[24m[4m0[24m013      [4m1[24m[4m1[24m400      [4m1[24m[4m0[24m

converting counts to integer mode

“some variables in design formula are characters, converting to factors”



OL45_20220927 

+++++++++++++++++++++++++++++
Count matrix 
[90m# A tibble: 6 × 9[39m
  Peak       Input.rep1 Input.rep2 Input.rep3 Input.rep4 Output.rep1 Output.rep2
  [3m[90m<chr>[39m[23m           [3m[90m<dbl>[39m[23m      [3m[90m<dbl>[39m[23m      [3m[90m<dbl>[39m[23m      [3m[90m<dbl>[39m[23m       [3m[90m<dbl>[39m[23m       [3m[90m<dbl>[39m[23m
[90m1[39m chr11:328…       [4m5[24m796       [4m5[24m349       [4m5[24m093       [4m5[24m236        [4m4[24m190        [4m3[24m932
[90m2[39m chr11:328…       [4m4[24m925       [4m4[24m182       [4m4[24m315       [4m4[24m301        [4m5[24m557        [4m3[24m395
[90m3[39m chr11:328…       [4m8[24m452       [4m7[24m263       [4m7[24m369       [4m7[24m154       [4m1[24m[4m2[24m228       [4m1[24m[4m0[24m838
[90m4[39m chr11:328…      [4m1[24m[4m3[24m661      [4m1[24m[4m1[24m814      [4m1[24m[4m1[24m553      [4m1[24m[4m1[24m485       [4m3[24m[4m1[2

converting counts to integer mode

“some variables in design formula are characters, converting to factors”


## Preprocess

**Pre-filtering**

In [5]:
lst = lst_dds_setup

lst = lapply(lst, function(dds){
    ### remove the peaks which have < 10 reads
    cat("Before filter:", nrow(dds), "\n")
    dds = dds[rowSums(counts(dds)) >= 10,]
    cat("After  filter:", nrow(dds), "\n")

    ### set control condition as reference
    dds$condition <- relevel(dds$condition, ref = "Input")
    return(dds)
})

lst_dds_filtered = lst

Before filter: 22 
After  filter: 22 
Before filter: 394 
After  filter: 394 
Before filter: 1306 
After  filter: 1306 


In [6]:
names(lst)

## Run DESeq2

In [7]:
lst = lst_dds_filtered

lst = lapply(lst, function(dds){
    ### apply DESeq2 data processing
    dds = DESeq(dds, fitType='local')
    return(dds)
})

lst_dds_run = lst

estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing



## Get results

In [8]:
lst = lst_dds_run

lst = lapply(lst, function(dds){
    res = results(dds)
    res = as.data.frame(res) %>% rownames_to_column(var = "Peak")
    return(res)
})

lst_res = lst
print(names(lst))

[1] "OL13_20220512" "OL43_20221003" "OL45_20220927"


## Save results

In [9]:
lst = lst_res

for (idx in names(lst)){
    cat("\n+++++++++++++++++++++++++++++\n")
    cat(idx, "\n")
    
    fdiry = file.path(FD_RES, "results", PREFIX, FOLDER, "summary")
    fname = paste("result.Log2FC.raw.deseq", idx, "tsv", sep = ".")
    fpath = file.path(fdiry, fname)
    print(fpath)
    
    res = lst[[idx]]
    write_tsv(res, fpath)
}


+++++++++++++++++++++++++++++
OL13_20220512 
[1] "/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/Tewhey_K562_TileMPRA/coverage_astarrseq_peak_macs_input/summary/result.Log2FC.raw.deseq.OL13_20220512.tsv"

+++++++++++++++++++++++++++++
OL43_20221003 
[1] "/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/Tewhey_K562_TileMPRA/coverage_astarrseq_peak_macs_input/summary/result.Log2FC.raw.deseq.OL43_20221003.tsv"

+++++++++++++++++++++++++++++
OL45_20220927 
[1] "/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/Tewhey_K562_TileMPRA/coverage_astarrseq_peak_macs_input/summary/result.Log2FC.raw.deseq.OL45_20220927.tsv"
