# Motif marginal effect in the upstream of PER1 region

**Set environment**

In [1]:
suppressMessages(suppressWarnings(source("config_sing.R")))
print(FD_WORK)
print(FD_RES)

[1] "/home/mount/work"
[1] "/home/mount/work/out/proj_combeffect"


**Check input file**

In [2]:
fdiry = file.path(FD_RES, "annotation_fragment")
fname = "target_PER1.bed.gz"
fglob = file.path(fdiry, "*", fname)
system(paste("ls -lh", fglob), intern = TRUE)

**Check environment**

In [3]:
detectCores()

In [4]:
mem_used()

76.9 MB

## Import annotated fragment

In [5]:
###################################################
# Import annotated fragments
###################################################

### set column names and types
ctypes = c(col_character(), col_integer(), col_integer(), col_integer(),
           col_character(), col_integer(), col_integer(),
           col_character(), col_double(),  col_integer())
cnames = c("Chrom_Frag", "Start_Frag", "End_Frag", "Count_Frag",
           "Chrom_MTF",  "Start_MTF",  "End_MTF",
           "Motif", "Score", "Overlap")

### set samples
SAMPLES = c(
    paste0("Input", 1:5),
    paste0("Input", 1:5, "_20x"),
    paste0("TFX",   2:5, "_DMSO"),
    paste0("TFX",   2:5, "_Dex"))

### import bed files for each sample 
fdiry = file.path(FD_RES, "annotation_fragment")
fname = "target_PER1.bed.gz"

lst_dat = lapply(SAMPLES, function(sam){
    ### set path
    fpath = file.path(fdiry, sam, fname)
    print(fpath); flush.console()
    
    ### import data
    dat = read_tsv(fpath, col_types=ctypes, col_names=cnames) %>% mutate(Sample = sam)
    return(dat)
})

### arrange data
dat_ann_frag = bind_rows(lst_dat)

### check environment (before release variables)
print(mem_used())
lst_dat = NULL

### check environment (after release variables)
print(mem_used())

[1] "/home/mount/work/out/proj_combeffect/annotation_fragment/Input1/target_PER1.bed.gz"
[1] "/home/mount/work/out/proj_combeffect/annotation_fragment/Input2/target_PER1.bed.gz"
[1] "/home/mount/work/out/proj_combeffect/annotation_fragment/Input3/target_PER1.bed.gz"
[1] "/home/mount/work/out/proj_combeffect/annotation_fragment/Input4/target_PER1.bed.gz"
[1] "/home/mount/work/out/proj_combeffect/annotation_fragment/Input5/target_PER1.bed.gz"
[1] "/home/mount/work/out/proj_combeffect/annotation_fragment/Input1_20x/target_PER1.bed.gz"
[1] "/home/mount/work/out/proj_combeffect/annotation_fragment/Input2_20x/target_PER1.bed.gz"
[1] "/home/mount/work/out/proj_combeffect/annotation_fragment/Input3_20x/target_PER1.bed.gz"
[1] "/home/mount/work/out/proj_combeffect/annotation_fragment/Input4_20x/target_PER1.bed.gz"
[1] "/home/mount/work/out/proj_combeffect/annotation_fragment/Input5_20x/target_PER1.bed.gz"
[1] "/home/mount/work/out/proj_combeffect/annotation_fragment/TFX2_DMSO/target_PER1.bed.gz

In [6]:
head(dat_ann_frag)

Chrom_Frag,Start_Frag,End_Frag,Count_Frag,Chrom_MTF,Start_MTF,End_MTF,Motif,Score,Overlap,Sample
<chr>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<chr>
chr17,8148117,8149012,1,chr17,8148107,8148124,KLF/SP/2,9.0318,7,Input1
chr17,8148117,8149012,1,chr17,8148109,8148121,INSM1,6.1647,4,Input1
chr17,8148117,8149012,1,chr17,8148109,8148129,GC-tract,8.3277,12,Input1
chr17,8148117,8149012,1,chr17,8148123,8148133,GLI,7.4318,10,Input1
chr17,8148117,8149012,1,chr17,8148124,8148139,NR/17,7.8649,15,Input1
chr17,8148117,8149012,1,chr17,8148126,8148137,KLF/SP/1,11.3678,11,Input1


## Import library size

In [7]:
###################################################
# Import library size
###################################################

### Helper function to get
get_group = function(idn_sample){
    idn = idn_sample
    
    idn = str_replace(
        string = idn, 
        pattern = "Input[0-9]", 
        replacement = "Input")
    
    idn = str_remove(
        string = idn, 
        pattern = "_20x")
    
    idn = str_replace(
        string = idn, 
        pattern = "TFX[0-9]_", 
        replacement="TFX_")
    return(idn)
}

### set path
fdiry = file.path(FD_RES, "source")
fname = "library_size.txt"
fpath = file.path(fdiry, fname)

### import library size
ctypes = c(col_integer(), col_character())
cnames = c("Size", "FPath")
dat_lib = read_tsv(fpath, col_types=ctypes, col_names = cnames)

### remove the total size
dat_lib = dat_lib %>% dplyr::filter(FPath != "total")

### summarize info from the file path
### stackoverflow: Extract only folder name right before filename from full path
dat_lib = dat_lib %>% 
    #mutate(Sample = tools::file_path_sans_ext(basename(FPath))) %>%
    mutate(Sample = basename(dirname(FPath))) %>%
    mutate(Group = get_group(Sample))
dat_lib = dat_lib %>% dplyr::select(Size, Sample, Group)
dat_lib

Size,Sample,Group
<dbl>,<chr>,<chr>
371718546,Input1_20x,Input
18666630,Input1,Input
347635732,Input2_20x,Input
20167924,Input2,Input
349994051,Input3_20x,Input
23280988,Input3,Input
413508358,Input4_20x,Input
19003938,Input4,Input
341110487,Input5_20x,Input
15325016,Input5,Input


## Preprocess
* Filter: at least 10 fragments for a motif
* Filter: fully cover the motif

In [8]:
###################################################
# Preprocess
###################################################
cat("+++++ Preprocess +++++\n")

### Filter: fully cover the motif
dat = dat_ann_frag
dat = dat %>% 
    mutate(Length_MTF = End_MTF - Start_MTF) %>%
    mutate(Length_Dif = Length_MTF - Overlap)

cat("Filtering: fully cover the motif\n")
cat("    Before Filter:", "#Motif =", length(unique(dat$Motif)), "#Annot =", nrow(dat), "\n")
dat = dat %>% dplyr::filter(Length_Dif == 0)
cat("    After  Filter:", "#Motif =", length(unique(dat$Motif)), "#Annot =", nrow(dat), "\n")

### grouped by motif cluster and 
### split the annotated fragments into list
lst    = dat %>% group_by(Motif) %>% group_split
motifs = lapply(lst, function(x){unique(x$Motif)}) %>% unlist
names(lst) = motifs

### get the list
lst_frag = lapply(lst, function(dat){
    tmp = dat %>% 
        group_by(Chrom_Frag, Start_Frag, End_Frag, Count_Frag, Motif, Sample) %>%
        summarize(N_Motif = n(), .groups = 'drop')
    return(tmp)
})

### filter out motifs that have almost no/low fragment in total
THRESHOLD = 10
cat("Filtering: filter out motifs with low fragments\n")
cat("    Threshold:", THRESHOLD, "\n")
cat("    Before Filter:", "#Motif =", length(lst_frag), "\n")

lst = lst_frag
cnt = lapply(lst, function(dat){sum(dat$Count_Frag)})
lst = lst[cnt > 10]
lst_frag = lst

cat("    After  Filter:", "#Motif =", length(lst_frag), "\n")

+++++ Preprocess +++++
Filtering: fully cover the motif
    Before Filter: #Motif = 240 #Annot = 5970091 
    After  Filter: #Motif = 240 #Annot = 5817178 
Filtering: filter out motifs with low fragments
    Threshold: 10 
    Before Filter: #Motif = 240 
    After  Filter: #Motif = 240 


## Run Analysis

In [9]:
### arguments
TARGET = "target_PER1"
IS_INPUT20X = TRUE
THRESHOLD = 10
FDIRY  = "test_marginal_pool"
#FDIRY  = "test_marginal_split"

### set samples and path
SAMPLES_TOT = c(
    paste0("Input", 1:5),
    paste0("Input", 1:5, "_20x"),
    paste0("TFX",   2:5, "_DMSO"),
    paste0("TFX",   2:5, "_Dex"))

SAMPLES_INP = c(
    paste0("Input", 1:5),
    paste0("TFX",   2:5, "_DMSO"),
    paste0("TFX",   2:5, "_Dex"))

SAMPLES_INP20X = c(
    paste0("Input", 1:5, "_20x"),
    paste0("TFX",   2:5, "_DMSO"),
    paste0("TFX",   2:5, "_Dex"))

if (IS_INPUT20X) {
    SAMPLES = SAMPLES_INP20X
    FDIRY   = paste0(FDIRY, "_input20x")
} else {
    SAMPLES = SAMPLES_INP
}
FD_OUT = file.path(FD_RES, "model_linear", FDIRY, TARGET)
dir.create(FD_OUT, recursive = TRUE)

### set motifs
fdiry  = file.path(FD_RES, "annotation_fragment", SAMPLES[1], TARGET)
fname  = "*_merge.bed.gz"
fglob  = file.path(fdiry, fname)
fpaths = Sys.glob(fglob)
MOTIFS = basename(fpaths)

### set column names and types
CTYPES = c(col_character(), col_integer(), col_integer(), col_integer(),
           col_character(), col_integer(), col_integer(),
           col_character(), col_double(),  col_integer())
CNAMES = c("Chrom_Frag", "Start_Frag", "End_Frag", "Count_Frag",
           "Chrom_MTF",  "Start_MTF",  "End_MTF",
           "Motif", "Score", "Overlap")

### START message
cat("Target:          ", TARGET, "\n")
cat("Output Directory:", FD_OUT, "\n")
cat("Threshold:       ", THRESHOLD, "\n")

Target:           target_PER1 
Output Directory: /home/mount/work/out/proj_combeffect/model_linear/test_marginal_pool_input20x/target_PER1 
Threshold:        10 


**RUN**

In [10]:
head(names(lst_frag))

In [14]:
### start
#registerDoParallel(10)
timer_start = Sys.time()

lst_tmp = head(lst_frag, 5)
lst_tmp = foreach(idn = names(lst_tmp), .combine = append) %do% {
    
    ### extract
    dat = lst_tmp[[idn]]
    
    ### get fragments
    dat = dat %>% 
        dplyr::select(Chrom_Frag, Start_Frag, End_Frag, Count_Frag, Sample) %>%
        distinct()

    ### count
    dat = dat %>% group_by(Sample) %>% summarise(Value = sum(Count_Frag))
    
    ### choose
    dat = dat %>% dplyr::filter(Sample %in% SAMPLES)
    
    ### normalize counts by library size
    dat = dat %>% left_join(dat_lib, by="Sample")
    dat = dat %>%
        mutate(Norm_Value    = Value / Size) %>% 
        mutate(Lognorm_Value = log10(Value) - log10(Size)) %>%
        mutate(X = Group)

    ### create design matrix
    idxs  = c("Input", "TFX_DMSO", "TFX_Dex")
    dat$X = factor(dat$X, levels=idxs)
    X = model.matrix(~X, dat)
    X[,"XTFX_DMSO"] = X[,"XTFX_DMSO"] + X[,"XTFX_Dex"]
    y = dat$Norm_Value

    ### fit model and get the summary
    fit = lm(y ~ X + 0)
    #res = summary(fit)
    
    ### reduce the memory size
    #res = stripGlmLR(res)
    
    ### arrange
    lst = list()
    lst$fit = fit
    lst$cnt = dat
    lst$X   = X
    lst$y   = y
    
    ### store the results
    fdiry = FD_OUT
    fname = str_replace(idn, pattern = "/", replacement = "_")
    fname = paste0(fname, ".RDS")
    fpath = file.path(fdiry, fname)
    saveRDS(lst, fpath)
    idn
}

### print end message
timer = Sys.time()
cat("Done!\n")
print(timer - timer_start)

Done!
Time difference of 0.2221425 secs


## Compare old input and new input (deeper)

In [None]:
### start
#registerDoParallel(10)
timer_start = Sys.time()

###
lst_tmp     = lst_frag #head(lst_frag, 10)
lst_res_old = foreach(idn = names(lst_tmp), .combine = append) %do% {
    ### set environment
    require(tidyverse)
    
    ### extract
    dat = lst_tmp[[idn]]
    
    ### get fragments
    dat = dat %>% 
        dplyr::select(Chrom_Frag, Start_Frag, End_Frag, Count_Frag, Sample) %>%
        distinct()

    ### count
    dat = dat %>% group_by(Sample) %>% summarise(Value = sum(Count_Frag))
    
    ### choose
    dat = dat %>% dplyr::filter(Sample %in% SAMPLES1)
    
    ### normalize counts by library size
    dat = dat %>% left_join(dat_lib, by="Sample")
    dat = dat %>%
        mutate(Norm_Value    = Value / Size) %>% 
        mutate(Lognorm_Value = log10(Value) - log10(Size)) %>%
        mutate(X = Group)

    ### create design matrix
    idxs  = c("Input", "TFX_DMSO", "TFX_Dex")
    dat$X = factor(dat$X, levels=idxs)
    X = model.matrix(~X, dat)
    X[,"XTFX_DMSO"] = X[,"XTFX_DMSO"] + X[,"XTFX_Dex"]
    y = dat$Norm_Value

    ### fit model and get the summary
    fit = lm(y ~ X + 0)
    res = summary(fit)
    
    ### reduce the memory size
    res = stripGlmLR(res)
    
    ### arrange
    lst = list()
    lst[[idn]] = res
    #lst[[idn]] = list()
    #lst[[idn]]$res = res
    #lst[[idn]]$cnt = dat
    #lst[[idn]]$X   = X
    #lst[[idn]]$y   = y
    lst
}

### print end message
timer = Sys.time()
cat("Done!\n")
print(timer - timer_start)

In [None]:
### start
#registerDoParallel(10)
timer_start = Sys.time()

###
lst_tmp     = lst_frag #head(lst_frag, 10)
lst_res_new = foreach(idn = names(lst_tmp), .combine = append) %do% {
    ### set environment
    require(tidyverse)
    
    ### extract
    dat = lst_tmp[[idn]]
    
    ### get fragments
    dat = dat %>% 
        dplyr::select(Chrom_Frag, Start_Frag, End_Frag, Count_Frag, Sample) %>%
        distinct()

    ### count
    dat = dat %>% group_by(Sample) %>% summarise(Value = sum(Count_Frag))
    
    ### choose
    dat = dat %>% dplyr::filter(Sample %in% SAMPLES2)
    
    ### normalize counts by library size
    dat = dat %>% left_join(dat_lib, by="Sample")
    dat = dat %>%
        mutate(Norm_Value    = Value / Size) %>% 
        mutate(Lognorm_Value = log10(Value) - log10(Size)) %>%
        mutate(X = Group)

    ### create design matrix
    idxs  = c("Input", "TFX_DMSO", "TFX_Dex")
    dat$X = factor(dat$X, levels=idxs)
    X = model.matrix(~X, dat)
    X[,"XTFX_DMSO"] = X[,"XTFX_DMSO"] + X[,"XTFX_Dex"]
    y = dat$Norm_Value

    ### fit model and get the summary
    fit = lm(y ~ X + 0)
    res = summary(fit)
    
    ### reduce the memory size
    res = stripGlmLR(res)
    
    ### arrange
    lst = list()
    lst[[idn]] = res
    #lst[[idn]] = list()
    #lst[[idn]]$res = res
    #lst[[idn]]$cnt = dat
    #lst[[idn]]$X   = X
    #lst[[idn]]$y   = y
    lst
}

### print end message
timer = Sys.time()
cat("Done!\n")
print(timer - timer_start)

## Visualize

In [None]:
fun = function(motifs){
    res = sapply(motifs, function(motif){
        if (motif == "NR/20"){ return("red") }
        if (motif == "AP1/1"){ return("blue") }
        return("grey50")
    })
    return(res)
}

In [None]:
lst_dat = list(lst_res_old, lst_res_new)
lst_dat = lapply(lst_dat, function(lst_res){
    ###
    lst = lapply(names(lst_res), function(mtf){
        res = lst_res[[mtf]]
        dat = as.data.frame(coef(res))
        dat = rownames_to_column(dat, var="X")
        dat$Motif = mtf
        return(dat)
    })
    
    ###
    dat = bind_rows(lst)
    dat = dat %>% dplyr::filter(str_detect(X, "Intercept", negate = TRUE))
    dat$Color = fun(dat$Motif)
    dat$X = factor(dat$X, levels=c("XXTFX_DMSO", "XXTFX_Dex"))
    return(dat)
})

In [None]:
lst_dat = lapply(lst_dat, function(dat){
    dat$adj_pval = p.adjust(dat$`Pr(>|t|)`, method = "BH")
    return(dat)
})

In [None]:
head(lst_dat[[1]])

In [None]:
lst_gpt = lapply(lst_dat, function(dat){
    gpt = ggplot(dat, aes(x=Estimate, y=-log10(`Pr(>|t|)`))) + 
        geom_point(size=0.7, color=dat$Color) + 
        geom_hline(yintercept = 2, color="red", alpha=0.5) +
        labs(x="Estimate", y="-log10(adj. p-value (BH))") +
        theme_bw() + 
        facet_wrap(~X)
    return(gpt)
})
lst_gpt[[1]] = lst_gpt[[1]] + ggtitle("Old Input")
lst_gpt[[2]] = lst_gpt[[2]] + ggtitle("New Input (Deeper)")

In [None]:
options(repr.plot.height=7, repr.plot.width=8)
grid.arrange(grobs = lst_gpt, ncol = 1)

In [1]:
options(repr.plot.height=7, repr.plot.width=8)
grid.arrange(grobs = lst_gpt, ncol = 1)

ERROR: Error in grid.arrange(grobs = lst_gpt, ncol = 1): could not find function "grid.arrange"
