# Motif marginal effect in the upstream of PER1 region

**Set environment**

In [1]:
suppressMessages(suppressWarnings(source("config_sing.R")))
print(FD_WORK)
print(FD_RES)

[1] "/home/mount/work"
[1] "/home/mount/work/out/proj_combeffect"


**Check input file**

In [2]:
fdiry = file.path(FD_RES, "annotation_fragment", "Input1", "target_PER1")
fname = "*_merge.bed.gz"
fglob = file.path(fdiry, fname)
system(paste("ls -lh", fglob), intern = TRUE)

**Check environment**

In [2]:
detectCores()

In [3]:
mem_used()

76.8 MB

## Import library size

In [4]:
###################################################
# Import library size
###################################################

### Helper function to get
get_group = function(idn_sample){
    idn = idn_sample
    
    idn = str_replace(
        string = idn, 
        pattern = "Input[0-9]", 
        replacement = "Input")
    
    idn = str_remove(
        string = idn, 
        pattern = "_20x")
    
    idn = str_replace(
        string = idn, 
        pattern = "TFX[0-9]_", 
        replacement="TFX_")
    return(idn)
}

### set path
fdiry = file.path(FD_RES, "source")
fname = "library_size.txt"
fpath = file.path(fdiry, fname)

### import library size
ctypes = c(col_integer(), col_character())
cnames = c("Size", "FPath")
dat_lib = read_tsv(fpath, col_types=ctypes, col_names = cnames)

### remove the total size
dat_lib = dat_lib %>% dplyr::filter(FPath != "total")

### summarize info from the file path
### stackoverflow: Extract only folder name right before filename from full path
dat_lib = dat_lib %>% 
    #mutate(Sample = tools::file_path_sans_ext(basename(FPath))) %>%
    mutate(Sample = basename(dirname(FPath))) %>%
    mutate(Group = get_group(Sample))
dat_lib = dat_lib %>% dplyr::select(Size, Sample, Group)
dat_lib

Size,Sample,Group
<dbl>,<chr>,<chr>
371718546,Input1_20x,Input
18666630,Input1,Input
347635732,Input2_20x,Input
20167924,Input2,Input
349994051,Input3_20x,Input
23280988,Input3,Input
413508358,Input4_20x,Input
19003938,Input4,Input
341110487,Input5_20x,Input
15325016,Input5,Input


## Run linear model on each motif

In [22]:
### arguments
TARGET = "target_PER1"
IS_INPUT20X = TRUE
THRESHOLD = 10
#FDIRY  = "test_marginal_pool"
FDIRY  = "test_marginal_split"

### set samples and path
SAMPLES_TOT = c(
    paste0("Input", 1:5),
    paste0("Input", 1:5, "_20x"),
    paste0("TFX",   2:5, "_DMSO"),
    paste0("TFX",   2:5, "_Dex"))

SAMPLES_INP = c(
    paste0("Input", 1:5),
    paste0("TFX",   2:5, "_DMSO"),
    paste0("TFX",   2:5, "_Dex"))

SAMPLES_INP20X = c(
    paste0("Input", 1:5, "_20x"),
    paste0("TFX",   2:5, "_DMSO"),
    paste0("TFX",   2:5, "_Dex"))

if (IS_INPUT20X) {
    SAMPLES = SAMPLES_INP20X
    FDIRY   = paste0(FDIRY, "_input20x")
} else {
    SAMPLES = SAMPLES_INP
}
FD_OUT = file.path(FD_RES, "model_linear", FDIRY, TARGET)
dir.create(FD_OUT, recursive = TRUE)

### set motifs
fdiry  = file.path(FD_RES, "annotation_fragment", SAMPLES[1], TARGET)
fname  = "*_merge.bed.gz"
fglob  = file.path(fdiry, fname)
fpaths = Sys.glob(fglob)
MOTIFS = basename(fpaths)

### set column names and types
CTYPES = c(col_character(), col_integer(), col_integer(), col_integer(),
           col_character(), col_integer(), col_integer(),
           col_character(), col_double(),  col_integer())
CNAMES = c("Chrom_Frag", "Start_Frag", "End_Frag", "Count_Frag",
           "Chrom_MTF",  "Start_MTF",  "End_MTF",
           "Motif", "Score", "Overlap")

### START message
cat("Target:          ", TARGET, "\n")
cat("Output Directory:", FD_OUT, "\n")
cat("Threshold:       ", THRESHOLD, "\n")

Target:           target_PER1 
Output Directory: /home/mount/work/out/proj_combeffect/model_linear/test_marginal_split_input20x/target_PER1 
Threshold:        10 


In [24]:
lst_res = foreach(fname = MOTIFS[1:5]) %do% {
    ###
    mtf = str_remove_all(fname, pattern = "_merge.bed.gz")
    msg = paste(mtf, "Start")
    print(msg); flush.console()
    
    ###
    fdiry  = file.path(FD_RES, "annotation_fragment")
    lst_dat = lapply(SAMPLES, function(sam){
        ### set path
        fpath = file.path(fdiry, sam, TARGET, fname)    
        #print(fpath); flush.console()
        
        ### import data
        dat = read_tsv(fpath, col_types=CTYPES, col_names=CNAMES)
        if (nrow(dat) == 0){
            return(NULL)
        } else {
            dat = dat %>% 
                mutate(Sample = sam) %>%
                mutate(Length_MTF = End_MTF - Start_MTF) %>%
                mutate(Length_Dif = Length_MTF - Overlap)
            return(dat)
        }
    })
    
    ### arrange data
    dat = bind_rows(lst_dat)
    
    ###################################################
    # Preprocess
    ###################################################
    #cat("+++++ Preprocess +++++\n")
    
    ### Filter out empty data
    if(nrow(dat) == 0){
        msg = paste(mtf, "Empty")
        print(msg)
        return(msg)
    }
    
    ### Filter: fully cover the motif
    dat = dat %>% dplyr::filter(Length_Dif == 0)
    
    ### Filter: No/Low coverage
    cnt = sum(dat$Count_Frag)
    if(cnt <= THRESHOLD){
        msg = paste(mtf, "Low_Coverage")
        print(msg)
        return(msg)
    }
    
    ###################################################
    # Create Count Table
    ###################################################
    #cat("+++++ Create Count Table +++++\n")
    
    ### get fragments
    dat = dat %>% 
        dplyr::select(Chrom_Frag, Start_Frag, End_Frag, Count_Frag, Sample) %>%
        distinct()

    ### count
    dat = dat %>% group_by(Sample) %>% summarise(Value = sum(Count_Frag))
    
    ### normalize counts by library size
    dat = dat %>% left_join(dat_lib, by="Sample")
    dat = dat %>%
        mutate(Norm_Value    = Value / Size) %>% 
        mutate(Lognorm_Value = log10(Value) - log10(Size)) %>%
        mutate(X = Group)
    
    ###################################################
    # Analyze w/ Linear Model
    ###################################################
    #cat("+++++ Analyze +++++\n")
    
    ### create design matrix
    idxs  = c("Input", "TFX_DMSO", "TFX_Dex")
    dat$X = factor(dat$X, levels=idxs)
    X = model.matrix(~X, dat)
    y = dat$Norm_Value

    ### setup design matrix
    X[,"XTFX_DMSO"] = X[,"XTFX_DMSO"] + X[,"XTFX_Dex"]
    
    ### fit model and get the summary
    fit = lm(y ~ X + 0)
    
    ### arrange
    lst = list()
    lst$fit = fit
    lst$cnt = dat
    lst$X   = X
    lst$y   = y
    
    ### store the results
    fdiry = FD_OUT
    fname = paste0(mtf, ".RDS") # str_replace(mtf, pattern = "/", replacement = "_")
    fpath = file.path(fdiry, fname)
    
    #dir.create(fdiry)
    saveRDS(lst, fpath)
    
    ###
    msg = paste(mtf, "Done")
    print(msg)
    return(msg)
}

[1] "AHR Start"
[1] "AHR Done"
[1] "AIRE Start"
[1] "AIRE Done"
[1] "AP1_1 Start"
[1] "AP1_1 Done"
[1] "AP1_2 Start"
[1] "AP1_2 Done"
[1] "ARI5A Start"
[1] "ARI5A Empty"
