In [2]:
suppressMessages(suppressWarnings(source("config_sing.R")))
print(FD_WORK)
print(FD_RES)

[1] "/home/mount/work"
[1] "/home/mount/work/out/proj_combeffect"


## Import library size

In [3]:
###################################################
# Import library size
###################################################

### Helper function to get
get_group = function(idn_sample){
    idn = idn_sample
    
    idn = str_replace(
        string = idn, 
        pattern = "Input[0-9]", 
        replacement = "Input")
    
    idn = str_remove(
        string = idn, 
        pattern = "_20x")
    
    idn = str_replace(
        string = idn, 
        pattern = "TFX[0-9]_", 
        replacement="TFX_")
    return(idn)
}

### set path
fdiry = file.path(FD_RES, "source")
fname = "library_size.txt"
fpath = file.path(fdiry, fname)

### import library size
ctypes = c(col_integer(), col_character())
cnames = c("Size", "FPath")
dat_lib = read_tsv(fpath, col_types=ctypes, col_names = cnames)

### remove the total size
dat_lib = dat_lib %>% dplyr::filter(FPath != "total")

### summarize info from the file path
### stackoverflow: Extract only folder name right before filename from full path
dat_lib = dat_lib %>% 
    #mutate(Sample = tools::file_path_sans_ext(basename(FPath))) %>%
    mutate(Sample = basename(dirname(FPath))) %>%
    mutate(Group = get_group(Sample))
dat_lib = dat_lib %>% dplyr::select(Size, Sample, Group)
dat_lib

Size,Sample,Group
<dbl>,<chr>,<chr>
371718546,Input1_20x,Input
18666630,Input1,Input
347635732,Input2_20x,Input
20167924,Input2,Input
349994051,Input3_20x,Input
23280988,Input3,Input
413508358,Input4_20x,Input
19003938,Input4,Input
341110487,Input5_20x,Input
15325016,Input5,Input


In [4]:
### arguments
TARGET = "target_PER1"
IS_INPUT20X = TRUE
THRESHOLD = 10
#FDIRY  = "test_marginal_pool"
FDIRY  = "test_interactive_split"

### set samples and path
SAMPLES_TOT = c(
    paste0("Input", 1:5),
    paste0("Input", 1:5, "_20x"),
    paste0("TFX",   2:5, "_DMSO"),
    paste0("TFX",   2:5, "_Dex"))

SAMPLES_INP = c(
    paste0("Input", 1:5),
    paste0("TFX",   2:5, "_DMSO"),
    paste0("TFX",   2:5, "_Dex"))

SAMPLES_INP20X = c(
    paste0("Input", 1:5, "_20x"),
    paste0("TFX",   2:5, "_DMSO"),
    paste0("TFX",   2:5, "_Dex"))

if (IS_INPUT20X) {
    SAMPLES = SAMPLES_INP20X
    FDIRY   = paste0(FDIRY, "_input20x")
} else {
    SAMPLES = SAMPLES_INP
}
FD_OUT = file.path(FD_RES, "model_linear", FDIRY, TARGET)
dir.create(FD_OUT, recursive = TRUE, showWarnings = FALSE)

### set motifs
fdiry  = file.path(FD_RES, "annotation_fragment", SAMPLES[1], TARGET)
fname  = "*_merge.bed.gz"
fglob  = file.path(fdiry, fname)
fpaths = Sys.glob(fglob)
MOTIFS = basename(fpaths)

### set column names and types
CTYPES = c(col_character(), col_integer(), col_integer(), col_integer(),
           col_character(), col_integer(), col_integer(),
           col_character(), col_double(),  col_integer())
CNAMES = c("Chrom_Frag", "Start_Frag", "End_Frag", "Count_Frag",
           "Chrom_MTF",  "Start_MTF",  "End_MTF",
           "Motif", "Score", "Overlap")

### START message
cat("Target:          ", TARGET, "\n")
cat("Output Directory:", FD_OUT, "\n")
cat("Threshold:       ", THRESHOLD, "\n")

Target:           target_PER1 
Output Directory: /home/mount/work/out/proj_combeffect/model_linear/test_interactive_split_input20x/target_PER1 
Threshold:        10 


In [5]:
### combination of motifs
dat_comb = t(combn(MOTIFS[3:5], 2))
print(head(dat_comb))
print("++++++++++++++++++++++++++++++")
### convert motif pairs into a list
lst_motif_pair = split(dat_comb, seq(nrow(dat_comb)))
print(lst_motif_pair[[1]])

     [,1]                 [,2]                
[1,] "AP1_1_merge.bed.gz" "AP1_2_merge.bed.gz"
[2,] "AP1_1_merge.bed.gz" "ARI5A_merge.bed.gz"
[3,] "AP1_2_merge.bed.gz" "ARI5A_merge.bed.gz"
[1] "++++++++++++++++++++++++++++++"
[1] "AP1_1_merge.bed.gz" "AP1_2_merge.bed.gz"


In [26]:
head(lst_dat[[1]])

Chrom_Frag,Start_Frag,End_Frag,Count_Frag,Chrom_MTF,Start_MTF,End_MTF,Motif,Score,Overlap,Sample,Length_MTF,Length_Dif
<chr>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>
chr17,8148003,8148983,3,chr17,8148257,8148268,AP1/2,8.9298,11,Input1_20x,11,0
chr17,8148004,8148925,1,chr17,8148257,8148268,AP1/2,8.9298,11,Input1_20x,11,0
chr17,8148004,8148962,1,chr17,8148257,8148268,AP1/2,8.9298,11,Input1_20x,11,0
chr17,8148004,8148963,1,chr17,8148257,8148268,AP1/2,8.9298,11,Input1_20x,11,0
chr17,8148005,8149014,1,chr17,8148257,8148268,AP1/2,8.9298,11,Input1_20x,11,0
chr17,8148005,8149015,1,chr17,8148257,8148268,AP1/2,8.9298,11,Input1_20x,11,0


## Test a pair

In [6]:
motif_pair = lst_motif_pair[[1]]
motif_pair

In [7]:
###
lst_dat = lapply(motif_pair, function(fname){
    lst = lapply(SAMPLES, function(sam){

        ### set path
        fdiry  = file.path(FD_RES, "annotation_fragment")
        fpath = file.path(fdiry, sam, TARGET, fname)    

        ### import data
        dat = read_tsv(fpath, col_types=CTYPES, col_names=CNAMES)
        if (nrow(dat) == 0){
            return(NULL)
        } else {
            dat = dat %>% 
                mutate(Sample = sam) %>%
                mutate(Length_MTF = End_MTF - Start_MTF) %>%
                mutate(Length_Dif = Length_MTF - Overlap)
            return(dat)
        }
    })

    ### arrange data
    dat = bind_rows(lst)
    return(dat)
})

In [9]:
head(lst_dat[[1]], 3)

Chrom_Frag,Start_Frag,End_Frag,Count_Frag,Chrom_MTF,Start_MTF,End_MTF,Motif,Score,Overlap,Sample,Length_MTF,Length_Dif
<chr>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>
chr17,8148003,8148983,3,chr17,8148425,8148433,AP1/1,6.7321,8,Input1_20x,8,0
chr17,8148003,8148983,3,chr17,8148917,8148925,AP1/1,8.0514,8,Input1_20x,8,0
chr17,8148004,8148925,1,chr17,8148425,8148433,AP1/1,6.7321,8,Input1_20x,8,0


In [10]:
head(lst_dat[[2]], 3)

Chrom_Frag,Start_Frag,End_Frag,Count_Frag,Chrom_MTF,Start_MTF,End_MTF,Motif,Score,Overlap,Sample,Length_MTF,Length_Dif
<chr>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>
chr17,8148003,8148983,3,chr17,8148257,8148268,AP1/2,8.9298,11,Input1_20x,11,0
chr17,8148004,8148925,1,chr17,8148257,8148268,AP1/2,8.9298,11,Input1_20x,11,0
chr17,8148004,8148962,1,chr17,8148257,8148268,AP1/2,8.9298,11,Input1_20x,11,0


In [12]:
for (idx in seq_along(lst_dat)) {
    dat = lst_dat
    
    print(head(dat))
}

[90m# A tibble: 6 × 13[39m
  Chrom_Frag Start_Frag End_Frag Count_Frag Chrom_MTF Start_MTF End_MTF Motif
  [3m[90m<chr>[39m[23m           [3m[90m<dbl>[39m[23m    [3m[90m<dbl>[39m[23m      [3m[90m<dbl>[39m[23m [3m[90m<chr>[39m[23m         [3m[90m<dbl>[39m[23m   [3m[90m<dbl>[39m[23m [3m[90m<chr>[39m[23m
[90m1[39m chr17         8[4m1[24m[4m4[24m[4m8[24m003  8[4m1[24m[4m4[24m[4m8[24m983          3 chr17       8[4m1[24m[4m4[24m[4m8[24m425 8[4m1[24m[4m4[24m[4m8[24m433 AP1/1
[90m2[39m chr17         8[4m1[24m[4m4[24m[4m8[24m003  8[4m1[24m[4m4[24m[4m8[24m983          3 chr17       8[4m1[24m[4m4[24m[4m8[24m917 8[4m1[24m[4m4[24m[4m8[24m925 AP1/1
[90m3[39m chr17         8[4m1[24m[4m4[24m[4m8[24m004  8[4m1[24m[4m4[24m[4m8[24m925          1 chr17       8[4m1[24m[4m4[24m[4m8[24m425 8[4m1[24m[4m4[24m[4m8[24m433 AP1/1
[90m4[39m chr17         8[4m1[24m[4m4[24m[4m8[24m004  8[4m1[24m[

In [None]:
for ()

    ### Filter out empty data
    if(nrow(dat) == 0){
        msg = paste(mtf, "Skip Empty")
        cat(msg, "\n"); flush.console()
        return(msg)
    }
    
    ### Filter: No/Low coverage
    cnt = sum(dat$Count_Frag)
    if(cnt <= THRESHOLD_COVER){
        msg = paste(mtf, "Skip Low_Coverage")
        cat(msg, "\n"); flush.console()
        return(msg)
    }
    
    ### Filter: fully cover the motif
    num1 = nrow(dat)
    dat = dat %>% dplyr::filter(Length_Dif == 0)
    num2 = nrow(dat)
    msg = paste(mtf, "Filter_overlap", num1, num2)
    cat(msg, "\n"); flush.console()
    
    if(nrow(dat) == 0){
        msg = paste(mtf, "Filter_overlap Empty")
        cat(msg, "\n"); flush.console()
        return(msg)
    }
    
    ### Filter: fully cover the motif
    num1 = nrow(dat)
    dat = dat %>% dplyr::filter(Score >= THRESHOLD_MOTIF)
    num2 = nrow(dat)
    msg = paste(mtf, "Filter_score", num1, num2)
    cat(msg, "\n"); flush.console()
    
    if(nrow(dat) == 0){
        msg = paste(mtf, "Filter_score Empty")
        cat(msg, "\n"); flush.console()
        return(msg)
    }

In [11]:
THRESHOLD_COVER = 10
### Filter: No/Low coverage
lapply(lst_dat, function(dat){
    cnt = sum(dat$Count_Frag)
    if(cnt <= THRESHOLD_COVER){
        msg = paste(mtf, "Skip Low_Coverage")
        cat(msg, "\n"); flush.console()
        return(msg)
    }
})
    

## Loop

In [27]:
lst_res = foreach(motif_pair = lst_motif_pair) %do% {
    
    ###
    lst_dat = lapply(motif_pair, function(fname){
        lst = lapply(SAMPLES, function(sam){

            ### set path
            fdiry  = file.path(FD_RES, "annotation_fragment")
            fpath = file.path(fdiry, sam, TARGET, fname)    

            ### import data
            dat = read_tsv(fpath, col_types=CTYPES, col_names=CNAMES)
            if (nrow(dat) == 0){
                return(NULL)
            } else {
                dat = dat %>% 
                    mutate(Sample = sam) %>%
                    mutate(Length_MTF = End_MTF - Start_MTF) %>%
                    mutate(Length_Dif = Length_MTF - Overlap)
                return(dat)
            }
        })

        ### arrange data
        dat = bind_rows(lst)
        return(dat)
    })
    
    ###
    tmp = lapply(lst_dat, function(dat){nrow(dat) == 0})
    is_any_empty = Reduce(`|`, tmp)
    if(is_any_empty){
        msg = paste(motif_pair, collapse = " ")
        msg = paste(msg, "Empty")
        print(msg); flush.console()
        return(msg)
    } else {
        df1  = lst_dat[[1]]
        df2  = lst_dat[[2]]
        mtf1 = unique(df1$Motif)
        mtf2 = unique(df2$Motif)
        lst_dat = NULL
    }
    
    ### extract fragments
    dat1 = bind_rows(df1, df2) %>% 
        dplyr::select(Chrom_Frag, Start_Frag, End_Frag, Count_Frag, Sample) %>%
        distinct
    dat2 = df1 %>% 
        dplyr::select(Chrom_Frag, Start_Frag, End_Frag, Count_Frag, Sample, Motif)
    dat3 = df2 %>% 
        dplyr::select(Chrom_Frag, Start_Frag, End_Frag, Count_Frag, Sample, Motif)
    
    ### match fragments for the motif pair
    dat = dat1 %>%
        full_join(dat2, by = c("Chrom_Frag", "Start_Frag", "End_Frag", "Count_Frag", "Sample")) %>%
        full_join(dat3, by = c("Chrom_Frag", "Start_Frag", "End_Frag", "Count_Frag", "Sample")) %>%
        mutate(Motif = paste(Motif.x, Motif.y, sep = "_")) %>%
        mutate(Motif = str_remove(string=Motif, pattern="_NA|NA_")) %>% 
        mutate(Group = str_remove(string = Sample, pattern = "[0-9]")) %>%
        mutate(X     = paste(Group, Motif, sep="_")) %>%
        mutate(X     = ifelse(str_detect(X, "Input"), "Input", X))
    
    ### count
    dat = dat %>% 
        group_by(Sample, X) %>% 
        summarise(Value = sum(Count_Frag), .groups = 'drop')
    
    ### normalize counts by library size
    dat = dat %>% left_join(dat_lib, by="Sample")
    dat = dat %>%
        mutate(Norm_Value    = Value / Size) %>% 
        mutate(Lognorm_Value = log10(Value) - log10(Size))
    
    ### choose which input
    dat = dat %>% dplyr::filter(Sample %in% SAMPLES) 
    
    ### create design matrix
    idx11 = paste("TFX_DMSO", mtf1,       sep="_")
    idx12 = paste("TFX_DMSO", mtf2,       sep="_")
    idx13 = paste("TFX_DMSO", mtf1, mtf2, sep="_")
    idx21 = paste("TFX_Dex",  mtf1,       sep="_")
    idx22 = paste("TFX_Dex",  mtf2,       sep="_")
    idx23 = paste("TFX_Dex",  mtf1, mtf2, sep="_")
    idxs  = c("Input", idx11, idx12, idx21, idx22, idx13, idx23)
    dat$X = factor(dat$X, levels=idxs)
    X = model.matrix(~X, dat)
    y = dat$Norm_Value
    
    ### setup design matrix
    idx11 = paste("XTFX_DMSO", mtf1,       sep="_")
    idx12 = paste("XTFX_DMSO", mtf2,       sep="_")
    idx13 = paste("XTFX_DMSO", mtf1, mtf2, sep="_")
    idx21 = paste("XTFX_Dex",  mtf1,       sep="_")
    idx22 = paste("XTFX_Dex",  mtf2,       sep="_")
    idx23 = paste("XTFX_Dex",  mtf1, mtf2, sep="_")
    X[,idx11] = X[,idx11] + X[,idx13] + X[,idx21] + X[,idx23]
    X[,idx12] = X[,idx12] + X[,idx13] + X[,idx22] + X[,idx23]
    X[,idx21] = X[,idx21] + X[,idx23]
    X[,idx22] = X[,idx22] + X[,idx23]
    X[,idx13] = X[,idx13] + X[,idx23]
    
    ### fit model and reduce the memory size
    fit = lm(y ~ X + 0)
    
    ### arrange the results
    lst = list()
    lst$fit = fit
    lst$cnt = dat
    lst$X   = X
    lst$y   = y
    
    ### store the results
    #mtf1  = str_replace_all(mtf1, pattern = "/", replacement = "_")
    #mtf2  = str_replace_all(mtf2, pattern = "/", replacement = "_")
    mtf1 = str_remove(string=motif_pair[1], pattern="_merge.bed.gz")
    mtf2 = str_remove(string=motif_pair[2], pattern="_merge.bed.gz")
    fdiry = FD_OUT
    fname = paste0(mtf1, "_", mtf2, ".RDS")
    fpath = file.path(fdiry, fname)
    dir.create(fdiry, recursive = TRUE, showWarnings = FALSE)
    saveRDS(lst, fpath)
    
    ###
    msg = paste(motif_pair, collapse = " ")
    msg = paste(msg, "Done")
    print(msg); flush.console()
    return(msg)
}

[1] "AP1_1_merge.bed.gz AP1_2_merge.bed.gz Done"
[1] "AP1_1_merge.bed.gz ARI5A_merge.bed.gz Empty"
[1] "AP1_2_merge.bed.gz ARI5A_merge.bed.gz Empty"


In [8]:
SAMPLES

In [None]:
### start
#registerDoParallel(10)
timer_start = Sys.time()

### loop through each pair of motifs
### estimate interaction effect of each motif pair
lst_res = lapply(lst_motif_pair[1:5], function(x){

    ### extract fragments for each motif
    fname1 = x[1]
    fname2 = x[2]
    mtf1 = str_remove(x[1], pattern = "_merge.bed.gz")
    mtf2 = str_remove(x[2], pattern = "_merge.bed.gz")
    
    ### import data
    fdiry = file.path(FD_RES, "annotation_fragment")
    fpath = file.path(fdiry, fname1)
    dat   = read_tsv(fpath, col_types=CTYPES, col_names=CNAMES)
    if (nrow(dat) == 0){
        return(NULL)
    } else {
            dat = dat %>% 
                mutate(Sample = sam) %>%
                mutate(Length_MTF = End_MTF - Start_MTF) %>%
                mutate(Length_Dif = Length_MTF - Overlap)
            return(dat)
        }
    
    lst_dat = lapply(SAMPLES, function(sam){
        ### set path
        fpath = file.path(fdiry, sam, TARGET, fname)    
        #print(fpath); flush.console()
        
        ### import data
        dat = read_tsv(fpath, col_types=CTYPES, col_names=CNAMES)
        if (nrow(dat) == 0){
            return(NULL)
        } else {
            dat = dat %>% 
                mutate(Sample = sam) %>%
                mutate(Length_MTF = End_MTF - Start_MTF) %>%
                mutate(Length_Dif = Length_MTF - Overlap)
            return(dat)
        }
    })
    
    ### extract fragments
    dat1 = bind_rows(df1, df2) %>% 
        dplyr::select(Chrom_Frag, Start_Frag, End_Frag, Count_Frag, Sample) %>%
        distinct
    dat2 = df1 %>% 
        dplyr::select(Chrom_Frag, Start_Frag, End_Frag, Count_Frag, Sample, Motif)
    dat3 = df2 %>% 
        dplyr::select(Chrom_Frag, Start_Frag, End_Frag, Count_Frag, Sample, Motif)
    
    ### match fragments for the motif pair
    dat = dat1 %>%
        full_join(dat2, by = c("Chrom_Frag", "Start_Frag", "End_Frag", "Count_Frag", "Sample")) %>%
        full_join(dat3, by = c("Chrom_Frag", "Start_Frag", "End_Frag", "Count_Frag", "Sample")) %>%
        mutate(Motif = paste(Motif.x, Motif.y, sep = "_")) %>%
        mutate(Motif = str_remove(string=Motif, pattern="_NA|NA_")) %>% 
        mutate(Group = str_remove(string = Sample, pattern = "[0-9]")) %>%
        mutate(X     = paste(Group, Motif, sep="_")) %>%
        mutate(X     = ifelse(str_detect(X, "Input"), "Input", X))
    
    ### summarize into counts
    ### normalize counts by library size
    tmp = dat %>% 
        group_by(Sample, X) %>% 
        summarise(Value = sum(Count_Frag), .groups = 'drop')
    tmp = tmp %>% left_join(dat_lib, by="Sample")
    tmp = tmp %>%
        mutate(Norm_Value    = Value / Size) %>% 
        mutate(Lognorm_Value = log10(Value) - log10(Size))
    
    ### choose
    tmp = tmp %>% dplyr::filter(Sample %in% SAMPLES1)
    #print(head(tmp))
    
    ### create design matrix
    ### annotate fragments based on motif annotation
    idx11 = paste("TFX_DMSO", mtf1,       sep="_")
    idx12 = paste("TFX_DMSO", mtf2,       sep="_")
    idx13 = paste("TFX_DMSO", mtf1, mtf2, sep="_")
    idx21 = paste("TFX_Dex",  mtf1,       sep="_")
    idx22 = paste("TFX_Dex",  mtf2,       sep="_")
    idx23 = paste("TFX_Dex",  mtf1, mtf2, sep="_")
    idxs  = c("Input", idx11, idx12, idx21, idx22, idx13, idx23)
    tmp$X = factor(tmp$X, levels=idxs)
    X = model.matrix(~X, tmp)
    y = tmp$Norm_Value
    
    ### arrange design matrix
    idx11 = paste("XTFX_DMSO", mtf1,       sep="_")
    idx12 = paste("XTFX_DMSO", mtf2,       sep="_")
    idx13 = paste("XTFX_DMSO", mtf1, mtf2, sep="_")
    idx21 = paste("XTFX_Dex",  mtf1,       sep="_")
    idx22 = paste("XTFX_Dex",  mtf2,       sep="_")
    idx23 = paste("XTFX_Dex",  mtf1, mtf2, sep="_")
    X[,idx11] = X[,idx11] + X[,idx13] + X[,idx21] + X[,idx23]
    X[,idx12] = X[,idx12] + X[,idx13] + X[,idx22] + X[,idx23]
    X[,idx21] = X[,idx21] + X[,idx23]
    X[,idx22] = X[,idx22] + X[,idx23]
    X[,idx13] = X[,idx13] + X[,idx23]
    
    ### fit model and get the summary
    fit = lm(y ~ X + 0)
    #res = summary(fit)
    
    ### reduce the memory size
    #res = stripGlmLR(res)
    lst = list(fit=fit, X=X, y=y, data=tmp)
    return(lst)
})

### print end message
timer = Sys.time()
cat("Done!\n")
print(timer - timer_start)