In [1]:
suppressMessages(suppressWarnings(source("config_sing.R")))

print(FD_WORK)
print(FD_RES)

[1] "/home/mount/work"
[1] "/home/mount/work/out/proj_combeffect"


In [2]:
fdiry = file.path(FD_RES, "annotation_fragment")
fname = "target_PER1.bed.gz"
fglob = file.path(fdiry, "*", fname)
system(paste("ls -lh", fglob), intern = TRUE)

In [3]:
mem_used()

76.4 MB

## Import annotated fragments

In [4]:
### set samples
SAMPLES = c(
    paste0("Input", 1:5),
    paste0("Input", 1:5, "_20x"),
    paste0("TFX",   2:5, "_DMSO"),
    paste0("TFX",   2:5, "_Dex"))
SAMPLES

In [5]:
###################################################
# Import annotated fragments
###################################################

### set column names and types
ctypes = c(col_character(), col_integer(), col_integer(), col_integer(),
           col_character(), col_integer(), col_integer(),
           col_character(), col_double(),  col_integer())
cnames = c("Chrom_Frag", "Start_Frag", "End_Frag", "Count_Frag",
           "Chrom_MTF",  "Start_MTF",  "End_MTF",
           "Motif", "Score", "Overlap")

### set samples
SAMPLES = c(
    paste0("Input", 1:5),
    paste0("Input", 1:5, "_20x"),
    paste0("TFX",   2:5, "_DMSO"),
    paste0("TFX",   2:5, "_Dex"))

### import bed files for each sample 
fdiry = file.path(FD_RES, "annotation_fragment")
fname = "target_PER1.bed.gz"

lst_dat = lapply(SAMPLES, function(sam){
    ### set path
    fpath = file.path(fdiry, sam, fname)
    print(fpath); flush.console()
    
    ### import data
    dat = read_tsv(fpath, col_types=ctypes, col_names=cnames) %>% mutate(Sample = sam)
    return(dat)
})

### arrange data
dat_ann_frag = bind_rows(lst_dat)
lst_dat = NULL

[1] "/home/mount/work/out/proj_combeffect/annotation_fragment/Input1/target_PER1.bed.gz"
[1] "/home/mount/work/out/proj_combeffect/annotation_fragment/Input2/target_PER1.bed.gz"
[1] "/home/mount/work/out/proj_combeffect/annotation_fragment/Input3/target_PER1.bed.gz"
[1] "/home/mount/work/out/proj_combeffect/annotation_fragment/Input4/target_PER1.bed.gz"
[1] "/home/mount/work/out/proj_combeffect/annotation_fragment/Input5/target_PER1.bed.gz"
[1] "/home/mount/work/out/proj_combeffect/annotation_fragment/Input1_20x/target_PER1.bed.gz"
[1] "/home/mount/work/out/proj_combeffect/annotation_fragment/Input2_20x/target_PER1.bed.gz"
[1] "/home/mount/work/out/proj_combeffect/annotation_fragment/Input3_20x/target_PER1.bed.gz"
[1] "/home/mount/work/out/proj_combeffect/annotation_fragment/Input4_20x/target_PER1.bed.gz"
[1] "/home/mount/work/out/proj_combeffect/annotation_fragment/Input5_20x/target_PER1.bed.gz"
[1] "/home/mount/work/out/proj_combeffect/annotation_fragment/TFX2_DMSO/target_PER1.bed.gz

In [6]:
mem_used()

606 MB

In [7]:
head(dat_ann_frag)

Chrom_Frag,Start_Frag,End_Frag,Count_Frag,Chrom_MTF,Start_MTF,End_MTF,Motif,Score,Overlap,Sample
<chr>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<chr>
chr17,8148117,8149012,1,chr17,8148107,8148124,KLF/SP/2,9.0318,7,Input1
chr17,8148117,8149012,1,chr17,8148109,8148121,INSM1,6.1647,4,Input1
chr17,8148117,8149012,1,chr17,8148109,8148129,GC-tract,8.3277,12,Input1
chr17,8148117,8149012,1,chr17,8148123,8148133,GLI,7.4318,10,Input1
chr17,8148117,8149012,1,chr17,8148124,8148139,NR/17,7.8649,15,Input1
chr17,8148117,8149012,1,chr17,8148126,8148137,KLF/SP/1,11.3678,11,Input1


In [8]:
range(dat_ann_frag$Overlap)

## Import library size

In [22]:
###################################################
# Import library size
###################################################

### Helper function to get
get_sample = function(idn_sample){
    idn = idn_sample
    idn = str_replace(
        string = idn, 
        pattern = "Input[0-9]", 
        replacement = "Input")
    idn = str_replace(
        string = idn, 
        pattern = "TFX[0-9]_", 
        replacement="TFX_")
    return(idn)
}

### set path
fdiry = file.path(FD_RES, "source")
fname = "library_size.txt"
fpath = file.path(fdiry, fname)

### import library size
ctypes = c(col_integer(), col_character())
cnames = c("Size", "FPath")
dat_lib = read_tsv(fpath, col_types=ctypes, col_names = cnames)

### remove the total size
dat_lib = dat_lib %>% dplyr::filter(FPath != "total")

### summarize info from the file path
### stackoverflow: Extract only folder name right before filename from full path
dat_lib = dat_lib %>% 
    #mutate(Sample = tools::file_path_sans_ext(basename(FPath))) %>%
    mutate(Sample = basename(dirname(FPath))) %>%
    mutate(Group = get_sample(Sample))
dat_lib = dat_lib %>% dplyr::select(Size, Sample, Group)
dat_lib

Size,Sample,Group
<dbl>,<chr>,<chr>
371718546,Input1_20x,Input_20x
18666630,Input1,Input
347635732,Input2_20x,Input_20x
20167924,Input2,Input
349994051,Input3_20x,Input_20x
23280988,Input3,Input
413508358,Input4_20x,Input_20x
19003938,Input4,Input
341110487,Input5_20x,Input_20x
15325016,Input5,Input


## Preprocess

In [15]:
###################################################
# Preprocess
###################################################
cat("+++++ Preprocess +++++\n")

### grouped by motif cluster and 
### split the annotated fragments into list
dat    = dat_ann_frag
lst    = dat %>% group_by(Motif) %>% group_split
motifs = lapply(lst, function(dat){unique(dat$Motif)}) %>% unlist
names(lst) = motifs

### get the list
lst_frag = lapply(lst, function(dat){
    tmp = dat %>% 
        dplyr::select(Chrom_Frag, Start_Frag, End_Frag, Count_Frag, Motif, Sample) %>%
        distinct()
    return(tmp)
})

### filter out motifs that have almost no fragment in total
cat("+++++ Preprocess: filter motifs +++++\n")
cat("Before filteration: #Motifs =", length(lst_frag), "\n")

THRESHOLD = 10
lst = lst_frag
cnt = lapply(lst, function(dat){sum(dat$Count_Frag)})
lst = lst[cnt > 10]
lst_frag = lst

cat("Threshold =", THRESHOLD, "\n")
cat("After filteration: #Motifs =", length(lst_frag), "\n")

+++++ Preprocess +++++
+++++ Preprocess: filter motifs +++++
Before filteration: #Motifs = 240 
Threshold = 10 
After filteration: #Motifs = 240 


## Set all pairs of motifs

In [16]:
lst_motif_pair = list("AP1/1|NR/20" = c("AP1/1", "NR/20"))
lst_motif_pair

In [17]:
### Helper function
### https://win-vector.com/2014/05/30/trimming-the-fat-from-glm-models-in-r/
stripGlmLR = function(cm) {
  cm$y = c()
  cm$model = c()
  
  cm$residuals = c()
  cm$fitted.values = c()
  cm$effects = c()
  cm$qr$qr = c()  
  cm$linear.predictors = c()
  cm$weights = c()
  cm$prior.weights = c()
  cm$data = c()

  cm$family$variance = c()
  cm$family$dev.resids = c()
  cm$family$aic = c()
  cm$family$validmu = c()
  cm$family$simulate = c()
  attr(cm$terms,".Environment") = c()
  attr(cm$formula,".Environment") = c()
  
  return(cm)
}

In [18]:
###################################################
# Get interactive effects
###################################################
cat("+++++ Get interactive effects +++++\n")

+++++ Get interactive effects +++++


In [24]:
x = lst_motif_pair[[1]]
x

In [25]:
### extract fragments for each motif
    mtf1 = x[1]
    mtf2 = x[2]
    df1 = lst_frag[[mtf1]]
    df2 = lst_frag[[mtf2]]
    
    ### extract fragments
    dat1 = bind_rows(df1, df2) %>% 
        dplyr::select(Chrom_Frag, Start_Frag, End_Frag, Count_Frag, Sample) %>%
        distinct
    dat2 = df1 %>% 
        dplyr::select(Chrom_Frag, Start_Frag, End_Frag, Count_Frag, Sample, Motif)
    dat3 = df2 %>% 
        dplyr::select(Chrom_Frag, Start_Frag, End_Frag, Count_Frag, Sample, Motif)
### match fragments for the motif pair
    dat = dat1 %>%
        full_join(dat2, by = c("Chrom_Frag", "Start_Frag", "End_Frag", "Count_Frag", "Sample")) %>%
        full_join(dat3, by = c("Chrom_Frag", "Start_Frag", "End_Frag", "Count_Frag", "Sample")) %>%
        mutate(Motif = paste(Motif.x, Motif.y, sep = "_")) %>%
        mutate(Motif = str_remove(string=Motif, pattern="_NA|NA_")) %>% 
        mutate(Group = str_remove(string = Sample, pattern = "[0-9]")) %>%
        mutate(X     = paste(Group, Motif, sep="_")) %>%
        mutate(X     = ifelse(str_detect(X, "Input"), "Input", X))


In [30]:
print(unique(dat$Sample))
cat("+++++++++++++++++++++++++++++\n")
print(unique(dat$Group))
cat("+++++++++++++++++++++++++++++\n")
head(dat)

 [1] "Input1"     "Input2"     "Input3"     "Input4"     "Input5"    
 [6] "Input1_20x" "Input2_20x" "Input3_20x" "Input4_20x" "Input5_20x"
[11] "TFX2_DMSO"  "TFX3_DMSO"  "TFX4_DMSO"  "TFX5_DMSO"  "TFX2_Dex"  
[16] "TFX3_Dex"   "TFX4_Dex"   "TFX5_Dex"  
+++++++++++++++++++++++++++++
[1] "Input"     "Input_20x" "TFX_DMSO"  "TFX_Dex"  
+++++++++++++++++++++++++++++


Chrom_Frag,Start_Frag,End_Frag,Count_Frag,Sample,Motif.x,Motif.y,Motif,Group,X
<chr>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
chr17,8148117,8149012,1,Input1,AP1/1,,AP1/1,Input,Input
chr17,8148122,8149107,1,Input1,AP1/1,,AP1/1,Input,Input
chr17,8148178,8149194,1,Input1,AP1/1,,AP1/1,Input,Input
chr17,8148188,8149154,1,Input1,AP1/1,,AP1/1,Input,Input
chr17,8148190,8149151,1,Input1,AP1/1,,AP1/1,Input,Input
chr17,8148220,8149108,1,Input1,AP1/1,,AP1/1,Input,Input


In [38]:
tmp = dat %>% 
        group_by(Sample, X) %>% 
        summarise(Value = sum(Count_Frag), .groups = 'drop')
### summarize into counts
    ### normalize counts by library size
    tmp = dat %>% 
        group_by(Sample, X) %>% 
        summarise(Value = sum(Count_Frag), .groups = 'drop')
    tmp = tmp %>% left_join(dat_lib, by="Sample")
    tmp = tmp %>%
        mutate(Norm_Value    = Value / Size) %>% 
        mutate(Lognorm_Value = log10(Value) - log10(Size))
tmp

Sample,X,Value,Size,Group,Norm_Value,Lognorm_Value
<chr>,<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>
Input1,Input,138,18666630,Input,7.392872e-06,-5.131187
Input1_20x,Input,3483,371718546,Input_20x,9.369993e-06,-5.028261
Input2,Input,144,20167924,Input,7.140051e-06,-5.146299
Input2_20x,Input,3279,347635732,Input_20x,9.432287e-06,-5.025383
Input3,Input,200,23280988,Input,8.590701e-06,-5.065971
Input3_20x,Input,3391,349994051,Input_20x,9.688736e-06,-5.013733
Input4,Input,133,19003938,Input,6.998549e-06,-5.154992
Input4_20x,Input,3823,413508358,Input_20x,9.245279e-06,-5.03408
Input5,Input,97,15325016,Input,6.32952e-06,-5.198629
Input5_20x,Input,3174,341110487,Input_20x,9.304903e-06,-5.031288


In [34]:
### set samples
SAMPLES0 = c(
    paste0("Input", 1:5),
    paste0("Input", 1:5, "_20x"),
    paste0("TFX",   2:5, "_DMSO"),
    paste0("TFX",   2:5, "_Dex"))

SAMPLES1 = c(
    paste0("Input", 1:5),
    paste0("TFX",   2:5, "_DMSO"),
    paste0("TFX",   2:5, "_Dex"))

SAMPLES2 = c(
    paste0("Input", 1:5, "_20x"),
    paste0("TFX",   2:5, "_DMSO"),
    paste0("TFX",   2:5, "_Dex"))

In [33]:
SAMPLES1

In [39]:
tmp %>% dplyr::filter(Sample %in% SAMPLES2)

Sample,X,Value,Size,Group,Norm_Value,Lognorm_Value
<chr>,<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>
Input1_20x,Input,3483,371718546,Input_20x,9.369993e-06,-5.028261
Input2_20x,Input,3279,347635732,Input_20x,9.432287e-06,-5.025383
Input3_20x,Input,3391,349994051,Input_20x,9.688736e-06,-5.013733
Input4_20x,Input,3823,413508358,Input_20x,9.245279e-06,-5.03408
Input5_20x,Input,3174,341110487,Input_20x,9.304903e-06,-5.031288
TFX2_Dex,TFX_Dex_AP1/1,150,45413539,TFX_Dex,3.30298e-06,-5.481094
TFX2_Dex,TFX_Dex_AP1/1_NR/20,857,45413539,TFX_Dex,1.887102e-05,-4.724205
TFX2_Dex,TFX_Dex_NR/20,1271,45413539,TFX_Dex,2.798725e-05,-4.55304
TFX2_DMSO,TFX_DMSO_AP1/1,191,43844606,TFX_DMSO,4.356294e-06,-5.360883
TFX2_DMSO,TFX_DMSO_AP1/1_NR/20,196,43844606,TFX_DMSO,4.470333e-06,-5.34966


In [19]:
### loop through each pair of motifs
### estimate interaction effect of each motif pair
lst_res = lapply(lst_motif_pair, function(x){

    ### extract fragments for each motif
    mtf1 = x[1]
    mtf2 = x[2]
    df1 = lst_frag[[mtf1]]
    df2 = lst_frag[[mtf2]]
    
    ### extract fragments
    dat1 = bind_rows(df1, df2) %>% 
        dplyr::select(Chrom_Frag, Start_Frag, End_Frag, Count_Frag, Sample) %>%
        distinct
    dat2 = df1 %>% 
        dplyr::select(Chrom_Frag, Start_Frag, End_Frag, Count_Frag, Sample, Motif)
    dat3 = df2 %>% 
        dplyr::select(Chrom_Frag, Start_Frag, End_Frag, Count_Frag, Sample, Motif)
    
    ### match fragments for the motif pair
    dat = dat1 %>%
        full_join(dat2, by = c("Chrom_Frag", "Start_Frag", "End_Frag", "Count_Frag", "Sample")) %>%
        full_join(dat3, by = c("Chrom_Frag", "Start_Frag", "End_Frag", "Count_Frag", "Sample")) %>%
        mutate(Motif = paste(Motif.x, Motif.y, sep = "_")) %>%
        mutate(Motif = str_remove(string=Motif, pattern="_NA|NA_")) %>% 
        mutate(Group = str_remove(string = Sample, pattern = "[0-9]")) %>%
        mutate(X     = paste(Group, Motif, sep="_")) %>%
        mutate(X     = ifelse(str_detect(X, "Input"), "Input", X))
    
    ### summarize into counts
    ### normalize counts by library size
    tmp = dat %>% 
        group_by(Sample, X) %>% 
        summarise(Value = sum(Count_Frag), .groups = 'drop')
    tmp = tmp %>% left_join(dat_lib, by="Sample")
    tmp = tmp %>%
        mutate(Norm_Value    = Value / Size) %>% 
        mutate(Lognorm_Value = log10(Value) - log10(Size))
    
    ### create design matrix
    ### annotate fragments based on motif annotation
    idx11 = paste("TFX_DMSO", mtf1,       sep="_")
    idx12 = paste("TFX_DMSO", mtf2,       sep="_")
    idx13 = paste("TFX_DMSO", mtf1, mtf2, sep="_")
    idx21 = paste("TFX_Dex",  mtf1,       sep="_")
    idx22 = paste("TFX_Dex",  mtf2,       sep="_")
    idx23 = paste("TFX_Dex",  mtf1, mtf2, sep="_")
    idxs  = c("Input", idx11, idx12, idx21, idx22, idx13, idx23)
    tmp$X = factor(tmp$X, levels=idxs)
    X = model.matrix(~X, tmp)
    y = tmp$Norm_Value
    
    ### arrange design matrix
    idx11 = paste("XTFX_DMSO", mtf1,       sep="_")
    idx12 = paste("XTFX_DMSO", mtf2,       sep="_")
    idx13 = paste("XTFX_DMSO", mtf1, mtf2, sep="_")
    idx21 = paste("XTFX_Dex",  mtf1,       sep="_")
    idx22 = paste("XTFX_Dex",  mtf2,       sep="_")
    idx23 = paste("XTFX_Dex",  mtf1, mtf2, sep="_")
    X[,idx11] = X[,idx11] + X[,idx13] + X[,idx21] + X[,idx23]
    X[,idx12] = X[,idx12] + X[,idx13] + X[,idx22] + X[,idx23]
    X[,idx21] = X[,idx21] + X[,idx23]
    X[,idx22] = X[,idx22] + X[,idx23]
    X[,idx13] = X[,idx13] + X[,idx23]
    
    ### fit model and get the summary
    fit = lm(y ~ X + 0)
    #res = summary(fit)
    
    ### reduce the memory size
    #res = stripGlmLR(res)
    lst = list(fit=fit, X=X, y=y, data=tmp)
    return(lst)
})

In [21]:
lst = lst_res[[1]]
print(lst$fit)


Call:
lm(formula = y ~ X + 0)

Coefficients:
          X(Intercept)        XXTFX_DMSO_AP1/1        XXTFX_DMSO_NR/20  
             3.841e-06               5.778e-07               3.354e-06  
       XXTFX_Dex_AP1/1         XXTFX_Dex_NR/20  XXTFX_DMSO_AP1/1_NR/20  
            -1.073e-06               2.381e-05              -2.942e-06  
 XXTFX_Dex_AP1/1_NR/20  
            -6.264e-06  



In [None]:
### loop through each pair of motifs
### estimate interaction effect of each motif pair
lst_res = lapply(lst_motif_pair, function(x){

    ### extract fragments for each motif
    mtf1 = x[1]
    mtf2 = x[2]
    df1 = lst_frag[[mtf1]]
    df2 = lst_frag[[mtf2]]
    
    ### extract fragments
    dat1 = bind_rows(df1, df2) %>% 
        dplyr::select(Chrom_Frag, Start_Frag, End_Frag, Count_Frag, Sample) %>%
        distinct
    dat2 = df1 %>% 
        dplyr::select(Chrom_Frag, Start_Frag, End_Frag, Count_Frag, Sample, Motif)
    dat3 = df2 %>% 
        dplyr::select(Chrom_Frag, Start_Frag, End_Frag, Count_Frag, Sample, Motif)
    
    ### match fragments for the motif pair
    dat = dat1 %>%
        full_join(dat2, by = c("Chrom_Frag", "Start_Frag", "End_Frag", "Count_Frag", "Sample")) %>%
        full_join(dat3, by = c("Chrom_Frag", "Start_Frag", "End_Frag", "Count_Frag", "Sample")) %>%
        mutate(Motif = paste(Motif.x, Motif.y, sep = "_")) %>%
        mutate(Motif = str_remove(string=Motif, pattern="_NA|NA_")) %>% 
        mutate(Group = str_remove(string = Sample, pattern = "[0-9]")) %>%
        mutate(X     = paste(Group, Motif, sep="_")) %>%
        mutate(X     = ifelse(str_detect(X, "Input"), "Input", X))
    
    ### annotate fragments based on motif annotation
    idx11 = paste("TFX_DMSO", mtf1,       sep="_")
    idx12 = paste("TFX_DMSO", mtf2,       sep="_")
    idx13 = paste("TFX_DMSO", mtf1, mtf2, sep="_")
    idx21 = paste("TFX_Dex",  mtf1,       sep="_")
    idx22 = paste("TFX_Dex",  mtf2,       sep="_")
    idx23 = paste("TFX_Dex",  mtf1, mtf2, sep="_")
    idxs  = c("Input", idx11, idx12, idx21, idx22, idx13, idx23)
    tmp = dat %>% 
        group_by(Sample, X) %>% 
        summarise(Value = sum(Count_Frag), .groups = 'drop')
    
    ### normalize counts by library size
    tmp = tmp %>% left_join(dat_lib, by="Sample")
    tmp = tmp %>%
        mutate(Norm_Value    = Value / Size) %>% 
        mutate(Lognorm_Value = log10(Value) - log10(Size))
    tmp$X = factor(tmp$X, levels=idxs)
    X = model.matrix(~X, tmp)
    y = tmp$Norm_Value
    
    ### create design matrix
    idx11 = paste("XTFX_DMSO", mtf1,       sep="_")
    idx12 = paste("XTFX_DMSO", mtf2,       sep="_")
    idx13 = paste("XTFX_DMSO", mtf1, mtf2, sep="_")
    idx21 = paste("XTFX_Dex",  mtf1,       sep="_")
    idx22 = paste("XTFX_Dex",  mtf2,       sep="_")
    idx23 = paste("XTFX_Dex",  mtf1, mtf2, sep="_")
    X[,idx11] = X[,idx11] + X[,idx13] + X[,idx21] + X[,idx23]
    X[,idx12] = X[,idx12] + X[,idx13] + X[,idx22] + X[,idx23]
    X[,idx21] = X[,idx21] + X[,idx23]
    X[,idx22] = X[,idx22] + X[,idx23]
    X[,idx13] = X[,idx13] + X[,idx23]
    
    ### fit model and get the summary
    fit = lm(y ~ X + 0)
    #res = summary(fit)
    
    ### reduce the memory size
    #res = stripGlmLR(res)
    lst = list(fit=fit, X=X, y=y, data=tmp)
    return(lst)
})