In [1]:
suppressMessages(suppressWarnings(source("config_sing.R")))

print(FD_WORK)
print(FD_RES)

[1] "/home/mount/work"
[1] "/home/mount/work/out/proj_combeffect"


In [2]:
dir(FD_RES)

In [5]:
fdiry = file.path(FD_RES, "annotation_fragment", "filter_motif_score095")
dir(fdiry)

In [9]:
fdiry = file.path(FD_RES, "annotation_fragment", "filter_motif_score095", "Input1")
dir(fdiry)

## Import bed files

In [12]:
CHROM = "chr17"

In [13]:
###################################################
# Import annotated fragments
###################################################

### set column names and types
ctypes = c(col_character(), col_integer(), col_integer(), col_integer(),
           col_character(), col_integer(), col_integer(),
           col_character(), col_double(),  col_integer())
cnames = c("Chrom_Frag", "Start_Frag", "End_Frag", "Count_Frag",
           "Chrom_MTF",  "Start_MTF",  "End_MTF",
           "Motif", "Score", "Overlap")

### set samples
SAMPLES = c(
    paste0("Input", 1:5),
    paste0("TFX",   2:5, "_DMSO"),
    paste0("TFX",   2:5, "_Dex"))

### import bed files for each sample 
fdiry = file.path(FD_RES, "annotation_fragment", "filter_motif_score095")
fname = paste0(CHROM, ".bed", ".gz") 

lst_dat = lapply(SAMPLES, function(sam){
    ### set path
    fpath = file.path(fdiry, sam, fname)
    print(fpath)
    
    ### import data
    dat = read_tsv(fpath, col_types=ctypes, col_names=cnames) %>% mutate(Sample = sam)
    return(dat)
})

### arrange data
dat_ann_frag = bind_rows(lst_dat)
lst_dat = NULL

[1] "/home/mount/work/out/proj_combeffect/annotation_fragment/filter_motif_score095/Input1/chr17.bed.gz"
[1] "/home/mount/work/out/proj_combeffect/annotation_fragment/filter_motif_score095/Input2/chr17.bed.gz"
[1] "/home/mount/work/out/proj_combeffect/annotation_fragment/filter_motif_score095/Input3/chr17.bed.gz"
[1] "/home/mount/work/out/proj_combeffect/annotation_fragment/filter_motif_score095/Input4/chr17.bed.gz"
[1] "/home/mount/work/out/proj_combeffect/annotation_fragment/filter_motif_score095/Input5/chr17.bed.gz"
[1] "/home/mount/work/out/proj_combeffect/annotation_fragment/filter_motif_score095/TFX2_DMSO/chr17.bed.gz"
[1] "/home/mount/work/out/proj_combeffect/annotation_fragment/filter_motif_score095/TFX3_DMSO/chr17.bed.gz"
[1] "/home/mount/work/out/proj_combeffect/annotation_fragment/filter_motif_score095/TFX4_DMSO/chr17.bed.gz"
[1] "/home/mount/work/out/proj_combeffect/annotation_fragment/filter_motif_score095/TFX5_DMSO/chr17.bed.gz"
[1] "/home/mount/work/out/proj_combeffect/a

## Import library depth

In [6]:
### helper function to get the group of sample
get_sample = function(idn_sample){
    idn = idn_sample
    idn = str_replace(
        string = idn, 
        pattern = "Input[0-9]", 
        replacement = "Input")
    idn = str_replace(
        string = idn, 
        pattern = "TFX[0-9]_", 
        replacement="TFX_")
    return(idn)
}

### set path
fdiry = file.path(FD_RES, "source")
fname = "library_size.txt"
fpath = file.path(fdiry, fname)

### import data
ctypes = c(col_integer(), col_character())
cnames = c("Size", "Fpath")
dat_lib = read_tsv(fpath, col_types=ctypes, col_names = cnames)
dat_lib = dat_lib %>% 
    mutate(Sample = tools::file_path_sans_ext(basename(Fpath))) %>%
    mutate(Group = get_sample(Sample))
dat_lib = dat_lib %>% dplyr::select(Size, Sample, Group)
head(dat_lib, 10)

Size,Sample,Group
<dbl>,<chr>,<chr>
18666630,Input1,Input
20167924,Input2,Input
23280988,Input3,Input
19003938,Input4,Input
15325016,Input5,Input
48376253,TFX2_AZD2906,TFX_AZD2906
52542517,TFX2_AZD9567,TFX_AZD9567
43646484,TFX2_CORT108297,TFX_CORT108297
41732268,TFX2_CpdA,TFX_CpdA
43844606,TFX2_DMSO,TFX_DMSO


## Preprocess

In [14]:
###################################################
# Preprocess
###################################################
cat("+++++ Preprocess +++++\n")

### grouped by motif cluster and 
### split the annotated fragments into list
dat    = dat_ann_frag
lst    = dat %>% group_by(Motif) %>% group_split
motifs = lapply(lst, function(dat){unique(dat$Motif)}) %>% unlist
names(lst) = motifs

### get the list
lst_frag = lapply(lst, function(dat){
    tmp = dat %>% 
        dplyr::select(Chrom_Frag, Start_Frag, End_Frag, Count_Frag, Motif, Sample) %>%
        distinct()
    return(tmp)
})

### filter out motifs that have almost no fragment in total
cat("+++++ Preprocess: filter motifs +++++\n")
cat("Before filteration: #Motifs =", length(lst_frag), "\n")

THRESHOLD = 10
lst = lst_frag
cnt = lapply(lst, function(dat){sum(dat$Count_Frag)})
lst = lst[cnt > 10]
lst_frag = lst

cat("Threshold =", THRESHOLD, "\n")
cat("After filteration: #Motifs =", length(lst_frag), "\n")

+++++ Preprocess +++++
+++++ Preprocess: filter motifs +++++
Before filteration: #Motifs = 242 
Threshold = 10 
After filteration: #Motifs = 242 


## Set all pairs of motifs

In [21]:
lst_motif_pair = list("AP1/1|NR/20" = c("AP1/1", "NR/20"))
lst_motif_pair

## Get interactive effects

In [17]:
###################################################
# Get interactive effects
###################################################
cat("+++++ Get interactive effects +++++\n")

+++++ Get interactive effects +++++


```
### Helper function
### https://win-vector.com/2014/05/30/trimming-the-fat-from-glm-models-in-r/
stripGlmLR = function(cm) {
  cm$y = c()
  cm$model = c()
  
  cm$residuals = c()
  cm$fitted.values = c()
  cm$effects = c()
  cm$qr$qr = c()  
  cm$linear.predictors = c()
  cm$weights = c()
  cm$prior.weights = c()
  cm$data = c()

  cm$family$variance = c()
  cm$family$dev.resids = c()
  cm$family$aic = c()
  cm$family$validmu = c()
  cm$family$simulate = c()
  attr(cm$terms,".Environment") = c()
  attr(cm$formula,".Environment") = c()
  
  return(cm)
}

```

In [31]:
### loop through each pair of motifs
### estimate interaction effect of each motif pair
lst_res = lapply(lst_motif_pair, function(x){

    ### extract fragments for each motif
    mtf1 = x[1]
    mtf2 = x[2]
    df1 = lst_frag[[mtf1]]
    df2 = lst_frag[[mtf2]]
    
    ### extract fragments
    dat1 = bind_rows(df1, df2) %>% 
        dplyr::select(Chrom_Frag, Start_Frag, End_Frag, Count_Frag, Sample) %>%
        distinct
    dat2 = df1 %>% 
        dplyr::select(Chrom_Frag, Start_Frag, End_Frag, Count_Frag, Sample, Motif)
    dat3 = df2 %>% 
        dplyr::select(Chrom_Frag, Start_Frag, End_Frag, Count_Frag, Sample, Motif)
    
    ### match fragments for the motif pair
    dat = dat1 %>%
        full_join(dat2, by = c("Chrom_Frag", "Start_Frag", "End_Frag", "Count_Frag", "Sample")) %>%
        full_join(dat3, by = c("Chrom_Frag", "Start_Frag", "End_Frag", "Count_Frag", "Sample")) %>%
        mutate(Motif = paste(Motif.x, Motif.y, sep = "_")) %>%
        mutate(Motif = str_remove(string=Motif, pattern="_NA|NA_")) %>% 
        mutate(Group = str_remove(string = Sample, pattern = "[0-9]")) %>%
        mutate(X     = paste(Group, Motif, sep="_")) %>%
        mutate(X     = ifelse(str_detect(X, "Input"), "Input", X))
    
    ### annotate fragments based on motif annotation
    idx11 = paste("TFX_DMSO", mtf1,       sep="_")
    idx12 = paste("TFX_DMSO", mtf2,       sep="_")
    idx13 = paste("TFX_DMSO", mtf1, mtf2, sep="_")
    idx21 = paste("TFX_Dex",  mtf1,       sep="_")
    idx22 = paste("TFX_Dex",  mtf2,       sep="_")
    idx23 = paste("TFX_Dex",  mtf1, mtf2, sep="_")
    idxs  = c("Input", idx11, idx12, idx21, idx22, idx13, idx23)
    tmp = dat %>% 
        group_by(Sample, X) %>% 
        summarise(Value = sum(Count_Frag), .groups = 'drop')
    
    ### normalize counts by library size
    tmp = tmp %>% left_join(dat_lib, by="Sample")
    tmp = tmp %>%
        mutate(Norm_Value    = Value / Size) %>% 
        mutate(Lognorm_Value = log10(Value) - log10(Size))
    tmp$X = factor(tmp$X, levels=idxs)
    X = model.matrix(~X, tmp)
    y = tmp$Norm_Value
    
    ### create design matrix
    idx11 = paste("XTFX_DMSO", mtf1,       sep="_")
    idx12 = paste("XTFX_DMSO", mtf2,       sep="_")
    idx13 = paste("XTFX_DMSO", mtf1, mtf2, sep="_")
    idx21 = paste("XTFX_Dex",  mtf1,       sep="_")
    idx22 = paste("XTFX_Dex",  mtf2,       sep="_")
    idx23 = paste("XTFX_Dex",  mtf1, mtf2, sep="_")
    X[,idx11] = X[,idx11] + X[,idx13] + X[,idx21] + X[,idx23]
    X[,idx12] = X[,idx12] + X[,idx13] + X[,idx22] + X[,idx23]
    X[,idx21] = X[,idx21] + X[,idx23]
    X[,idx22] = X[,idx22] + X[,idx23]
    X[,idx13] = X[,idx13] + X[,idx23]
    
    ### fit model and get the summary
    fit = lm(y ~ X + 0)
    #res = summary(fit)
    
    ### reduce the memory size
    #res = stripGlmLR(res)
    lst = list(fit=fit, X=X, y=y, data=tmp)
    return(lst)
})

In [32]:
lst = lst_res[[1]]
dat = lst$data
fit = lst$fit
X   = lst$X
y   = lst$y
print(fit)


Call:
lm(formula = y ~ X + 0)

Coefficients:
          X(Intercept)        XXTFX_DMSO_AP1/1        XXTFX_DMSO_NR/20  
             0.0012337               0.0005389              -0.0007765  
       XXTFX_Dex_AP1/1         XXTFX_Dex_NR/20  XXTFX_DMSO_AP1/1_NR/20  
             0.0001037               0.0007412              -0.0009681  
 XXTFX_Dex_AP1/1_NR/20  
            -0.0007509  



In [35]:
dat = dat %>% dplyr::select(Sample, Group, Size, X, Value, Norm_Value, Lognorm_Value)
dat

Sample,Group,Size,X,Value,Norm_Value,Lognorm_Value
<chr>,<chr>,<dbl>,<fct>,<dbl>,<dbl>,<dbl>
Input1,Input,18666630,Input,22921,0.001227913,-2.910832
Input2,Input,20167924,Input,25078,0.00124346,-2.905368
Input3,Input,23280988,Input,29145,0.00125188,-2.902437
Input4,Input,19003938,Input,23532,0.00123827,-2.907185
Input5,Input,15325016,Input,18495,0.00120685,-2.918347
TFX2_Dex,TFX_Dex,45413539,TFX_Dex_AP1/1,83606,0.001840993,-2.734948
TFX2_Dex,TFX_Dex,45413539,TFX_Dex_AP1/1_NR/20,4835,0.000106466,-3.972789
TFX2_Dex,TFX_Dex,45413539,TFX_Dex_NR/20,49704,0.001094475,-2.960794
TFX2_DMSO,TFX_DMSO,43844606,TFX_DMSO_AP1/1,77196,0.001760673,-2.754321
TFX2_DMSO,TFX_DMSO,43844606,TFX_DMSO_AP1/1_NR/20,1116,2.545353e-05,-4.594252


In [57]:
mat = cbind(y)
mat = cbind(mat, X)
mat = as.data.frame(mat)
colnames(mat)[1] = "response"
colnames(mat)[2] = "Intercept"
mat

Unnamed: 0_level_0,response,Intercept,XTFX_DMSO_AP1/1,XTFX_DMSO_NR/20,XTFX_Dex_AP1/1,XTFX_Dex_NR/20,XTFX_DMSO_AP1/1_NR/20,XTFX_Dex_AP1/1_NR/20
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,0.001227913,1,0,0,0,0,0,0
2,0.00124346,1,0,0,0,0,0,0
3,0.00125188,1,0,0,0,0,0,0
4,0.00123827,1,0,0,0,0,0,0
5,0.00120685,1,0,0,0,0,0,0
6,0.001840993,1,1,0,1,0,0,0
7,0.000106466,1,1,1,1,1,1,1
8,0.001094475,1,0,1,0,1,0,0
9,0.001760673,1,1,0,0,0,0,0
10,2.545353e-05,1,1,1,0,0,1,0


**Export**

In [52]:
fdiry = file.path(FD_RES, "model_linear")
dir(fdiry)

In [53]:
fdiry = file.path(FD_RES, "model_linear", "example")
dir.create(fdiry)

“'/home/mount/work/out/proj_combeffect/model_linear/example' already exists”


In [54]:
fname = "chr17_count_table.tsv"
fpath = file.path(fdiry, fname)
write_tsv(dat, fpath)

In [58]:
fname = "chr17_design_matrix.tsv"
fpath = file.path(fdiry, fname)
write_tsv(mat, fpath)

In [61]:
tmp = lm(y~X + 0)
print(tmp)


Call:
lm(formula = y ~ X + 0)

Coefficients:
          X(Intercept)        XXTFX_DMSO_AP1/1        XXTFX_DMSO_NR/20  
             0.0012337               0.0005389              -0.0007765  
       XXTFX_Dex_AP1/1         XXTFX_Dex_NR/20  XXTFX_DMSO_AP1/1_NR/20  
             0.0001037               0.0007412              -0.0009681  
 XXTFX_Dex_AP1/1_NR/20  
            -0.0007509  



In [62]:
print(fit)


Call:
lm(formula = y ~ X + 0)

Coefficients:
          X(Intercept)        XXTFX_DMSO_AP1/1        XXTFX_DMSO_NR/20  
             0.0012337               0.0005389              -0.0007765  
       XXTFX_Dex_AP1/1         XXTFX_Dex_NR/20  XXTFX_DMSO_AP1/1_NR/20  
             0.0001037               0.0007412              -0.0009681  
 XXTFX_Dex_AP1/1_NR/20  
            -0.0007509  

