In [1]:
source("/home/mount/project/config_sing.R")

── [1mAttaching packages[22m ─────────────────────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.5     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.3     [32m✔[39m [34mdplyr  [39m 1.0.7
[32m✔[39m [34mtidyr  [39m 1.1.3     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.0.1     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()



In [2]:
dir(FD_RES)

## Import library size

In [3]:
get_sample = function(idn_sample){
    idn = idn_sample
    idn = str_replace(
        string = idn, 
        pattern = "Input[0-9]", 
        replacement = "Input")
    idn = str_replace(
        string = idn, 
        pattern = "TFX[0-9]_", 
        replacement="TFX_")
    return(idn)
}

In [4]:
fdiry = file.path(FD_RES, "source")
fname = "library_size.txt"
fpath = file.path(fdiry, fname)

ctypes = c(col_integer(), col_character())
cnames = c("Size", "Fpath")
dat_lib = read_tsv(fpath, col_types=ctypes, col_names = cnames)
dat_lib = dat_lib %>% 
    mutate(Sample = tools::file_path_sans_ext(basename(Fpath))) %>%
    mutate(Group = get_sample(Sample))
dat_lib = dat_lib %>% dplyr::select(Size, Sample, Group)
head(dat_lib, 10)

Size,Sample,Group
<dbl>,<chr>,<chr>
18666630,Input1,Input
20167924,Input2,Input
23280988,Input3,Input
19003938,Input4,Input
15325016,Input5,Input
48376253,TFX2_AZD2906,TFX_AZD2906
52542517,TFX2_AZD9567,TFX_AZD9567
43646484,TFX2_CORT108297,TFX_CORT108297
41732268,TFX2_CpdA,TFX_CpdA
43844606,TFX2_DMSO,TFX_DMSO


## Import data

In [5]:
### import
ctypes = c(col_character(), col_integer(), col_integer(), col_integer(),
           col_character(), col_integer(), col_integer(),
           col_character(), col_double(),  col_integer(),
           col_character())
cnames = c("Chrom_Frag", "Start_Frag", "End_Frag", "Count_Frag",
           "Chrom_MTF",  "Start_MTF",  "End_MTF",
           "Motif", "Score", "Overlap", 
           "Sample")

In [6]:
### set samples
Samples=c(
    paste0("Input", 1:5),
    paste0("TFX",   2:5, "_DMSO"),
    paste0("TFX",   2:5, "_Dex")
)

In [7]:
fdiry = file.path(FD_RES, "annotation_fragment")
fname = "motif_annotation_chr21_095.bed"
fpath = file.path(fdiry, fname)

dat_chr21_95 = read_tsv(fpath, col_types=ctypes)
head(dat_chr21_95, 2)

Chrom_Frag,Start_Frag,End_Frag,Count_Frag,Chrom_MTF,Start_MTF,End_MTF,Motif,Score,Overlap,Sample
<chr>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<chr>
chr21,5032566,5033537,1,chr21,5032581,5032600,ZFN121,19.3355,19,Input1
chr21,5032566,5033537,1,chr21,5032583,5032600,PAX_2,17.0397,17,Input1


In [8]:
dim(dat_chr21_95)

## Preprocess

In [9]:
dat = dat_chr21_95
lst = dat %>% group_by(Motif) %>% group_split

In [10]:
print(head(names(lst)))
print(length(lst))

NULL
[1] 241


In [11]:
motifs = lapply(lst, function(dat){unique(dat$Motif)}) %>% unlist
print(head(motifs))
print(length(motifs))

[1] "AIRE"   "AP1_1"  "AP1_2"  "BATF"   "BCL6_1" "BCL6_2"
[1] 241


In [12]:
names(lst) = motifs

In [13]:
lapply(lst, nrow) %>% head

In [14]:
lst_frag = lapply(lst, function(dat){
    tmp = dat %>% 
        dplyr::select(Chrom_Frag, Start_Frag, End_Frag, Count_Frag, Motif, Sample) %>%
        distinct()
    return(tmp)
})

In [15]:
lapply(lst_frag, nrow) %>% head

## Test pick two motifs

In [25]:
mtf1 = motifs[1]
mtf2 = motifs[2]

df1 = lst_frag[[mtf1]]
df2 = lst_frag[[mtf2]]

print(c(mtf1, mtf2))

   AIRE   AP1_1 
 "AIRE" "AP1_1" 


In [26]:
dat1 = bind_rows(df1, df2) %>% 
    dplyr::select(Chrom_Frag, Start_Frag, End_Frag, Count_Frag, Sample) %>%
    distinct

dat2 = df1 %>% 
    dplyr::select(Chrom_Frag, Start_Frag, End_Frag, Count_Frag, Sample, Motif)

dat3 = df2 %>% 
    dplyr::select(Chrom_Frag, Start_Frag, End_Frag, Count_Frag, Sample, Motif)

In [27]:
dat = dat1 %>%
    full_join(dat2, by = c("Chrom_Frag", "Start_Frag", "End_Frag", "Count_Frag", "Sample")) %>%
    full_join(dat3, by = c("Chrom_Frag", "Start_Frag", "End_Frag", "Count_Frag", "Sample")) %>%
    mutate(Motif = paste(Motif.x, Motif.y, sep = "_")) %>%
    mutate(Motif = str_remove(string=Motif, pattern="_NA|NA_")) %>% 
    mutate(Group = str_remove(string = Sample, pattern = "[0-9]")) %>%
    mutate(X     = paste(Group, Motif, sep="_")) %>%
    mutate(X     = ifelse(str_detect(X, "Input"), "Input", X))
head(dat)

Chrom_Frag,Start_Frag,End_Frag,Count_Frag,Sample,Motif.x,Motif.y,Motif,Group,X
<chr>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
chr21,13342921,13343862,1,Input1,AIRE,,AIRE,Input,Input
chr21,13342976,13343941,1,Input1,AIRE,,AIRE,Input,Input
chr21,13343241,13344304,1,Input1,AIRE,,AIRE,Input,Input
chr21,13343259,13344220,1,Input1,AIRE,,AIRE,Input,Input
chr21,13343566,13344485,1,Input1,AIRE,,AIRE,Input,Input
chr21,13343570,13344556,1,Input1,AIRE,,AIRE,Input,Input


In [51]:
dat_comb = t(combn(names(lst_frag), 2))
#dat_comb = head(dat_comb, 3)
print(dim(dat_comb))
head(dat_comb)

[1] 28920     2


0,1
AIRE,AP1_1
AIRE,AP1_2
AIRE,BATF
AIRE,BCL6_1
AIRE,BCL6_2
AIRE,CCAAT_CEBP


In [52]:
lst_motif_pair = split(dat_comb, seq(nrow(dat_comb)))
names(lst_motif_pair) = lapply(
    lst_motif_pair, function(x){
        mtf1 = x[1]
        mtf2 = x[2]
        return(paste(mtf1, mtf2, sep="|"))
    } # end fun
) # end lapply

head(lst_motif_pair, 3)

In [53]:
which(motifs == "NR_20")

In [54]:
which(motifs == "AP1_1")

In [55]:
### https://win-vector.com/2014/05/30/trimming-the-fat-from-glm-models-in-r/
stripGlmLR = function(cm) {
  cm$y = c()
  cm$model = c()
  
  cm$residuals = c()
  cm$fitted.values = c()
  cm$effects = c()
  cm$qr$qr = c()  
  cm$linear.predictors = c()
  cm$weights = c()
  cm$prior.weights = c()
  cm$data = c()

  
  cm$family$variance = c()
  cm$family$dev.resids = c()
  cm$family$aic = c()
  cm$family$validmu = c()
  cm$family$simulate = c()
  attr(cm$terms,".Environment") = c()
  attr(cm$formula,".Environment") = c()
  
  cm
}

In [56]:
lst_res = lapply(lst_motif_pair, function(x){
    ### extract fragments for each motif
    mtf1 = x[1]
    mtf2 = x[2]
    df1 = lst_frag[[mtf1]]
    df2 = lst_frag[[mtf2]]
    
    ### extract fragments
    dat1 = bind_rows(df1, df2) %>% 
        dplyr::select(Chrom_Frag, Start_Frag, End_Frag, Count_Frag, Sample) %>%
        distinct
    dat2 = df1 %>% 
        dplyr::select(Chrom_Frag, Start_Frag, End_Frag, Count_Frag, Sample, Motif)
    dat3 = df2 %>% 
        dplyr::select(Chrom_Frag, Start_Frag, End_Frag, Count_Frag, Sample, Motif)
    
    ### match fragments for the motif pair
    dat = dat1 %>%
        full_join(dat2, by = c("Chrom_Frag", "Start_Frag", "End_Frag", "Count_Frag", "Sample")) %>%
        full_join(dat3, by = c("Chrom_Frag", "Start_Frag", "End_Frag", "Count_Frag", "Sample")) %>%
        mutate(Motif = paste(Motif.x, Motif.y, sep = "_")) %>%
        mutate(Motif = str_remove(string=Motif, pattern="_NA|NA_")) %>% 
        mutate(Group = str_remove(string = Sample, pattern = "[0-9]")) %>%
        mutate(X     = paste(Group, Motif, sep="_")) %>%
        mutate(X     = ifelse(str_detect(X, "Input"), "Input", X))
    
    ### annotate fragments based on motif annotation
    idx11 = paste("TFX_DMSO", mtf1,       sep="_")
    idx12 = paste("TFX_DMSO", mtf2,       sep="_")
    idx13 = paste("TFX_DMSO", mtf1, mtf2, sep="_")
    idx21 = paste("TFX_Dex",  mtf1,       sep="_")
    idx22 = paste("TFX_Dex",  mtf2,       sep="_")
    idx23 = paste("TFX_Dex",  mtf1, mtf2, sep="_")
    idxs  = c("Input", idx11, idx12, idx21, idx22, idx13, idx23)
    tmp = dat %>% 
        group_by(Sample, X) %>% 
        summarise(Value = sum(Count_Frag), .groups = 'drop')
    
    ### normalize counts by library size
    tmp = tmp %>% left_join(dat_lib, by="Sample")
    tmp = tmp %>%
        mutate(Norm_Value    = Value / Size) %>% 
        mutate(Lognorm_Value = log10(Value) - log10(Size))
    tmp$X = factor(tmp$X, levels=idxs)
    X = model.matrix(~X, tmp)
    y = tmp$Norm_Value
    
    ### create design matrix
    idx11 = paste("XTFX_DMSO", mtf1,       sep="_")
    idx12 = paste("XTFX_DMSO", mtf2,       sep="_")
    idx13 = paste("XTFX_DMSO", mtf1, mtf2, sep="_")
    idx21 = paste("XTFX_Dex",  mtf1,       sep="_")
    idx22 = paste("XTFX_Dex",  mtf2,       sep="_")
    idx23 = paste("XTFX_Dex",  mtf1, mtf2, sep="_")
    X[,idx11] = X[,idx11] + X[,idx13] + X[,idx21] + X[,idx23]
    X[,idx12] = X[,idx12] + X[,idx13] + X[,idx22] + X[,idx23]
    X[,idx21] = X[,idx21] + X[,idx23]
    X[,idx22] = X[,idx22] + X[,idx23]
    X[,idx13] = X[,idx13] + X[,idx23]
    
    ### fit model and get the summary
    fit = lm(y ~ X + 0)
    res = summary(fit)
    res = stripGlmLR(res)
    
    ### reduce the memory size
    return(res)
})

In [57]:
as.matrix(lapply(lst_res[[1]], function(x) length(serialize(x,NULL)))) 

0,1
call,168
terms,722
coefficients,576
aliased,261
sigma,39
df,43
r.squared,39
adj.r.squared,39
fstatistic,127
cov.unscaled,848


In [34]:
as.matrix(lapply(lst_res[[1]], function(x) length(serialize(x,NULL)))) 

0,1
call,168
terms,774
residuals,441
coefficients,576
aliased,261
sigma,39
df,43
r.squared,39
adj.r.squared,39
fstatistic,127


In [35]:
object.size(lst_res)

38120 bytes

In [None]:
rm(list  = ls(envir = attr(fit$terms, ".Environment")), 
       envir = attr(fit$terms, ".Environment")) 
    #res = summary(fit)
    #return(res)

In [None]:
print(length(lst_res))
print(head(names(lst_res)))
cat("++++++++++++++++++++++++++++++++\n")
tmp = lst_res[[1]]
print(tmp)
cat("++++++++++++++++++++++++++++++++\n")
print(coef(tmp))

In [58]:
print(length(lst_res))
print(head(names(lst_res)))
cat("++++++++++++++++++++++++++++++++\n")
tmp = lst_res[[1]]
print(tmp)
cat("++++++++++++++++++++++++++++++++\n")
print(coef(tmp))

[1] 28920
[1] "AIRE|AP1_1"      "AIRE|AP1_2"      "AIRE|BATF"       "AIRE|BCL6_1"    
[5] "AIRE|BCL6_2"     "AIRE|CCAAT_CEBP"
++++++++++++++++++++++++++++++++

Call:
lm(formula = y ~ X + 0)

Residuals:
   Min     1Q Median     3Q    Max 
    NA     NA     NA     NA     NA 

Coefficients:
                        Estimate Std. Error t value Pr(>|t|)    
X(Intercept)           3.119e-04  5.783e-06  53.933  < 2e-16 ***
XXTFX_DMSO_AIRE       -3.084e-04  8.674e-06 -35.553  < 2e-16 ***
XXTFX_DMSO_AP1_1       2.468e-04  8.674e-06  28.449 8.86e-16 ***
XXTFX_Dex_AIRE        -1.502e-07  9.143e-06  -0.016  0.98708    
XXTFX_Dex_AP1_1        8.113e-05  9.143e-06   8.873 8.66e-08 ***
XXTFX_DMSO_AIRE_AP1_1 -2.501e-04  1.686e-05 -14.832 3.70e-11 ***
XXTFX_Dex_AIRE_AP1_1  -8.095e-05  2.045e-05  -3.960  0.00101 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.293e-05 on 17 degrees of freedom
Multiple R-squared:  0.9992,	Adjusted R-squared:  0.9988 
F-sta

In [59]:
FD_RES

In [60]:
dir(FD_RES)

In [61]:
fdiry = file.path(FD_RES, "model_linear")
fname = "res_interactive_chr21.rds"
fpath = file.path(fdiry, fname)
saveRDS(lst_res, fpath)