In [1]:
suppressMessages(suppressWarnings(source("config_sing.R")))
print(FD_WORK)
print(FD_RES)

[1] "/home/mount/work"
[1] "/home/mount/work/out/proj_combeffect"


In [2]:
### set global variables
TARGET = "target_PER1"
THRESHOLD_COVER = 10
THRESHOLD_MOTIF =  0
#THRESHOLD_MOTIF = 10.81

SAMPLES = c(
    paste0("Input", 1:5, "_20x"),
    paste0("TFX",   2:5, "_DMSO"),
    paste0("TFX",   2:5, "_Dex"))

### set motifs
fdiry  = file.path(FD_RES, "annotation_fragment", SAMPLES[1], TARGET)
fname  = "*_merge.bed.gz"
fglob  = file.path(fdiry, fname)
fpaths = Sys.glob(fglob)
MOTIFS = basename(fpaths)

### set column names and types
CTYPES = c(col_character(), col_integer(), col_integer(), col_integer(),
           col_character(), col_integer(), col_integer(),
           col_character(), col_double(),  col_integer())
CNAMES = c("Chrom_Frag", "Start_Frag", "End_Frag", "Count_Frag",
           "Chrom_MTF",  "Start_MTF",  "End_MTF",
           "Motif", "Score", "Overlap")

In [3]:
head(MOTIFS)

In [4]:
### combination of motifs
dat_comb = t(combn(MOTIFS[1:7], 2))
lst_motif_pair = split(dat_comb, seq(nrow(dat_comb)))

In [45]:
motif_pair = lst_motif_pair[[3]]
print(motif_pair)

[1] "AHR_merge.bed.gz"   "AP1_2_merge.bed.gz"


In [46]:
motif1 = motif_pair[1]
motif2 = motif_pair[2]

### start message
mtfs = sapply(motif_pair, function(fname){
    mtf = str_remove_all(fname, pattern = "_merge.bed.gz")
    return(mtf)
})

msg_mtf = paste(mtfs, collapse=" ")
msg_mtf = paste(msg_mtf, "|")
msg     = paste(msg_mtf, "Start")
msg

In [47]:
SAMPLES[1]

In [48]:
for (idx in seq_along(SAMPLES)){
    sam = SAMPLES[idx]
    print(c(idx, sam))
}

[1] "1"          "Input1_20x"
[1] "2"          "Input2_20x"
[1] "3"          "Input3_20x"
[1] "4"          "Input4_20x"
[1] "5"          "Input5_20x"
[1] "6"         "TFX2_DMSO"
[1] "7"         "TFX3_DMSO"
[1] "8"         "TFX4_DMSO"
[1] "9"         "TFX5_DMSO"
[1] "10"       "TFX2_Dex"
[1] "11"       "TFX3_Dex"
[1] "12"       "TFX4_Dex"
[1] "13"       "TFX5_Dex"


In [49]:
sam

In [50]:
THRESHOLD_MOTIF=0

In [51]:
mtf = msg_mtf

In [52]:
### import annotated fragments for each motif
lst_dat = lapply(motif_pair, function(motif){
    #lst = lapply(SAMPLES, function(sam){

        ###################################################
        # Import fragment annotation
        ###################################################

        ### SET: file path of annotated fragment
        fdiry = file.path(FD_RES, "annotation_fragment", sam, TARGET)
        fname = motif
        fpath = file.path(fdiry, fname)    

        ### PRINT: ready to import
        msg = paste(mtf, sam, "Import", fpath)
        cat(msg, "\n"); flush.console()

        ### import data
        dat = read_tsv(fpath, col_types=CTYPES, col_names=CNAMES)

        ### HANDLE EXCEPTION: empty data
        if (nrow(dat) == 0){
            msg = paste(mtf, sam, "Skip Import_Empty")
            cat(msg, "\n"); flush.console()
            next
        }

        ###################################################
        # Preprocess
        ###################################################

        ### FILTER:
        ###     filter out annotation not fully cover motif
        ###     filter out motif score lower than threshold
        num1 = nrow(dat)    
        dat = dat %>% 
            mutate(Sample = sam) %>%
            mutate(Length_MTF = End_MTF - Start_MTF)  %>%
            mutate(Length_Dif = Length_MTF - Overlap) %>% 
            dplyr::filter(Length_Dif == 0) %>%
            dplyr::filter(Score >= THRESHOLD_MOTIF)
        num2 = nrow(dat)

        ### PRINT: result of filtering
        msg = paste(num1, num2, sep="-")
        msg = paste(mtf, sam, "Filter", msg)
        cat(msg, "\n"); flush.console()

        ### HANDLE EXCEPTION: empty data after filteration
        if(nrow(dat) == 0){
            msg = paste(mtf, sam, "Skip Filter_Empty")
            cat(msg, "\n"); flush.console()
            next
        }

    ### arrange data
    #dat = bind_rows(lst)
    return(dat)
})
    

AHR AP1_2 | TFX5_Dex Import /home/mount/work/out/proj_combeffect/annotation_fragment/TFX5_Dex/target_PER1/AHR_merge.bed.gz 
AHR AP1_2 | TFX5_Dex Filter 767-766 
AHR AP1_2 | TFX5_Dex Import /home/mount/work/out/proj_combeffect/annotation_fragment/TFX5_Dex/target_PER1/AP1_2_merge.bed.gz 
AHR AP1_2 | TFX5_Dex Filter 1423-1396 


In [53]:
lst_dat[[1]] %>% head

Chrom_Frag,Start_Frag,End_Frag,Count_Frag,Chrom_MTF,Start_MTF,End_MTF,Motif,Score,Overlap,Sample,Length_MTF,Length_Dif
<chr>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>
chr17,8150381,8151534,2,chr17,8151284,8151290,AHR,7.9331,6,TFX5_Dex,6,0
chr17,8150387,8151372,2,chr17,8151284,8151290,AHR,7.9331,6,TFX5_Dex,6,0
chr17,8150387,8151373,1,chr17,8151284,8151290,AHR,7.9331,6,TFX5_Dex,6,0
chr17,8150388,8151373,1,chr17,8151284,8151290,AHR,7.9331,6,TFX5_Dex,6,0
chr17,8150404,8151422,1,chr17,8151284,8151290,AHR,7.9331,6,TFX5_Dex,6,0
chr17,8150416,8151427,1,chr17,8151284,8151290,AHR,7.9331,6,TFX5_Dex,6,0


In [54]:
lst_dat[[2]] %>% head

Chrom_Frag,Start_Frag,End_Frag,Count_Frag,Chrom_MTF,Start_MTF,End_MTF,Motif,Score,Overlap,Sample,Length_MTF,Length_Dif
<chr>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>
chr17,8148018,8148882,1,chr17,8148257,8148268,AP1/2,8.9298,11,TFX5_Dex,11,0
chr17,8148019,8148882,1,chr17,8148257,8148268,AP1/2,8.9298,11,TFX5_Dex,11,0
chr17,8148056,8149063,1,chr17,8148257,8148268,AP1/2,8.9298,11,TFX5_Dex,11,0
chr17,8148195,8149069,1,chr17,8148257,8148268,AP1/2,8.9298,11,TFX5_Dex,11,0
chr17,8148196,8149069,1,chr17,8148257,8148268,AP1/2,8.9298,11,TFX5_Dex,11,0
chr17,8148197,8149068,1,chr17,8148257,8148268,AP1/2,8.9298,11,TFX5_Dex,11,0


In [55]:
### arrange data after preprocessing
    df1  = lst_dat[[1]]
    df2  = lst_dat[[2]]
    mtf1 = unique(df1$Motif)
    mtf2 = unique(df2$Motif)
    #lst_dat = NULL

In [56]:
###################################################
    # Create Count Table
    ###################################################
    #cat("+++++ Create Count Table +++++\n")
    
    ### extract fragments
    dat1 = bind_rows(df1, df2) %>% 
        dplyr::select(Chrom_Frag, Start_Frag, End_Frag, Count_Frag, Sample) %>%
        distinct
    dat2 = df1 %>% 
        dplyr::select(Chrom_Frag, Start_Frag, End_Frag, Count_Frag, Sample, Motif)
    dat3 = df2 %>% 
        dplyr::select(Chrom_Frag, Start_Frag, End_Frag, Count_Frag, Sample, Motif)
    
    ### match fragments for the motif pair
    dat = dat1 %>%
        full_join(dat2, by = c("Chrom_Frag", "Start_Frag", "End_Frag", "Count_Frag", "Sample")) %>%
        full_join(dat3, by = c("Chrom_Frag", "Start_Frag", "End_Frag", "Count_Frag", "Sample")) %>%
        mutate(Motif = paste(Motif.x, Motif.y, sep = "_")) %>%
        mutate(Motif = str_remove(string=Motif, pattern="_NA|NA_")) %>% 
        mutate(Group = str_remove(string = Sample, pattern = "[0-9]")) %>%
        mutate(X     = paste(Group, Motif, sep="_")) %>%
        mutate(X     = ifelse(str_detect(X, "Input"), "Input", X))
    
    

In [58]:
table(dat$Motif)


      AHR AHR_AP1/2     AP1/2 
      636       140      1256 

In [59]:
tmp = dat %>% 
        group_by(Sample, X) %>% 
        summarise(Value = sum(Count_Frag), .groups = 'drop')
tmp

Sample,X,Value
<chr>,<chr>,<dbl>
TFX5_Dex,TFX_Dex_AHR,883
TFX5_Dex,TFX_Dex_AHR_AP1/2,190
TFX5_Dex,TFX_Dex_AP1/2,1926


## linear model

In [2]:
suppressMessages(suppressWarnings(source("config_sing.R")))
print(FD_WORK)
print(FD_RES)

[1] "/home/mount/work"
[1] "/home/mount/work/out/proj_combeffect"


In [4]:
###################################################
# Import library size
###################################################
cat("\n++++++++++ Import library size ++++++++++\n")

### Helper function to get
get_group = function(idn_sample){
    idn = idn_sample
    
    idn = str_replace(
        string = idn, 
        pattern = "Input[0-9]", 
        replacement = "Input")
    
    idn = str_remove(
        string = idn, 
        pattern = "_20x")
    
    idn = str_replace(
        string = idn, 
        pattern = "TFX[0-9]_", 
        replacement="TFX_")
    return(idn)
}

### set path
fdiry = file.path(FD_RES, "source")
fname = "library_size.txt"
fpath = file.path(fdiry, fname)

### import library size
ctypes = c(col_integer(), col_character())
cnames = c("Size", "FPath")
dat_lib = read_tsv(fpath, col_types=ctypes, col_names = cnames)

### remove the total size
dat_lib = dat_lib %>% dplyr::filter(FPath != "total")

### summarize info from the file path
### stackoverflow: Extract only folder name right before filename from full path
dat_lib = dat_lib %>% 
    mutate(Sample = basename(dirname(FPath))) %>%
    mutate(Group = get_group(Sample))
dat_lib = dat_lib %>% dplyr::select(Size, Sample, Group)


++++++++++ Import library size ++++++++++


In [6]:
TARGET = "target_PER1"
FDIRY  = "interactive_filter00"

fdiry  = file.path(FD_RES, "model_linear", FDIRY, TARGET)
fname  = "count_*"
fglob  = file.path(fdiry, fname)
fpaths = Sys.glob(fglob)
MOTIFS = basename(fpaths)
head(MOTIFS)

In [9]:
fdiry  = file.path(FD_RES, "model_linear", FDIRY, TARGET)
fname  = MOTIFS[3]
fpath  = file.path(fdiry, fname)

dat = read_tsv(fpath)
dat

[1m[1mRows: [1m[22m[34m[34m54[34m[39m [1m[1mColumns: [1m[22m[34m[34m5[34m[39m

[36m──[39m [1m[1mColumn specification[1m[22m [36m─────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (4): Sample, Motif.x, Motif.y, X
[32mdbl[39m (1): Value


[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.



Sample,Motif.x,Motif.y,X,Value
<chr>,<chr>,<chr>,<chr>,<dbl>
Input1,AHR,AP1/2,Input,28
Input1,AHR,,Input,64
Input1,,AP1/2,Input,122
Input2,AHR,AP1/2,Input,26
Input2,AHR,,Input,73
Input2,,AP1/2,Input,134
Input3,AHR,AP1/2,Input,43
Input3,AHR,,Input,111
Input3,,AP1/2,Input,175
Input4,AHR,AP1/2,Input,33


In [10]:
mtf1 = na.omit(unique(dat$Motif.x))
mtf2 = na.omit(unique(dat$Motif.y))
print(c(mtf1, mtf2))

[1] "AHR"   "AP1/2"


In [11]:
### create design matrix
idx11 = paste("TFX_DMSO", mtf1,       sep="_")
idx12 = paste("TFX_DMSO", mtf2,       sep="_")
idx13 = paste("TFX_DMSO", mtf1, mtf2, sep="_")
idx21 = paste("TFX_Dex",  mtf1,       sep="_")
idx22 = paste("TFX_Dex",  mtf2,       sep="_")
idx23 = paste("TFX_Dex",  mtf1, mtf2, sep="_")
idxs  = c("Input", idx11, idx12, idx21, idx22, idx13, idx23)
dat$X = factor(dat$X, levels=idxs)
X = model.matrix(~X, dat)
X

Unnamed: 0,(Intercept),XTFX_DMSO_AHR,XTFX_DMSO_AP1/2,XTFX_Dex_AHR,XTFX_Dex_AP1/2,XTFX_DMSO_AHR_AP1/2,XTFX_Dex_AHR_AP1/2
1,1,0,0,0,0,0,0
2,1,0,0,0,0,0,0
3,1,0,0,0,0,0,0
4,1,0,0,0,0,0,0
5,1,0,0,0,0,0,0
6,1,0,0,0,0,0,0
7,1,0,0,0,0,0,0
8,1,0,0,0,0,0,0
9,1,0,0,0,0,0,0
10,1,0,0,0,0,0,0


In [12]:
### setup design matrix
    idx11 = paste("XTFX_DMSO", mtf1,       sep="_")
    idx12 = paste("XTFX_DMSO", mtf2,       sep="_")
    idx13 = paste("XTFX_DMSO", mtf1, mtf2, sep="_")
    idx21 = paste("XTFX_Dex",  mtf1,       sep="_")
    idx22 = paste("XTFX_Dex",  mtf2,       sep="_")
    idx23 = paste("XTFX_Dex",  mtf1, mtf2, sep="_")
    X[,idx11] = X[,idx11] + X[,idx13] + X[,idx21] + X[,idx23]
    X[,idx12] = X[,idx12] + X[,idx13] + X[,idx22] + X[,idx23]
    X[,idx21] = X[,idx21] + X[,idx23]
    X[,idx22] = X[,idx22] + X[,idx23]
    X[,idx13] = X[,idx13] + X[,idx23]
X

Unnamed: 0,(Intercept),XTFX_DMSO_AHR,XTFX_DMSO_AP1/2,XTFX_Dex_AHR,XTFX_Dex_AP1/2,XTFX_DMSO_AHR_AP1/2,XTFX_Dex_AHR_AP1/2
1,1,0,0,0,0,0,0
2,1,0,0,0,0,0,0
3,1,0,0,0,0,0,0
4,1,0,0,0,0,0,0
5,1,0,0,0,0,0,0
6,1,0,0,0,0,0,0
7,1,0,0,0,0,0,0
8,1,0,0,0,0,0,0
9,1,0,0,0,0,0,0
10,1,0,0,0,0,0,0


## test linear model using the output from python script

In [15]:
fpath = "/home/mount/work/out/proj_combeffect/model_linear/example_interactive/target_PER1/count_AHR_AIRE.tsv"
dat = read_tsv(fpath, na = "nan")
dat

[1m[1mRows: [1m[22m[34m[34m34[34m[39m [1m[1mColumns: [1m[22m[34m[34m5[34m[39m

[36m──[39m [1m[1mColumn specification[1m[22m [36m─────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (4): Sample, Motif_x, Motif_y, X
[32mdbl[39m (1): Value


[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.



Sample,Motif_x,Motif_y,X,Value
<chr>,<chr>,<chr>,<chr>,<dbl>
Input1,AHR,,Input,76
Input1,,AIRE,Input,13
Input2,AHR,,Input,86
Input2,,AIRE,Input,14
Input3,AHR,,Input,136
Input3,,AIRE,Input,23
Input4,AHR,,Input,90
Input4,,AIRE,Input,15
Input5,AHR,,Input,57
Input5,,AIRE,Input,9
