In [1]:
suppressMessages(suppressWarnings(source("config_sing.R")))
print(FD_WORK)
print(FD_RES)

[1] "/home/mount/work"
[1] "/home/mount/work/out/proj_combeffect"


In [15]:
### set global variables
TARGET = "target_PER1"
THRESHOLD_COVER = 10
THRESHOLD_MOTIF =  0
#THRESHOLD_MOTIF = 10.81

SAMPLES = c(
    paste0("Input", 1:5, "_20x"),
    paste0("TFX",   2:5, "_DMSO"),
    paste0("TFX",   2:5, "_Dex"))

### set motifs
fdiry  = file.path(FD_RES, "annotation_fragment", SAMPLES[1], TARGET)
fname  = "*_merge.bed.gz"
fglob  = file.path(fdiry, fname)
fpaths = Sys.glob(fglob)
MOTIFS = basename(fpaths)

### set column names and types
CTYPES = c(col_character(), col_integer(), col_integer(), col_integer(),
           col_character(), col_integer(), col_integer(),
           col_character(), col_double(),  col_integer())
CNAMES = c("Chrom_Frag", "Start_Frag", "End_Frag", "Count_Frag",
           "Chrom_MTF",  "Start_MTF",  "End_MTF",
           "Motif", "Score", "Overlap")

In [28]:
sam   = SAMPLES[1]
fname = MOTIFS[1]

fdiry = file.path(FD_RES, "annotation_fragment")
fpath = file.path(fdiry, sam, TARGET, fname)
print(fpath)

[1] "/home/mount/work/out/proj_combeffect/annotation_fragment/Input1_20x/target_PER1/AHR_merge.bed.gz"


In [45]:
#lst_dat = lapply(SAMPLES, function(sam){
### set path
fpath = file.path(fdiry, sam, TARGET, fname)    
msg = paste(mtf, "Import", fpath)
cat(msg, "\n"); flush.console()

### import data
dat = read_tsv(fpath, col_types=CTYPES, col_names=CNAMES)

if (nrow(dat) == 0){
    return(NULL)
} else {
    ###
    num1 = nrow(dat)    
    dat = dat %>% 
        mutate(Sample = sam) %>%
        mutate(Length_MTF = End_MTF - Start_MTF)  %>%
        mutate(Length_Dif = Length_MTF - Overlap) %>% 
        dplyr::filter(Length_Dif == 0) %>%
        dplyr::filter(Score >= THRESHOLD_MOTIF)
    num2 = nrow(dat)

    ###
    msg = paste(num1, num2, sep="-")
    msg = paste(mtf, "Filter", sam, msg)
    cat(msg, "\n"); flush.console()
    #return(dat)
}
#})
head(dat)

AHR Import /home/mount/work/out/proj_combeffect/annotation_fragment/Input1_20x/target_PER1/AHR_merge.bed.gz 
AHR Filter Input1_20x 1453-1450 


Chrom_Frag,Start_Frag,End_Frag,Count_Frag,Chrom_MTF,Start_MTF,End_MTF,Motif,Score,Overlap,Sample,Length_MTF,Length_Dif
<chr>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>
chr17,8150289,8151402,1,chr17,8151284,8151290,AHR,7.9331,6,Input1_20x,6,0
chr17,8150291,8151379,1,chr17,8151284,8151290,AHR,7.9331,6,Input1_20x,6,0
chr17,8150308,8151314,1,chr17,8151284,8151290,AHR,7.9331,6,Input1_20x,6,0
chr17,8150320,8151356,3,chr17,8151284,8151290,AHR,7.9331,6,Input1_20x,6,0
chr17,8150321,8151356,2,chr17,8151284,8151290,AHR,7.9331,6,Input1_20x,6,0
chr17,8150329,8151343,2,chr17,8151284,8151290,AHR,7.9331,6,Input1_20x,6,0


In [46]:
### get fragments
dat = dat %>% 
    dplyr::group_by(Chrom_Frag, Start_Frag, End_Frag, Count_Frag, Motif, Sample) %>%
    summarize(N_Motif = n(), .groups = 'drop') 

head(dat)

Chrom_Frag,Start_Frag,End_Frag,Count_Frag,Motif,Sample,N_Motif
<chr>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<int>
chr17,8150289,8151402,1,AHR,Input1_20x,1
chr17,8150291,8151379,1,AHR,Input1_20x,1
chr17,8150308,8151314,1,AHR,Input1_20x,1
chr17,8150320,8151356,3,AHR,Input1_20x,1
chr17,8150321,8151356,2,AHR,Input1_20x,1
chr17,8150329,8151343,2,AHR,Input1_20x,1


In [47]:
table(dat$N_Motif)


  1   2   3 
534 119 226 

In [48]:
### get count for each sample, number of the motif within a fragment
dat = dat %>% group_by(Sample, Motif, N_Motif) %>% summarise(Value = sum(Count_Frag))
dat

`summarise()` has grouped output by 'Sample', 'Motif'. You can override using the `.groups` argument.



Sample,Motif,N_Motif,Value
<chr>,<chr>,<int>,<dbl>
Input1_20x,AHR,1,744
Input1_20x,AHR,2,208
Input1_20x,AHR,3,352


In [39]:
### get fragments
dat = dat %>% 
    dplyr::select(Chrom_Frag, Start_Frag, End_Frag, Count_Frag, Sample) %>%
    distinct()

head(dat)

Chrom_Frag,Start_Frag,End_Frag,Count_Frag,Sample
<chr>,<dbl>,<dbl>,<dbl>,<chr>
chr17,8150289,8151402,1,Input1_20x
chr17,8150291,8151379,1,Input1_20x
chr17,8150308,8151314,1,Input1_20x
chr17,8150320,8151356,3,Input1_20x
chr17,8150321,8151356,2,Input1_20x
chr17,8150329,8151343,2,Input1_20x


In [37]:
### loop through each motif to get the marginal effect
lst_res = foreach(fname = MOTIFS[1]) %do% {
    
    ### start message and get the name of motif
    mtf = str_remove_all(fname, pattern = "_merge.bed.gz")
    msg = paste(mtf, "Start")
    cat(msg, "\n"); flush.console()
    
    ### import fragment annotation
    fdiry  = file.path(FD_RES, "annotation_fragment")
    lst_dat = lapply(SAMPLES, function(sam){
        ### set path
        fpath = file.path(fdiry, sam, TARGET, fname)    
        msg = paste(mtf, "Import", fpath)
        cat(msg, "\n"); flush.console()
        
        ### import data
        dat = read_tsv(fpath, col_types=CTYPES, col_names=CNAMES)
        if (nrow(dat) == 0){
            return(NULL)
        } else {
            ###
            num1 = nrow(dat)    
            dat = dat %>% 
                mutate(Sample = sam) %>%
                mutate(Length_MTF = End_MTF - Start_MTF)  %>%
                mutate(Length_Dif = Length_MTF - Overlap) %>% 
                dplyr::filter(Length_Dif == 0) %>%
                dplyr::filter(Score >= THRESHOLD_MOTIF)
            num2 = nrow(dat)
            
            ###
            msg = paste(num1, num2, sep="-")
            msg = paste(mtf, "Filter", sam, msg)
            cat(msg, "\n"); flush.console()
            return(dat)
        }
    })
    
    ### arrange data
    #dat = bind_rows(lst_dat)
    msg = paste(mtf, "Total", nrow(dat))
    cat(msg, "\n"); flush.console()
    
    ###################################################
    # Preprocess
    ###################################################
    cat("+++++ Preprocess +++++\n")
    
    ### Filter out empty data
    #if(nrow(dat) == 0){
    #    msg = paste(mtf, "Skip Empty")
    #    cat(msg, "\n"); flush.console()
    #    return(msg)
    #}
    
    ### Filter: fully cover the motif and motif score
    #num1 = nrow(dat)
    #dat = dat %>% 
    #    dplyr::filter(Length_Dif == 0) %>%
    #    dplyr::filter(Score >= THRESHOLD_MOTIF)
    #num2 = nrow(dat)
    #msg = paste(num1, num2, sep="-")
    #msg = paste(mtf, "Filter", msg)
    #cat(msg, "\n"); flush.console()
    
    ### Filter out empty data    
    #if(nrow(dat) == 0){
    #    msg = paste(mtf, "Filter Empty")
    #    cat(msg, "\n"); flush.console()
    #    return(msg)
    #}
    
    ### Filter: No/Low coverage
    #cnt = sum(dat$Count_Frag)
    #if(cnt <= THRESHOLD_COVER){
    #    msg = paste(mtf, "Filter Low_Coverage")
    #    cat(msg, "\n"); flush.console()
    #    return(msg)
    #}
    
    ###################################################
    # Create Count Table
    ###################################################
    cat("+++++ Create Count Table +++++\n")
    
    ### get fragments
    dat = dat %>% 
        dplyr::select(Chrom_Frag, Start_Frag, End_Frag, Count_Frag, Sample) %>%
        distinct()

    ### get count for each sample
    dat = dat %>% group_by(Sample) %>% summarise(Value = sum(Count_Frag))
    
    ### normalize counts by library size
    #dat = dat %>% left_join(dat_lib, by="Sample")
    #dat = dat %>%
    #    mutate(Norm_Value    = Value / Size) %>% 
    #    mutate(Lognorm_Value = log10(Value) - log10(Size)) %>%
    #    mutate(X = Group)
    print(head(dat)); flush.console()
    return(mtf)
}

AHR Start 
AHR Import /home/mount/work/out/proj_combeffect/annotation_fragment/Input1_20x/target_PER1/AHR_merge.bed.gz 
AHR Filter Input1_20x 1453-1450 
AHR Import /home/mount/work/out/proj_combeffect/annotation_fragment/Input2_20x/target_PER1/AHR_merge.bed.gz 
AHR Filter Input2_20x 1486-1481 
AHR Import /home/mount/work/out/proj_combeffect/annotation_fragment/Input3_20x/target_PER1/AHR_merge.bed.gz 
AHR Filter Input3_20x 1421-1417 
AHR Import /home/mount/work/out/proj_combeffect/annotation_fragment/Input4_20x/target_PER1/AHR_merge.bed.gz 
AHR Filter Input4_20x 1509-1505 
AHR Import /home/mount/work/out/proj_combeffect/annotation_fragment/Input5_20x/target_PER1/AHR_merge.bed.gz 
AHR Filter Input5_20x 1359-1353 
AHR Import /home/mount/work/out/proj_combeffect/annotation_fragment/TFX2_DMSO/target_PER1/AHR_merge.bed.gz 
AHR Filter TFX2_DMSO 563-563 
AHR Import /home/mount/work/out/proj_combeffect/annotation_fragment/TFX3_DMSO/target_PER1/AHR_merge.bed.gz 
AHR Filter TFX3_DMSO 389-389 
AHR

## linear model

In [49]:
###################################################
# Import library size
###################################################
cat("\n++++++++++ Import library size ++++++++++\n")

### Helper function to get
get_group = function(idn_sample){
    idn = idn_sample
    
    idn = str_replace(
        string = idn, 
        pattern = "Input[0-9]", 
        replacement = "Input")
    
    idn = str_remove(
        string = idn, 
        pattern = "_20x")
    
    idn = str_replace(
        string = idn, 
        pattern = "TFX[0-9]_", 
        replacement="TFX_")
    return(idn)
}

### set path
fdiry = file.path(FD_RES, "source")
fname = "library_size.txt"
fpath = file.path(fdiry, fname)

### import library size
ctypes = c(col_integer(), col_character())
cnames = c("Size", "FPath")
dat_lib = read_tsv(fpath, col_types=ctypes, col_names = cnames)

### remove the total size
dat_lib = dat_lib %>% dplyr::filter(FPath != "total")

### summarize info from the file path
### stackoverflow: Extract only folder name right before filename from full path
dat_lib = dat_lib %>% 
    mutate(Sample = basename(dirname(FPath))) %>%
    mutate(Group = get_group(Sample))
dat_lib = dat_lib %>% dplyr::select(Size, Sample, Group)


++++++++++ Import library size ++++++++++


In [67]:
TARGET = "target_PER1"

fdiry  = file.path(FD_RES, "model_linear", FDIRY, TARGET)
fname  = "count_*"
fglob  = file.path(fdiry, fname)
fpaths = Sys.glob(fglob)
MOTIFS = basename(fpaths)
head(MOTIFS)

In [52]:
TARGET = "target_PER1"
FDIRY  = "marginal_filter00_input20x"
fdiry  = file.path(FD_RES, "model_linear", FDIRY, TARGET)
fname  = "count_AHR.tsv"
fpath  = file.path(fdiry, fname)

dat = read_tsv(fpath)
dat

[1m[1mRows: [1m[22m[34m[34m39[34m[39m [1m[1mColumns: [1m[22m[34m[34m4[34m[39m

[36m──[39m [1m[1mColumn specification[1m[22m [36m─────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (2): Sample, Motif
[32mdbl[39m (2): N_Motif, Value


[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.



Sample,Motif,N_Motif,Value
<chr>,<chr>,<dbl>,<dbl>
Input1_20x,AHR,1,744
Input1_20x,AHR,2,208
Input1_20x,AHR,3,352
Input2_20x,AHR,1,728
Input2_20x,AHR,2,215
Input2_20x,AHR,3,322
Input3_20x,AHR,1,682
Input3_20x,AHR,2,249
Input3_20x,AHR,3,321
Input4_20x,AHR,1,803


In [55]:
dat = dat %>% group_by(Sample) %>% summarize(Value = sum(Value))
### normalize counts by library size
dat = dat %>% left_join(dat_lib, by="Sample")
dat = dat %>%
    mutate(Norm_Value    = Value / Size) %>% 
    mutate(Lognorm_Value = log2(Value) - log2(Size)) %>%
    mutate(X = Group)
dat

Sample,Value,Size,Group,Norm_Value,Lognorm_Value,X
<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<chr>
Input1_20x,1304,371718546,Input,3.508031e-06,-5.454937,Input
Input2_20x,1265,347635732,Input,3.638866e-06,-5.439034,Input
Input3_20x,1252,349994051,Input,3.577204e-06,-5.446456,Input
Input4_20x,1402,413508358,Input,3.3905e-06,-5.469736,Input
Input5_20x,1225,341110487,Input,3.591212e-06,-5.444759,Input
TFX2_Dex,792,45413539,TFX_Dex,1.743973e-05,-4.75846,TFX_Dex
TFX2_DMSO,386,43844606,TFX_DMSO,8.80382e-06,-5.055329,TFX_DMSO
TFX3_Dex,409,26400671,TFX_Dex,1.549203e-05,-4.809892,TFX_Dex
TFX3_DMSO,322,26819569,TFX_DMSO,1.200616e-05,-4.920596,TFX_DMSO
TFX4_Dex,612,34590086,TFX_Dex,1.769293e-05,-4.7522,TFX_Dex


In [64]:
### create design matrix
idxs  = c("Input", "TFX_DMSO", "TFX_Dex")
dat$X = factor(dat$X, levels=idxs)
X = model.matrix(~X, dat)
X[,"XTFX_DMSO"] = X[,"XTFX_DMSO"] + X[,"XTFX_Dex"]

y = dat$Norm_Value    
fit = lm(y ~ X + 0)

In [65]:
names(fit)

In [62]:
fit$x

Unnamed: 0,X(Intercept),XXTFX_DMSO,XXTFX_Dex
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
5,1,0,0
6,1,1,1
7,1,1,0
8,1,1,1
9,1,1,0
10,1,1,1


In [56]:
 ###################################################
    # Analyze w/ Linear Model
    ###################################################
    #cat("+++++ Analyze w/ Linear Model +++++\n")
    
    ### create design matrix
    idxs  = c("Input", "TFX_DMSO", "TFX_Dex")
    dat$X = factor(dat$X, levels=idxs)
    X = model.matrix(~X, dat)

    ### setup design matrix
    X[,"XTFX_DMSO"] = X[,"XTFX_DMSO"] + X[,"XTFX_Dex"]
    
    ### fit model and get the summary
    
    y = dat$Norm_Value    
    fit = lm(y ~ X + 0)
        
    y = dat$Lognorm_Value
    fit_log = lm(y ~ X + 0)
    

    ### arrange
    lst = list()
    lst$cnt     = dat
    lst$fit     = fit
    lst$fit_log = fit
    lst$X       = X

    ### store the results
    fdiry = FD_OUT
    fname = paste0("lm_", mtf, ".RDS") # str_replace(mtf, pattern = "/", replacement = "_")
    fpath = file.path(fdiry, fname)
    saveRDS(lst, fpath)

ERROR: Error in parse(text = x, srcfile = src): <text>:19:5: unexpected '}'
18:     y = dat$Norm_Value
19:     }
        ^
