In [3]:
source ../config_duke.sh -v

You are on Duke Server: HARDAC
BASE DIRECTORY:     /gpfs/fs1/data/reddylab/Kuei
PATH OF SOURCE:     /gpfs/fs1/data/reddylab/Kuei/source
PATH OF EXECUTABLE: /gpfs/fs1/data/reddylab/Kuei/exe
PATH OF ANNOTATION: /gpfs/fs1/data/reddylab/Kuei/annotation
PATH OF PROJECT:    /gpfs/fs1/data/reddylab/Kuei/GitRepo/Proj_CombEffect_STARRseq/notebooks
PATH OF RESULTS:    /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect



In [71]:
cat > motif_marginal_count.R << 'EOF'

###################################################
# Set environment
###################################################
cat("\n++++++++++ Set environment  ++++++++++\n")

#source("/home/mount/project/config_sing.R")
source("config_sing.R")

###################################################
# Set global variables
###################################################
cat("\n++++++++++ Set global variables ++++++++++\n")

### Get argument: Chromomsome
ARGS            = commandArgs(trailingOnly=TRUE)
TARGET          = as.character(ARGS[1])  # which chromosome or region to run
FDIRY           = as.character(ARGS[2])  # the name of the output folder
N_CORE          = as.integer(ARGS[3])    # number of cores to register during the parallelization
THRESHOLD_MOTIF = as.numeric(ARGS[4])    # threshold for the motif score filteration

### set global variables
SAMPLES = c(
    paste0("Input", 1:5),
    paste0("Input", 1:5, "_20x"),
    paste0("TFX",   2:5, "_DMSO"),
    paste0("TFX",   2:5, "_Dex"))

FD_OUT = file.path(FD_RES, "model_linear", FDIRY, TARGET)
dir.create(FD_OUT, recursive = TRUE, showWarnings = FALSE)

### set motifs
fdiry  = file.path(FD_RES, "annotation_fragment", SAMPLES[1], TARGET)
fname  = "*_merge.bed.gz"
fglob  = file.path(fdiry, fname)
fpaths = Sys.glob(fglob)
MOTIFS = basename(fpaths)

### set column names and types
CTYPES = c(col_character(), col_integer(), col_integer(), col_integer(),
           col_character(), col_integer(), col_integer(),
           col_character(), col_double(),  col_integer())
CNAMES = c("Chrom_Frag", "Start_Frag", "End_Frag", "Count_Frag",
           "Chrom_MTF",  "Start_MTF",  "End_MTF",
           "Motif", "Score", "Overlap")
           
### print start message
cat("Target:           ", TARGET,          "\n")
cat("Output Directory: ", FD_OUT,          "\n")
cat("#Cores Resgister: ", N_CORE,          "\n")
cat("Threshold (Motif):", THRESHOLD_MOTIF, "\n")

###################################################
# Set motif list
###################################################

### helper function
fun_chunk = function(x, n){ 
    if (n==1){ 
        ### EXCEPTION: split to only one chunk
        lst = list(x) 
    } else {
        ### split a vector into several chunks
        lst = split(x, cut(seq_along(x), n, labels = FALSE))
    }
    return(lst)
}

### split motifs into several chunks for parallel programming
#lst_motifs = fun_chunk(MOTIFS, N_CORE)

###################################################
# Get motif count table
###################################################
cat("\n++++++++++ Get motif count table ++++++++++\n")

### PRINT: start message
timer_start = Sys.time()
#registerDoParallel(cores=N_CORE)

#cl <- parallel::makeCluster(N_CORE)
#doParallel::registerDoParallel(cl)
#cl <- makeForkCluster(N_CORE)
#registerDoParallel(cl)

### loop through each motif to get the marginal effect
#lst_tmp = foreach(index = 1:N_CORE) %dopar% {
    
    ### init: get a chunk of motifs
    #motifs = lst_motifs[[index]]
    
    ### loop through motifs within the chunk
    #for (motif in motifs){
    for (motif in MOTIFS){
    
        ### INIT: get the name of motif
        mtf = str_remove_all(motif, pattern = "_merge.bed.gz")
        is_created = FALSE
    
        ### PRINT: start message
        ### example: motif = "AHR_merge.bed.gz"
        msg = paste(mtf, "Start")
        cat(msg, "\n"); flush.console()

        ### for each motif, import fragment annotation, preprocess, then summarize
        for(idx in seq_along(SAMPLES)) {

            ### INIT: get sample name
            sam = SAMPLES[idx]

            ###################################################
            # Import fragment annotation
            ###################################################

            ### SET: file path of annotated fragment
            fdiry = file.path(FD_RES, "annotation_fragment", sam, TARGET)
            fname = motif
            fpath = file.path(fdiry, fname)    

            ### PRINT: ready to import
            msg = paste(mtf, sam, "Import", fpath)
            cat(msg, "\n"); flush.console()

            ### import data
            dat = read_tsv(fpath, col_types=CTYPES, col_names=CNAMES)

            ### HANDLE EXCEPTION: empty data
            if (nrow(dat) == 0){
                msg = paste(mtf, sam, "Skip Import_Empty")
                cat(msg, "\n"); flush.console()
                next
            }

            ###################################################
            # Preprocess
            ###################################################

            ### FILTER:
            ###     filter out annotation not fully cover motif
            ###     filter out motif score lower than threshold
            num1 = nrow(dat)    
            dat = dat %>% 
                mutate(Sample = sam) %>%
                mutate(Length_MTF = End_MTF - Start_MTF)  %>%
                mutate(Length_Dif = Length_MTF - Overlap) %>% 
                dplyr::filter(Length_Dif == 0) %>%
                dplyr::filter(Score >= THRESHOLD_MOTIF)
            num2 = nrow(dat)

            ### PRINT: result of filtering
            msg = paste(num1, num2, sep="-")
            msg = paste(mtf, sam, "Filter", msg)
            cat(msg, "\n"); flush.console()

            ### HANDLE EXCEPTION: empty data after filteration
            if(nrow(dat) == 0){
                msg = paste(mtf, sam, "Skip Filter_Empty")
                cat(msg, "\n"); flush.console()
                next
            }

            ###################################################
            # Summarize
            ###################################################

            ### summarize annotated fragments
            dat = dat %>% 
                group_by(Chrom_Frag, Start_Frag, End_Frag, Count_Frag, Motif, Sample) %>%
                summarize(N_Motif = n(), .groups = 'drop') 

            ### get count for each sample, number of the motif within a fragment
            dat = dat %>% 
                group_by(Sample, Motif, N_Motif) %>% 
                summarise(Value = sum(Count_Frag), .groups = 'drop') 

            ### PRINT: total number of count
            msg = paste(mtf, sam, "Count", sum(dat$Value))
            cat(msg, "\n"); flush.console()

            ###################################################
            # Store output
            ###################################################

            ### SET: file path for output count table
            fdiry = FD_OUT 
            fname = paste0("count_",  mtf, ".tsv")
            fpath = file.path(fdiry, fname)

            ### store results
            ### create the table for the first sample or if the file is not yet created
            ### Otherwise, append the counts in the file
            if (is_created) {
            
                ### PRINT: file path for output count
                msg = paste(mtf, sam, "Store_Append", fpath)
                cat(msg, "\n"); flush.console()
                
                ### append the file
                write.table(
                    dat,
                    file      = fpath,
                    append    = TRUE,
                    quote     = FALSE,
                    sep       = "\t",
                    row.names = FALSE,
                    col.names = FALSE)
                 
            } else {
            
                ### PRINT: file path for output count
                msg = paste(mtf, sam, "Store_Create", fpath)
                cat(msg, "\n"); flush.console()
                
                ### create the file
                write.table(
                    dat,
                    file      = fpath,
                    quote     = FALSE,
                    sep       = "\t",
                    row.names = FALSE,
                    col.names = TRUE)
                
                ### update flag
                is_created = TRUE
                
            } # end if-else
            
        } # end inner for loop (SAMPLES)
    } # end inner for loop (MOTIFS)
    
    ### do not return any results
    #return(NULL)
#} # end outer loop (foreach)

### PRINT: end message
msg = "Done!"
cat(msg, "\n"); flush.console()

timer = Sys.time()
cat("Timer of the loop:\n"); flush.console()
print(timer - timer_start)

EOF

In [72]:
echo ${NODE}
echo ${FD_LOG}

all
/gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect/log


## Test region: upstream of PER1

In [None]:
${FD_LOG}/linear_model_marginal_count_per1_filter00.txt 
Timer of the loop:
Time difference of 3.560382 mins

Done!
Run Time: 3 minutes and 37 seconds


Timer of the loop:
Time difference of 1.032754 mins

Done!
Run Time: 1 minutes and 6 seconds

In [73]:
### set log file directory
sbatch -p ${NODE} \
    --mem=8G \
    --tasks-per-node=1 \
    --cpus-per-task=16 \
    --job-name='Marginal count PER1 f00' \
    -o ${FD_LOG}/linear_model_marginal_count_per1_filter00.txt \
    <<'EOF'
#!/bin/bash
### set directories & global variables
source ../config_duke.sh -v
#CHROMS=($(seq 1 22) X Y)
#CHROM=chr${CHROMS[${SLURM_ARRAY_TASK_ID}]}
#CHROM=chrY

TARGET=target_PER1
FDIRY=marginal_filter00
NCORE=10
THRESHOLD_MOTIF=0

### print start message
timer_start=`date +%s`
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Time Stamp:        " $(date +"%m-%d-%y+%T")
echo

### run the model
srun ${FD_PRJ}/sing_proj_combeffect.sh Rscript motif_marginal_count.R \
    ${TARGET} ${FDIRY} ${NCORE} ${THRESHOLD_MOTIF}
echo

### print end message
timer=`date +%s`
runtime=$(echo "${timer} - ${timer_start}" | bc -l)
echo 'Done!'
echo "Run Time: $(displaytime ${runtime})"

EOF

Submitted batch job 26624739


In [45]:
### set log file directory
sbatch -p ${NODE} \
    --mem=8G \
    --tasks-per-node=1 \
    --cpus-per-task=16 \
    --job-name='Marginal count PER1 f95' \
    -o ${FD_LOG}/linear_model_marginal_count_per1_filter95.txt \
    <<'EOF'
#!/bin/bash
### set directories & global variables
source ../config_duke.sh -v

#CHROMS=($(seq 1 22) X Y)
#CHROM=chr${CHROMS[${SLURM_ARRAY_TASK_ID}]}
#CHROM=chrY

TARGET=target_PER1
FDIRY=marginal_filter95
NCORE=10
THRESHOLD_MOTIF=10.81

### print start message
timer_start=`date +%s`
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Time Stamp:        " $(date +"%m-%d-%y+%T")
echo

### run the model
srun ${FD_PRJ}/sing_proj_combeffect.sh Rscript motif_marginal_count.R \
    ${TARGET} ${FDIRY} ${NCORE} ${THRESHOLD_MOTIF}
echo

### print end message
timer=`date +%s`
runtime=$(echo "${timer} - ${timer_start}" | bc -l)
echo 'Done!'
echo "Run Time: $(displaytime ${runtime})"

EOF

Submitted batch job 26613305


In [36]:
### set log file directory
sbatch -p ${NODE} \
    --mem=20G \
    --tasks-per-node=1 \
    --cpus-per-task=21 \
    --job-name='Marginal count chrY f95' \
    -o ${FD_LOG}/linear_model_marginal_count_input20x_chrY_filter95.txt \
    <<'EOF'
#!/bin/bash
### set directories & global variables
source ../config_duke.sh -v

#CHROMS=($(seq 1 22) X Y)
#CHROM=chr${CHROMS[${SLURM_ARRAY_TASK_ID}]}
#CHROM=chrY

TARGET=chrY
IS_INPUT20X=TRUE
FDIRY=marginal_filter95
NCORE=20
THRESHOLD_MOTIF=10.81

### print start message
timer_start=`date +%s`
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Time Stamp:        " $(date +"%m-%d-%y+%T")
echo

### run the model
srun ${FD_PRJ}/sing_proj_combeffect.sh Rscript motif_marginal_count.R \
    ${TARGET} ${IS_INPUT20X} ${FDIRY} ${NCORE} ${THRESHOLD_MOTIF}
echo

### print end message
timer=`date +%s`
runtime=$(echo "${timer} - ${timer_start}" | bc -l)
echo 'Done!'
echo "Run Time: $(displaytime ${runtime})"

EOF

Submitted batch job 26611423


In [38]:
### set log file directory
sbatch -p ${NODE} \
    --mem=30G \
    --tasks-per-node=1 \
    --cpus-per-task=12 \
    --job-name='Marginal count chr22 f95' \
    -o ${FD_LOG}/linear_model_marginal_count_input20x_chr22_filter95.txt \
    <<'EOF'
#!/bin/bash
### set directories & global variables
source ../config_duke.sh -v

#CHROMS=($(seq 1 22) X Y)
#CHROM=chr${CHROMS[${SLURM_ARRAY_TASK_ID}]}
#CHROM=chrY

TARGET=chr22
IS_INPUT20X=TRUE
FDIRY=marginal_filter95
NCORE=5
THRESHOLD_MOTIF=10.81

### print start message
timer_start=`date +%s`
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Time Stamp:        " $(date +"%m-%d-%y+%T")
echo

### run the model
srun ${FD_PRJ}/sing_proj_combeffect.sh Rscript motif_marginal_count.R \
    ${TARGET} ${IS_INPUT20X} ${FDIRY} ${NCORE} ${THRESHOLD_MOTIF}
echo

### print end message
timer=`date +%s`
runtime=$(echo "${timer} - ${timer_start}" | bc -l)
echo 'Done!'
echo "Run Time: $(displaytime ${runtime})"

EOF

Submitted batch job 26611661


In [74]:
### set log file directory
sbatch -p ${NODE} \
    --mem=50G \
    --tasks-per-node=1 \
    --cpus-per-task=8 \
    --job-name='Marginal count chrX f95' \
    -o ${FD_LOG}/linear_model_marginal_count_chrX_filter95.txt \
    <<'EOF'
#!/bin/bash
### set directories & global variables
source ../config_duke.sh -v

#CHROMS=($(seq 1 22) X Y)
#CHROM=chr${CHROMS[${SLURM_ARRAY_TASK_ID}]}
#CHROM=chrY

TARGET=chrX
FDIRY=marginal_filter95
NCORE=5
THRESHOLD_MOTIF=10.81

### print start message
timer_start=`date +%s`
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Time Stamp:        " $(date +"%m-%d-%y+%T")
echo

### run the model
srun ${FD_PRJ}/sing_proj_combeffect.sh Rscript motif_marginal_count.R \
    ${TARGET} ${FDIRY} ${NCORE} ${THRESHOLD_MOTIF}
echo

### print end message
timer=`date +%s`
runtime=$(echo "${timer} - ${timer_start}" | bc -l)
echo 'Done!'
echo "Run Time: $(displaytime ${runtime})"

EOF

Submitted batch job 26624740


In [61]:
### set log file directory
sbatch -p ${NODE} \
    --mem=30G \
    --tasks-per-node=1 \
    --cpus-per-task=16 \
    --job-name='Marginal count chr17 f95' \
    -o ${FD_LOG}/linear_model_marginal_count_chr17_filter95.txt \
    <<'EOF'
#!/bin/bash
### set directories & global variables
source ../config_duke.sh -v

#CHROMS=($(seq 1 22) X Y)
#CHROM=chr${CHROMS[${SLURM_ARRAY_TASK_ID}]}
#CHROM=chrY

TARGET=chr17
FDIRY=marginal_filter95
NCORE=10
THRESHOLD_MOTIF=10.81

### print start message
timer_start=`date +%s`
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Time Stamp:        " $(date +"%m-%d-%y+%T")
echo

### run the model
srun ${FD_PRJ}/sing_proj_combeffect.sh Rscript motif_marginal_count.R \
    ${TARGET} ${FDIRY} ${NCORE} ${THRESHOLD_MOTIF}
echo

### print end message
timer=`date +%s`
runtime=$(echo "${timer} - ${timer_start}" | bc -l)
echo 'Done!'
echo "Run Time: $(displaytime ${runtime})"

EOF

Submitted batch job 26613366


In [62]:
### set log file directory
sbatch -p ${NODE} \
    --mem=30G \
    --tasks-per-node=1 \
    --cpus-per-task=16 \
    --job-name='Marginal count chr1 f95' \
    -o ${FD_LOG}/linear_model_marginal_count_chr1_filter95.txt \
    <<'EOF'
#!/bin/bash
### set directories & global variables
source ../config_duke.sh -v

#CHROMS=($(seq 1 22) X Y)
#CHROM=chr${CHROMS[${SLURM_ARRAY_TASK_ID}]}
#CHROM=chrY

TARGET=chr1
FDIRY=marginal_filter95
NCORE=10
THRESHOLD_MOTIF=10.81

### print start message
timer_start=`date +%s`
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Time Stamp:        " $(date +"%m-%d-%y+%T")
echo

### run the model
srun ${FD_PRJ}/sing_proj_combeffect.sh Rscript motif_marginal_count.R \
    ${TARGET} ${FDIRY} ${NCORE} ${THRESHOLD_MOTIF}
echo

### print end message
timer=`date +%s`
runtime=$(echo "${timer} - ${timer_start}" | bc -l)
echo 'Done!'
echo "Run Time: $(displaytime ${runtime})"

EOF

Submitted batch job 26613367


In [48]:
tail -n 20 ${FD_LOG}/linear_model_marginal_count_chr17_filter95.txt

SPI Input1 Count 23085 
SPI Input1 Store_Create /home/mount/work/out/proj_combeffect/model_linear/marginal_filter95/chr17/count_SPI.tsv 
SPI Input2 Import /home/mount/work/out/proj_combeffect/annotation_fragment/Input2/chr17/SPI_merge.bed.gz 
PRDM14 Input1_20x Count 76661 
PRDM14 Input1_20x Store_Append /home/mount/work/out/proj_combeffect/model_linear/marginal_filter95/chr17/count_PRDM14.tsv 
PRDM14 Input2_20x Import /home/mount/work/out/proj_combeffect/annotation_fragment/Input2_20x/chr17/PRDM14_merge.bed.gz 
HEN1 Input1_20x Count 613448 
HEN1 Input1_20x Store_Append /home/mount/work/out/proj_combeffect/model_linear/marginal_filter95/chr17/count_HEN1.tsv 
HEN1 Input2_20x Import /home/mount/work/out/proj_combeffect/annotation_fragment/Input2_20x/chr17/HEN1_merge.bed.gz 
SPI Input2 Filter 706143-25276 
PRDM14 Input2_20x Filter 1078474-51366 
SPI Input2 Count 24927 
SPI Input2 Store_Append /home/mount/work/out/proj_combeffect/model_linear/marginal_filter95/chr17/count_SPI.tsv 
SPI Input

In [56]:
cat ${FD_LOG}/linear_model_marginal_count_chr1_filter95.txt | grep ZFN121

[01;31m[KZFN121[m[K Start 
[01;31m[KZFN121[m[K Input1 Import /home/mount/work/out/proj_combeffect/annotation_fragment/Input1/chr1/[01;31m[KZFN121[m[K_merge.bed.gz 
[01;31m[KZFN121[m[K Input1 Filter 1872982-562607 
[01;31m[KZFN121[m[K Input1 Count 370078 
[01;31m[KZFN121[m[K Input1 Store_Create /home/mount/work/out/proj_combeffect/model_linear/marginal_filter95/chr1/count_[01;31m[KZFN121[m[K.tsv 
[01;31m[KZFN121[m[K Input2 Import /home/mount/work/out/proj_combeffect/annotation_fragment/Input2/chr1/[01;31m[KZFN121[m[K_merge.bed.gz 
[01;31m[KZFN121[m[K Input2 Filter 2027808-607774 
[01;31m[KZFN121[m[K Input2 Count 401774 
[01;31m[KZFN121[m[K Input2 Store_Append /home/mount/work/out/proj_combeffect/model_linear/marginal_filter95/chr1/count_[01;31m[KZFN121[m[K.tsv 
[01;31m[KZFN121[m[K Input3 Import /home/mount/work/out/proj_combeffect/annotation_fragment/Input3/chr1/[01;31m[KZFN121[m[K_merge.bed.gz 
[01;31m[KZFN121[m[K Input3 F

In [49]:
cat ${FD_LOG}/linear_model_marginal_count_chr17_filter95.txt | grep HAND1

[01;31m[KHAND1[m[K Start 
[01;31m[KHAND1[m[K Input1 Import /home/mount/work/out/proj_combeffect/annotation_fragment/Input1/chr17/[01;31m[KHAND1[m[K_merge.bed.gz 
[01;31m[KHAND1[m[K Input1 Filter 172168-0 
[01;31m[KHAND1[m[K Input1 Skip Filter_Empty 
[01;31m[KHAND1[m[K Input2 Import /home/mount/work/out/proj_combeffect/annotation_fragment/Input2/chr17/[01;31m[KHAND1[m[K_merge.bed.gz 
[01;31m[KHAND1[m[K Input2 Filter 186982-0 
[01;31m[KHAND1[m[K Input2 Skip Filter_Empty 
[01;31m[KHAND1[m[K Input3 Import /home/mount/work/out/proj_combeffect/annotation_fragment/Input3/chr17/[01;31m[KHAND1[m[K_merge.bed.gz 
[01;31m[KHAND1[m[K Input3 Filter 216938-0 
[01;31m[KHAND1[m[K Input3 Skip Filter_Empty 
[01;31m[KHAND1[m[K Input4 Import /home/mount/work/out/proj_combeffect/annotation_fragment/Input4/chr17/[01;31m[KHAND1[m[K_merge.bed.gz 
[01;31m[KHAND1[m[K Input4 Filter 174906-0 
[01;31m[KHAND1[m[K Input4 Skip Filter_Empty 
[01;31m[