In [3]:
source ../config_duke.sh -v

You are on Duke Server: HARDAC
BASE DIRECTORY:     /gpfs/fs1/data/reddylab/Kuei
PATH OF SOURCE:     /gpfs/fs1/data/reddylab/Kuei/source
PATH OF EXECUTABLE: /gpfs/fs1/data/reddylab/Kuei/exe
PATH OF ANNOTATION: /gpfs/fs1/data/reddylab/Kuei/annotation
PATH OF PROJECT:    /gpfs/fs1/data/reddylab/Kuei/GitRepo/Proj_CombEffect_STARRseq/notebooks
PATH OF RESULTS:    /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect



In [16]:
cat > motif_interactive_count.R << 'EOF'

###################################################
# Set environment
###################################################
cat("\n++++++++++ Set environment  ++++++++++\n")

#source("/home/mount/project/config_sing.R")
source("config_sing.R")

###################################################
# Set global variables
###################################################
cat("\n++++++++++ Set global variables ++++++++++\n")

### Get argument: Chromomsome
ARGS            = commandArgs(trailingOnly=TRUE)
TARGET          = as.character(ARGS[1])  # which chromosome or region to run
FDIRY           = as.character(ARGS[2])  # the name of the output folder
N_CORE          = as.integer(ARGS[3])    # number of cores to register during the parallelization
THRESHOLD_MOTIF = as.numeric(ARGS[4])    # threshold for the motif score filteration

### set global variables
SAMPLES = c(
    paste0("Input", 1:5),
    paste0("Input", 1:5, "_20x"),
    paste0("TFX",   2:5, "_DMSO"),
    paste0("TFX",   2:5, "_Dex"))

FD_OUT = file.path(FD_RES, "model_linear", FDIRY, TARGET)
dir.create(FD_OUT, recursive = TRUE, showWarnings = FALSE)

### set motifs
fdiry  = file.path(FD_RES, "annotation_fragment", SAMPLES[1], TARGET)
fname  = "*_merge.bed.gz"
fglob  = file.path(fdiry, fname)
fpaths = Sys.glob(fglob)
MOTIFS = basename(fpaths)

### set column names and types
CTYPES = c(col_character(), col_integer(), col_integer(), col_integer(),
           col_character(), col_integer(), col_integer(),
           col_character(), col_double(),  col_integer())
CNAMES = c("Chrom_Frag", "Start_Frag", "End_Frag", "Count_Frag",
           "Chrom_MTF",  "Start_MTF",  "End_MTF",
           "Motif", "Score", "Overlap")
           
### print start message
cat("Target:           ", TARGET,          "\n")
cat("Output Directory: ", FD_OUT,          "\n")
cat("#Cores Resgister: ", N_CORE,          "\n")
cat("Threshold (Motif):", THRESHOLD_MOTIF, "\n")

###################################################
# Set motif list
###################################################

### helper function
fun_chunk = function(x, n){ 
    if (n==1){ 
        ### EXCEPTION: split to only one chunk
        lst = list(x) 
    } else {
        ### split a vector into several chunks
        lst = split(x, cut(seq_along(x), n, labels = FALSE))
    }
    return(lst)
}

### split motifs into several chunks for parallel programming
#lst_motifs = fun_chunk(MOTIFS, N_CORE)

### combination of motifs
dat_comb = t(combn(MOTIFS, 2))
lst_motif_pairs = split(dat_comb, seq(nrow(dat_comb)))
#lst_motif_chunk = fun_chunk(lst_motif_pairs, N_CORE)

###################################################
# Get motif count table
###################################################
cat("\n++++++++++ Get motif count table ++++++++++\n")

### PRINT: start message
timer_start = Sys.time()

#registerDoParallel(cores=N_CORE)
#cl <- parallel::makeCluster(N_CORE)
#doParallel::registerDoParallel(cl)
#cl <- makeForkCluster(N_CORE)
#registerDoParallel(cl)

### loop through each motif to get the marginal effect
#foreach(index = 1:N_CORE) %do% {
    
    ### init
    #lst_motif_pair = lst_motif_chunk[[idx]]

### loop through each motif pair to estimate motif interaction
for (motif_pair in lst_motif_pairs){

    ### init
    is_created = FALSE
    mtfs = sapply(motif_pair, function(motif){
        mtf = str_remove_all(motif, pattern = "_merge.bed.gz")
        return(mtf)
    })
    mtfs = paste(mtfs, collapse="_")
    msg  = paste(mtfs, "Start")
    cat(msg, "\n"); flush.console()

    ###
    for (sam in SAMPLES){

         ### import annotated fragments for each motif
         lst_dat = lapply(motif_pair, function(motif){

             ###################################################
             # Import fragment annotation
             ###################################################

             ### SET: file path of annotated fragment
             fdiry = file.path(FD_RES, "annotation_fragment", sam, TARGET)
             fname = motif
             fpath = file.path(fdiry, fname)    

             ### PRINT: ready to import
             msg = paste(mtfs, sam, "Import", fpath)
             cat(msg, "\n"); flush.console()

             ### import data
             dat = read_tsv(fpath, col_types=CTYPES, col_names=CNAMES)

             ### HANDLE EXCEPTION: empty data
             if (nrow(dat) == 0){
                 msg = paste(mtfs, sam, "Skip Import_Empty")
                 cat(msg, "\n"); flush.console()
                 return(NULL)
             }

             ###################################################
             # Preprocess
             ###################################################

             ### FILTER:
             ###     filter out annotation not fully cover motif
             ###     filter out motif score lower than threshold
             num1 = nrow(dat)    
             dat = dat %>% 
                 mutate(Sample = sam) %>%
                 mutate(Length_MTF = End_MTF - Start_MTF)  %>%
                 mutate(Length_Dif = Length_MTF - Overlap) %>% 
                 dplyr::filter(Length_Dif == 0) %>%
                 dplyr::filter(Score >= THRESHOLD_MOTIF)
             num2 = nrow(dat)

             ### PRINT: result of filtering
             msg = paste(num1, num2, sep="-")
             msg = paste(mtfs, sam, "Filter", msg)
             cat(msg, "\n"); flush.console()

             ### HANDLE EXCEPTION: empty data after filteration
             if(nrow(dat) == 0){
                 msg = paste(mtfs, sam, "Skip Filter_Empty")
                 cat(msg, "\n"); flush.console()
                 return(NULL)
             }
             
             return(dat)
        }) # end lapply

        ### arrange data after preprocessing
        df1  = lst_dat[[1]]
        df2  = lst_dat[[2]]
        lst_dat = NULL

        ### HANDLE EXCEPTION: skip if one data is empty
        if (is.null(df1)){next}
        if (is.null(df2)){next}
        

        ###################################################
        # Create Count Table
        ###################################################
        cat("+++++ Create Count Table +++++\n")
        
        ### extract fragments
        dat1 = bind_rows(df1, df2) %>% 
            dplyr::select(Chrom_Frag, Start_Frag, End_Frag, Count_Frag, Sample) %>%
            distinct
        dat2 = df1 %>% 
            dplyr::select(Chrom_Frag, Start_Frag, End_Frag, Count_Frag, Sample, Motif)
        dat3 = df2 %>% 
            dplyr::select(Chrom_Frag, Start_Frag, End_Frag, Count_Frag, Sample, Motif)

        ### match fragments for the motif pair
        dat = dat1 %>%
            full_join(dat2, by = c("Chrom_Frag", "Start_Frag", "End_Frag", "Count_Frag", "Sample")) %>%
            full_join(dat3, by = c("Chrom_Frag", "Start_Frag", "End_Frag", "Count_Frag", "Sample")) %>%
            mutate(Motif = paste(Motif.x, Motif.y, sep = "_")) %>%
            mutate(Motif = str_remove(string=Motif, pattern="_NA|NA_")) %>% 
            mutate(Group = str_remove(string = Sample, pattern = "[0-9]")) %>%
            mutate(X     = paste(Group, Motif, sep="_")) %>%
            mutate(X     = ifelse(str_detect(X, "Input"), "Input", X))

        ### get count for each sample
        dat = dat %>% 
            group_by(Sample, Motif.x, Motif.y, X) %>% 
            summarise(Value = sum(Count_Frag), .groups = 'drop')
        
        ###################################################
        # Store output
        ###################################################

        ### SET: file path for output count table
        fdiry = FD_OUT 
        fname = paste0("count_",  mtfs, ".tsv")
        fpath = file.path(fdiry, fname)

        ### store results
        ### create the table for the first sample or if the file is not yet created
        ### Otherwise, append the counts in the file
        if (is_created) {

            ### PRINT: file path for output count
            msg = paste(mtfs, sam, "Store_Append", fpath)
            cat(msg, "\n"); flush.console()

            ### append the file
            write.table(
                dat,
                file      = fpath,
                append    = TRUE,
                quote     = FALSE,
                sep       = "\t",
                row.names = FALSE,
                col.names = FALSE)

        } else {

            ### PRINT: file path for output count
            msg = paste(mtfs, sam, "Store_Create", fpath)
            cat(msg, "\n"); flush.console()

            ### create the file
            write.table(
                dat,
                file      = fpath,
                quote     = FALSE,
                sep       = "\t",
                row.names = FALSE,
                col.names = TRUE)

            ### update flag
            is_created = TRUE

        } # end if-else
        
    } # end inner for loop (SAMPLES)
} # end outer for loop (lst_motif_pair)
    
    
EOF

## Test region: upstream of PER1

In [17]:
### set log file directory
sbatch -p ${NODE} \
    --mem=8G \
    --tasks-per-node=1 \
    --cpus-per-task=16 \
    --job-name='Interactive count PER1 f00' \
    -o ${FD_LOG}/linear_model_interactive_count_per1_filter00.txt \
    <<'EOF'
#!/bin/bash
### set directories & global variables
source ../config_duke.sh -v
#CHROMS=($(seq 1 22) X Y)
#CHROM=chr${CHROMS[${SLURM_ARRAY_TASK_ID}]}
#CHROM=chrY

TARGET=target_PER1
FDIRY=interactive_filter00
NCORE=10
THRESHOLD_MOTIF=0

### print start message
timer_start=`date +%s`
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Time Stamp:        " $(date +"%m-%d-%y+%T")
echo

### run the model
srun ${FD_PRJ}/sing_proj_combeffect.sh Rscript motif_interactive_count.R \
    ${TARGET} ${FDIRY} ${NCORE} ${THRESHOLD_MOTIF}
echo

### print end message
timer=`date +%s`
runtime=$(echo "${timer} - ${timer_start}" | bc -l)
echo 'Done!'
echo "Run Time: $(displaytime ${runtime})"

EOF

Submitted batch job 26624731


In [30]:
ls ${FD_RES}/model_linear/tmp_old_v3/interactive_filter00/target_PER1/count_AHR_AIRE.tsv

/gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect/model_linear/tmp_old_v3/interactive_filter00/target_PER1/count_AHR_AIRE.tsv


In [31]:
cat ${FD_RES}/model_linear/tmp_old_v3/interactive_filter00/target_PER1/count_AHR_AIRE.tsv

Sample	Motif.x	Motif.y	X	Value
Input1	AHR	NA	Input	76
Input1	NA	AIRE	Input	13
Input2	AHR	NA	Input	86
Input2	NA	AIRE	Input	14
Input3	AHR	NA	Input	136
Input3	NA	AIRE	Input	23
Input4	AHR	NA	Input	90
Input4	NA	AIRE	Input	15
Input5	AHR	NA	Input	57
Input5	NA	AIRE	Input	9
Input1_20x	AHR	NA	Input	2216
Input1_20x	NA	AIRE	Input	385
Input2_20x	AHR	NA	Input	2124
Input2_20x	NA	AIRE	Input	327
Input3_20x	AHR	NA	Input	2143
Input3_20x	NA	AIRE	Input	357
Input4_20x	AHR	NA	Input	2332
Input4_20x	NA	AIRE	Input	367
Input5_20x	AHR	NA	Input	2036
Input5_20x	NA	AIRE	Input	322
TFX2_DMSO	AHR	NA	TFX_DMSO_AHR	713
TFX2_DMSO	NA	AIRE	TFX_DMSO_AIRE	5
TFX4_DMSO	AHR	NA	TFX_DMSO_AHR	573
TFX4_DMSO	NA	AIRE	TFX_DMSO_AIRE	2
TFX5_DMSO	AHR	NA	TFX_DMSO_AHR	581
TFX5_DMSO	NA	AIRE	TFX_DMSO_AIRE	25
TFX2_Dex	AHR	NA	TFX_Dex_AHR	1004
TFX2_Dex	NA	AIRE	TFX_Dex_AIRE	8
TFX3_Dex	AHR	NA	TFX_Dex_AHR	524
TFX3_Dex	NA	AIRE	TFX_Dex_AIRE	5
TFX4_Dex	AHR	NA	TFX_Dex_AHR	774
TFX4_Dex	NA	AIRE	TFX_Dex_AIRE	6
TFX5_Dex	AHR	NA	TFX_Dex_AHR	1057
TFX5_Dex	NA	A

In [20]:
cat ${FD_RES}/model_linear/interactive_filter00/target_PER1/count_AP1_1_AP1_2.tsv

Sample	Motif.x	Motif.y	X	Value
Input1	AP1/1	AP1/2	Input	157
Input1	AP1/1	NA	Input	30
Input1	NA	AP1/2	Input	50
Input2	AP1/1	AP1/2	Input	148
Input2	AP1/1	NA	Input	27
Input2	NA	AP1/2	Input	45
Input3	AP1/1	AP1/2	Input	174
Input3	AP1/1	NA	Input	42
Input3	NA	AP1/2	Input	82
Input4	AP1/1	AP1/2	Input	162
Input4	AP1/1	NA	Input	24
Input4	NA	AP1/2	Input	38
Input5	AP1/1	AP1/2	Input	118
Input5	AP1/1	NA	Input	18
Input5	NA	AP1/2	Input	32
Input1_20x	AP1/1	AP1/2	Input	3744
Input1_20x	AP1/1	NA	Input	686
Input1_20x	NA	AP1/2	Input	1230
Input2_20x	AP1/1	AP1/2	Input	3472
Input2_20x	AP1/1	NA	Input	601
Input2_20x	NA	AP1/2	Input	1176
Input3_20x	AP1/1	AP1/2	Input	3615
Input3_20x	AP1/1	NA	Input	629
Input3_20x	NA	AP1/2	Input	1192
Input4_20x	AP1/1	AP1/2	Input	4349
Input4_20x	AP1/1	NA	Input	713
Input4_20x	NA	AP1/2	Input	1333
Input5_20x	AP1/1	AP1/2	Input	3529
Input5_20x	AP1/1	NA	Input	580
Input5_20x	NA	AP1/2	Input	1135
TFX2_DMSO	AP1/1	AP1/2	TFX_DMSO_AP1/1_AP1/2	597
TFX2_DMSO	AP1/1	NA	TFX_DMSO_AP1/1	87
TFX2_DMSO	NA	AP

In [9]:
cat ${FD_LOG}/linear_model_interactive_count_per1_filter00.txt

You are on Duke Server: HARDAC
BASE DIRECTORY:     /gpfs/fs1/data/reddylab/Kuei
PATH OF SOURCE:     /gpfs/fs1/data/reddylab/Kuei/source
PATH OF EXECUTABLE: /gpfs/fs1/data/reddylab/Kuei/exe
PATH OF ANNOTATION: /gpfs/fs1/data/reddylab/Kuei/annotation
PATH OF PROJECT:    /gpfs/fs1/data/reddylab/Kuei/GitRepo/Proj_CombEffect_STARRseq/notebooks
PATH OF RESULTS:    /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect

Slurm Array Index: 
Time Stamp:         10-22-21+14:40:49


++++++++++ Set environment  ++++++++++
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
✔ ggplot2 3.3.5     ✔ purrr   0.3.4
✔ tibble  3.1.4     ✔ dplyr   1.0.7
✔ tidyr   1.1.3     ✔ stringr 1.4.0
✔ readr   2.0.1     ✔ forcats 0.5.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()

Attaching package: ‘pryr’

The following objects are masked from ‘package:purrr’:

    compose, partial


