In [1]:
source ../config_duke.sh -v

You are on Duke Server: HARDAC
BASE DIRECTORY:     /gpfs/fs1/data/reddylab/Kuei
PATH OF SOURCE:     /gpfs/fs1/data/reddylab/Kuei/source
PATH OF EXECUTABLE: /gpfs/fs1/data/reddylab/Kuei/exe
PATH OF ANNOTATION: /gpfs/fs1/data/reddylab/Kuei/annotation
PATH OF PROJECT:    /gpfs/fs1/data/reddylab/Kuei/GitRepo/Proj_CombEffect_STARRseq/notebooks
PATH OF RESULTS:    /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect



In [18]:
cat > motif_marginal.R << 'EOF'
### set environment
cat("\n++++++++++ Set environment  ++++++++++\n")
#source("/home/mount/project/config_sing.R")
source("config_sing.R")

cat("\n++++++++++ Set global variables ++++++++++\n")
### Get argument: Chromomsome
ARGS        = commandArgs(trailingOnly=TRUE)
CHROM       = ARGS[1]
IS_INPUT20X = as.logical(ARGS[2])
FDIRY       = ARGS[3]
N_CORE      = as.integer(ARGS[4])
THRESHOLD   = 10

### set global variables
SAMPLES_TOT = c(
    paste0("Input", 1:5),
    paste0("Input", 1:5, "_20x"),
    paste0("TFX",   2:5, "_DMSO"),
    paste0("TFX",   2:5, "_Dex"))

SAMPLES_INP = c(
    paste0("Input", 1:5),
    paste0("TFX",   2:5, "_DMSO"),
    paste0("TFX",   2:5, "_Dex"))

SAMPLES_INP20X = c(
    paste0("Input", 1:5, "_20x"),
    paste0("TFX",   2:5, "_DMSO"),
    paste0("TFX",   2:5, "_Dex"))

if (IS_INPUT20X) {
    SAMPLES = SAMPLES_INP20X
    FDIRY   = paste0(FDIRY, "_", "input20x")
} else {
    SAMPLES = SAMPLES_INP
}
FD_OUT = file.path(FD_RES, "model_linear", FDIRY, CHROM)

### print start message
cat("Chromosome:      ", CHROM,       "\n")
cat("Is Input20x used?", IS_INPUT20X, "\n")
cat("Output Directory:", FD_OUT,      "\n")
cat("#Cores Resgister:", N_CORE,      "\n")

###################################################
### Import library size
###################################################
cat("\n++++++++++ Import library size ++++++++++\n")

### Helper function to get
get_group = function(idn_sample){
    idn = idn_sample
    
    idn = str_replace(
        string = idn, 
        pattern = "Input[0-9]", 
        replacement = "Input")
    
    idn = str_remove(
        string = idn, 
        pattern = "_20x")
    
    idn = str_replace(
        string = idn, 
        pattern = "TFX[0-9]_", 
        replacement="TFX_")
    return(idn)
}

### set path
fdiry = file.path(FD_RES, "source")
fname = "library_size.txt"
fpath = file.path(fdiry, fname)

### import library size
ctypes = c(col_integer(), col_character())
cnames = c("Size", "FPath")
dat_lib = read_tsv(fpath, col_types=ctypes, col_names = cnames)

### remove the total size
dat_lib = dat_lib %>% dplyr::filter(FPath != "total")

### summarize info from the file path
### stackoverflow: Extract only folder name right before filename from full path
dat_lib = dat_lib %>% 
    #mutate(Sample = tools::file_path_sans_ext(basename(FPath))) %>%
    mutate(Sample = basename(dirname(FPath))) %>%
    mutate(Group = get_group(Sample))
dat_lib = dat_lib %>% dplyr::select(Size, Sample, Group)
print(dat_lib)

###################################################
### Import annotated fragments
###################################################
cat("\n++++++++++ Import annotated fragments ++++++++++\n")

### set column names and types
ctypes = c(col_character(), col_integer(), col_integer(), col_integer(),
           col_character(), col_integer(), col_integer(),
           col_character(), col_double(),  col_integer())
cnames = c("Chrom_Frag", "Start_Frag", "End_Frag", "Count_Frag",
           "Chrom_MTF",  "Start_MTF",  "End_MTF",
           "Motif", "Score", "Overlap")

### import bed files for each sample 
fdiry = file.path(FD_RES, "annotation_fragment", "filter_motif_score095")
fname = paste0(CHROM, ".bed.gz")

lst_dat = lapply(SAMPLES_TOT, function(sam){
    ### set path
    fpath = file.path(fdiry, sam, fname)
    print(fpath); flush.console()
    
    ### import data
    dat = read_tsv(fpath, col_types=ctypes, col_names=cnames) %>% mutate(Sample = sam)
    return(dat)
})

### arrange data
dat_ann_frag = bind_rows(lst_dat)
lst_dat      = NULL

### check environment
cat("Current memory used after import data:\n")
mem_used()

###################################################
### Preprocess
###################################################
cat("\n++++++++++ Preprocess ++++++++++\n")

### Filter: fully cover the motif
dat = dat_ann_frag
dat = dat %>% 
    mutate(Length_MTF = End_MTF - Start_MTF) %>%
    mutate(Length_Dif = Length_MTF - Overlap)
dat_ann_frag = NULL # release memory

cat("Filtering: fully cover the motif\n")
cat("    Before Filter:", "#Motif =", length(unique(dat$Motif)), "#Annot =", nrow(dat), "\n")
dat = dat %>% dplyr::filter(Length_Dif == 0)
cat("    After  Filter:", "#Motif =", length(unique(dat$Motif)), "#Annot =", nrow(dat), "\n")

### grouped by motif cluster and split the annotated fragments into list
lst    = dat %>% group_by(Motif) %>% group_split
motifs = lapply(lst, function(x){unique(x$Motif)}) %>% unlist
names(lst) = motifs
dat = NULL # release memory

lst_frag = lapply(lst, function(dat){
    tmp = dat %>% 
        group_by(Chrom_Frag, Start_Frag, End_Frag, Count_Frag, Motif, Sample) %>%
        summarize(N_Motif = n(), .groups = 'drop')
    return(tmp)
})

### filter out motifs that have almost no/low fragment in total
cat("Filtering: filter out motifs with no/low fragments\n")
cat("    Threshold:", THRESHOLD, "\n")
cat("    Before Filter:", "#Motif =", length(lst_frag), "\n")

lst = lst_frag
cnt = lapply(lst, function(dat){sum(dat$Count_Frag)})
lst = lst[cnt > 10]
lst_frag = lst

cat("    After  Filter:", "#Motif =", length(lst_frag), "\n")

### check environment
cat("Current memory used after import data:\n")
mem_used()

###################################################
### Set up linear model
###################################################
cat("\n++++++++++ Linear Model ++++++++++\n")

### Helper function
### https://win-vector.com/2014/05/30/trimming-the-fat-from-glm-models-in-r/
stripGlmLR = function(cm) {
  cm$y = c()
  cm$model = c()
  
  cm$residuals = c()
  cm$fitted.values = c()
  cm$effects = c()
  cm$qr$qr = c()  
  cm$linear.predictors = c()
  cm$weights = c()
  cm$prior.weights = c()
  cm$data = c()

  cm$family$variance = c()
  cm$family$dev.resids = c()
  cm$family$aic = c()
  cm$family$validmu = c()
  cm$family$simulate = c()
  attr(cm$terms,".Environment") = c()
  attr(cm$formula,".Environment") = c()
  
  return(cm)
}

### start timer
timer_start = Sys.time()

### run linear model
registerDoParallel(N_CORE)
lst_tmp = lst_frag #head(lst_frag, 10)
lst_tmp = foreach(idn = names(lst_tmp)) %dopar% {

    ### extract and get fragments
    dat = lst_tmp[[idn]]
    dat = dat %>% 
        dplyr::select(Chrom_Frag, Start_Frag, End_Frag, Count_Frag, Sample) %>%
        distinct()

    ### count fragments for each sample
    dat = dat %>% group_by(Sample) %>% summarise(Value = sum(Count_Frag))
    
    ### choose samples for modeling
    dat = dat %>% dplyr::filter(Sample %in% SAMPLES)
    
    ### normalize counts by library size
    dat = dat %>% left_join(dat_lib, by="Sample")
    dat = dat %>%
        mutate(Norm_Value    = Value / Size) %>% 
        mutate(Lognorm_Value = log10(Value) - log10(Size)) %>%
        mutate(X = Group)

    ### create design matrix
    idxs  = c("Input", "TFX_DMSO", "TFX_Dex")
    dat$X = factor(dat$X, levels=idxs)
    X = model.matrix(~X, dat)
    y = dat$Norm_Value

    ### setup design matrix
    X[,"XTFX_DMSO"] = X[,"XTFX_DMSO"] + X[,"XTFX_Dex"]
    
    ### fit model and reduce the memory size
    fit = lm(y ~ X + 0)
    #fit = stripGlmLR(fit)
    res = summary(fit)
    res = stripGlmLR(res)
    
    ### arrange results
    lst = list()
    lst$res = res
    lst$cnt = dat
    lst$X   = X
    lst$y   = y
    
    ### store the results
    mtf   = str_replace_all(idn, pattern = "/", replacement = "_")
    #fdiry = file.path(FD_RES, "model_linear", FD_OUT, CHROM)
    fdiry = FD_OUT
    fname = paste0("motif_", mtf, ".RDS")
    fpath = file.path(fdiry, fname)
    
    #print(c(idn, mtf)); flush.console()
    dir.create(fdiry, recursive = TRUE, showWarnings = FALSE)
    saveRDS(lst, fpath)
}

### print end message
timer = Sys.time()
cat("Timer of the loop:\n")
print(timer - timer_start)

EOF

In [19]:
sinfo

PARTITION   AVAIL  TIMELIMIT  NODES  STATE NODELIST
all*           up   infinite      1   comp x1-01-2
all*           up   infinite     31    mix dl-01,x1-01-4,x1-02-[1-3],x1-03-[3-4],x2-02-3,x2-03-[1-2,4],x2-04-[1-4],x2-05-[2-4],x2-06-[2-4],x2-07-[1-2],x3-02-[3-4],x3-03-[1,4],x3-04-3,x3-05-[2-4]
all*           up   infinite     26   idle x1-01-3,x1-02-4,x1-03-[1-2],x2-02-[2,4],x2-03-3,x2-05-1,x2-07-[3-4],x2-08-[1-4],x3-01-[1-4],x3-02-[1-2],x3-03-[2-3],x3-04-[1-2,4],x3-05-1
interactive    up 1-00:00:00      1    mix x2-02-1
interactive    up 1-00:00:00      1   idle x2-06-1
jupyterhub     up 1-00:00:00      1   idle x1-01-1


In [16]:
echo ${NODE}

all


## Test: chromosome Y

In [108]:
### set log file directory
sbatch -p ${NODE} \
    --mem=8G \
    --tasks-per-node=1 \
    --cpus-per-task=10 \
    --job-name='Marginal Effect' \
    -o ${FD_LOG}/linear_model_marginal_input20x_chrY.txt \
    <<'EOF'
#!/bin/bash
### set directories & global variables
source ../config_duke.sh -v

CHROMS=($(seq 1 22) X Y)
CHROM=chr${CHROMS[${SLURM_ARRAY_TASK_ID}]}
CHROM=chrY

IS_INPUT20X=TRUE
FDIRY=marginal_filter

### print start message
timer_start=`date +%s`
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Time Stamp:        " $(date +"%m-%d-%y+%T")
echo

### run the model
srun ${FD_PRJ}/sing_proj_combeffect.sh Rscript motif_marginal.R ${CHROM} ${IS_INPUT20X} ${FDIRY}
echo

### print end message
timer=`date +%s`
runtime=$(echo "${timer} - ${timer_start}" | bc -l)
echo 'Done!'
echo "Run Time: $(displaytime ${runtime})"

EOF

Submitted batch job 26490502


## RUN: chromosome 20, 21, 22, X

In [8]:
tail ${FD_LOG}/test_memory_chrom_70G.*.txt

==> /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect/log/test_memory_chrom_70G.19.txt <==
[1] "/home/mount/work/out/proj_combeffect/annotation_fragment/filter_motif_score095/TFX5_DMSO/chr20.bed.gz"
[1] "/home/mount/work/out/proj_combeffect/annotation_fragment/filter_motif_score095/TFX2_Dex/chr20.bed.gz"
[1] "/home/mount/work/out/proj_combeffect/annotation_fragment/filter_motif_score095/TFX3_Dex/chr20.bed.gz"
[1] "/home/mount/work/out/proj_combeffect/annotation_fragment/filter_motif_score095/TFX4_Dex/chr20.bed.gz"
[1] "/home/mount/work/out/proj_combeffect/annotation_fragment/filter_motif_score095/TFX5_Dex/chr20.bed.gz"
Current memory used after import data:
28.7 GB

Done!
Run Time: 5 minutes and 37 seconds

==> /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect/log/test_memory_chrom_70G.20.txt <==
[1] "/home/mount/work/out/proj_combeffect/annotation_fragment/filter_motif_score095/TFX5_DMSO/chr21.bed.gz"
[1] "/home/mount/work/out/proj_combeffect/annotation_fragment/filter_motif_score095/TF

In [17]:
### Chromosome 20, 21, 22, X
### set log file directory
sbatch -p ${NODE} \
    --array=19-22 \
    --mem=90G \
    --tasks-per-node=1 \
    --cpus-per-task=21 \
    --job-name='Marginal Effect' \
    -o ${FD_LOG}/linear_model_marginal_input20x.%a.txt \
    <<'EOF'
#!/bin/bash
### set directories & global variables
source ../config_duke.sh -v

CHROMS=($(seq 1 22) X Y)
CHROM=chr${CHROMS[${SLURM_ARRAY_TASK_ID}]}

IS_INPUT20X=TRUE
FDIRY=marginal_filter
NCORE=20

### print start message
timer_start=`date +%s`
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Time Stamp:        " $(date +"%y-%m-%d-%T")
echo

### run the model
srun ${FD_PRJ}/sing_proj_combeffect.sh Rscript motif_marginal.R ${CHROM} ${IS_INPUT20X} ${FDIRY} ${NCORE}
echo

### print end message
timer=`date +%s`
runtime=$(echo "${timer} - ${timer_start}" | bc -l)
echo 'Done!'
echo "Run Time: $(displaytime ${runtime})"

EOF

Submitted batch job 26542120
