**Set environment**

In [1]:
source ../config/config_duke.sh
source ../config/config_load_module_bedtools.sh
show_env

You are on Duke Server: HARDAC
BASE DIRECTORY (FD_BASE): /data/reddylab/Kuei
WORK DIRECTORY (FD_WORK): /data/reddylab/Kuei/out
CODE DIRECTORY (FD_CODE): /data/reddylab/Kuei/code
SING DIRECTORY (FD_SING): /data/reddylab/Kuei/singularity
PATH OF PROJECT (FD_PRJ): /data/reddylab/Kuei/code/Proj_CombEffect_ENCODE_FCC
PATH OF RESULTS (FD_RES): /data/reddylab/Kuei/out/proj_combeffect_encode_fcc
PATH OF LOG     (FD_LOG): /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/log



**Check data**

In [2]:
ASSAY=A001_K562_WSTARRseq
ls ${FD_RES}/results/${ASSAY}/coverage/track_normalized

[0m[38;5;9mA001_K562_WSTARRseq.Input.mean.SUBSET.unstranded.bed.gz[0m
[38;5;9mA001_K562_WSTARRseq.Input.rep1.SUBSET.unstranded.bed.gz[0m
[38;5;9mA001_K562_WSTARRseq.Input.rep2.SUBSET.unstranded.bed.gz[0m
[38;5;9mA001_K562_WSTARRseq.Input.rep3.SUBSET.unstranded.bed.gz[0m
[38;5;9mA001_K562_WSTARRseq.Input.rep4.SUBSET.unstranded.bed.gz[0m
[38;5;9mA001_K562_WSTARRseq.Output.mean.SUBSET.unstranded.bed.gz[0m
[38;5;9mA001_K562_WSTARRseq.Output.rep1.SUBSET.unstranded.bed.gz[0m
[38;5;9mA001_K562_WSTARRseq.Output.rep2.SUBSET.unstranded.bed.gz[0m
[38;5;9mA001_K562_WSTARRseq.Output.rep3.SUBSET.unstranded.bed.gz[0m
[38;5;9mA001_K562_WSTARRseq.pLog2FC.mean.SUBSET.unstranded.bed.gz[0m


In [5]:
ASSAY=A001_K562_WSTARRseq
GROUP=Input
ls ${FD_RES}/results/${ASSAY}/coverage/track_normalized/*${GROUP}*rep*SUBSET*bed.gz \
| xargs -n 1 basename

A001_K562_WSTARRseq.Input.rep1.SUBSET.unstranded.bed.gz
A001_K562_WSTARRseq.Input.rep2.SUBSET.unstranded.bed.gz
A001_K562_WSTARRseq.Input.rep3.SUBSET.unstranded.bed.gz
A001_K562_WSTARRseq.Input.rep4.SUBSET.unstranded.bed.gz


## RUN

In [6]:
sbatch -p ${NODE} \
    --exclude=dl-01 \
    --cpus-per-task 8 \
    --mem 20G \
    --output ${FD_LOG}/coverage_perbase_mean_subset_WSTARR.%a.txt \
    --array 0-1 \
    <<'EOF'
#!/bin/bash
### set directories & global variables
source ../config/config_duke.sh
ASSAY=A001_K562_WSTARRseq
SAMPLE_GROUPS=("Input" "Output")

### print start message
timer_start=`date +%s`
echo "Hostname:          " $(hostname)
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Time Stamp:        " $(date +"%m-%d-%y+%T")
echo

### set input and output files
FD_INP=${FD_RES}/results/${ASSAY}/coverage/track_normalized
FD_OUT=${FD_RES}/results/${ASSAY}/coverage/track_normalized

GROUP=${SAMPLE_GROUPS[${SLURM_ARRAY_TASK_ID}]}
FP_INPS=($(ls ${FD_INP}/*${GROUP}*rep*SUBSET*bed.gz))

FP_INP=$(ls ${FD_INP}/*${GROUP}*rep1*SUBSET*bed.gz)
FN_INP=$(basename ${FP_INP})
FN_OUT=${FN_INP/rep1/mean}
FP_OUT=${FD_OUT}/${FN_OUT}

### show I/O file
echo ++++++++++++++++++++++++++++++++++++++
for FP_INP in ${FP_INPS[@]}; do
    echo "Input file: " ${FP_INP}
    echo
    echo "show first few lines of file"
    zcat ${FP_INP} | head -10
    echo
done

### execute
bedtools unionbedg -i ${FP_INPS[@]} \
| awk '{
    sum = 0; 
    for (col = 4; col <= NF; col++) { 
       sum += $col; 
    }
    mean = sum / (NF-4+1);
    print $1 "\t" $2 "\t" $3 "\t" mean; 
  }' \
| gzip -c \
> ${FP_OUT}
  
### show I/O file
echo ++++++++++++++++++++++++++++++++++++++
echo "Output file: " ${FP_OUT}
echo
echo "show first few lines of file"
zcat ${FP_OUT} | head -10
echo

### print end message
timer=`date +%s`
runtime=$(echo "${timer} - ${timer_start}" | bc -l)
echo
echo 'Done!'
echo "Run Time: $(displaytime ${runtime})"
EOF

Submitted batch job 29575255


## Check

In [7]:
cat ${FD_LOG}/coverage_perbase_mean_subset_WSTARR.0.txt

Hostname:           x2-07-3.genome.duke.edu
Slurm Array Index:  0
Time Stamp:         03-16-23+09:56:32

++++++++++++++++++++++++++++++++++++++
Input file:  /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/A001_K562_WSTARRseq/coverage/track_normalized/A001_K562_WSTARRseq.Input.rep1.SUBSET.unstranded.bed.gz

show first few lines of file
chr11	4000000	4000001	0.185812
chr11	4000001	4000002	0.185812
chr11	4000002	4000003	0.185812
chr11	4000003	4000004	0.185812
chr11	4000004	4000005	0.185812
chr11	4000005	4000006	0.185812
chr11	4000006	4000007	0.185812
chr11	4000007	4000008	0.185812
chr11	4000008	4000009	0.185812
chr11	4000009	4000010	0.185812

Input file:  /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/A001_K562_WSTARRseq/coverage/track_normalized/A001_K562_WSTARRseq.Input.rep2.SUBSET.unstranded.bed.gz

show first few lines of file
chr11	4000000	4000001	0.14014
chr11	4000001	4000002	0.14014
chr11	4000002	4000003	0.14014
chr11	4000003	4000004	0.13013
chr11	4000004	400

In [8]:
cat ${FD_LOG}/coverage_perbase_mean_subset_WSTARR.1.txt

Hostname:           x3-04-2.genome.duke.edu
Slurm Array Index:  1
Time Stamp:         03-16-23+09:56:32

++++++++++++++++++++++++++++++++++++++
Input file:  /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/A001_K562_WSTARRseq/coverage/track_normalized/A001_K562_WSTARRseq.Output.rep1.SUBSET.unstranded.bed.gz

show first few lines of file
chr11	4000000	4000001	0.330529
chr11	4000001	4000002	0.324292
chr11	4000002	4000003	0.324292
chr11	4000003	4000004	0.324292
chr11	4000004	4000005	0.324292
chr11	4000005	4000006	0.324292
chr11	4000006	4000007	0.324292
chr11	4000007	4000008	0.318056
chr11	4000008	4000009	0.318056
chr11	4000009	4000010	0.318056

Input file:  /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/A001_K562_WSTARRseq/coverage/track_normalized/A001_K562_WSTARRseq.Output.rep2.SUBSET.unstranded.bed.gz

show first few lines of file
chr11	4000000	4000001	0.254249
chr11	4000001	4000002	0.254249
chr11	4000002	4000003	0.254249
chr11	4000003	4000004	0.266961
chr11	40000