**Set environment**

In [1]:
source ../config/config_duke.sh
source ../config/config_load_module_bedtools.sh
show_env

You are on Duke Server: HARDAC
BASE DIRECTORY (FD_BASE): /data/reddylab/Kuei
WORK DIRECTORY (FD_WORK): /data/reddylab/Kuei/out
CODE DIRECTORY (FD_CODE): /data/reddylab/Kuei/code
SING DIRECTORY (FD_SING): /data/reddylab/Kuei/singularity
PATH OF PROJECT (FD_PRJ): /data/reddylab/Kuei/code/Proj_CombEffect_ENCODE_FCC
PATH OF RESULTS (FD_RES): /data/reddylab/Kuei/out/proj_combeffect_encode_fcc
PATH OF LOG     (FD_LOG): /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/log



**Check data**

In [2]:
ASSAY=Tewhey_K562_TileMPRA
PREFIX=OL43
GROUP=Input
ls ${FD_RES}/results/${ASSAY}/coverage/track_normalized/${PREFIX}*hg38*raw*${GROUP}*rep*bed.gz \
| xargs -n 1 basename

OL43_20221003.hg38.raw.Input.rep1.stranded_pos.bed.gz
OL43_20221003.hg38.raw.Input.rep2.stranded_pos.bed.gz
OL43_20221003.hg38.raw.Input.rep3.stranded_pos.bed.gz
OL43_20221003.hg38.raw.Input.rep4.stranded_pos.bed.gz
OL43_20221003.hg38.raw.Input.rep5.stranded_pos.bed.gz
OL43_20221003.hg38.raw.Input.rep6.stranded_pos.bed.gz


In [3]:
ASSAY=Tewhey_K562_TileMPRA
PREFIX=OL43
GROUP=Input
ls ${FD_RES}/results/${ASSAY}/coverage/track_normalized/${PREFIX}*hg38*raw*${GROUP}*rep1*bed.gz \
| xargs -n 1 basename

OL43_20221003.hg38.raw.Input.rep1.stranded_pos.bed.gz


## RUN

In [4]:
sbatch -p ${NODE} \
    --exclude=dl-01 \
    --cpus-per-task 8 \
    --mem 20G \
    --output ${FD_LOG}/coverage_perbase_mean_TMPRA_OL43.%a.txt \
    --array 0-1 \
    <<'EOF'
#!/bin/bash
### set directories & global variables
source ../config/config_duke.sh
ASSAY=Tewhey_K562_TileMPRA
PREFIX=OL43
SAMPLE_GROUPS=("Input" "Output")

### print start message
timer_start=`date +%s`
echo "Hostname:          " $(hostname)
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Time Stamp:        " $(date +"%m-%d-%y+%T")
echo

### set input and output files
FD_INP=${FD_RES}/results/${ASSAY}/coverage/track_normalized
FD_OUT=${FD_RES}/results/${ASSAY}/coverage/track_normalized

GROUP=${SAMPLE_GROUPS[${SLURM_ARRAY_TASK_ID}]}
FP_INPS=($(ls ${FD_INP}/${PREFIX}*hg38*raw*${GROUP}*rep*bed.gz))

FP_INP=$(ls ${FD_INP}/${PREFIX}*hg38*raw*${GROUP}*rep1*bed.gz)
FN_INP=$(basename ${FP_INP})
FN_OUT=${FN_INP/rep1/mean}
FP_OUT=${FD_OUT}/${FN_OUT}

### show I/O file
echo ++++++++++++++++++++++++++++++++++++++
for FP_INP in ${FP_INPS[@]}; do
    echo "Input file: " ${FP_INP}
    echo
    echo "show first few lines of file"
    zcat ${FP_INP} | head -10
    echo
done

### execute
bedtools unionbedg -i ${FP_INPS[@]} \
| awk '{
    sum = 0; 
    for (col = 4; col <= NF; col++) {
       sum += $col; 
    }
    mean = sum / (NF-4+1);
    print $1 "\t" $2 "\t" $3 "\t" mean; 
  }' \
| gzip -c \
> ${FP_OUT}
  
### show I/O file
echo ++++++++++++++++++++++++++++++++++++++
echo "Output file: " ${FP_OUT}
echo
echo "show first few lines of file"
zcat ${FP_OUT} | head -10
echo

### print end message
timer=`date +%s`
runtime=$(echo "${timer} - ${timer_start}" | bc -l)
echo
echo 'Done!'
echo "Run Time: $(displaytime ${runtime})"
EOF

Submitted batch job 29574760


## Check

In [5]:
cat ${FD_LOG}/coverage_perbase_mean_TMPRA_OL43.0.txt

Hostname:           x2-08-4.genome.duke.edu
Slurm Array Index:  0
Time Stamp:         03-15-23+15:16:25

++++++++++++++++++++++++++++++++++++++
Input file:  /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/Tewhey_K562_TileMPRA/coverage/track_normalized/OL43_20221003.hg38.raw.Input.rep1.stranded_pos.bed.gz

show first few lines of file
chr8	126735901	126735902	17.5495
chr8	126735902	126735903	17.5495
chr8	126735903	126735904	17.5495
chr8	126735904	126735905	17.5495
chr8	126735905	126735906	17.5495
chr8	126735906	126735907	17.5495
chr8	126735907	126735908	17.5495
chr8	126735908	126735909	17.5495
chr8	126735909	126735910	17.5495
chr8	126735910	126735911	17.5495

Input file:  /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/Tewhey_K562_TileMPRA/coverage/track_normalized/OL43_20221003.hg38.raw.Input.rep2.stranded_pos.bed.gz

show first few lines of file
chr8	126735901	126735902	18.4034
chr8	126735902	126735903	18.4034
chr8	126735903	126735904	18.4034
chr8	126735904	12673

In [6]:
cat ${FD_LOG}/coverage_perbase_mean_TMPRA_OL43.1.txt

Hostname:           x3-01-1.genome.duke.edu
Slurm Array Index:  1
Time Stamp:         03-15-23+15:16:25

++++++++++++++++++++++++++++++++++++++
Input file:  /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/Tewhey_K562_TileMPRA/coverage/track_normalized/OL43_20221003.hg38.raw.Output.rep1.stranded_pos.bed.gz

show first few lines of file
chr8	126735901	126735902	62.9679
chr8	126735902	126735903	62.9679
chr8	126735903	126735904	62.9679
chr8	126735904	126735905	62.9679
chr8	126735905	126735906	62.9679
chr8	126735906	126735907	62.9679
chr8	126735907	126735908	62.9679
chr8	126735908	126735909	62.9679
chr8	126735909	126735910	62.9679
chr8	126735910	126735911	62.9679

Input file:  /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/Tewhey_K562_TileMPRA/coverage/track_normalized/OL43_20221003.hg38.raw.Output.rep2.stranded_pos.bed.gz

show first few lines of file
chr8	126735901	126735902	59.7356
chr8	126735902	126735903	59.7356
chr8	126735903	126735904	59.7356
chr8	126735904	126