**Set environment**

In [1]:
source ../config/config_duke.sh
source ../config/config_load_module_bedtools.sh
show_env

You are on Duke Server: HARDAC
BASE DIRECTORY (FD_BASE): /data/reddylab/Kuei
WORK DIRECTORY (FD_WORK): /data/reddylab/Kuei/out
CODE DIRECTORY (FD_CODE): /data/reddylab/Kuei/code
SING DIRECTORY (FD_SING): /data/reddylab/Kuei/singularity
PATH OF PROJECT (FD_PRJ): /data/reddylab/Kuei/code/Proj_CombEffect_ENCODE_FCC
PATH OF RESULTS (FD_RES): /data/reddylab/Kuei/out/proj_combeffect_encode_fcc
PATH OF LOG     (FD_LOG): /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/log



**Check data**

In [2]:
ASSAY=Tewhey_K562_TileMPRA
PREFIX=OL45
GROUP=Input
ls ${FD_RES}/results/${ASSAY}/coverage/track_normalized/${PREFIX}*hg38*raw*${GROUP}*rep*bed.gz \
| xargs -n 1 basename

OL45_20220927.hg38.raw.Input.rep1.stranded_pos.bed.gz
OL45_20220927.hg38.raw.Input.rep2.stranded_pos.bed.gz
OL45_20220927.hg38.raw.Input.rep3.stranded_pos.bed.gz
OL45_20220927.hg38.raw.Input.rep4.stranded_pos.bed.gz


In [3]:
ASSAY=Tewhey_K562_TileMPRA
PREFIX=OL45
GROUP=Input
ls ${FD_RES}/results/${ASSAY}/coverage/track_normalized/${PREFIX}*hg38*raw*${GROUP}*rep1*bed.gz \
| xargs -n 1 basename

OL45_20220927.hg38.raw.Input.rep1.stranded_pos.bed.gz


## RUN

In [4]:
sbatch -p ${NODE} \
    --exclude=dl-01 \
    --cpus-per-task 8 \
    --mem 20G \
    --output ${FD_LOG}/coverage_perbase_mean_TMPRA_OL45.%a.txt \
    --array 0-1 \
    <<'EOF'
#!/bin/bash
### set directories & global variables
source ../config/config_duke.sh
ASSAY=Tewhey_K562_TileMPRA
PREFIX=OL45
SAMPLE_GROUPS=("Input" "Output")

### print start message
timer_start=`date +%s`
echo "Hostname:          " $(hostname)
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Time Stamp:        " $(date +"%m-%d-%y+%T")
echo

### set input and output files
FD_INP=${FD_RES}/results/${ASSAY}/coverage/track_normalized
FD_OUT=${FD_RES}/results/${ASSAY}/coverage/track_normalized

GROUP=${SAMPLE_GROUPS[${SLURM_ARRAY_TASK_ID}]}
FP_INPS=($(ls ${FD_INP}/${PREFIX}*hg38*raw*${GROUP}*rep*bed.gz))

FP_INP=$(ls ${FD_INP}/${PREFIX}*hg38*raw*${GROUP}*rep1*bed.gz)
FN_INP=$(basename ${FP_INP})
FN_OUT=${FN_INP/rep1/mean}
FP_OUT=${FD_OUT}/${FN_OUT}

### show I/O file
echo ++++++++++++++++++++++++++++++++++++++
for FP_INP in ${FP_INPS[@]}; do
    echo "Input file: " ${FP_INP}
    echo
    echo "show first few lines of file"
    zcat ${FP_INP} | head -10
    echo
done

### execute
bedtools unionbedg -i ${FP_INPS[@]} \
| awk '{
    sum = 0; 
    for (col = 4; col <= NF; col++) {
       sum += $col; 
    }
    mean = sum / (NF-4+1);
    print $1 "\t" $2 "\t" $3 "\t" mean; 
  }' \
| gzip -c \
> ${FP_OUT}
  
### show I/O file
echo ++++++++++++++++++++++++++++++++++++++
echo "Output file: " ${FP_OUT}
echo
echo "show first few lines of file"
zcat ${FP_OUT} | head -10
echo

### print end message
timer=`date +%s`
runtime=$(echo "${timer} - ${timer_start}" | bc -l)
echo
echo 'Done!'
echo "Run Time: $(displaytime ${runtime})"
EOF

Submitted batch job 29574762


## Check

In [5]:
cat ${FD_LOG}/coverage_perbase_mean_TMPRA_OL45.0.txt

Hostname:           x3-01-2.genome.duke.edu
Slurm Array Index:  0
Time Stamp:         03-15-23+15:17:25

++++++++++++++++++++++++++++++++++++++
Input file:  /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/Tewhey_K562_TileMPRA/coverage/track_normalized/OL45_20220927.hg38.raw.Input.rep1.stranded_pos.bed.gz

show first few lines of file
chr11	4505501	4505502	14.0224
chr11	4505502	4505503	14.0224
chr11	4505503	4505504	14.0224
chr11	4505504	4505505	14.0224
chr11	4505505	4505506	14.0224
chr11	4505506	4505507	14.0224
chr11	4505507	4505508	14.0224
chr11	4505508	4505509	14.0224
chr11	4505509	4505510	14.0224
chr11	4505510	4505511	14.0224

Input file:  /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/Tewhey_K562_TileMPRA/coverage/track_normalized/OL45_20220927.hg38.raw.Input.rep2.stranded_pos.bed.gz

show first few lines of file
chr11	4505501	4505502	13.5239
chr11	4505502	4505503	13.5239
chr11	4505503	4505504	13.5239
chr11	4505504	4505505	13.5239
chr11	4505505	4505506	13.5239

In [6]:
cat ${FD_LOG}/coverage_perbase_mean_TMPRA_OL45.1.txt

Hostname:           x3-01-2.genome.duke.edu
Slurm Array Index:  1
Time Stamp:         03-15-23+15:17:25

++++++++++++++++++++++++++++++++++++++
Input file:  /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/Tewhey_K562_TileMPRA/coverage/track_normalized/OL45_20220927.hg38.raw.Output.rep1.stranded_pos.bed.gz

show first few lines of file
chr11	4505501	4505502	8.39743
chr11	4505502	4505503	8.39743
chr11	4505503	4505504	8.39743
chr11	4505504	4505505	8.39743
chr11	4505505	4505506	8.39743
chr11	4505506	4505507	8.39743
chr11	4505507	4505508	8.39743
chr11	4505508	4505509	8.39743
chr11	4505509	4505510	8.39743
chr11	4505510	4505511	8.39743

Input file:  /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/Tewhey_K562_TileMPRA/coverage/track_normalized/OL45_20220927.hg38.raw.Output.rep2.stranded_pos.bed.gz

show first few lines of file
chr11	4505501	4505502	10.3593
chr11	4505502	4505503	10.3593
chr11	4505503	4505504	10.3593
chr11	4505504	4505505	10.3593
chr11	4505505	4505506	10.35