## Calculate coverage perbase for input and output

**Set environment**

In [1]:
source ../config/config_duke.sh
source ../config/config_load_module_bedtools.sh
show_env

You are on Duke Server: HARDAC
BASE DIRECTORY (FD_BASE): /data/reddylab/Kuei
WORK DIRECTORY (FD_WORK): /data/reddylab/Kuei/out
CODE DIRECTORY (FD_CODE): /data/reddylab/Kuei/code
SING DIRECTORY (FD_SING): /data/reddylab/Kuei/singularity
PATH OF PROJECT (FD_PRJ): /data/reddylab/Kuei/code/Proj_CombEffect_ENCODE_FCC
PATH OF RESULTS (FD_RES): /data/reddylab/Kuei/out/proj_combeffect_encode_fcc
PATH OF LOG     (FD_LOG): /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/log



**Check data**

In [2]:
ASSAY=A001_K562_WSTARRseq
FD_INP=${FD_RES}/results/${ASSAY}/fragment_count
ls ${FD_INP}

[0m[38;5;9mA001_K562_WSTARRseq.Input.rep1.SUBSET.unstranded.bed.gz[0m
[38;5;9mA001_K562_WSTARRseq.Input.rep1.WGS.unstranded.bed.gz[0m
[38;5;9mA001_K562_WSTARRseq.Input.rep2.SUBSET.unstranded.bed.gz[0m
[38;5;9mA001_K562_WSTARRseq.Input.rep2.WGS.unstranded.bed.gz[0m
[38;5;9mA001_K562_WSTARRseq.Input.rep3.SUBSET.unstranded.bed.gz[0m
[38;5;9mA001_K562_WSTARRseq.Input.rep3.WGS.unstranded.bed.gz[0m
[38;5;9mA001_K562_WSTARRseq.Input.rep4.SUBSET.unstranded.bed.gz[0m
[38;5;9mA001_K562_WSTARRseq.Input.rep4.WGS.unstranded.bed.gz[0m
[38;5;9mA001_K562_WSTARRseq.Output.rep1.SUBSET.unstranded.bed.gz[0m
[38;5;9mA001_K562_WSTARRseq.Output.rep1.WGS.unstranded.bed.gz[0m
[38;5;9mA001_K562_WSTARRseq.Output.rep2.SUBSET.unstranded.bed.gz[0m
[38;5;9mA001_K562_WSTARRseq.Output.rep2.WGS.unstranded.bed.gz[0m
[38;5;9mA001_K562_WSTARRseq.Output.rep3.SUBSET.unstranded.bed.gz[0m
[38;5;9mA001_K562_WSTARRseq.Output.rep3.WGS.unstranded.bed.gz[0m


In [3]:
ASSAY=A001_K562_WSTARRseq
FD_INP=${FD_RES}/results/${ASSAY}/fragment_count

FP_INPS=($(ls ${FD_INP}/*SUBSET*))
echo "Total:  ${#FP_INPS[@]}"
basename -a ${FP_INPS[@]}

Total:  7
A001_K562_WSTARRseq.Input.rep1.SUBSET.unstranded.bed.gz
A001_K562_WSTARRseq.Input.rep2.SUBSET.unstranded.bed.gz
A001_K562_WSTARRseq.Input.rep3.SUBSET.unstranded.bed.gz
A001_K562_WSTARRseq.Input.rep4.SUBSET.unstranded.bed.gz
A001_K562_WSTARRseq.Output.rep1.SUBSET.unstranded.bed.gz
A001_K562_WSTARRseq.Output.rep2.SUBSET.unstranded.bed.gz
A001_K562_WSTARRseq.Output.rep3.SUBSET.unstranded.bed.gz


## RUN

In [4]:
sbatch -p ${NODE} \
    --exclude=dl-01 \
    --cpus-per-task 8 \
    --mem 8G \
    --output ${FD_LOG}/coverage_perbase_subset_WSTARR.%a.txt \
    --array 0-6 \
    <<'EOF'
#!/bin/bash
### set directories & global variables
source ../config/config_duke.sh
ASSAY=A001_K562_WSTARRseq

### print start message
timer_start=`date +%s`
echo "Hostname:          " $(hostname)
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Time Stamp:        " $(date +"%m-%d-%y+%T")
echo

### set input and output files
FD_INP=${FD_RES}/results/${ASSAY}/fragment_count
FD_OUT=${FD_RES}/results/${ASSAY}/coverage/track_counts
FD_REG=${FD_RES}/results/region
FP_REG=${FD_REG}/region_screened_selected_perbase.bed.gz

FP_INPS=($(ls ${FD_INP}/*SUBSET*))
FP_INP=${FP_INPS[${SLURM_ARRAY_TASK_ID}]}
FN_INP=$(basename ${FP_INP})
FN_OUT=${FN_INP}
FP_OUT=${FD_OUT}/${FN_OUT}

### show I/O file
echo ++++++++++++++++++++++++++++++++++++++
echo "Input: " ${FP_REG}
echo
echo "show first few lines of file"
zcat ${FP_REG} | head -5
echo
echo ++++++++++++++++++++++++++++++++++++++
echo "Input: " ${FP_INP}
echo
echo "show first few lines of file"
zcat ${FP_INP} | head -5
echo

### execute
bedtools map \
    -a ${FP_REG} \
    -b ${FP_INP} \
    -o sum \
| gzip -c \
> ${FP_OUT}

### show I/O file
echo ++++++++++++++++++++++++++++++++++++++
echo "Output: " ${FP_OUT}
echo
echo "show first few lines of file"
zcat ${FP_OUT} | head -5
echo

### print end message
timer=`date +%s`
runtime=$(echo "${timer} - ${timer_start}" | bc -l)
echo
echo 'Done!'
echo "Run Time: $(displaytime ${runtime})"
EOF

Submitted batch job 29574558


## Check

In [5]:
cat ${FD_LOG}/coverage_perbase_subset_WSTARR.0.txt

Hostname:           x3-05-4.genome.duke.edu
Slurm Array Index:  0
Time Stamp:         03-15-23+13:38:55

++++++++++++++++++++++++++++++++++++++
Input:  /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/region/region_screened_selected_perbase.bed.gz

show first few lines of file
chr11	4000000	4000001
chr11	4000001	4000002
chr11	4000002	4000003
chr11	4000003	4000004
chr11	4000004	4000005

++++++++++++++++++++++++++++++++++++++
Input:  /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/A001_K562_WSTARRseq/fragment_count/A001_K562_WSTARRseq.Input.rep1.SUBSET.unstranded.bed.gz

show first few lines of file
chr11	3999435	4000079	chr11_3999435_4000079	1	.	chr11	4000000	6600000	chr11:4091884-6505900	79
chr11	3999566	4000061	chr11_3999566_4000061	1	.	chr11	4000000	6600000	chr11:4091884-6505900	61
chr11	3999631	4000124	chr11_3999631_4000124	1	.	chr11	4000000	6600000	chr11:4091884-6505900	124
chr11	3999851	4000369	chr11_3999851_4000369	1	.	chr11	4000000	6600000	chr11:4091884-6505

In [6]:
cat ${FD_LOG}/coverage_perbase_subset_WSTARR.6.txt

Hostname:           x3-05-1.genome.duke.edu
Slurm Array Index:  6
Time Stamp:         03-15-23+13:38:55

++++++++++++++++++++++++++++++++++++++
Input:  /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/region/region_screened_selected_perbase.bed.gz

show first few lines of file
chr11	4000000	4000001
chr11	4000001	4000002
chr11	4000002	4000003
chr11	4000003	4000004
chr11	4000004	4000005

++++++++++++++++++++++++++++++++++++++
Input:  /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/A001_K562_WSTARRseq/fragment_count/A001_K562_WSTARRseq.Output.rep3.SUBSET.unstranded.bed.gz

show first few lines of file
chr11	3999381	4000166	chr11_3999381_4000166	1	.	chr11	4000000	6600000	chr11:4091884-6505900	166
chr11	3999386	4000021	chr11_3999386_4000021	1	.	chr11	4000000	6600000	chr11:4091884-6505900	21
chr11	3999392	4000028	chr11_3999392_4000028	1	.	chr11	4000000	6600000	chr11:4091884-6505900	28
chr11	3999435	4000079	chr11_3999435_4000079	1	.	chr11	4000000	6600000	chr11:4091884-650