**Set environment**

In [1]:
source ../config_duke.sh -v
source ../config_load_module_bedtools.sh

You are on Duke Server: HARDAC
BASE DIRECTORY:     /gpfs/fs1/data/reddylab/Kuei
PATH OF SOURCE:     /gpfs/fs1/data/reddylab/Kuei/source
PATH OF EXECUTABLE: /gpfs/fs1/data/reddylab/Kuei/exe
PATH OF ANNOTATION: /gpfs/fs1/data/reddylab/Kuei/annotation
PATH OF PROJECT:    /gpfs/fs1/data/reddylab/Kuei/GitRepo/Proj_CombEffect_ENCODE_FCC/notebooks
PATH OF RESULTS:    /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc



## Test

In [2]:
SIZES=(1 100 200 500 1000)
PREFIX="region_GATA1"
for SIZE in ${SIZES[@]}; do
    FNAME="bin${SIZE}"
    FN_BIN=${PREFIX}_${FNAME}.bed
    echo ${SIZE}
    echo ${FNAME}
    echo ${FN_BIN}
done

1
bin1
region_GATA1_bin1.bed
100
bin100
region_GATA1_bin100.bed
200
bin200
region_GATA1_bin200.bed
500
bin500
region_GATA1_bin500.bed
1000
bin1000
region_GATA1_bin1000.bed


## Run

In [5]:
sbatch -p ${NODE} \
    --mem 8G \
    -o ${FD_LOG}/TMPRA_coverage_gata1_bin.txt \
    <<'EOF'
#!/bin/bash
### set directories & global variables
source ../config_duke.sh

SIZES=(1 100 200 500 1000)
PREFIX="region_GATA1"

FD_FRG=${FD_RES}/Tewhey_K562_TileMPRA/fragment
FD_COV=${FD_RES}/Tewhey_K562_TileMPRA/coverage
    

### print start message
timer_start=`date +%s`
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Time Stamp:        " $(date +"%m-%d-%y+%T")
echo

### loop through different bin sizes
for SIZE in ${SIZES[@]}; do
    ### set input and output
    FNAME="bin${SIZE}"
    FN_BIN=${PREFIX}_${FNAME}.bed
    
    FP_BIN=${FD_COV}/${FN_BIN}
    FP_FRG=${FD_FRG}/Tile_K562_20210130.GATA1.unstranded.bed
    FP_OUT=${FD_COV}/Tile_K562_20210130.GATA1.unstranded.${FNAME}.bed.gz
    
    ### print input file message
    echo +++++++++++++++++++++++++++++++++++++++
    echo "INPUT 1: ${FP_BIN}" 
    head -5 ${FP_BIN}
    echo
    echo "INPUT 2: ${FP_FRG}"
    head -5 ${FP_FRG}
    echo
    
    ### execute
    bedtools intersect \
        -a ${FP_BIN} \
        -b ${FP_FRG} \
        -f 0.9 \
        -F 0.9 \
        -e \
        -wo |\
        gzip -c \
        > ${FP_OUT}
        
    ### print output file message
    echo "OUTPUT: ${FP_OUT}"
    zcat ${FP_OUT} | head -5
    echo
done

### print end message
timer=`date +%s`
runtime=$(echo "${timer} - ${timer_start}" | bc -l)
echo
echo 'Done!'
echo "Run Time: $(displaytime ${runtime})"

EOF

Submitted batch job 28159074


In [6]:
cat ${FD_LOG}/TMPRA_coverage_gata1_bin.txt

Slurm Array Index: 
Time Stamp:         04-20-22+13:06:52

+++++++++++++++++++++++++++++++++++++++
INPUT 1: /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/Tewhey_K562_TileMPRA/coverage/region_GATA1_bin1.bed
chrX	47786401	47786402
chrX	47786402	47786403
chrX	47786403	47786404
chrX	47786404	47786405
chrX	47786405	47786406

INPUT 2: /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/Tewhey_K562_TileMPRA/fragment/Tile_K562_20210130.GATA1.unstranded.bed
chrX	47786401	47786600
chrX	47786451	47786650
chrX	47786501	47786700
chrX	47786551	47786750
chrX	47786601	47786800

OUTPUT: /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/Tewhey_K562_TileMPRA/coverage/Tile_K562_20210130.GATA1.unstranded.bin1.bed.gz
chrX	47786401	47786402	chrX	47786401	47786600	1
chrX	47786402	47786403	chrX	47786401	47786600	1
chrX	47786403	47786404	chrX	47786401	47786600	1
chrX	47786404	47786405	chrX	47786401	47786600	1
chrX	47786405	47786406	chrX	47786401	47786600	1

++++++++++++++++++++++

In [14]:
sbatch -p ${NODE} \
    --mem 8G \
    -o ${FD_LOG}/TMPRA_coverage_gata1_bin.txt \
    <<'EOF'
#!/bin/bash
### set directories & global variables
source ../config_duke.sh

### set input and output
FD_FRG=${FD_RES}/Tewhey_K562_TileMPRA/fragment
FD_COV=${FD_RES}/Tewhey_K562_TileMPRA/coverage

FP_BIN=${FD_COV}/region_GATA1_bin100.bed
FP_FRG=${FD_FRG}/Tile_K562_20210130.GATA1.unstranded.bed
FP_OUT=${FD_COV}/Tile_K562_20210130.GATA1.unstranded.bin100.bed.gz

### print start message
timer_start=`date +%s`
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Time Stamp:        " $(date +"%m-%d-%y+%T")
echo

### execute
bedtools intersect \
    -a ${FP_BIN} \
    -b ${FP_FRG} \
    -f 1.0 \
    -wo |\
    gzip -c \
    > ${FP_OUT}

### print end message
timer=`date +%s`
runtime=$(echo "${timer} - ${timer_start}" | bc -l)
echo
echo 'Done!'
echo "Run Time: $(displaytime ${runtime})"

EOF

Submitted batch job 28145990


In [11]:
cat ${FD_LOG}/TMPRA_coverage_gata1_bin.txt

Slurm Array Index: 
Time Stamp:         04-19-22+14:16:25


Done!
Run Time: 1 seconds


In [15]:
FD_COV=${FD_RES}/Tewhey_K562_TileMPRA/coverage
zcat ${FD_COV}/Tile_K562_20210130.GATA1.unstranded.bin100.bed.gz | head -10

chrX	47786500	47786600	chrX	47786401	47786600	100
chrX	47786500	47786600	chrX	47786451	47786650	100
chrX	47786600	47786700	chrX	47786501	47786700	100
chrX	47786600	47786700	chrX	47786551	47786750	100
chrX	47786700	47786800	chrX	47786601	47786800	100
chrX	47786700	47786800	chrX	47786651	47786850	100
chrX	47786800	47786900	chrX	47786701	47786900	100
chrX	47786800	47786900	chrX	47786751	47786950	100
chrX	47786900	47787000	chrX	47786801	47787000	100
chrX	47786900	47787000	chrX	47786851	47787050	100

gzip: stdout: Broken pipe


In [23]:
sbatch -p ${NODE} \
    --mem 8G \
    -o ${FD_LOG}/TMPRA_coverage_gata1_bin500.txt \
    <<'EOF'
#!/bin/bash
### set directories & global variables
source ../config_duke.sh

### set input and output
FD_FRG=${FD_RES}/Tewhey_K562_TileMPRA/fragment
FD_COV=${FD_RES}/Tewhey_K562_TileMPRA/coverage

FP_BIN=${FD_COV}/region_GATA1_bin500.bed
FP_FRG=${FD_FRG}/Tile_K562_20210130.GATA1.unstranded.bed
FP_OUT=${FD_COV}/Tile_K562_20210130.GATA1.unstranded.bin500.bed.gz

### print start message
timer_start=`date +%s`
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Time Stamp:        " $(date +"%m-%d-%y+%T")
echo

### print file message
echo "INPUT 1: ${FP_BIN}" 
head -5 ${FP_BIN}
echo
echo "INPUT 2: ${FP_FRG}"
head -5 ${FP_FRG}
echo

### execute
bedtools intersect \
    -a ${FP_BIN} \
    -b ${FP_FRG} \
    -f 0.9 \
    -F 0.9 \
    -e \
    -wo |\
    gzip -c \
    > ${FP_OUT}

### print file message
echo "OUTPUT:  ${FP_OUT}"
zcat ${FP_OUT} | head -5
echo

### print end message
timer=`date +%s`
runtime=$(echo "${timer} - ${timer_start}" | bc -l)
echo
echo 'Done!'
echo "Run Time: $(displaytime ${runtime})"

EOF

Submitted batch job 28149743


In [24]:
cat ${FD_LOG}/TMPRA_coverage_gata1_bin500.txt

Slurm Array Index: 
Time Stamp:         04-19-22+20:10:33

INPUT 1: /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/Tewhey_K562_TileMPRA/coverage/region_GATA1_bin500.bed
chrX	47786400	47786900
chrX	47786900	47787400
chrX	47787400	47787900
chrX	47787900	47788400
chrX	47788400	47788900

INPUT 2: /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/Tewhey_K562_TileMPRA/fragment/Tile_K562_20210130.GATA1.unstranded.bed
chrX	47786401	47786600
chrX	47786451	47786650
chrX	47786501	47786700
chrX	47786551	47786750
chrX	47786601	47786800

OUTPUT:  /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/Tewhey_K562_TileMPRA/coverage/Tile_K562_20210130.GATA1.unstranded.bin500.bed.gz
chrX	47786400	47786900	chrX	47786401	47786600	199
chrX	47786400	47786900	chrX	47786451	47786650	199
chrX	47786400	47786900	chrX	47786501	47786700	199
chrX	47786400	47786900	chrX	47786551	47786750	199
chrX	47786400	47786900	chrX	47786601	47786800	199


Done!
Run Time: 0 seconds


In [19]:
sbatch -p ${NODE} \
    --mem 8G \
    -o ${FD_LOG}/TMPRA_coverage_gata1_bin1000.txt \
    <<'EOF'
#!/bin/bash
### set directories & global variables
source ../config_duke.sh

### set input and output
FD_FRG=${FD_RES}/Tewhey_K562_TileMPRA/fragment
FD_COV=${FD_RES}/Tewhey_K562_TileMPRA/coverage

FP_BIN=${FD_COV}/region_GATA1_bin1000.bed
FP_FRG=${FD_FRG}/Tile_K562_20210130.GATA1.unstranded.bed
FP_OUT=${FD_COV}/Tile_K562_20210130.GATA1.unstranded.bin1000.bed.gz

### print start message
timer_start=`date +%s`
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Time Stamp:        " $(date +"%m-%d-%y+%T")
echo

### print file message
echo "INPUT 1: ${FP_BIN}" 
head -5 ${FP_BIN}
echo
echo "INPUT 2: ${FP_FRG}"
head -5 ${FP_FRG}
echo

### execute
bedtools intersect \
    -a ${FP_BIN} \
    -b ${FP_FRG} \
    -f 1.0 \
    -F 1.0 \
    -e \
    -wo |\
    gzip -c \
    > ${FP_OUT}

### print file message
echo "OUTPUT:  ${FP_OUT}"
zcat ${FP_OUT} | head -5
echo

### print end message
timer=`date +%s`
runtime=$(echo "${timer} - ${timer_start}" | bc -l)
echo
echo 'Done!'
echo "Run Time: $(displaytime ${runtime})"

EOF

Submitted batch job 28148982


In [20]:
cat ${FD_LOG}/TMPRA_coverage_gata1_bin1000.txt 

Slurm Array Index: 
Time Stamp:         04-19-22+19:42:32

INPUT 1: /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/Tewhey_K562_TileMPRA/coverage/region_GATA1_bin1000.bed
chrX	47786400	47787400
chrX	47787400	47788400
chrX	47788400	47789400
chrX	47789400	47790400
chrX	47790400	47791400

INPUT 2: /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/Tewhey_K562_TileMPRA/fragment/Tile_K562_20210130.GATA1.unstranded.bed
chrX	47786401	47786600
chrX	47786451	47786650
chrX	47786501	47786700
chrX	47786551	47786750
chrX	47786601	47786800

OUTPUT:  /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/Tewhey_K562_TileMPRA/coverage/Tile_K562_20210130.GATA1.unstranded.bin1000.bed.gz
chrX	47786400	47787400	chrX	47786401	47786600	199
chrX	47786400	47787400	chrX	47786451	47786650	199
chrX	47786400	47787400	chrX	47786501	47786700	199
chrX	47786400	47787400	chrX	47786551	47786750	199
chrX	47786400	47787400	chrX	47786601	47786800	199


Done!
Run Time: 0 seconds
