**Set environment**

In [1]:
source ../config_duke.sh -v
source ../config_load_module_bedtools.sh

You are on Duke Server: HARDAC
BASE DIRECTORY:     /gpfs/fs1/data/reddylab/Kuei
PATH OF SOURCE:     /gpfs/fs1/data/reddylab/Kuei/source
PATH OF EXECUTABLE: /gpfs/fs1/data/reddylab/Kuei/exe
PATH OF ANNOTATION: /gpfs/fs1/data/reddylab/Kuei/annotation
PATH OF PROJECT:    /gpfs/fs1/data/reddylab/Kuei/GitRepo/Proj_CombEffect_ENCODE_FCC/notebooks
PATH OF RESULTS:    /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc



## Test

In [2]:
ls ${FD_RES}/KS91_K562_ASTARRseq/fragment

KS91_K562_hg38_ASTARRseq_Input_rep1.GATA1.unstranded.dedups.bed
KS91_K562_hg38_ASTARRseq_Input_rep2.GATA1.unstranded.dedups.bed
KS91_K562_hg38_ASTARRseq_Input_rep3.GATA1.unstranded.dedups.bed
KS91_K562_hg38_ASTARRseq_Input_rep4.GATA1.unstranded.dedups.bed
KS91_K562_hg38_ASTARRseq_Input_rep5.GATA1.unstranded.dedups.bed
KS91_K562_hg38_ASTARRseq_Input_rep6.GATA1.unstranded.dedups.bed
KS91_K562_hg38_ASTARRseq_Output_rep1.GATA1.unstranded.bed
KS91_K562_hg38_ASTARRseq_Output_rep2.GATA1.unstranded.bed
KS91_K562_hg38_ASTARRseq_Output_rep3.GATA1.unstranded.bed
KS91_K562_hg38_ASTARRseq_Output_rep4.GATA1.unstranded.bed
library_size.tsv
library_size.txt


In [3]:
FP_BEDS=($(ls ${FD_RES}/KS91_K562_ASTARRseq/fragment/KS91_K562_hg38_ASTARRseq*))
for FP_BED in ${FP_BEDS[@]}; do
    FN_BED=$(basename ${FP_BED})
    PREFIX="${FN_BED%.*}"
    #echo ${FP_BED}
    echo ${FN_BED}
    echo ${PREFIX}
done

KS91_K562_hg38_ASTARRseq_Input_rep1.GATA1.unstranded.dedups.bed
KS91_K562_hg38_ASTARRseq_Input_rep1.GATA1.unstranded.dedups
KS91_K562_hg38_ASTARRseq_Input_rep2.GATA1.unstranded.dedups.bed
KS91_K562_hg38_ASTARRseq_Input_rep2.GATA1.unstranded.dedups
KS91_K562_hg38_ASTARRseq_Input_rep3.GATA1.unstranded.dedups.bed
KS91_K562_hg38_ASTARRseq_Input_rep3.GATA1.unstranded.dedups
KS91_K562_hg38_ASTARRseq_Input_rep4.GATA1.unstranded.dedups.bed
KS91_K562_hg38_ASTARRseq_Input_rep4.GATA1.unstranded.dedups
KS91_K562_hg38_ASTARRseq_Input_rep5.GATA1.unstranded.dedups.bed
KS91_K562_hg38_ASTARRseq_Input_rep5.GATA1.unstranded.dedups
KS91_K562_hg38_ASTARRseq_Input_rep6.GATA1.unstranded.dedups.bed
KS91_K562_hg38_ASTARRseq_Input_rep6.GATA1.unstranded.dedups
KS91_K562_hg38_ASTARRseq_Output_rep1.GATA1.unstranded.bed
KS91_K562_hg38_ASTARRseq_Output_rep1.GATA1.unstranded
KS91_K562_hg38_ASTARRseq_Output_rep2.GATA1.unstranded.bed
KS91_K562_hg38_ASTARRseq_Output_rep2.GATA1.unstranded
KS91_K562_hg38_ASTARRseq_Output_

In [4]:
SIZES=(1 100 200 500 1000)
PREFIX="region_GATA1"
for SIZE in ${SIZES[@]}; do
    FNAME="bin${SIZE}"
    FN_BIN=${PREFIX}_${FNAME}.bed
    echo ${SIZE}
    echo ${FNAME}
    echo ${FN_BIN}
done

1
bin1
region_GATA1_bin1.bed
100
bin100
region_GATA1_bin100.bed
200
bin200
region_GATA1_bin200.bed
500
bin500
region_GATA1_bin500.bed
1000
bin1000
region_GATA1_bin1000.bed


In [5]:
sbatch -p ${NODE} \
    --mem 8G \
    --array 0-9 \
    -o ${FD_LOG}/ASTARR_coverage_gata1_bin.%a.txt \
    <<'EOF'
#!/bin/bash
### set directories & global variables
source ../config_duke.sh

SIZES=(100 200 500 1000)
REGION="region_GATA1"

### set input and output
FP_FRGS=($(ls ${FD_RES}/KS91_K562_ASTARRseq/fragment/KS91_K562_hg38_ASTARRseq*))
FP_FRG=${FP_FRGS[${SLURM_ARRAY_TASK_ID}]}
FN_FRG=$(basename ${FP_FRG})
PREFIX="${FN_FRG%.*}"

FD_COV=${FD_RES}/KS91_K562_ASTARRseq/coverage

### loop through different bin sizes
for SIZE in ${SIZES[@]}; do
    ### set input and output
    FNAME="bin${SIZE}"
    FN_BIN="${REGION}_${FNAME}.bed"
    
    FP_BIN=${FD_COV}/${FN_BIN}
    FP_OUT=${FD_COV}/${PREFIX}.${FNAME}.bed.gz
    
    ### print input file message
    echo +++++++++++++++++++++++++++++++++++++++
    echo "INPUT 1: ${FP_BIN}" 
    head -5 ${FP_BIN}
    echo
    echo "INPUT 2: ${FP_FRG}"
    head -5 ${FP_FRG}
    echo
    
    ### execute
    bedtools intersect \
        -a ${FP_BIN} \
        -b ${FP_FRG} \
        -f 0.9 \
        -F 0.9 \
        -e   \
        -wo |\
        awk -v OFS='\t' {'print $1, $2, $3'} |\
        sort    |\
        uniq -c |\
        awk '{$(NF+1)=$1;$1=""}1'   |\
        sed -e 's/^[[:space:]]*//'  |\
        sed -e 's/[[:space:]]/\t/g' |\
        sort -k 1,1 -k2,2n |\
        gzip -c \
        > ${FP_OUT}

    ### print output file message
    echo "OUTPUT: ${FP_OUT}"
    zcat ${FP_OUT} | head -5
    echo
done

### print end message
timer=`date +%s`
runtime=$(echo "${timer} - ${timer_start}" | bc -l)
echo
echo 'Done!'
echo "Run Time: $(displaytime ${runtime})"

EOF

Submitted batch job 28159204


In [6]:
cat ${FD_LOG}/ASTARR_coverage_gata1_bin.0.txt

+++++++++++++++++++++++++++++++++++++++
INPUT 1: /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/KS91_K562_ASTARRseq/coverage/region_GATA1_bin100.bed
chrX	47786401	47786501
chrX	47786501	47786601
chrX	47786601	47786701
chrX	47786701	47786801
chrX	47786801	47786901

INPUT 2: /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/KS91_K562_ASTARRseq/fragment/KS91_K562_hg38_ASTARRseq_Input_rep1.GATA1.unstranded.dedups.bed
chrX	47787165	47787363	chrX_47787165_47787363
chrX	47787557	47787773	chrX_47787557_47787773
chrX	47787593	47787783	chrX_47787593_47787783
chrX	47787598	47787772	chrX_47787598_47787772
chrX	47787661	47788058	chrX_47787661_47788058

OUTPUT: /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/KS91_K562_ASTARRseq/coverage/KS91_K562_hg38_ASTARRseq_Input_rep1.GATA1.unstranded.dedups.bin100.bed.gz
chrX	47787201	47787301	1
chrX	47787601	47787701	3
chrX	47787701	47787801	1
chrX	47787801	47787901	1
chrX	47787901	47788001	1

++++++++++++++++++++++++++++++++

In [40]:
sbatch -p ${NODE} \
    --mem 8G \
    --array 0-9 \
    -o ${FD_LOG}/ASTARR_coverage_gata1_bin.%a.txt \
    <<'EOF'
#!/bin/bash
### set directories & global variables
source ../config_duke.sh

### set input and output
FP_BEDS=($(ls ${FD_RES}/KS91_K562_ASTARRseq/fragment/KS91_K562_hg38_ASTARRseq*))
FP_BED=${FP_BEDS[${SLURM_ARRAY_TASK_ID}]}
FN_BED=$(basename ${FP_BED})
PREFIX="${FN_BED%.*}"

FD_COV=${FD_RES}/KS91_K562_ASTARRseq/coverage
FP_BIN=${FD_COV}/region_GATA1_bin100.bed
FP_OUT=${FD_COV}/${PREFIX}.bin100.bed.gz

### print start message
timer_start=`date +%s`
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Time Stamp:        " $(date +"%m-%d-%y+%T")
echo

### print file message
echo "INPUT 1: ${FP_BIN}" 
head -5 ${FP_BIN}
echo
echo "INPUT 2: ${FP_BED}"
head -5 ${FP_BED}
echo

### execute
bedtools intersect \
    -a ${FP_BIN} \
    -b ${FP_BED} \
    -f 1.0 \
    -wo |\
    awk -v OFS='\t' {'print $1, $2, $3'} |\
    sort |\
    uniq -c |\
    awk '{$(NF+1)=$1;$1=""}1'   |\
    sed -e 's/^[[:space:]]*//'  |\
    sed -e 's/[[:space:]]/\t/g' |\
    sort -k 1,1 -k2,2n |\
    gzip -c \
    > ${FP_OUT}

### print file message
echo "OUTPUT:  ${FP_OUT}"
zcat ${FP_OUT} | head -5
echo

### print end message
timer=`date +%s`
runtime=$(echo "${timer} - ${timer_start}" | bc -l)
echo
echo 'Done!'
echo "Run Time: $(displaytime ${runtime})"

EOF

Submitted batch job 28146058


In [41]:
cat ${FD_LOG}/ASTARR_coverage_gata1_bin.0.txt

Slurm Array Index:  0
Time Stamp:         04-19-22+16:05:56

INPUT 1: /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/KS91_K562_ASTARRseq/coverage/region_GATA1_bin100.bed
chrX	47786400	47786500
chrX	47786500	47786600
chrX	47786600	47786700
chrX	47786700	47786800
chrX	47786800	47786900

INPUT 2: /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/KS91_K562_ASTARRseq/fragment/KS91_K562_hg38_ASTARRseq_Input_rep1.GATA1.unstranded.dedups.bed
chrX	47787165	47787363	chrX_47787165_47787363
chrX	47787557	47787773	chrX_47787557_47787773
chrX	47787593	47787783	chrX_47787593_47787783
chrX	47787598	47787772	chrX_47787598_47787772
chrX	47787661	47788058	chrX_47787661_47788058

OUTPUT:  /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/KS91_K562_ASTARRseq/coverage/KS91_K562_hg38_ASTARRseq_Input_rep1.GATA1.unstranded.dedups.bin100.bed.gz
chrX	47787200	47787300	1
chrX	47787600	47787700	3
chrX	47787700	47787800	1
chrX	47787800	47787900	1
chrX	47787900	47788000	1


Done!
Run

In [23]:
cat ${FD_LOG}/ASTARR_coverage_gata1_bin.0.txt

Slurm Array Index:  0
Time Stamp:         04-19-22+15:45:26

INPUT 1: /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/KS91_K562_ASTARRseq/coverage/region_GATA1_bin100.bed
chrX	47786400	47786500
chrX	47786500	47786600
chrX	47786600	47786700
chrX	47786700	47786800
chrX	47786800	47786900

INPUT 2: /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/KS91_K562_ASTARRseq/fragment/KS91_K562_hg38_ASTARRseq_Input_rep1.GATA1.unstranded.dedups.bed
chrX	47787165	47787363	chrX_47787165_47787363
chrX	47787557	47787773	chrX_47787557_47787773
chrX	47787593	47787783	chrX_47787593_47787783
chrX	47787598	47787772	chrX_47787598_47787772
chrX	47787661	47788058	chrX_47787661_47788058

OUTPUT:  /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/KS91_K562_ASTARRseq/coverage/KS91_K562_hg38_ASTARRseq_Input_rep1.GATA1.unstranded.dedups.bin100.bed.gz
chrX	47787200	47787300	chrX	47787165	47787363	chrX_47787165_47787363	100
chrX	47787600	47787700	chrX	47787557	47787773	chrX_47787557_477

In [45]:
sbatch -p ${NODE} \
    --mem 8G \
    --array 0-9 \
    -o ${FD_LOG}/ASTARR_coverage_gata1_bin1000.%a.txt \
    <<'EOF'
#!/bin/bash
### set directories & global variables
source ../config_duke.sh

### set input and output
FP_BEDS=($(ls ${FD_RES}/KS91_K562_ASTARRseq/fragment/KS91_K562_hg38_ASTARRseq*))
FP_BED=${FP_BEDS[${SLURM_ARRAY_TASK_ID}]}
FN_BED=$(basename ${FP_BED})
PREFIX="${FN_BED%.*}"

FD_COV=${FD_RES}/KS91_K562_ASTARRseq/coverage
FP_BIN=${FD_COV}/region_GATA1_bin1000.bed
FP_OUT=${FD_COV}/${PREFIX}.bin1000.bed.gz

### print start message
timer_start=`date +%s`
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Time Stamp:        " $(date +"%m-%d-%y+%T")
echo

### print file message
echo "INPUT 1: ${FP_BIN}" 
head -5 ${FP_BIN}
echo
echo "INPUT 2: ${FP_BED}"
head -5 ${FP_BED}
echo

### execute
bedtools intersect \
    -a ${FP_BIN} \
    -b ${FP_BED} \
    -f 1.0 \
    -F 1.0 \
    -e \
    -wo |\
    awk -v OFS='\t' {'print $1, $2, $3'} |\
    sort |\
    uniq -c |\
    awk '{$(NF+1)=$1;$1=""}1'   |\
    sed -e 's/^[[:space:]]*//'  |\
    sed -e 's/[[:space:]]/\t/g' |\
    sort -k 1,1 -k2,2n |\
    gzip -c \
    > ${FP_OUT}

### print file message
echo "OUTPUT:  ${FP_OUT}"
zcat ${FP_OUT} | head -5
echo

### print end message
timer=`date +%s`
runtime=$(echo "${timer} - ${timer_start}" | bc -l)
echo
echo 'Done!'
echo "Run Time: $(displaytime ${runtime})"

EOF

Submitted batch job 28148983


In [46]:
cat ${FD_LOG}/ASTARR_coverage_gata1_bin1000.6.txt

Slurm Array Index:  6
Time Stamp:         04-19-22+19:42:32

INPUT 1: /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/KS91_K562_ASTARRseq/coverage/region_GATA1_bin1000.bed
chrX	47786400	47787400
chrX	47787400	47788400
chrX	47788400	47789400
chrX	47789400	47790400
chrX	47790400	47791400

INPUT 2: /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/KS91_K562_ASTARRseq/fragment/KS91_K562_hg38_ASTARRseq_Output_rep1.GATA1.unstranded.bed
chrX	47787189	47787363	chrX_47787189_47787363
chrX	47787189	47787363	chrX_47787189_47787363
chrX	47787189	47787363	chrX_47787189_47787363
chrX	47787189	47787363	chrX_47787189_47787363
chrX	47787189	47787363	chrX_47787189_47787363

OUTPUT:  /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/KS91_K562_ASTARRseq/coverage/KS91_K562_hg38_ASTARRseq_Output_rep1.GATA1.unstranded.bin1000.bed.gz
chrX	47786400	47787400	6
chrX	47788400	47789400	115
chrX	47792400	47793400	2
chrX	47794400	47795400	38
chrX	47795400	47796400	24


Done!
Run Time: