**Set environment**

In [1]:
source ../config/config_duke.sh
source ../config/config_load_module_bedtools.sh
show_env

You are on Duke Server: HARDAC
BASE DIRECTORY:     /gpfs/fs1/data/reddylab/Kuei
PATH OF SOURCE:     /gpfs/fs1/data/reddylab/Kuei/source
PATH OF EXECUTABLE: /gpfs/fs1/data/reddylab/Kuei/exe
PATH OF ANNOTATION: /gpfs/fs1/data/reddylab/Kuei/annotation
PATH OF PROJECT:    /gpfs/fs1/data/reddylab/Kuei/GitRepo/Proj_CombEffect_ENCODE_FCC/notebooks
PATH OF RESULTS:    /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc



## Count total fragments for each region

**TEST**

In [2]:
FD_BED=${FD_RES}/A001_K562_WSTARRseq/fragment_count
FD_COV=${FD_RES}/A001_K562_WSTARRseq/coverage

REP=rep1
REGION=GATA1
FP_CNTS=($(ls ${FD_BED}/*${REP}*${REGION}*.unstranded.bed))
for FP_INP in ${FP_CNTS[@]}; do
    FN_INP=$(basename ${FP_INP})
    FN_OUT=${FN_INP%.*}.total_count.tsv
    echo "FN_INP:" ${FN_INP}
    echo "FN_OUT:" ${FN_OUT}
done

FN_INP: A001_K562_WSTARRseq_Input_rep1.GATA1.unstranded.bed
FN_OUT: A001_K562_WSTARRseq_Input_rep1.GATA1.unstranded.total_count.tsv
FN_INP: A001_K562_WSTARRseq_Output_rep1.GATA1.unstranded.bed
FN_OUT: A001_K562_WSTARRseq_Output_rep1.GATA1.unstranded.total_count.tsv


**RUN: GATA1**

In [3]:
sbatch -p ${NODE} \
    --mem 20G \
    --array 1-6 \
    -o ${FD_LOG}/coverage_total_count_WSTARR_GATA1.%a.txt \
    <<'EOF'
#!/bin/bash
### set directories & global variables
source ../config/config_duke.sh
REGION=GATA1

### set input and output directory
FD_CNT=${FD_RES}/A001_K562_WSTARRseq/fragment_count
FD_COV=${FD_RES}/A001_K562_WSTARRseq/coverage
REP=rep${SLURM_ARRAY_TASK_ID}

### print start message
timer_start=`date +%s`
echo "Hostname:          " $(hostname)
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Time Stamp:        " $(date +"%m-%d-%y+%T")
echo

FP_CNTS=($(ls ${FD_CNT}/*${REP}*${REGION}*.unstranded.bed))
for FP_INP in ${FP_CNTS[@]}; do

    ### set input and output file
    FN_INP=$(basename ${FP_INP})
    FN_OUT=${FN_INP%.*}.total_count.tsv
    FP_OUT=${FD_COV}/${FN_OUT}
    echo ++++++++++++++++++++++++++++++++++++++
    echo
    
    ### show input file
    echo "Input: " ${FP_INP}
    echo
    echo "show first few lines of input"
    cat ${FP_INP} | head -5
    echo
    echo "show last few lines of input"
    cat ${FP_INP} | tail -n 5
    echo
    
    ### execute
    bedtools map \
        -a ${FD_COV}/region_${REGION}.bed \
        -b ${FP_INP} \
        -o sum > ${FP_OUT}
    
    ### show output file
    echo "Output: " ${FP_OUT}
    echo
    cat ${FP_OUT}
    echo
done

### print end message
timer=`date +%s`
runtime=$(echo "${timer} - ${timer_start}" | bc -l)
echo
echo 'Done!'
echo "Run Time: $(displaytime ${runtime})"
EOF

Submitted batch job 29149297


**RUN: MYC**

In [4]:
sbatch -p ${NODE} \
    --mem 20G \
    --array 1-6 \
    -o ${FD_LOG}/coverage_total_count_WSTARR_MYC.%a.txt \
    <<'EOF'
#!/bin/bash
### set directories & global variables
source ../config/config_duke.sh
REGION=MYC

### set input and output directory
FD_CNT=${FD_RES}/A001_K562_WSTARRseq/fragment_count
FD_COV=${FD_RES}/A001_K562_WSTARRseq/coverage
REP=rep${SLURM_ARRAY_TASK_ID}

### print start message
timer_start=`date +%s`
echo "Hostname:          " $(hostname)
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Time Stamp:        " $(date +"%m-%d-%y+%T")
echo

FP_CNTS=($(ls ${FD_CNT}/*${REP}*${REGION}*.unstranded.bed))
for FP_INP in ${FP_CNTS[@]}; do

    ### set input and output file
    FN_INP=$(basename ${FP_INP})
    FN_OUT=${FN_INP%.*}.total_count.tsv
    FP_OUT=${FD_COV}/${FN_OUT}
    echo ++++++++++++++++++++++++++++++++++++++
    echo
    
    ### show input file
    echo "Input: " ${FP_INP}
    echo
    echo "show first few lines of input"
    cat ${FP_INP} | head -5
    echo
    echo "show last few lines of input"
    cat ${FP_INP} | tail -n 5
    echo
    
    ### execute
    bedtools map \
        -a ${FD_COV}/region_${REGION}.bed \
        -b ${FP_INP} \
        -o sum > ${FP_OUT}
    
    ### show output file
    echo "Output: " ${FP_OUT}
    echo
    cat ${FP_OUT}
    echo
done

### print end message
timer=`date +%s`
runtime=$(echo "${timer} - ${timer_start}" | bc -l)
echo
echo 'Done!'
echo "Run Time: $(displaytime ${runtime})"
EOF


Submitted batch job 29149298


**CHECK**

In [5]:
ls -l ${FD_LOG}/coverage_total_count_WSTARR_GATA1.1.txt
ls -l ${FD_LOG}/coverage_total_count_WSTARR_MYC.1.txt

-rw-rw-r-- 1 kk319 reddylab 2035 Oct 12 15:08 /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/log/coverage_total_count_WSTARR_GATA1.1.txt
-rw-rw-r-- 1 kk319 reddylab 2095 Oct 12 15:08 /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/log/coverage_total_count_WSTARR_MYC.1.txt


In [7]:
cat ${FD_LOG}/coverage_total_count_WSTARR_GATA1.1.txt

Hostname:           x1-01-2.genome.duke.edu
Slurm Array Index:  1
Time Stamp:         10-12-22+15:08:17

++++++++++++++++++++++++++++++++++++++

Input:  /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/A001_K562_WSTARRseq/fragment_count/A001_K562_WSTARRseq_Input_rep1.GATA1.unstranded.bed

show first few lines of input
chrX	47787533	47787772	chrX_47787533_47787772	1	.
chrX	47787569	47787735	chrX_47787569_47787735	1	.
chrX	47787714	47788125	chrX_47787714_47788125	1	.
chrX	47787767	47788062	chrX_47787767_47788062	1	.
chrX	47788341	47788759	chrX_47788341_47788759	1	.

show last few lines of input
chrX	49785558	49785892	chrX_49785558_49785892	1	.
chrX	49785590	49786084	chrX_49785590_49786084	1	.
chrX	49785687	49785991	chrX_49785687_49785991	1	.
chrX	49785714	49786172	chrX_49785714_49786172	1	.
chrX	49786298	49786464	chrX_49786298_49786464	1	.

Output:  /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/A001_K562_WSTARRseq/coverage/A001_K562_WSTARRseq_Input_rep1.GATA1.uns

In [8]:
cat ${FD_LOG}/coverage_total_count_WSTARR_MYC.1.txt

Hostname:           x2-04-4
Slurm Array Index:  1
Time Stamp:         10-12-22+15:08:17

++++++++++++++++++++++++++++++++++++++

Input:  /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/A001_K562_WSTARRseq/fragment_count/A001_K562_WSTARRseq_Input_rep1.MYC.unstranded.bed

show first few lines of input
chr8	126736069	126736518	chr8_126736069_126736518	1	.
chr8	126736096	126736559	chr8_126736096_126736559	1	.
chr8	126736430	126736781	chr8_126736430_126736781	1	.
chr8	126736624	126736744	chr8_126736624_126736744	1	.
chr8	126737079	126737259	chr8_126737079_126737259	1	.

show last few lines of input
chr8	128735526	128735941	chr8_128735526_128735941	1	.
chr8	128735556	128735920	chr8_128735556_128735920	1	.
chr8	128735558	128735985	chr8_128735558_128735985	1	.
chr8	128735657	128736014	chr8_128735657_128736014	1	.
chr8	128735698	128735849	chr8_128735698_128735849	1	.

Output:  /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/A001_K562_WSTARRseq/coverage/A001_K562_WSTARRse

## Calculate coverage perbase for input and output

**TEST**

In [2]:
FD_BED=${FD_RES}/A001_K562_WSTARRseq/fragment_count
FD_COV=${FD_RES}/A001_K562_WSTARRseq/coverage

REP=rep1
REGION=
FP_BEDS=($(ls ${FD_BED}/*${REP}*bed))
for FP_BED in ${FP_BEDS[@]}; do
    FN_BED=$(basename ${FP_BED})
    FN_OUT=${FN_BED%.*}.perbase.tsv.gz
    echo ${FN_BED}
    echo ${FN_OUT}
done

A001_K562_WSTARRseq_Input_rep1.GATA1.stranded_neg.bed
A001_K562_WSTARRseq_Input_rep1.GATA1.stranded_neg.perbase.tsv.gz
A001_K562_WSTARRseq_Input_rep1.GATA1.stranded_pos.bed
A001_K562_WSTARRseq_Input_rep1.GATA1.stranded_pos.perbase.tsv.gz
A001_K562_WSTARRseq_Input_rep1.GATA1.unstranded.bed
A001_K562_WSTARRseq_Input_rep1.GATA1.unstranded.perbase.tsv.gz
A001_K562_WSTARRseq_Input_rep1.MYC.stranded_neg.bed
A001_K562_WSTARRseq_Input_rep1.MYC.stranded_neg.perbase.tsv.gz
A001_K562_WSTARRseq_Input_rep1.MYC.stranded_pos.bed
A001_K562_WSTARRseq_Input_rep1.MYC.stranded_pos.perbase.tsv.gz
A001_K562_WSTARRseq_Input_rep1.MYC.unstranded.bed
A001_K562_WSTARRseq_Input_rep1.MYC.unstranded.perbase.tsv.gz
A001_K562_WSTARRseq_Output_rep1.GATA1.stranded_neg.bed
A001_K562_WSTARRseq_Output_rep1.GATA1.stranded_neg.perbase.tsv.gz
A001_K562_WSTARRseq_Output_rep1.GATA1.stranded_pos.bed
A001_K562_WSTARRseq_Output_rep1.GATA1.stranded_pos.perbase.tsv.gz
A001_K562_WSTARRseq_Output_rep1.GATA1.unstranded.bed
A001_K562_W

**RUN:GATA1**

In [5]:
sbatch -p ${NODE} \
    --mem 20G \
    --array 1-4 \
    -o ${FD_LOG}/coverage_perbase_WSTARR_GATA1.%a.txt \
    <<'EOF'
#!/bin/bash
### set directories & global variables
source ../config/config_duke.sh
REGION=GATA1

### set input and output directory
FD_CNT=${FD_RES}/A001_K562_WSTARRseq/fragment_count
FD_COV=${FD_RES}/A001_K562_WSTARRseq/coverage
REP=rep${SLURM_ARRAY_TASK_ID}

### print start message
timer_start=`date +%s`
echo "Hostname:          " $(hostname)
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Time Stamp:        " $(date +"%m-%d-%y+%T")
echo

FP_CNTS=($(ls ${FD_CNT}/*${REP}*${REGION}*bed))
for FP_INP in ${FP_CNTS[@]}; do

    ### set input and output file
    FN_INP=$(basename ${FP_INP})
    FN_OUT=${FN_INP%.*}.perbase.tsv.gz
    FP_OUT=${FD_COV}/${FN_OUT}
    echo ++++++++++++++++++++++++++++++++++++++
    echo
    
    ### show input file
    echo "Input: " ${FP_INP}
    echo
    echo "show first few lines of input"
    cat ${FP_INP} | head -5
    echo
    echo "show last few lines of input"
    cat ${FP_INP} | tail -n 5
    echo
    
    ### execute
    bedtools map \
        -a ${FD_COV}/region_${REGION}_perbase.bed \
        -b ${FP_INP} \
        -o sum |\
        gzip -c > ${FP_OUT}
    
    ### show output file
    echo "Output: " ${FP_OUT}
    echo
    echo "show first few lines of output:"
    zcat ${FP_OUT} | head -5
    echo
    echo "show last few lines of output"
    zcat ${FP_OUT} | tail -n 5
    echo
done

### print end message
timer=`date +%s`
runtime=$(echo "${timer} - ${timer_start}" | bc -l)
echo
echo 'Done!'
echo "Run Time: $(displaytime ${runtime})"
EOF

Submitted batch job 29155997


**RUN:MYC**

In [6]:
sbatch -p ${NODE} \
    --mem 20G \
    --array 1-4 \
    -o ${FD_LOG}/coverage_perbase_WSTARR_MYC.%a.txt \
    <<'EOF'
#!/bin/bash
### set directories & global variables
source ../config/config_duke.sh
REGION=MYC

### set input and output directory
FD_CNT=${FD_RES}/A001_K562_WSTARRseq/fragment_count
FD_COV=${FD_RES}/A001_K562_WSTARRseq/coverage
REP=rep${SLURM_ARRAY_TASK_ID}

### print start message
timer_start=`date +%s`
echo "Hostname:          " $(hostname)
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Time Stamp:        " $(date +"%m-%d-%y+%T")
echo

FP_CNTS=($(ls ${FD_CNT}/*${REP}*${REGION}*bed))
for FP_INP in ${FP_CNTS[@]}; do

    ### set input and output file
    FN_INP=$(basename ${FP_INP})
    FN_OUT=${FN_INP%.*}.perbase.tsv.gz
    FP_OUT=${FD_COV}/${FN_OUT}
    echo ++++++++++++++++++++++++++++++++++++++
    echo
    
    ### show input file
    echo "Input: " ${FP_INP}
    echo
    echo "show first few lines of input"
    cat ${FP_INP} | head -5
    echo
    echo "show last few lines of input"
    cat ${FP_INP} | tail -n 5
    echo
    
    ### execute
    bedtools map \
        -a ${FD_COV}/region_${REGION}_perbase.bed \
        -b ${FP_INP} \
        -o sum |\
        gzip -c > ${FP_OUT}
    
    ### show output file
    echo "Output: " ${FP_OUT}
    echo
    echo "show first few lines of output:"
    zcat ${FP_OUT} | head -5
    echo
    echo "show last few lines of output"
    zcat ${FP_OUT} | tail -n 5
    echo
done

### print end message
timer=`date +%s`
runtime=$(echo "${timer} - ${timer_start}" | bc -l)
echo
echo 'Done!'
echo "Run Time: $(displaytime ${runtime})"
EOF

Submitted batch job 29155998


**CHECK**

In [11]:
ls -l ${FD_LOG}/coverage_perbase_ASTARR_GATA1.1.txt
ls -l ${FD_LOG}/coverage_perbase_ASTARR_MYC.1.txt

-rw-rw-r-- 1 kk319 reddylab 2156 Oct 12 14:18 /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/log/coverage_perbase_ASTARR_GATA1.1.txt
-rw-rw-r-- 1 kk319 reddylab 2250 Oct 12 14:18 /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/log/coverage_perbase_ASTARR_MYC.1.txt


In [12]:
cat ${FD_LOG}/coverage_perbase_ASTARR_GATA1.1.txt

Hostname:           x1-01-2.genome.duke.edu
Slurm Array Index:  1
Time Stamp:         10-12-22+14:18:17

++++++++++++++++++++++++++++++++++++++

Input:  /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/KS91_K562_ASTARRseq/fragment_count/KS91_K562_hg38_ASTARRseq_Input_rep1.GATA1.stranded_neg.bed

show first few lines of input
chrX	47787557	47787773	chrX_47787557_47787773_-	1	-
chrX	47787593	47787783	chrX_47787593_47787783_-	1	-
chrX	47787598	47787772	chrX_47787598_47787772_-	1	-
chrX	47787743	47787849	chrX_47787743_47787849_-	1	-
chrX	47788008	47788157	chrX_47788008_47788157_-	1	-

show last few lines of input
chrX	49786139	49786285	chrX_49786139_49786285_-	1	-
chrX	49786329	49786456	chrX_49786329_49786456_-	1	-
chrX	49786338	49786527	chrX_49786338_49786527_-	1	-
chrX	49786423	49786588	chrX_49786423_49786588_-	1	-
chrX	49786428	49786620	chrX_49786428_49786620_-	1	-

Output:  /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/KS91_K562_ASTARRseq/coverage/KS91_K562_hg3

In [13]:
cat ${FD_LOG}/coverage_perbase_ASTARR_MYC.1.txt

Hostname:           x3-03-2.genome.duke.edu
Slurm Array Index:  1
Time Stamp:         10-12-22+14:18:17

++++++++++++++++++++++++++++++++++++++

Input:  /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/KS91_K562_ASTARRseq/fragment_count/KS91_K562_hg38_ASTARRseq_Input_rep1.MYC.stranded_neg.bed

show first few lines of input
chr8	126736091	126736259	chr8_126736091_126736259_-	1	-
chr8	126736091	126736273	chr8_126736091_126736273_-	1	-
chr8	126736094	126736199	chr8_126736094_126736199_-	1	-
chr8	126736118	126736221	chr8_126736118_126736221_-	1	-
chr8	126736120	126736273	chr8_126736120_126736273_-	1	-

show last few lines of input
chr8	128735657	128735838	chr8_128735657_128735838_-	1	-
chr8	128735669	128735923	chr8_128735669_128735923_-	1	-
chr8	128735723	128735879	chr8_128735723_128735879_-	1	-
chr8	128735821	128736138	chr8_128735821_128736138_-	1	-
chr8	128735934	128736149	chr8_128735934_128736149_-	1	-

Output:  /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/KS91