**Set environment**

In [1]:
source ../config/config_duke.sh
source ../config/config_load_module_bedtools.sh
show_env

You are on Duke Server: HARDAC
BASE DIRECTORY:     /gpfs/fs1/data/reddylab/Kuei
PATH OF SOURCE:     /gpfs/fs1/data/reddylab/Kuei/source
PATH OF EXECUTABLE: /gpfs/fs1/data/reddylab/Kuei/exe
PATH OF ANNOTATION: /gpfs/fs1/data/reddylab/Kuei/annotation
PATH OF PROJECT:    /gpfs/fs1/data/reddylab/Kuei/GitRepo/Proj_CombEffect_ENCODE_FCC/notebooks
PATH OF RESULTS:    /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc



## Calculate coverage perbase for input and output

**TEST**

In [2]:
### global variables
FD_CNT=${FD_RES}/Tewhey_K562_TileMPRA/fragment_count
FD_COV=${FD_RES}/Tewhey_K562_TileMPRA/coverage

IDX_COLS=(8 9)
IDN_COLS=(Input Output)
IDX=1

### loop through each region
REGIONS=(GATA1 MYC FADS)
for REGION in ${REGIONS[@]}; do

echo "Region: " ${REGION}
IDX_COL=${IDX_COLS[${IDX}]}
IDN_COL=${IDN_COLS[${IDX}]}

FP_CNTS=($(ls ${FD_CNT}/*${REGION}*.round.bed))
for FP_INP in ${FP_CNTS[@]}; do
    FN_INP=$(basename ${FP_INP})
    FN_OUT=${FN_INP%.*.*}.${IDN_COL}.perbase.tsv.gz
    echo "FN_INP:" ${FN_INP}
    echo "FN_OUT:" ${FN_OUT}
done
echo

done

Region:  GATA1
FN_INP: Tile_K562_hg38_20210130.GATA1.stranded_pos.round.bed
FN_OUT: Tile_K562_hg38_20210130.GATA1.stranded_pos.Output.perbase.tsv.gz

Region:  MYC
FN_INP: Tile_K562_hg38_20210130.MYC.stranded_pos.round.bed
FN_OUT: Tile_K562_hg38_20210130.MYC.stranded_pos.Output.perbase.tsv.gz

Region:  FADS
FN_INP: Tile_K562_hg38_20200905.FADS.stranded_neg.round.bed
FN_OUT: Tile_K562_hg38_20200905.FADS.stranded_neg.Output.perbase.tsv.gz
FN_INP: Tile_K562_hg38_20200905.FADS.stranded_pos.round.bed
FN_OUT: Tile_K562_hg38_20200905.FADS.stranded_pos.Output.perbase.tsv.gz



**RUN:GATA1**

In [3]:
sbatch -p ${NODE} \
    --mem 20G \
    --array 0-1 \
    -o ${FD_LOG}/coverage_perbase_TMPRA_GATA1.%a.txt \
    <<'EOF'
#!/bin/bash
### set directories & global variables
source ../config/config_duke.sh
REGION=GATA1

FD_CNT=${FD_RES}/Tewhey_K562_TileMPRA/fragment_count
FD_COV=${FD_RES}/Tewhey_K562_TileMPRA/coverage

IDX_COLS=(8 9)
IDN_COLS=(Input Output)

### set input and output
IDX_COL=${IDX_COLS[${SLURM_ARRAY_TASK_ID}]}
IDN_COL=${IDN_COLS[${SLURM_ARRAY_TASK_ID}]}

### print start message
timer_start=`date +%s`
echo "Hostname:          " $(hostname)
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Time Stamp:        " $(date +"%m-%d-%y+%T")
echo


### loop through files
FP_CNTS=($(ls ${FD_CNT}/*${REGION}*.round.bed))
for FP_INP in ${FP_CNTS[@]}; do
    FN_INP=$(basename ${FP_INP})
    FN_OUT=${FN_INP%.*.*}.${IDN_COL}.perbase.tsv.gz
    FP_OUT=${FD_COV}/${FN_OUT}
    echo ++++++++++++++++++++++++++++++++++++++
    echo
    
    ### show input file
    echo "Input: " ${FP_INP}
    echo
    echo "show first few lines of input"
    cat ${FP_INP} | head -5
    echo
    echo "show last few lines of input"
    cat ${FP_INP} | tail -n 5
    echo
    
    ### execute
    bedtools map \
        -a ${FD_COV}/region_${REGION}_perbase.bed \
        -b ${FP_INP} \
        -o mean \
        -c ${IDX_COL} |\
        gzip -c > ${FP_OUT}
    
    ### show output file
    echo "Output: " ${FP_OUT}
    echo
    echo "show first few lines of output:"
    zcat ${FP_OUT} | head -5
    echo
    echo "show last few lines of output"
    zcat ${FP_OUT} | tail -n 5
    echo
    
done

### print end message
timer=`date +%s`
runtime=$(echo "${timer} - ${timer_start}" | bc -l)
echo
echo 'Done!'
echo "Run Time: $(displaytime ${runtime})"

EOF

Submitted batch job 29149293


**RUN:MYC**

In [4]:
sbatch -p ${NODE} \
    --mem 20G \
    --array 0-1 \
    -o ${FD_LOG}/coverage_perbase_TMPRA_MYC.%a.txt \
    <<'EOF'
#!/bin/bash
### set directories & global variables
source ../config/config_duke.sh
REGION=MYC

FD_CNT=${FD_RES}/Tewhey_K562_TileMPRA/fragment_count
FD_COV=${FD_RES}/Tewhey_K562_TileMPRA/coverage

IDX_COLS=(8 9)
IDN_COLS=(Input Output)

### set input and output
IDX_COL=${IDX_COLS[${SLURM_ARRAY_TASK_ID}]}
IDN_COL=${IDN_COLS[${SLURM_ARRAY_TASK_ID}]}

### print start message
timer_start=`date +%s`
echo "Hostname:          " $(hostname)
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Time Stamp:        " $(date +"%m-%d-%y+%T")
echo


### loop through files
FP_CNTS=($(ls ${FD_CNT}/*${REGION}*.round.bed))
for FP_INP in ${FP_CNTS[@]}; do
    FN_INP=$(basename ${FP_INP})
    FN_OUT=${FN_INP%.*.*}.${IDN_COL}.perbase.tsv.gz
    FP_OUT=${FD_COV}/${FN_OUT}
    echo ++++++++++++++++++++++++++++++++++++++
    echo
    
    ### show input file
    echo "Input: " ${FP_INP}
    echo
    echo "show first few lines of input"
    cat ${FP_INP} | head -5
    echo
    echo "show last few lines of input"
    cat ${FP_INP} | tail -n 5
    echo
    
    ### execute
    bedtools map \
        -a ${FD_COV}/region_${REGION}_perbase.bed \
        -b ${FP_INP} \
        -o mean \
        -c ${IDX_COL} |\
        gzip -c > ${FP_OUT}
    
    ### show output file
    echo "Output: " ${FP_OUT}
    echo
    echo "show first few lines of output:"
    zcat ${FP_OUT} | head -5
    echo
    echo "show last few lines of output"
    zcat ${FP_OUT} | tail -n 5
    echo
    
done

### print end message
timer=`date +%s`
runtime=$(echo "${timer} - ${timer_start}" | bc -l)
echo
echo 'Done!'
echo "Run Time: $(displaytime ${runtime})"

EOF

Submitted batch job 29149294


**CHECK**

In [5]:
ls -l ${FD_LOG}/coverage_perbase_TMPRA_GATA1.1.txt
ls -l ${FD_LOG}/coverage_perbase_TMPRA_MYC.1.txt

-rw-rw-r-- 1 kk319 reddylab 1629 Oct 12 14:54 /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/log/coverage_perbase_TMPRA_GATA1.1.txt
-rw-rw-r-- 1 kk319 reddylab 1676 Oct 12 14:54 /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/log/coverage_perbase_TMPRA_MYC.1.txt


In [6]:
cat ${FD_LOG}/coverage_perbase_TMPRA_GATA1.1.txt

Hostname:           x1-01-3.genome.duke.edu
Slurm Array Index:  1
Time Stamp:         10-12-22+14:54:47

++++++++++++++++++++++++++++++++++++++

Input:  /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/Tewhey_K562_TileMPRA/fragment_count/Tile_K562_hg38_20210130.GATA1.stranded_pos.round.bed

show first few lines of input
chrX	47786401	47786600	X:47786401-47786600	.	+	2.306	2117.787	10474.869
chrX	47786451	47786650	X:47786451-47786650	.	+	0.783	1386.007	2387.119
chrX	47786501	47786700	X:47786501-47786700	.	+	-0.129	1955.626	1788.236
chrX	47786551	47786750	X:47786551-47786750	.	+	0.588	2388.997	3591.264
chrX	47786601	47786800	X:47786601-47786800	.	+	0.067	2102.916	2202.385

show last few lines of input
chrX	49786351	49786550	X:49786351-49786550	.	+	-0.216	775.951	668.187
chrX	49786401	49786600	X:49786401-49786600	.	+	0.18	1556.617	1763.467
chrX	49786451	49786650	X:49786451-49786650	.	+	-0.373	127.91	98.961
chrX	49786501	49786700	X:49786501-49786700	.	+	-0.49	120.701	86.235
chrX

In [7]:
cat ${FD_LOG}/coverage_perbase_TMPRA_MYC.1.txt

Hostname:           x1-02-1.genome.duke.edu
Slurm Array Index:  1
Time Stamp:         10-12-22+14:54:47

++++++++++++++++++++++++++++++++++++++

Input:  /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/Tewhey_K562_TileMPRA/fragment_count/Tile_K562_hg38_20210130.MYC.stranded_pos.round.bed

show first few lines of input
chr8	126735901	126736100	8:126735901-126736100	.	+	3.216	889.87	8270.081
chr8	126735951	126736150	8:126735951-126736150	.	+	2.742	125.835	840.939
chr8	126736001	126736200	8:126736001-126736200	.	+	2.973	510.316	4005.357
chr8	126736051	126736250	8:126736051-126736250	.	+	0.518	702.549	1005.359
chr8	126736101	126736300	8:126736101-126736300	.	+	0.854	551.083	995.47

show last few lines of input
chr8	128735851	128736050	8:128735851-128736050	.	+	0.088	80.41	85.666
chr8	128735901	128736100	8:128735901-128736100	.	+	-0.095	582.526	546.063
chr8	128735951	128736150	8:128735951-128736150	.	+	0.181	909.672	1032.099
chr8	128736001	128736200	8:128736001-128736200	.	+	0.82