**Set environment**

In [1]:
### set env
source ../config/config_duke.sh
source ../config/config_load_module_bedtools.sh
show_env

You are on Duke Server: HARDAC
BASE DIRECTORY (FD_BASE): /data/reddylab/Kuei
WORK DIRECTORY (FD_WORK): /data/reddylab/Kuei/out
CODE DIRECTORY (FD_CODE): /data/reddylab/Kuei/code
SING DIRECTORY (FD_SING): /data/reddylab/Kuei/singularity
PATH OF PROJECT (FD_PRJ): /data/reddylab/Kuei/code/Proj_CombEffect_ENCODE_FCC
PATH OF RESULTS (FD_RES): /data/reddylab/Kuei/out/proj_combeffect_encode_fcc
PATH OF LOG     (FD_LOG): /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/log



**Check data**

In [11]:
ls -1 ${FD_RES}/results/region/annotation_gencode

gencode.v42.basic.annotation.tes.bed
gencode.v42.basic.annotation.tss.bed


## Annotate ATAC peaks using TSS sites

In [6]:
sbatch -p ${NODE} \
    --exclude=dl-01 \
    --cpus-per-task 8 \
    --mem 8G \
    --output ${FD_LOG}/annotation_astarr_peak_macs_input_gencode_tss.txt \
    --array 0 \
    <<'EOF'
#!/bin/bash
### set directories & global variables
source ../config/config_duke.sh

### set input and output
FD_BED=${FD_RES}/results/region/KS91_K562_ASTARRseq_peak_macs_input
FN_BED=KS91_K562_hg38_ASTARRseq_Input.all_reps.masked.union_narrowPeak.q5.bed
FP_BED=${FD_BED}/${FN_BED}

FD_ANN=${FD_RES}/results/region/annotation_gencode
FN_ANN=gencode.v42.basic.annotation.tss.bed
FP_ANN=${FD_ANN}/${FN_ANN}

FD_OUT=${FD_RES}/results/region/KS91_K562_ASTARRseq_peak_macs_input
FN_OUT=peak.annotation.gencode_v24_tss.bed.gz
FP_OUT=${FD_OUT}/${FN_OUT}

### print start message
timer_start=`date +%s`
echo "Hostname:          " $(hostname)
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Time Stamp:        " $(date +"%m-%d-%y+%T")
echo

### show I/O file
echo "Input: " ${FP_BED}
echo
echo "show first few lines of input"
cat ${FP_BED} | head -5
echo
echo "Input: " ${FP_ANN}
echo
echo "show first few lines of input"
cat ${FP_ANN} | head -5
echo

### init: create output folder if not exist
mkdir -p ${FD_OUT}

### annotation using intersect
bedtools intersect -a ${FP_BED} -b ${FP_ANN} -wo | gzip -c > ${FP_OUT}

### show output file
echo
echo "Output: " ${FP_OUT}
echo
echo "show first few lines of output:"
zcat ${FP_OUT} | head -5
echo

### print end message
timer=`date +%s`
runtime=$(echo "${timer} - ${timer_start}" | bc -l)
echo
echo 'Done!'
echo "Run Time: $(displaytime ${runtime})"

EOF

Submitted batch job 29556814


**Check results**

In [8]:
cat ${FD_LOG}/annotation_astarr_peak_macs_input_gencode_tss.txt

Hostname:           x3-01-3.genome.duke.edu
Slurm Array Index:  0
Time Stamp:         03-07-23+16:00:01

Input:  /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/region/KS91_K562_ASTARRseq_peak_macs_input/KS91_K562_hg38_ASTARRseq_Input.all_reps.masked.union_narrowPeak.q5.bed

show first few lines of input
chr1	10015	10442
chr1	14253	14645
chr1	16015	16477
chr1	17237	17772
chr1	28903	29613

Input:  /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/region/annotation_gencode/gencode.v42.basic.annotation.tss.bed

show first few lines of input
chr1	11868	11869	ENSG00000290825.1	.	+	DDX11L2	2	overlaps_pseudogene
chr1	12009	12010	ENSG00000223972.6	.	+	DDX11L1	2	HGNC:37102
chr1	29570	29571	ENSG00000227232.5	.	-	WASH7P	2	HGNC:38034
chr1	17436	17437	ENSG00000278267.1	.	-	MIR6859-1	3	HGNC:50039
chr1	29553	29554	ENSG00000243485.5	.	+	MIR1302-2HG	2	HGNC:52482


Output:  /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/region/KS91_K562_ASTARRseq_peak_macs_input/peak.anno

## Annotate ATAC peaks using TES sites

In [7]:
sbatch -p ${NODE} \
    --exclude=dl-01 \
    --cpus-per-task 8 \
    --mem 8G \
    --output ${FD_LOG}/annotation_astarr_peak_macs_input_gencode_tes.txt \
    --array 0 \
    <<'EOF'
#!/bin/bash
### set directories & global variables
source ../config/config_duke.sh

### set input and output
FD_BED=${FD_RES}/results/region/KS91_K562_ASTARRseq_peak_macs_input
FN_BED=KS91_K562_hg38_ASTARRseq_Input.all_reps.masked.union_narrowPeak.q5.bed
FP_BED=${FD_BED}/${FN_BED}

FD_ANN=${FD_RES}/results/region/annotation_gencode
FN_ANN=gencode.v42.basic.annotation.tes.bed
FP_ANN=${FD_ANN}/${FN_ANN}

FD_OUT=${FD_RES}/results/region/KS91_K562_ASTARRseq_peak_macs_input
FN_OUT=peak.annotation.gencode_v24_tes.bed.gz
FP_OUT=${FD_OUT}/${FN_OUT}

### print start message
timer_start=`date +%s`
echo "Hostname:          " $(hostname)
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Time Stamp:        " $(date +"%m-%d-%y+%T")
echo

### show I/O file
echo "Input: " ${FP_BED}
echo
echo "show first few lines of input"
cat ${FP_BED} | head -5
echo
echo "Input: " ${FP_ANN}
echo
echo "show first few lines of input"
cat ${FP_ANN} | head -5
echo

### init: create output folder if not exist
mkdir -p ${FD_OUT}

### annotation using intersect
bedtools intersect -a ${FP_BED} -b ${FP_ANN} -wo | gzip -c > ${FP_OUT}

### show output file
echo
echo "Output: " ${FP_OUT}
echo
echo "show first few lines of output:"
zcat ${FP_OUT} | head -5
echo

### print end message
timer=`date +%s`
runtime=$(echo "${timer} - ${timer_start}" | bc -l)
echo
echo 'Done!'
echo "Run Time: $(displaytime ${runtime})"

EOF

Submitted batch job 29556815


**Check results**

In [9]:
cat ${FD_LOG}/annotation_astarr_peak_macs_input_gencode_tes.txt

Hostname:           x3-02-1.genome.duke.edu
Slurm Array Index:  0
Time Stamp:         03-07-23+16:00:01

Input:  /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/region/KS91_K562_ASTARRseq_peak_macs_input/KS91_K562_hg38_ASTARRseq_Input.all_reps.masked.union_narrowPeak.q5.bed

show first few lines of input
chr1	10015	10442
chr1	14253	14645
chr1	16015	16477
chr1	17237	17772
chr1	28903	29613

Input:  /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/region/annotation_gencode/gencode.v42.basic.annotation.tes.bed

show first few lines of input
chr1	14409	14410	ENSG00000290825.1	.	+	DDX11L2	2	overlaps_pseudogene
chr1	13670	13671	ENSG00000223972.6	.	+	DDX11L1	2	HGNC:37102
chr1	14403	14404	ENSG00000227232.5	.	-	WASH7P	2	HGNC:38034
chr1	17368	17369	ENSG00000278267.1	.	-	MIR6859-1	3	HGNC:50039
chr1	31109	31110	ENSG00000243485.5	.	+	MIR1302-2HG	2	HGNC:52482


Output:  /data/reddylab/Kuei/out/proj_combeffect_encode_fcc/results/region/KS91_K562_ASTARRseq_peak_macs_input/peak.anno