# Annotate fragment using nonRedundant TF archetype

**Set environment**

In [1]:
### set env
source ../config_duke.sh -v
source ../config_load_module_bedtools.sh

You are on Duke Server: HARDAC
BASE DIRECTORY:     /gpfs/fs1/data/reddylab/Kuei
PATH OF SOURCE:     /gpfs/fs1/data/reddylab/Kuei/source
PATH OF EXECUTABLE: /gpfs/fs1/data/reddylab/Kuei/exe
PATH OF ANNOTATION: /gpfs/fs1/data/reddylab/Kuei/annotation
PATH OF PROJECT:    /gpfs/fs1/data/reddylab/Kuei/GitRepo/Proj_ENCODE_FCC/notebooks
PATH OF RESULTS:    /gpfs/fs1/data/reddylab/Kuei/out/proj_encode_fcc



In [2]:
ls ${FD_RES}/KS91_K562_ASTARRseq/count

KS91_K562_hg38_ASTARRseq_Input_rep1.GATA1.unstranded.dedups.bed
KS91_K562_hg38_ASTARRseq_Input_rep2.GATA1.unstranded.dedups.bed
KS91_K562_hg38_ASTARRseq_Input_rep3.GATA1.unstranded.dedups.bed
KS91_K562_hg38_ASTARRseq_Input_rep4.GATA1.unstranded.dedups.bed
KS91_K562_hg38_ASTARRseq_Input_rep5.GATA1.unstranded.dedups.bed
KS91_K562_hg38_ASTARRseq_Input_rep6.GATA1.unstranded.dedups.bed
KS91_K562_hg38_ASTARRseq_Output_rep1.GATA1.unstranded.bed
KS91_K562_hg38_ASTARRseq_Output_rep2.GATA1.unstranded.bed
KS91_K562_hg38_ASTARRseq_Output_rep3.GATA1.unstranded.bed
KS91_K562_hg38_ASTARRseq_Output_rep4.GATA1.unstranded.bed


In [3]:
FP_BEDS=($(ls ${FD_RES}/KS91_K562_ASTARRseq/count/*bed))

IDX=0
FP_BED=${FP_BEDS[${IDX}]}
FN_BED=$(basename ${FP_BED})

echo ${FP_BED}
echo ${FN_BED}

/gpfs/fs1/data/reddylab/Kuei/out/proj_encode_fcc/KS91_K562_ASTARRseq/count/KS91_K562_hg38_ASTARRseq_Input_rep1.GATA1.unstranded.dedups.bed
KS91_K562_hg38_ASTARRseq_Input_rep1.GATA1.unstranded.dedups.bed


## RUN

In [4]:
ls -d ${FD_LOG}
echo ${NODE}

[0m[38;5;27m/gpfs/fs1/data/reddylab/Kuei/out/proj_encode_fcc/log[0m
all


In [5]:
sbatch -p ${NODE} \
    --mem 20G \
    --array 0-9 \
    -o ${FD_LOG}/ASTARR_annotation.%a.txt \
    <<'EOF'
#!/bin/bash
### set directories & global variables
source ../config_duke.sh
CHROM=chrX

### set input and output
FP_BEDS=($(ls ${FD_RES}/KS91_K562_ASTARRseq/fragment/*bed))
FP_BED=${FP_BEDS[${SLURM_ARRAY_TASK_ID}]}

FD_OUT=${FD_RES}/KS91_K562_ASTARRseq/annotation
FN_OUT=$(basename ${FP_BED}).gz

### motif annotation
FD_MTF=${FD_ANN}/motif_cluster_jvierstra/hg38_archetype_motifs_v1
FN_MTF=${CHROM}_rm_mouse_merge.bed.gz

### print start message
timer_start=`date +%s`
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Time Stamp:        " $(date +"%m-%d-%y+%T")
echo
echo "Input  file: " ${FP_BED}
echo "Output file: " ${FD_OUT}/${FN_OUT}
echo
echo "Show the first few lines of the input file"
echo ${FP_BED}
cat  ${FP_BED} | head -n 3 
echo
    
### init: create output folder if not exist
mkdir -p ${FD_OUT}

### annotation using intersect
bedtools intersect -a ${FP_BED} -b ${FD_MTF}/${FN_MTF} -wo -F 1.0 |\
    gzip -c > ${FD_OUT}/${FN_OUT}

### print end message
echo "Show the first few lines of the output file"
echo ${FD_OUT}/${FN_OUT}
zcat ${FD_OUT}/${FN_OUT} | head -n 3
echo

timer=`date +%s`
runtime=$(echo "${timer} - ${timer_start}" | bc -l)
echo
echo 'Done!'
echo "Run Time: $(displaytime ${runtime})"

EOF

Submitted batch job 27849232


## CHECK

In [6]:
ls ${FD_RES}/KS91_K562_ASTARRseq/annotation

[0m[38;5;9mKS91_K562_hg38_ASTARRseq_Input_rep1.GATA1.unstranded.dedups.bed.gz[0m
[38;5;9mKS91_K562_hg38_ASTARRseq_Input_rep2.GATA1.unstranded.dedups.bed.gz[0m
[38;5;9mKS91_K562_hg38_ASTARRseq_Input_rep3.GATA1.unstranded.dedups.bed.gz[0m
[38;5;9mKS91_K562_hg38_ASTARRseq_Input_rep4.GATA1.unstranded.dedups.bed.gz[0m
[38;5;9mKS91_K562_hg38_ASTARRseq_Input_rep5.GATA1.unstranded.dedups.bed.gz[0m
[38;5;9mKS91_K562_hg38_ASTARRseq_Input_rep6.GATA1.unstranded.dedups.bed.gz[0m
[38;5;9mKS91_K562_hg38_ASTARRseq_Output_rep1.GATA1.unstranded.bed.gz[0m
[38;5;9mKS91_K562_hg38_ASTARRseq_Output_rep2.GATA1.unstranded.bed.gz[0m
[38;5;9mKS91_K562_hg38_ASTARRseq_Output_rep3.GATA1.unstranded.bed.gz[0m
[38;5;9mKS91_K562_hg38_ASTARRseq_Output_rep4.GATA1.unstranded.bed.gz[0m


In [6]:
cat ${FD_LOG}/ASTARR_annotation.0.txt

Slurm Array Index:  0
Time Stamp:         03-24-22+22:17:52

Input  file:  /gpfs/fs1/data/reddylab/Kuei/out/proj_encode_fcc/KS91_K562_ASTARRseq/fragment/KS91_K562_hg38_ASTARRseq_Input_rep1.GATA1.unstranded.dedups.bed
Output file:  /gpfs/fs1/data/reddylab/Kuei/out/proj_encode_fcc/KS91_K562_ASTARRseq/annotation/KS91_K562_hg38_ASTARRseq_Input_rep1.GATA1.unstranded.dedups.bed.gz

Show the first few lines of the input file
/gpfs/fs1/data/reddylab/Kuei/out/proj_encode_fcc/KS91_K562_ASTARRseq/fragment/KS91_K562_hg38_ASTARRseq_Input_rep1.GATA1.unstranded.dedups.bed
chrX	47787165	47787363	chrX_47787165_47787363
chrX	47787557	47787773	chrX_47787557_47787773
chrX	47787593	47787783	chrX_47787593_47787783

Show the first few lines of the output file
/gpfs/fs1/data/reddylab/Kuei/out/proj_encode_fcc/KS91_K562_ASTARRseq/annotation/KS91_K562_hg38_ASTARRseq_Input_rep1.GATA1.unstranded.dedups.bed.gz
chrX	47787165	47787363	chrX_47787165_47787363	chrX	47787180	47787190	FOX/8	5.1151	10
chrX	47787165	4778736

In [7]:
cat ${FD_LOG}/ASTARR_annotation.0.txt

Slurm Array Index:  0
Time Stamp:         03-22-22+23:20:38

Input  file:  /gpfs/fs1/data/reddylab/Kuei/out/proj_encode_fcc/KS91_K562_ASTARRseq/fragment/KS91_K562_hg38_ASTARRseq_Input_rep1.GATA1.unstranded.dedups.bed
Output file:  /gpfs/fs1/data/reddylab/Kuei/out/proj_encode_fcc/KS91_K562_ASTARRseq/annotation/KS91_K562_hg38_ASTARRseq_Input_rep1.GATA1.unstranded.dedups.bed

Show the first few lines of the input file
/gpfs/fs1/data/reddylab/Kuei/out/proj_encode_fcc/KS91_K562_ASTARRseq/fragment/KS91_K562_hg38_ASTARRseq_Input_rep1.GATA1.unstranded.dedups.bed
chrX	47787165	47787363	chrX_47787165_47787363
chrX	47787557	47787773	chrX_47787557_47787773
chrX	47787593	47787783	chrX_47787593_47787783

Show the first few lines of the output file
/gpfs/fs1/data/reddylab/Kuei/out/proj_encode_fcc/KS91_K562_ASTARRseq/annotation/KS91_K562_hg38_ASTARRseq_Input_rep1.GATA1.unstranded.dedups.bed
chrX	47787165	47787363	chrX_47787165_47787363	chrX	47787140	47787167	KLF/SP/2	6.8205	2
chrX	47787165	47787363	ch

In [7]:
cat ${FD_LOG}/ASTARR_annotation.9.txt

Slurm Array Index:  9
Time Stamp:         03-24-22+22:17:52

Input  file:  /gpfs/fs1/data/reddylab/Kuei/out/proj_encode_fcc/KS91_K562_ASTARRseq/fragment/KS91_K562_hg38_ASTARRseq_Output_rep4.GATA1.unstranded.bed
Output file:  /gpfs/fs1/data/reddylab/Kuei/out/proj_encode_fcc/KS91_K562_ASTARRseq/annotation/KS91_K562_hg38_ASTARRseq_Output_rep4.GATA1.unstranded.bed.gz

Show the first few lines of the input file
/gpfs/fs1/data/reddylab/Kuei/out/proj_encode_fcc/KS91_K562_ASTARRseq/fragment/KS91_K562_hg38_ASTARRseq_Output_rep4.GATA1.unstranded.bed
chrX	47787173	47787363	chrX_47787173_47787363
chrX	47787173	47787363	chrX_47787173_47787363
chrX	47787173	47787363	chrX_47787173_47787363

Show the first few lines of the output file
/gpfs/fs1/data/reddylab/Kuei/out/proj_encode_fcc/KS91_K562_ASTARRseq/annotation/KS91_K562_hg38_ASTARRseq_Output_rep4.GATA1.unstranded.bed.gz
chrX	47787173	47787363	chrX_47787173_47787363	chrX	47787180	47787190	FOX/8	5.1151	10
chrX	47787173	47787363	chrX_47787173_47787363

In [8]:
cat ${FD_LOG}/ASTARR_annotation.9.txt

Slurm Array Index:  9
Time Stamp:         03-22-22+23:20:38

Input  file:  /gpfs/fs1/data/reddylab/Kuei/out/proj_encode_fcc/KS91_K562_ASTARRseq/fragment/KS91_K562_hg38_ASTARRseq_Output_rep4.GATA1.unstranded.bed
Output file:  /gpfs/fs1/data/reddylab/Kuei/out/proj_encode_fcc/KS91_K562_ASTARRseq/annotation/KS91_K562_hg38_ASTARRseq_Output_rep4.GATA1.unstranded.bed

Show the first few lines of the input file
/gpfs/fs1/data/reddylab/Kuei/out/proj_encode_fcc/KS91_K562_ASTARRseq/fragment/KS91_K562_hg38_ASTARRseq_Output_rep4.GATA1.unstranded.bed
chrX	47787173	47787363	chrX_47787173_47787363
chrX	47787173	47787363	chrX_47787173_47787363
chrX	47787173	47787363	chrX_47787173_47787363

Show the first few lines of the output file
/gpfs/fs1/data/reddylab/Kuei/out/proj_encode_fcc/KS91_K562_ASTARRseq/annotation/KS91_K562_hg38_ASTARRseq_Output_rep4.GATA1.unstranded.bed
chrX	47787173	47787363	chrX_47787173_47787363	chrX	47787163	47787180	ZNF85	10.896	7
chrX	47787173	47787363	chrX_47787173_47787363	chrX	4