# Annotate fragment using nonRedundant TF archetype

**Set environment**

In [1]:
source ../config_duke.sh -v

You are on Duke Server: HARDAC
BASE DIRECTORY:     /gpfs/fs1/data/reddylab/Kuei
PATH OF SOURCE:     /gpfs/fs1/data/reddylab/Kuei/source
PATH OF EXECUTABLE: /gpfs/fs1/data/reddylab/Kuei/exe
PATH OF ANNOTATION: /gpfs/fs1/data/reddylab/Kuei/annotation
PATH OF PROJECT:    /gpfs/fs1/data/reddylab/Kuei/GitRepo/Proj_CombEffect_STARRseq/notebooks
PATH OF RESULTS:    /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect



**Check data**

In [5]:
### Loop folders
FD_BEDS=($(ls -d ${FD_RES}/count_fragment/{Input?,Input?_20x,TFX?_DMSO,TFX?_Dex}/))
for FD_BED in ${FD_BEDS[@]}; do
    SAMPLE=$(basename ${FD_BED})
    echo "SAM:${SAMPLE}; BED: ${FD_BED}"
done

SAM:Input1; BED: /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect/count_fragment/Input1/
SAM:Input1_20x; BED: /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect/count_fragment/Input1_20x/
SAM:Input2; BED: /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect/count_fragment/Input2/
SAM:Input2_20x; BED: /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect/count_fragment/Input2_20x/
SAM:Input3; BED: /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect/count_fragment/Input3/
SAM:Input3_20x; BED: /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect/count_fragment/Input3_20x/
SAM:Input4; BED: /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect/count_fragment/Input4/
SAM:Input4_20x; BED: /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect/count_fragment/Input4_20x/
SAM:Input5; BED: /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect/count_fragment/Input5/
SAM:Input5_20x; BED: /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect/count_fragment/Input5_20x/
SAM:TFX2_Dex; BED: /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect/co

**RUN**

In [4]:
ls -d ${FD_LOG}
echo ${NODE}

[0m[38;5;27m/gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect/log[0m
all


In [8]:
### set env
source ../config_duke.sh
source ../config_load_module_bedtools.sh

sbatch -p ${NODE} \
    --array 0-23 \
    --mem 20G \
    -o ${FD_LOG}/annotate_fragment_chrom.%a.txt \
    <<'EOF'
#!/bin/bash
### set directories & global variables
source ../config_duke.sh
CHROMS=($(seq 1 22) X Y)
CHROM=chr${CHROMS[${SLURM_ARRAY_TASK_ID}]}
TARGET=${CHROM}

### set input and output
FD_BEDS=($(ls -d ${FD_RES}/count_fragment/{Input?,Input?_20x,TFX?_DMSO,TFX?_Dex}/))
FN_BED=${TARGET}.bed.gz

### motif annotation
FD_MTF=${FD_ANN}/motif_cluster_jvierstra/hg38_archetype_motifs_v1
FN_MTF=${CHROM}_rm_mouse_merge.bed.gz

### print start message
timer_start=`date +%s`
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Time Stamp:        " $(date +"%m-%d-%y+%T")
echo

### loop through each sample and count each fragment of a chromosome
echo "RUN: loop through samples; annotate fragment"

for FD_BED in ${FD_BEDS[@]}; do
    SAMPLE=$(basename ${FD_BED})
    FD_OUT=${FD_RES}/annotation/${SAMPLE}
    FN_OUT=${TARGET}.bed.gz
    
    ### START Message
    echo ++++++++++++++++++++++++++++++++++++++++++++++++
    echo "Input  file: " ${FD_BED}/${FN_BED}
    echo "Output file: " ${FD_OUT}/${FN_OUT}
    echo
    echo "Show the first few lines of the input file"
    echo ${FD_BED}/${FN_BED}
    zcat ${FD_BED}/${FN_BED} | head -n 3 
    echo
    
    ### init: create output folder if not exist
    mkdir -p ${FD_OUT}
    
    ### annotation using intersect
    bedtools intersect -a ${FD_BED}/${FN_BED} -b ${FD_MTF}/${FN_MTF} -wo |\
        gzip -c > ${FD_OUT}/${FN_OUT}
    
    ### END Message
    echo "Show the first few lines of the output file"
    echo ${FD_OUT}/${FN_OUT}
    zcat ${FD_OUT}/${FN_OUT} | head -n 3
    echo
done

### print end message
timer=`date +%s`
runtime=$(echo "${timer} - ${timer_start}" | bc -l)
echo 'Done!'
echo "Run Time: $(displaytime ${runtime})"

EOF

Submitted batch job 27381192


In [9]:
cat ${FD_LOG}/annotate_fragment_chrom_input20x.0.txt

Slurm Array Index:  0
Time Stamp:         10-01-21+17:55:26

RUN: loop through samples; annotate fragment
++++++++++++++++++++++++++++++++++++++++++++++++
Input  file:  /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect/count_fragment/Input1_20x//chr1.bed.gz
Output file:  /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect/annotation_fragment/Input1_20x/chr1.bed.gz

Show the first few lines of the input file
/gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect/count_fragment/Input1_20x//chr1.bed.gz
chr1	13802	14757	1
chr1	13802	14759	2
chr1	13868	14793	2

Show the first few lines of the output file
/gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect/annotation_fragment/Input1_20x/chr1.bed.gz
chr1	13802	14757	1	chr1	13790	13810	ZNF335	6.8996	8
chr1	13802	14757	1	chr1	13792	13806	NR/3	8.1207	4
chr1	13802	14757	1	chr1	13794	13814	GC-tract	6.2461	12

++++++++++++++++++++++++++++++++++++++++++++++++
Input  file:  /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect/count_fragment/Input2_20x//chr1.bed.gz
Ou