# Annotate segments using JASPAR annotation

In [1]:
%%bash
wc -l /data/reddylab/Kuei/out/CombEffect_STARR/data/TFX_Dex/chr17.bed
wc -l /data/reddylab/Kuei/out/CombEffect_STARR/count_fragment/TFX_Dex/cnt_PER1.bed
wc -l /data/reddylab/Kuei/out/CombEffect_STARR/count_segment/TFX_Dex/seg_PER1.bed

6239658 /data/reddylab/Kuei/out/CombEffect_STARR/data/TFX_Dex/chr17.bed
4827 /data/reddylab/Kuei/out/CombEffect_STARR/count_fragment/TFX_Dex/cnt_PER1.bed
3562 /data/reddylab/Kuei/out/CombEffect_STARR/count_segment/TFX_Dex/seg_PER1.bed


## Test: able to parallelize the annotation

- Realize that the files can be splitted based on the last digit of motif number
- each split is about 70 motif files

In [2]:
%%bash
FD_WRK=/data/reddylab/Kuei/out/CombEffect_STARR
FD_ANN=/data/reddylab/Kuei/out/annotation/JASPAR2020

for i in {0..9}; do
    echo $i
    ls ${FD_ANN}/MA???${i}.?.tsv.gz | wc -l
done

0
72
1
75
2
76
3
77
4
75
5
73
6
71
7
74
8
76
9
77


In [3]:
%%bash
FD_WRK=/data/reddylab/Kuei/out/CombEffect_STARR
FD_ANN=/data/reddylab/Kuei/out/annotation/JASPAR2020

for i in {0..9}; do
    echo $i
    ls ${FD_ANN}/chr17_MA???${i}.?.tsv.gz | wc -l
done

0
72
1
75
2
76
3
77
4
75
5
73
6
71
7
74
8
76
9
77


In [4]:
%%bash
FD_WRK=/data/reddylab/Kuei/out/CombEffect_STARR
FD_ANN=/data/reddylab/Kuei/out/annotation/JASPAR2020
FN_ANN=MA0113.3.tsv.gz

wc -l ${FD_ANN}/${FN_ANN}
echo +++++++++++++++++++++++++++++++++
zcat  ${FD_ANN}/${FN_ANN}       | head -3
echo +++++++++++++++++++++++++++++++++
zcat  ${FD_ANN}/chr17_${FN_ANN} | head -3

36279 /data/reddylab/Kuei/out/annotation/JASPAR2020/MA0113.3.tsv.gz
+++++++++++++++++++++++++++++++++
chr10	11650	11667	NR3C1	814	384	+
chr10	16730	16747	NR3C1	825	399	+
chr10	16730	16747	NR3C1	826	400	-
+++++++++++++++++++++++++++++++++
chr17	82590	82607	NR3C1	808	375	+
chr17	82590	82607	NR3C1	802	367	-
chr17	82804	82821	NR3C1	838	418	+


In [17]:
%%bash
FD_ANN=/data/reddylab/Kuei/out/annotation/JASPAR2020
SLURM_ARRAY_TASK_ID=0
CHROM=chr17
FP_ANNS=$(ls ${FD_ANN}/${CHROM}/MA???${SLURM_ARRAY_TASK_ID}.?.tsv.gz)
for FP_ANN in ${FP_ANNS}; do
    echo  ${FP_ANN}
    zcat ${FP_ANN} | wc -l
    zcat ${FP_ANN} | head -n 3 | expand -t 12
    zcat ${FP_ANN} | tail -n 3 | expand -t 12
    echo ++++++++++++++++++++++++++++++++
done

/data/reddylab/Kuei/out/annotation/JASPAR2020/chr17/MA0030.1.tsv.gz
80710
chr17       61738       61752       FOXF2       898         495         +
chr17       65858       65872       FOXF2       813         369         -
chr17       66454       66468       FOXF2       807         360         +
chr17       83246185    83246199    FOXF2       836         400         +
chr17       83246794    83246808    FOXF2       803         356         -
chr17       83246806    83246820    FOXF2       871         451         -
++++++++++++++++++++++++++++++++
/data/reddylab/Kuei/out/annotation/JASPAR2020/chr17/MA0040.1.tsv.gz
237372
chr17       61628       61639       Foxq1       875         413         -
chr17       61731       61742       Foxq1       808         329         -
chr17       61742       61753       Foxq1       828         353         -
chr17       83246928    83246939    Foxq1       885         429         -
chr17       83247040    83247051    Foxq1       844         373         +
chr1

In [3]:
%%bash
FP_ANN=/data/reddylab/Kuei/out/annotation/JASPAR2020/chr17/MA0030.1.tsv.gz
FN_ANN=$(basename ${FP_ANN})
echo $FP_ANN
echo $FN_ANN
echo ${FN_ANN%.*.*}

/data/reddylab/Kuei/out/annotation/JASPAR2020/chr17/MA0030.1.tsv.gz
MA0030.1.tsv.gz
MA0030.1


## Annotate segments generated at the region around PER1 (Ouput; DEX)

**Give the first test**

In [17]:
%%bash
### set environment
module load bedtools2
module load perl
module load gcc
source /data/reddylab/software/miniconda2/bin/activate alex_dev
export PATH=/data/reddylab/software/homer/bin/:$PATH

### set log file directory
FD_LOG=/gpfs/fs1/data/reddylab/Kuei/out/CombEffect_STARR/log

### run script using sbatch
sbatch -pnew,all \
    --mem 15G \
    -o ${FD_LOG}/prep_annotation_output_dex_per1_chr17.0.txt \
    <<'EOF'
#!/bin/bash
### set directories
FD_WRK=/data/reddylab/Kuei/out/CombEffect_STARR
FD_ANN=/data/reddylab/Kuei/out/annotation/JASPAR2020

### set global variables
CHROM=chr17
TARGET=PER1
SAMPLE=TFX_Dex
SLURM_ARRAY_TASK_ID=0

### set input and output files
FP_ANNS=$(ls ${FD_ANN}/${CHROM}/MA???${SLURM_ARRAY_TASK_ID}.?.tsv.gz)
FD_OUT=${FD_WRK}/annotation_segment/${SAMPLE}/${TARGET}
FD_SEG=${FD_WRK}/count_segment/${SAMPLE}
FN_SEG=seg_${TARGET}.bed

### Print start message
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Input  file: " ${FD_SEG}/${FN_SEG}
echo
echo "Show the first few lines of the input file"
echo ${FD_SEG}/${FN_SEG}

### init: create output directory
mkdir -p ${FD_OUT}

### looping through the annotation files
for FP_ANN in ${FP_ANNS}; do
    ### set annotation and output files
    FN_ANN=$(basename ${FP_ANN})
    echo "ANNOT  file: " ${FP_ANN}
    echo "Output file: " ${FD_OUT}/${CHROM}_${FN_ANN%.*.*}.bedpe
    
    ### annotation segments with a binding site
    FP_BED_A=${FD_SEG}/${FN_SEG}
    FP_BED_B=${FP_ANN}
    FP_BED_O=${FD_OUT}/${CHROM}_${FN_ANN%.*.*}.bedpe
    bedtools intersect -a ${FP_BED_A} -b ${FP_BED_B} -wo > ${FP_BED_O}
done

### Print end message
echo "Done."

EOF

Submitted batch job 25436100


**Continue Annotating: --array=1-9**

In [18]:
%%bash
### set environment
module load bedtools2
module load perl
module load gcc
source /data/reddylab/software/miniconda2/bin/activate alex_dev
export PATH=/data/reddylab/software/homer/bin/:$PATH

### set log file directory
FD_LOG=/gpfs/fs1/data/reddylab/Kuei/out/CombEffect_STARR/log

### run script using sbatch
sbatch -pnew,all \
    --array=1-9 \
    --mem 15G \
    -o ${FD_LOG}/prep_annotation_output_dex_per1_chr17.%a.txt \
    <<'EOF'
#!/bin/bash
### set directories
FD_WRK=/data/reddylab/Kuei/out/CombEffect_STARR
FD_ANN=/data/reddylab/Kuei/out/annotation/JASPAR2020

### set global variables
CHROM=chr17
TARGET=PER1
SAMPLE=TFX_Dex

### set input and output files
FP_ANNS=$(ls ${FD_ANN}/${CHROM}/MA???${SLURM_ARRAY_TASK_ID}.?.tsv.gz)
FD_OUT=${FD_WRK}/annotation_segment/${SAMPLE}/${TARGET}
FD_SEG=${FD_WRK}/count_segment/${SAMPLE}
FN_SEG=seg_${TARGET}.bed

### Print start message
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Input  file: " ${FD_SEG}/${FN_SEG}
echo
echo "Show the first few lines of the input file"
echo ${FD_SEG}/${FN_SEG}

### init: create output directory
mkdir -p ${FD_OUT}

### looping through the annotation files
for FP_ANN in ${FP_ANNS}; do
    ### set annotation and output files
    FN_ANN=$(basename ${FP_ANN})
    echo "ANNOT  file: " ${FP_ANN}
    echo "Output file: " ${FD_OUT}/${CHROM}_${FN_ANN%.*.*}.bedpe
    
    ### annotation segments with a binding site
    FP_BED_A=${FD_SEG}/${FN_SEG}
    FP_BED_B=${FP_ANN}
    FP_BED_O=${FD_OUT}/${CHROM}_${FN_ANN%.*.*}.bedpe
    bedtools intersect -a ${FP_BED_A} -b ${FP_BED_B} -wo > ${FP_BED_O}
done

### Print end message
echo "Done."

EOF

Submitted batch job 25436101


## Annotate segments generated at the region around PER1 (Ouput; DMSO)

In [16]:
%%bash
### set environment
module load bedtools2
module load perl
module load gcc
source /data/reddylab/software/miniconda2/bin/activate alex_dev
export PATH=/data/reddylab/software/homer/bin/:$PATH

### set log file directory
FD_LOG=/gpfs/fs1/data/reddylab/Kuei/out/CombEffect_STARR/log

### run script using sbatch
sbatch -pnew,all \
    --array=0-9 \
    --mem 15G \
    -o ${FD_LOG}/prep_annotation_output_dmso_per1_chr17.%a.txt \
    <<'EOF'
#!/bin/bash
### set directories
FD_WRK=/data/reddylab/Kuei/out/CombEffect_STARR
FD_ANN=/data/reddylab/Kuei/out/annotation/JASPAR2020

### set global variables
CHROM=chr17
TARGET=PER1
SAMPLE=TFX_DMSO

### set input and output files
FP_ANNS=$(ls ${FD_ANN}/${CHROM}/MA???${SLURM_ARRAY_TASK_ID}.?.tsv.gz)
FD_OUT=${FD_WRK}/annotation_segment/${SAMPLE}/${TARGET}
FD_SEG=${FD_WRK}/count_segment/${SAMPLE}
FN_SEG=seg_${TARGET}.bed

### Print start message
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Input  file: " ${FD_SEG}/${FN_SEG}
echo
echo "Show the first few lines of the input file"
echo ${FD_SEG}/${FN_SEG}

### init: create output directory
mkdir -p ${FD_OUT}

### looping through the annotation files
for FP_ANN in ${FP_ANNS}; do
    ### set annotation and output files
    FN_ANN=$(basename ${FP_ANN})
    echo "ANNOT  file: " ${FP_ANN}
    echo "Output file: " ${FD_OUT}/${CHROM}_${FN_ANN%.*.*}.bedpe
    
    ### annotation segments with a binding site
    FP_BED_A=${FD_SEG}/${FN_SEG}
    FP_BED_B=${FP_ANN}
    FP_BED_O=${FD_OUT}/${CHROM}_${FN_ANN%.*.*}.bedpe
    bedtools intersect -a ${FP_BED_A} -b ${FP_BED_B} -wo > ${FP_BED_O}
done

### Print end message
echo "Done."

EOF

Submitted batch job 25436090


## Annotate segments generated at the region around PER1 (Input)

In [15]:
%%bash
### set environment
module load bedtools2
module load perl
module load gcc
source /data/reddylab/software/miniconda2/bin/activate alex_dev
export PATH=/data/reddylab/software/homer/bin/:$PATH

### set log file directory
FD_LOG=/gpfs/fs1/data/reddylab/Kuei/out/CombEffect_STARR/log

### run script using sbatch
sbatch -pnew,all \
    --array=0-9 \
    --mem 15G \
    -o ${FD_LOG}/prep_annotation_input_per1_chr17.%a.txt \
    <<'EOF'
#!/bin/bash
### set directories
FD_WRK=/data/reddylab/Kuei/out/CombEffect_STARR
FD_ANN=/data/reddylab/Kuei/out/annotation/JASPAR2020

### set global variables
CHROM=chr17
TARGET=PER1
SAMPLE=Input

### set input and output files
FP_ANNS=$(ls ${FD_ANN}/${CHROM}/MA???${SLURM_ARRAY_TASK_ID}.?.tsv.gz)
FD_OUT=${FD_WRK}/annotation_segment/${SAMPLE}/${TARGET}
FD_SEG=${FD_WRK}/count_segment/${SAMPLE}
FN_SEG=seg_${TARGET}.bed

### Print start message
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Input  file: " ${FD_SEG}/${FN_SEG}
echo
echo "Show the first few lines of the input file"
echo ${FD_SEG}/${FN_SEG}

### init: create output directory
mkdir -p ${FD_OUT}

### looping through the annotation files
for FP_ANN in ${FP_ANNS}; do
    ### set annotation and output files
    FN_ANN=$(basename ${FP_ANN})
    echo "ANNOT  file: " ${FP_ANN}
    echo "Output file: " ${FD_OUT}/${CHROM}_${FN_ANN%.*.*}.bedpe
    
    ### annotation segments with a binding site
    FP_BED_A=${FD_SEG}/${FN_SEG}
    FP_BED_B=${FP_ANN}
    FP_BED_O=${FD_OUT}/${CHROM}_${FN_ANN%.*.*}.bedpe
    bedtools intersect -a ${FP_BED_A} -b ${FP_BED_B} -wo > ${FP_BED_O}
done

### Print end message
echo "Done."

EOF

Submitted batch job 25436080


-----

## Test Ch17 output from the `11_partition.ipynb`

Give a test

In [17]:
%%bash
### set environment
module load bedtools2
module load perl
module load gcc
source /data/reddylab/software/miniconda2/bin/activate alex_dev
export PATH=/data/reddylab/software/homer/bin/:$PATH

### set log file directory
FD_LOG=/gpfs/fs1/data/reddylab/Kuei/out/CombEffect_STARR/log

### run script using sbatch
sbatch -pnew,all \
    --mem 20G \
    -o ${FD_LOG}/prep_annotation_dex_test.0.txt \
    <<'EOF'
#!/bin/bash
### set directories
FD_WRK=/data/reddylab/Kuei/out/CombEffect_STARR
FD_OUT=${FD_WRK}/data/TFX_Dex
FD_ANN=/data/reddylab/Kuei/out/annotation/JASPAR2020

### set input and output file names
SLURM_ARRAY_TASK_ID=0
CHROM=chr17
FN_SEG=seg_PER1.bed
FP_ANNS=$(ls ${FD_ANN}/MA???${SLURM_ARRAY_TASK_ID}.?.tsv.gz)

### Print start message
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Input  file:       " ${FD_OUT}/${FN_SEG}

### looping through the annotation files
for FP_ANN in ${FP_ANNS}; do
    FN_ANN=$(basename ${FP_ANN})
    
    ### annotation segments with a binding site
    echo "ANNOT  file: " ${FD_ANN}/${CHROM}_${FN_ANN}
    echo "Output file: " ${FD_OUT}/${CHROM}_${FN_ANN%.*.*}.bedpe
    FP_BED_A=${FD_OUT}/${FN_SEG}
    FP_BED_B=${FD_ANN}/${CHROM}_${FN_ANN}
    FP_BED_O=${FD_OUT}/${CHROM}_${FN_ANN%.*.*}.bedpe
    bedtools intersect -a ${FP_BED_A} -b ${FP_BED_B} -wo > ${FP_BED_O}
done

### Print end message
echo "Done."

EOF

Submitted batch job 25398713


array: 1-4

In [15]:
%%bash
### set environment
module load bedtools2
module load perl
module load gcc
source /data/reddylab/software/miniconda2/bin/activate alex_dev
export PATH=/data/reddylab/software/homer/bin/:$PATH

### set log file directory
FD_LOG=/gpfs/fs1/data/reddylab/Kuei/out/CombEffect_STARR/log

### run script using sbatch
sbatch -pnew,all \
    --array=1-4 \
    --mem 20G \
    -o ${FD_LOG}/prep_annotation_dex_test.%a.txt \
    <<'EOF'
#!/bin/bash
### set directories
FD_WRK=/data/reddylab/Kuei/out/CombEffect_STARR
FD_OUT=${FD_WRK}/data/TFX_Dex
FD_ANN=/data/reddylab/Kuei/out/annotation/JASPAR2020

### set input and output file names
#SLURM_ARRAY_TASK_ID=0
CHROM=chr17
FN_SEG=seg_${CHROM}.bed
FP_ANNS=$(ls ${FD_ANN}/MA???${SLURM_ARRAY_TASK_ID}.?.tsv.gz)

### Print start message
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Input  file:       " ${FD_OUT}/${FN_SEG}

### looping through the annotation files
for FP_ANN in ${FP_ANNS}; do
    FN_ANN=$(basename ${FP_ANN})
    
    ### annotation segments with a binding site
    echo "ANNOT  file: " ${FD_ANN}/${CHROM}_${FN_ANN}
    echo "Output file: " ${FD_OUT}/${CHROM}_${FN_ANN%.*.*}.bedpe
    FP_BED_A=${FD_OUT}/${FN_SEG}
    FP_BED_B=${FD_ANN}/${CHROM}_${FN_ANN}
    FP_BED_O=${FD_OUT}/${CHROM}_${FN_ANN%.*.*}.bedpe
    bedtools intersect -a ${FP_BED_A} -b ${FP_BED_B} -wo > ${FP_BED_O}
done

### Print end message
echo "Done."

EOF

Submitted batch job 25398698


array: 5-9

In [16]:
%%bash
### set environment
module load bedtools2
module load perl
module load gcc
source /data/reddylab/software/miniconda2/bin/activate alex_dev
export PATH=/data/reddylab/software/homer/bin/:$PATH

### set log file directory
FD_LOG=/gpfs/fs1/data/reddylab/Kuei/out/CombEffect_STARR/log

### run script using sbatch
sbatch -pnew,all \
    --array=5-9 \
    --mem 20G \
    -o ${FD_LOG}/prep_annotation_dex_test.%a.txt \
    <<'EOF'
#!/bin/bash
### set directories
FD_WRK=/data/reddylab/Kuei/out/CombEffect_STARR
FD_OUT=${FD_WRK}/data/TFX_Dex
FD_ANN=/data/reddylab/Kuei/out/annotation/JASPAR2020

### set input and output file names
#SLURM_ARRAY_TASK_ID=0
CHROM=chr17
FN_SEG=seg_${CHROM}.bed
FP_ANNS=$(ls ${FD_ANN}/MA???${SLURM_ARRAY_TASK_ID}.?.tsv.gz)

### Print start message
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Input  file:       " ${FD_OUT}/${FN_SEG}

### looping through the annotation files
for FP_ANN in ${FP_ANNS}; do
    FN_ANN=$(basename ${FP_ANN})
    
    ### annotation segments with a binding site
    echo "ANNOT  file: " ${FD_ANN}/${CHROM}_${FN_ANN}
    echo "Output file: " ${FD_OUT}/${CHROM}_${FN_ANN%.*.*}.bedpe
    FP_BED_A=${FD_OUT}/${FN_SEG}
    FP_BED_B=${FD_ANN}/${CHROM}_${FN_ANN}
    FP_BED_O=${FD_OUT}/${CHROM}_${FN_ANN%.*.*}.bedpe
    bedtools intersect -a ${FP_BED_A} -b ${FP_BED_B} -wo > ${FP_BED_O}
done

### Print end message
echo "Done."

EOF

Submitted batch job 25398707


rm /data/reddylab/Kuei/out/CombEffect_STARR/data/TFX_Dex/chr17_*

## Check the intput files and output results

for each motif, take a look at the number of locations it binds in the genome and the number of segments that it binds to the specified windows of chromosome 17

In [4]:
%%bash
FD_ANN=/data/reddylab/Kuei/out/annotation/JASPAR2020

i=0
fpaths=$(ls ${FD_ANN}/MA???${i}.?.tsv.gz)
for fpath in ${fpaths}; do
    fname=$(basename ${fpath})
    echo  ${fpath}
    echo  ${fname}
    echo  ${fname%.*.*}.bed
    wc -l ${FD_ANN}/${fname}
    wc -l ${FD_ANN}/chr17_${fname}
    echo ++++++++++++++++++++++++++++++
done

/data/reddylab/Kuei/out/annotation/JASPAR2020/MA0030.1.tsv.gz
MA0030.1.tsv.gz
MA0030.1.bed
167571 /data/reddylab/Kuei/out/annotation/JASPAR2020/MA0030.1.tsv.gz
3901 /data/reddylab/Kuei/out/annotation/JASPAR2020/chr17_MA0030.1.tsv.gz
++++++++++++++++++++++++++++++
/data/reddylab/Kuei/out/annotation/JASPAR2020/MA0040.1.tsv.gz
MA0040.1.tsv.gz
MA0040.1.bed
369644 /data/reddylab/Kuei/out/annotation/JASPAR2020/MA0040.1.tsv.gz
6632 /data/reddylab/Kuei/out/annotation/JASPAR2020/chr17_MA0040.1.tsv.gz
++++++++++++++++++++++++++++++
/data/reddylab/Kuei/out/annotation/JASPAR2020/MA0050.2.tsv.gz
MA0050.2.tsv.gz
MA0050.2.bed
175791 /data/reddylab/Kuei/out/annotation/JASPAR2020/MA0050.2.tsv.gz
5457 /data/reddylab/Kuei/out/annotation/JASPAR2020/chr17_MA0050.2.tsv.gz
++++++++++++++++++++++++++++++
/data/reddylab/Kuei/out/annotation/JASPAR2020/MA0060.3.tsv.gz
MA0060.3.tsv.gz
MA0060.3.bed
228453 /data/reddylab/Kuei/out/annotation/JASPAR2020/MA0060.3.tsv.gz
6426 /data/reddylab/Kuei/out/annotation/JASPAR20