In [37]:
### set environment on DCC
module load Bedtools
source config.sh
FD_LOG=${FD_WRK}/log

sbatch -p scavenger \
    --mem 15G \
    -o ${FD_LOG}/prep_annot_per1.txt \
    <<'EOF'
#!/bin/bash
### set directories & global variables
source config.sh
CHROM=chr17
TARGET=PER1

### init: set input and output file
FD_ANN=${FD_BASE}/annotation/motif_cluster_jvierstra/hg38_archetype_motifs_v1
FN_ANN=${CHROM}_rm_mouse.bed.gz
FD_OUT=${FD_ANN}/${TARGET}
FN_OUT=${TARGET}.bed

### print end message
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Input  file: " ${FD_ANN}/${FN_ANN}
echo "Output file: " ${FD_OUT}/${FN_OUT}
echo
echo "Show the first few lines of the input file"
echo ${FD_ANN}/${FN_ANN}
zcat ${FD_ANN}/${FN_ANN} | head


### count the fragments and arrange output to a proper bed file (chr17:PER1)
mkdir -p ${FD_OUT}
zcat ${FD_ANN}/${FN_ANN} |\
    awk -F $'\t' '($2 >= 8148000 && $2 <= 8160000)' > ${FD_OUT}/${FN_OUT}

### filter out the target TFs
cat ${FD_OUT}/${FN_OUT} | awk -F $'\t' '($4 == "NR/20")' > ${FD_OUT}/NR_20.bed
cat ${FD_OUT}/${FN_OUT} | awk -F $'\t' '($4 == "AP1/2")' > ${FD_OUT}/AP1_2.bed

### merge
bedtools merge -i ${FD_OUT}/NR_20.bed -c 4,5 -o distinct,mean > ${FD_OUT}/NR_20.merge.bed
bedtools merge -i ${FD_OUT}/AP1_2.bed -c 4,5 -o distinct,mean > ${FD_OUT}/AP1_2.merge.bed

### coverage
echo -e "chr17\t8148000\t8160000" > ${FD_OUT}/region_target.bed

FP_BED_A=${FD_OUT}/region_target.bed
FP_BED_B=${FD_OUT}/NR_20.merge.bed
FP_BED_O=${FD_OUT}/NR_20.coverage.bed
bedtools coverage -a ${FP_BED_A} -b ${FP_BED_B} -d > ${FP_BED_O}

FP_BED_A=${FD_OUT}/region_target.bed
FP_BED_B=${FD_OUT}/AP1_2.merge.bed
FP_BED_O=${FD_OUT}/AP1_2.coverage.bed
bedtools coverage -a ${FP_BED_A} -b ${FP_BED_B} -d > ${FP_BED_O}

### print end message
echo
echo "Show the first few lines of the output file"
echo ${FD_OUT}/${FN_OUT}
head ${FD_OUT}/${FN_OUT}

echo
echo "Show the first few lines of the output file"
echo ${FD_OUT}/NR_20.bed
head ${FD_OUT}/NR_20.bed

echo
echo "Show the first few lines of the output file"
echo ${FD_OUT}/AP1_2.bed
head ${FD_OUT}/AP1_2.bed

echo
echo "Show the first few lines of the output file"
echo ${FD_OUT}/NR_20.merge.bed
head ${FD_OUT}/NR_20.merge.bed

echo
echo "Show the first few lines of the output file"
echo ${FD_OUT}/AP1_2.merge.bed
head ${FD_OUT}/AP1_2.merge.bed

echo
echo "Show the first few lines of the output file"
echo ${FD_OUT}/NR_20.coverage.bed
head ${FD_OUT}/NR_20.coverage.bed

echo
echo "Show the first few lines of the output file"
echo ${FD_OUT}/AP1_2.coverage.bed
head ${FD_OUT}/AP1_2.coverage.bed

EOF

Submitted batch job 11776121


In [38]:
source config.sh
CHROM=chr17
FD_LOG=${FD_WRK}/log
cat ${FD_LOG}/prep_annot_per1.txt

Slurm Array Index: 
Input  file:  /work/kk319/annotation/motif_cluster_jvierstra/hg38_archetype_motifs_v1/chr17_rm_mouse.bed.gz
Output file:  /work/kk319/annotation/motif_cluster_jvierstra/hg38_archetype_motifs_v1/PER1/PER1.bed

Show the first few lines of the input file
/work/kk319/annotation/motif_cluster_jvierstra/hg38_archetype_motifs_v1/chr17_rm_mouse.bed.gz
chr17	60004	60022	ZNF140	5.6897	+	ZN140_HUMAN.H11MO.0.C	1
chr17	60004	60022	ZNF667	8.0240	-	ZN667_HUMAN.H11MO.0.C	1
chr17	60006	60015	Ebox/CAGCTG	7.9275	+	MYOD1_HUMAN.H11MO.0.A	1
chr17	60011	60031	GC-tract	12.1220	-	ZN341_HUMAN.H11MO.0.C	1
chr17	60012	60025	PRDM4	1.3083	+	PRDM4_C2H2_1	1
chr17	60017	60028	NR/19	9.6680	-	NR1D1_HUMAN.H11MO.0.B	2
chr17	60019	60035	HEN1	5.5854	-	HEN1_HUMAN.H11MO.0.C	2
chr17	60023	60042	ZNF680	6.3901	-	ZN680_HUMAN.H11MO.0.C	1
chr17	60027	60037	SMARCA1	7.5566	-	SMCA1_HUMAN.H11MO.0.C	1
chr17	60027	60040	LEF1	7.1402	+	ZN350_HUMAN.H11MO.0.C	1

Show the first few lines of the output file
/work/kk319/anno

In [9]:
FD_ANN=/work/kk319/annotation
cat /work/kk319/annotation/motif_cluster_jvierstra/hg38_archetype_motifs_v1/PER1/PER1.bed.gz |\
    awk -F $'\t' '($4 == "NR/20")' |\
    head

chr17	8150232	8150246	NR/20	8.3256	+	GCR_HUMAN.H11MO.0.A	2
chr17	8150831	8150845	NR/20	7.3429	-	GCR_HUMAN.H11MO.0.A	1
chr17	8151912	8151926	NR/20	11.8641	+	AR_nuclearreceptor_1	10
chr17	8154459	8154473	NR/20	15.1186	-	NR3C1_MA0113.3	11
chr17	8154460	8154474	NR/20	15.5768	+	NR3C1_MA0113.3	12
chr17	8154760	8154774	NR/20	8.2140	-	AR_nuclearreceptor_1	10
chr17	8154761	8154775	NR/20	8.1861	+	AR_nuclearreceptor_1	8


In [12]:
FD_ANN=/work/kk319/annotation
cat /work/kk319/annotation/motif_cluster_jvierstra/hg38_archetype_motifs_v1/PER1/PER1.bed.gz |\
    awk -F $'\t' '($4 == "AP1/2")' |\
    head

chr17	8149085	8149096	AP1/2	9.1683	+	BACH1_HUMAN.H11MO.0.A	2
chr17	8149462	8149473	AP1/2	9.1656	+	BACH1_HUMAN.H11MO.0.A	1
chr17	8150460	8150471	AP1/2	9.3536	+	BACH1_HUMAN.H11MO.0.A	3
chr17	8150585	8150596	AP1/2	7.6851	+	BACH1_HUMAN.H11MO.0.A	1
chr17	8152153	8152164	AP1/2	7.9175	+	BACH1_HUMAN.H11MO.0.A	3
chr17	8154319	8154330	AP1/2	8.2745	-	BACH1_HUMAN.H11MO.0.A	2
chr17	8157531	8157542	AP1/2	8.9634	+	BACH1_HUMAN.H11MO.0.A	1
chr17	8157831	8157842	AP1/2	8.1020	+	BACH1_HUMAN.H11MO.0.A	1
chr17	8157966	8157977	AP1/2	9.1683	-	BACH1_HUMAN.H11MO.0.A	2
chr17	8158098	8158109	AP1/2	10.8123	-	BACH1_HUMAN.H11MO.0.A	2


In [18]:
FD_ANN=/work/kk319/annotation
cat /work/kk319/annotation/motif_cluster_jvierstra/hg38_archetype_motifs_v1/PER1/PER1.bed.gz |\
    awk -F $'\t' '($4 == "NR/20")' |\
    head > test_PER1_NR20.bed

In [19]:
cat test_PER1_NR20.bed

chr17	8150232	8150246	NR/20	8.3256	+	GCR_HUMAN.H11MO.0.A	2
chr17	8150831	8150845	NR/20	7.3429	-	GCR_HUMAN.H11MO.0.A	1
chr17	8151912	8151926	NR/20	11.8641	+	AR_nuclearreceptor_1	10
chr17	8154459	8154473	NR/20	15.1186	-	NR3C1_MA0113.3	11
chr17	8154460	8154474	NR/20	15.5768	+	NR3C1_MA0113.3	12
chr17	8154760	8154774	NR/20	8.2140	-	AR_nuclearreceptor_1	10
chr17	8154761	8154775	NR/20	8.1861	+	AR_nuclearreceptor_1	8


In [20]:
bedtools merge -i test_PER1_NR20.bed -c 4,5 -o distinct,mean

chr17	8150232	8150246	NR/20	8.3256
chr17	8150831	8150845	NR/20	7.3429
chr17	8151912	8151926	NR/20	11.8641
chr17	8154459	8154474	NR/20	15.3477
chr17	8154760	8154775	NR/20	8.20005


In [14]:
FD_ANN=/work/kk319/annotation
cat /work/kk319/annotation/motif_cluster_jvierstra/hg38_archetype_motifs_v1/PER1/PER1.bed.gz |\
    awk -F $'\t' '($4 == "NR/20")' |\
    cut -f1-5 |\
    head > test_PER1_NR20.bed

In [15]:
cat test_PER1_NR20.bed

chr17	8150232	8150246	NR/20	8.3256
chr17	8150831	8150845	NR/20	7.3429
chr17	8151912	8151926	NR/20	11.8641
chr17	8154459	8154473	NR/20	15.1186
chr17	8154460	8154474	NR/20	15.5768
chr17	8154760	8154774	NR/20	8.2140
chr17	8154761	8154775	NR/20	8.1861


In [17]:
bedtools merge -i test_PER1_NR20.bed -c 4,5 -o distinct,mean

chr17	8150232	8150246	NR/20	8.3256
chr17	8150831	8150845	NR/20	7.3429
chr17	8151912	8151926	NR/20	11.8641
chr17	8154459	8154474	NR/20	15.3477
chr17	8154760	8154775	NR/20	8.20005


In [32]:
TAB="$(printf '\t')"
echo chr17${TAB}8148000${TAB}8160000 > region_target.bed
cat region_target.bed | expand -t 20

chr17 8148000 8160000


In [36]:
echo -e "chr17\t8148000\t8160000" > region_target.bed
cat region_target.bed | expand -t 20

chr17               8148000             8160000


In [28]:
TAB="$(printf '\t')"

cat > A.bed << EOF
chr17${TAB}8150232${TAB}8151926
EOF

bedtools coverage -a A.bed -b test_PER1_NR20.bed -d | head -50

chr17	8150232	8151926	1	1
chr17	8150232	8151926	2	1
chr17	8150232	8151926	3	1
chr17	8150232	8151926	4	1
chr17	8150232	8151926	5	1
chr17	8150232	8151926	6	1
chr17	8150232	8151926	7	1
chr17	8150232	8151926	8	1
chr17	8150232	8151926	9	1
chr17	8150232	8151926	10	1
chr17	8150232	8151926	11	1
chr17	8150232	8151926	12	1
chr17	8150232	8151926	13	1
chr17	8150232	8151926	14	1
chr17	8150232	8151926	15	0
chr17	8150232	8151926	16	0
chr17	8150232	8151926	17	0
chr17	8150232	8151926	18	0
chr17	8150232	8151926	19	0
chr17	8150232	8151926	20	0
chr17	8150232	8151926	21	0
chr17	8150232	8151926	22	0
chr17	8150232	8151926	23	0
chr17	8150232	8151926	24	0
chr17	8150232	8151926	25	0
chr17	8150232	8151926	26	0
chr17	8150232	8151926	27	0
chr17	8150232	8151926	28	0
chr17	8150232	8151926	29	0
chr17	8150232	8151926	30	0
chr17	8150232	8151926	31	0
chr17	8150232	8151926	32	0
chr17	8150232	8151926	33	0
chr17	8150232	8151926	34	0
chr17	8150232	8151926	35	0
chr17	8150232	8151926	36	0
chr17	8150232	8151926	37	0
chr17	8150

In [2]:
source config.sh
CHROM=chr17
FD_ANN=${FD_BASE}/annotation/motif_cluster_jvierstra/hg38_archetype_motifs_v1
FN_ANN=${CHROM}_rm_mouse.bed.gz
zcat ${FD_ANN}/${FN_ANN} | head

chr17	60004	60022	ZNF140	5.6897	+	ZN140_HUMAN.H11MO.0.C	1
chr17	60004	60022	ZNF667	8.0240	-	ZN667_HUMAN.H11MO.0.C	1
chr17	60006	60015	Ebox/CAGCTG	7.9275	+	MYOD1_HUMAN.H11MO.0.A	1
chr17	60011	60031	GC-tract	12.1220	-	ZN341_HUMAN.H11MO.0.C	1
chr17	60012	60025	PRDM4	1.3083	+	PRDM4_C2H2_1	1
chr17	60017	60028	NR/19	9.6680	-	NR1D1_HUMAN.H11MO.0.B	2
chr17	60019	60035	HEN1	5.5854	-	HEN1_HUMAN.H11MO.0.C	2
chr17	60023	60042	ZNF680	6.3901	-	ZN680_HUMAN.H11MO.0.C	1
chr17	60027	60037	SMARCA1	7.5566	-	SMCA1_HUMAN.H11MO.0.C	1
chr17	60027	60040	LEF1	7.1402	+	ZN350_HUMAN.H11MO.0.C	1

gzip: stdout: Broken pipe


In [None]:
### count the fragments and arrange output to a proper bed file (chr17:PER1)
awk -F $'\t' '($2 >= 8148987 && $2 <= 8159379)' ${FD_BED}/${FN_BED} |\
    sort    |\
    uniq -c |\
    awk '{$(NF+1)=$1;$1=""}1'   |\
    sed -e 's/^[[:space:]]*//'  |\
    sed -e 's/[[:space:]]/\t/g'  \
    > ${FD_CNT}/$(basename ${FD_BED})/${FN_CNT1}

In [None]:
### set environment on DCC
module load Bedtools
source config.sh
FD_LOG=${FD_WRK}/log

### run script using sbatch
#sbatch -pnew,all \
sbatch -p scavenger \
    --mem 15G \
    -o ${FD_LOG}/annot_fragment_input.txt \
    <<'EOF'
#!/bin/bash
### set directories & global variables
source config.sh
CHROM=chr17
SAMPLE=Input

### init: set input and output file
FD_ANN=${FD_BASE}/annotation/motif_cluster_jvierstra/hg38_archetype_motifs_v1
FN_ANN=${CHROM}_rm_mouse.bed.gz
FD_BED=${FD_WRK}/count_fragment/${SAMPLE}
FN_BED=cnt_${CHROM}.bed
FD_OUT=${FD_WRK}/annotation_fragment/${SAMPLE}
FN_OUT=${CHROM}.bed.gz

### init: set input and output file path
mkdir -p ${FD_OUT}
FP_BED_A=${FD_BED}/${FN_BED}
FP_BED_B=${FD_ANN}/${FN_ANN}
FP_BED_O=${FD_OUT}/${FN_OUT}

### print end message
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Input  file: " ${FP_BED_A}
echo "Input  file: " ${FP_BED_B}
echo "Output file: " ${FP_BED_O}
echo
echo "Show the first few lines of the input file"
echo ${FP_BED_A}
head ${FP_BED_A}
echo
echo ${FP_BED_B}
zcat ${FP_BED_B} | head

### RUN: annotation by intersecting two bed files
bedtools intersect -a ${FP_BED_A} -b ${FP_BED_B} -wo | gzip -cf > ${FP_BED_O}

### print end message
echo
echo "Show the first few lines of the output file"
echo ${FP_BED_O}
zcat ${FP_BED_O} | head

EOF