**Set environment**

In [1]:
source ../run_config_project.sh
show_env

You are working on             Duke Server: HARDAC
BASE DIRECTORY (FD_BASE):      /data/reddylab/Kuei
REPO DIRECTORY (FD_REPO):      /data/reddylab/Kuei/repo
WORK DIRECTORY (FD_WORK):      /data/reddylab/Kuei/work
DATA DIRECTORY (FD_DATA):      /data/reddylab/Kuei/data
CONTAINER DIR. (FD_SING):      /data/reddylab/Kuei/container

You are working with           ENCODE FCC
PATH OF PROJECT (FD_PRJ):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC
PROJECT RESULTS (FD_RES):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results
PROJECT SCRIPTS (FD_EXE):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/scripts
PROJECT DATA    (FD_DAT):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/data
PROJECT NOTE    (FD_NBK):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/notebooks
PROJECT DOCS    (FD_DOC):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/docs
PROJECT LOG     (FD_LOG):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/log
PROJECT APP     (FD_APP):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/app
PROJEC

## Prepare

**Check data**

In [2]:
FDIRY=${FD_DAT}/processed/ASTARRseq_K562_hg38_KS274_240311/fragments
echo ${FDIRY}
ls   ${FDIRY}

/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/data/processed/ASTARRseq_K562_hg38_KS274_240311/fragments
K562_ASTARR_repeat_rep1.f3q10.fragments.bedpe
K562_ASTARR_repeat_rep1.f3q10.sorted.with_umis.dedup.rpkm.bw
K562_ASTARR_repeat_rep2.f3q10.fragments.bedpe
K562_ASTARR_repeat_rep2.f3q10.sorted.with_umis.dedup.rpkm.bw
K562_ASTARR_repeat_rep3.f3q10.fragments.bedpe
K562_ASTARR_repeat_rep3.f3q10.sorted.with_umis.dedup.rpkm.bw


In [3]:
FDIRY=${FD_RES}/assay_fcc/STARR_ATAC_K562_Reddy_KS274/fragment_counts
echo  ${FDIRY}
ls -d ${FDIRY}

/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/assay_fcc/STARR_ATAC_K562_Reddy_KS274/fragment_counts
[0m[38;5;27m/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/assay_fcc/STARR_ATAC_K562_Reddy_KS274/fragment_counts[0m[K


**Test: Calcualte fragment -> fragment counts**

In [4]:
FD_INP=${FD_DAT}/processed/ASTARRseq_K562_hg38_KS274_240311/fragments
FN_INP=K562_ASTARR_repeat_rep1.f3q10.fragments.bedpe
FP_INP=${FD_INP}/${FN_INP}
cat ${FP_INP} | head

chr1	14145	14485
chr1	14145	14485
chr1	14243	14498
chr1	14454	14729
chr1	14522	14725
chr1	14599	14850
chr1	14659	14864
chr1	14659	14864
chr1	14781	15268
chr1	15172	15659


In [5]:
cat ${FP_INP} | head |\
    sort -k 1,1 -k2,2n |\
    uniq -c |\
    awk '{$(NF+1)=$1;$1=""}1'   |\
    sed -e 's/^[[:space:]]*//'  |\
    sed -e 's/[[:space:]]/\t/g' |\
    awk '{print $1"\t"$2"\t"$3"\t"$1":"$2"-"$3"\t"$4"\t""."}'

chr1	14145	14485	chr1:14145-14485	2	.
chr1	14243	14498	chr1:14243-14498	1	.
chr1	14454	14729	chr1:14454-14729	1	.
chr1	14522	14725	chr1:14522-14725	1	.
chr1	14599	14850	chr1:14599-14850	1	.
chr1	14659	14864	chr1:14659-14864	2	.
chr1	14781	15268	chr1:14781-15268	1	.
chr1	15172	15659	chr1:15172-15659	1	.


## Execute

**Run: Loop each fragment file and output fragment counts**

In [6]:
FD_INP=${FD_DAT}/processed/ASTARRseq_K562_hg38_KS274_240311/fragments
FD_OUT=${FD_RES}/assay_fcc/STARR_ATAC_K562_Reddy_KS274/fragment_counts

FN_INPS=(
    K562_ASTARR_repeat_rep1.f3q10.fragments.bedpe
    K562_ASTARR_repeat_rep2.f3q10.fragments.bedpe
    K562_ASTARR_repeat_rep3.f3q10.fragments.bedpe
)

FN_OUTS=(
    ASTARRseq_K562_KS274.hg38.Output.rep1.WGS.unstranded.bed.gz
    ASTARRseq_K562_KS274.hg38.Output.rep2.WGS.unstranded.bed.gz
    ASTARRseq_K562_KS274.hg38.Output.rep3.WGS.unstranded.bed.gz
)

In [7]:
for idx in "${!FN_INPS[@]}"; do
    FN_INP=${FN_INPS[idx]}
    FN_OUT=${FN_OUTS[idx]}
    FN_LOG=fragment_count.${FN_INP}.txt
    
    FP_INP=${FD_INP}/${FN_INP}
    FP_OUT=${FD_OUT}/${FN_OUT}
    FP_LOG=${FD_LOG}/${FN_LOG}
    
    echo "Calculate fragment counts..."
    echo ${FN_INP}
    echo ${FN_OUT}
    echo ${FN_LOG}
    
    sbatch -p ${NODE} \
        --exclude=dl-01   \
        --cpus-per-task 4 \
        --mem 4G \
        --output ${FP_LOG} \
        --export=FD_PRJ=${FD_PRJ},FP_INP=${FP_INP},FP_OUT=${FP_OUT} \
        <<'EOF'
#!/bin/bash
### set environment
source ${FD_PRJ}/scripts/config_project.sh

### print start message
timer_start=`date +%s`
echo "Hostname:          " $(hostname)
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Time Stamp:        " $(date +"%m-%d-%y+%T")
echo

### show input file
echo "Input: " ${FP_INP}
echo
echo "show first few lines of input"
fun_cat ${FP_INP} | head
echo

### execute
cat ${FP_INP} |\
    sort -k 1,1 -k2,2n |\
    uniq -c |\
    awk '{$(NF+1)=$1;$1=""}1'   |\
    sed -e 's/^[[:space:]]*//'  |\
    sed -e 's/[[:space:]]/\t/g' |\
    awk '{print $1"\t"$2"\t"$3"\t"$1":"$2"-"$3"\t"$4"\t""."}' |\
    gzip -c > ${FP_OUT}

### show output file
echo "Output: " ${FP_OUT}
echo
echo "show first few lines of output"
fun_cat ${FP_OUT} | head
echo

### print end message
timer=`date +%s`
runtime=$(echo "${timer} - ${timer_start}" | bc -l)
echo
echo 'Done!'
echo "Run Time: $(displaytime ${runtime})"
EOF
    echo
done

Calculate fragment counts...
K562_ASTARR_repeat_rep1.f3q10.fragments.bedpe
ASTARRseq_K562_KS274.hg38.Output.rep1.WGS.unstranded.bed.gz
fragment_count.K562_ASTARR_repeat_rep1.f3q10.fragments.bedpe.txt
Submitted batch job 30729251

Calculate fragment counts...
K562_ASTARR_repeat_rep2.f3q10.fragments.bedpe
ASTARRseq_K562_KS274.hg38.Output.rep2.WGS.unstranded.bed.gz
fragment_count.K562_ASTARR_repeat_rep2.f3q10.fragments.bedpe.txt
Submitted batch job 30729252

Calculate fragment counts...
K562_ASTARR_repeat_rep3.f3q10.fragments.bedpe
ASTARRseq_K562_KS274.hg38.Output.rep3.WGS.unstranded.bed.gz
fragment_count.K562_ASTARR_repeat_rep3.f3q10.fragments.bedpe.txt
Submitted batch job 30729253



## Review

**Check results**

In [8]:
cat ${FP_LOG}

Hostname:           x1-01-2.genome.duke.edu
Slurm Array Index: 
Time Stamp:         03-30-24+14:34:55

Input:  /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/data/processed/ASTARRseq_K562_hg38_KS274_240311/fragments/K562_ASTARR_repeat_rep3.f3q10.fragments.bedpe

show first few lines of input
chr1	10421	10570
chr1	13044	13243
chr1	13044	13243
chr1	13303	13459
chr1	13483	13661
chr1	14091	14513
chr1	14129	14468
chr1	14191	14755
chr1	14231	14498
chr1	14449	14971

Output:  /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/assay_fcc/STARR_ATAC_K562_Reddy_KS274/fragment_counts/ASTARRseq_K562_KS274.hg38.Output.rep3.WGS.unstranded.bed.gz

show first few lines of output
chr1	10421	10570	chr1:10421-10570	1	.
chr1	13044	13243	chr1:13044-13243	2	.
chr1	13303	13459	chr1:13303-13459	1	.
chr1	13483	13661	chr1:13483-13661	1	.
chr1	14091	14513	chr1:14091-14513	1	.
chr1	14129	14468	chr1:14129-14468	1	.
chr1	14191	14755	chr1:14191-14755	1	.
chr1	14231	14498	chr1:14231-14498	1	.
chr1	14449	14971	chr1:14449-14971	