**Set environment**

In [1]:
source ../run_config_project.sh
source ${FD_EXE}/config_load_module_bedtools.sh
show_env

You are working on             Duke Server: HARDAC
BASE DIRECTORY (FD_BASE):      /data/reddylab/Kuei
REPO DIRECTORY (FD_REPO):      /data/reddylab/Kuei/repo
WORK DIRECTORY (FD_WORK):      /data/reddylab/Kuei/work
DATA DIRECTORY (FD_DATA):      /data/reddylab/Kuei/data
CONTAINER DIR. (FD_SING):      /data/reddylab/Kuei/container

You are working with           ENCODE FCC
PATH OF PROJECT (FD_PRJ):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC
PROJECT RESULTS (FD_RES):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results
PROJECT SCRIPTS (FD_EXE):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/scripts
PROJECT DATA    (FD_DAT):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/data
PROJECT NOTE    (FD_NBK):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/notebooks
PROJECT DOCS    (FD_DOC):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/docs
PROJECT LOG     (FD_LOG):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/log
PROJECT APP     (FD_APP):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/app
PROJEC

## Prepare

**Set assay names**

In [2]:
### Assay
ASSAY_TYPE=assay_fcc
ASSAY_NAMES=(
    STARR_ATAC_K562_Reddy_KS274
    STARR_ATAC_K562_Reddy_KS91
)

In [3]:
ls -1 ${FD_RES}/assay_fcc

[0m[38;5;27mMPRA_Tiling_K562_Tewhey_hannah[0m
[38;5;27mSTARR_ATAC_K562_Reddy_KS274[0m
[38;5;27mSTARR_ATAC_K562_Reddy_KS91[0m
[38;5;27mSTARR_WHG_K562_Reddy_A001[0m


In [4]:
ASSAY_NAME=STARR_ATAC_K562_Reddy_KS274
ls ${FD_RES}/assay_fcc/${ASSAY_NAME}/coverage/fcc_astarr_macs/astarr_macs_input_overlap

[0m[38;5;27moverlap_fragment[0m  [38;5;27moverlap_fragment_unique[0m


In [5]:
ASSAY_NAME=STARR_ATAC_K562_Reddy_KS274
FDIRY=${FD_RES}/assay_fcc/${ASSAY_NAME}/coverage/fcc_astarr_macs/astarr_macs_input_overlap
FDIRY=${FDIRY}/overlap_fragment
ls ${FDIRY}

[0m[38;5;9mASTARRseq_K562_KS274.hg38.Output.rep1.WGS.unstranded.bed.gz[0m
[38;5;9mASTARRseq_K562_KS274.hg38.Output.rep2.WGS.unstranded.bed.gz[0m
[38;5;9mASTARRseq_K562_KS274.hg38.Output.rep3.WGS.unstranded.bed.gz[0m
[38;5;27moverlap_fragment_one2one[0m


**Test loop: Assay**

In [6]:
for ASSAY_NAME in ${ASSAY_NAMES[@]}; do
    
    ### show progress
    echo ==============================
    echo "Assay name: " ${ASSAY_NAME}
    
    ### Set directory
    FD_TMP=${FD_RES}/${ASSAY_TYPE}/${ASSAY_NAME}/coverage/fcc_astarr_macs/astarr_macs_input_overlap
    FD_INP=${FD_TMP}/overlap_fragment
    FD_OUT=${FD_TMP}/overlap_fragment_one2one
    FP_INPS=($(ls ${FD_INP}/*hg38*rep*bed.gz))
    
    ### loop through each sample
    for FP_INP in ${FP_INPS[@]}; do   
        
        FD_INP=$(dirname  ${FP_INP})
        FN_INP=$(basename ${FP_INP})
        
        FN_OUT=${FN_INP}
        FP_OUT=${FD_OUT}/${FN_OUT}
        
        FN_LOG=region.coverage.overlap.one2one.${FN_OUT}.txt
        FP_LOG=${FD_LOG}/${FN_LOG}
        
        ### show progress
        echo --------------------------------------
        echo "Input:    "  ${FP_INP}
        echo "Log file: " '${FD_LOG}/'${FN_LOG}
        sbatch -p ${NODE} \
            --exclude=dl-01   \
            --cpus-per-task 4 \
            --mem 4G \
            --output ${FP_LOG} \
            --export=FD_PRJ=${FD_PRJ},FP_INP=${FP_INP},FP_OUT=${FP_OUT} \
            <<'EOF'
#!/bin/bash
### set environment
FD_EXE=${FD_PRJ}/scripts
source ${FD_EXE}/config_func.sh
source ${FD_EXE}/config_load_module_bedtools.sh

### print start message
timer_start=`date +%s`
echo "Hostname:          " $(hostname)
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Time Stamp:        " $(date +"%m-%d-%y+%T")
echo

### set input and output files
FD_OUT=$(dirname  ${FP_OUT})
FN_OUT=$(basename ${FP_OUT})

FD_TMP=${FD_OUT}/summary
FN_TMP=region.fragment_counts.${FN_OUT}
FP_TMP=${FD_TMP}/${FN_TMP}

### show I/O file
echo "Input file: ${FP_INP}"
echo
echo "show lines of file"
fun_cat ${FP_INP} | head
echo

### init
mkdir -p ${FD_OUT}
mkdir -p ${FD_TMP}
NCOL=$(zcat ${FP_INP} | head -n 1 | awk '{print NF}')

### execute
###   one2one mapping/filtering
zcat ${FP_INP} \
| sort -k1,1 -k2,2n -k${NCOL},${NCOL}nr \
| bedtools groupby \
    -g 1-6 \
    -c ${NCOL} \
    -o first \
    -full \
| cut -f 1-${NCOL} \
| gzip -c \
> ${FP_OUT}

### summary
###   region count
zcat ${FP_OUT} \
| bedtools groupby \
    -g 7-9 \
    -c 1 \
    -o count\
| gzip -c \
> ${FP_TMP}

### show I/O file
echo "Output file: ${FP_OUT}"
echo
echo "show lines of file"
fun_cat ${FP_OUT} | head
echo

### show I/O file
echo "Output file: ${FP_TMP}"
echo
echo "show lines of file"
fun_cat ${FP_TMP} | head
echo


### print end message
timer=`date +%s`
runtime=$(echo "${timer} - ${timer_start}" | bc -l)
echo
echo 'Done!'
echo "Run Time: $(displaytime ${runtime})"
EOF
        echo
    done
done

Assay name:  STARR_ATAC_K562_Reddy_KS274
--------------------------------------
Input:     /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/assay_fcc/STARR_ATAC_K562_Reddy_KS274/coverage/fcc_astarr_macs/astarr_macs_input_overlap/overlap_fragment/ASTARRseq_K562_KS274.hg38.Output.rep1.WGS.unstranded.bed.gz
Log file:  ${FD_LOG}/region.coverage.overlap.one2one.ASTARRseq_K562_KS274.hg38.Output.rep1.WGS.unstranded.bed.gz.txt
Submitted batch job 30744195

--------------------------------------
Input:     /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/assay_fcc/STARR_ATAC_K562_Reddy_KS274/coverage/fcc_astarr_macs/astarr_macs_input_overlap/overlap_fragment/ASTARRseq_K562_KS274.hg38.Output.rep2.WGS.unstranded.bed.gz
Log file:  ${FD_LOG}/region.coverage.overlap.one2one.ASTARRseq_K562_KS274.hg38.Output.rep2.WGS.unstranded.bed.gz.txt
Submitted batch job 30744196

--------------------------------------
Input:     /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/assay_fcc/STARR_ATAC_K562_Reddy_KS27

## Review

**Check results**

In [9]:
cat ${FD_LOG}/region.coverage.overlap.one2one.ASTARRseq_K562_KS274.hg38.Output.rep1.WGS.unstranded.bed.gz.txt

Hostname:           x2-03-3.genome.duke.edu
Slurm Array Index: 
Time Stamp:         04-04-24+14:43:03

Input file: /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/assay_fcc/STARR_ATAC_K562_Reddy_KS274/coverage/fcc_astarr_macs/astarr_macs_input_overlap/overlap_fragment/ASTARRseq_K562_KS274.hg38.Output.rep1.WGS.unstranded.bed.gz

show lines of file
chr1	14145	14485	chr1:14145-14485	2	.	chr1	14282	14614	203
chr1	14243	14498	chr1:14243-14498	1	.	chr1	14282	14614	216
chr1	14454	14729	chr1:14454-14729	1	.	chr1	14282	14614	160
chr1	14522	14725	chr1:14522-14725	1	.	chr1	14282	14614	92
chr1	14599	14850	chr1:14599-14850	1	.	chr1	14282	14614	15
chr1	16048	16175	chr1:16048-16175	1	.	chr1	16025	16338	127
chr1	16048	16176	chr1:16048-16176	1	.	chr1	16025	16338	128
chr1	16068	16310	chr1:16068-16310	1	.	chr1	16025	16338	242
chr1	16174	16300	chr1:16174-16300	2	.	chr1	16025	16338	126
chr1	16175	16300	chr1:16175-16300	1	.	chr1	16025	16338	125

Output file: /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results