**Set environment**

In [2]:
### set env
source ../config/config_duke.sh
source ../config/config_load_module_bedtools.sh
show_env

You are on Duke Server: HARDAC
BASE DIRECTORY:     /gpfs/fs1/data/reddylab/Kuei
PATH OF SOURCE:     /gpfs/fs1/data/reddylab/Kuei/source
PATH OF EXECUTABLE: /gpfs/fs1/data/reddylab/Kuei/exe
PATH OF ANNOTATION: /gpfs/fs1/data/reddylab/Kuei/annotation
PATH OF PROJECT:    /gpfs/fs1/data/reddylab/Kuei/GitRepo/Proj_CombEffect_ENCODE_FCC/notebooks
PATH OF RESULTS:    /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc



In [3]:
which bedtools
bedtools -version

/nfs/software/helmod/apps/Core/bedtools2/2.25.0-fasrc01/bin/bedtools
bedtools v2.25.0


**Check data directory**

In [5]:
ls ${FD_RES}/Tewhey_K562_TileMPRA/fragment/*.bed | xargs -n 1 basename

Tile_K562_hg38_20200905.FADS.stranded_neg.bed
Tile_K562_hg38_20200905.FADS.stranded_pos.bed
Tile_K562_hg38_20210130.GATA1.stranded_pos.bed
Tile_K562_hg38_20210130.MYC.stranded_pos.bed


**Check chromosome**

In [6]:
FDIRY=${FD_RES}/Tewhey_K562_TileMPRA/fragment

FNAME=Tile_K562_hg38_20210130.GATA1.stranded_pos.bed
head -1 ${FDIRY}/${FNAME}

FNAME=Tile_K562_hg38_20210130.MYC.stranded_pos.bed
head -1 ${FDIRY}/${FNAME}

FNAME=Tile_K562_hg38_20200905.FADS.stranded_pos.bed
head -1 ${FDIRY}/${FNAME}

chrX	47786401	47786600	X:47786401-47786600	.	+
chr8	126735901	126736100	8:126735901-126736100	.	+
chr11	61787329	61787528	11:61554801-61555000	.	+


```
GATA1 --- chrX
MYC   --- chr8
```

## Calculate NUC for fragments in Whole-Genome-STARR

### Region: GATA

**RUN**

In [7]:
sbatch -p ${NODE} \
    -J bedtools_nuc_tmpra_gata1 \
    --mem 20G \
    --array 0 \
    -o ${FD_LOG}/fragment_nuc_tmpra_gata1.txt \
    <<'EOF'
#!/bin/bash
### set directories & global variables
source ../config/config_duke.sh
CHROM=chrX
REGION=GATA1

### set input and output
FD_BED=${FD_RES}/Tewhey_K562_TileMPRA/fragment
FN_BED=Tile_K562_hg38_20210130.GATA1.stranded_pos.bed

FD_OUT=${FD_RES}/Tewhey_K562_TileMPRA/fragment_nuc
FN_OUT=${FN_BED}.gz

### genome annotation
FD_GEN=${FD_ANN}/genome/hg38/fasta
FN_GEN=${CHROM}.fa

### print start message
timer_start=`date +%s`
echo "Hostname:          " $(hostname)
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Time Stamp:        " $(date +"%m-%d-%y+%T")
echo

### Show input/output file
echo "Input  file: " ${FD_BED}/${FN_BED}
echo "Output file: " ${FD_OUT}/${FN_OUT}
echo
echo "Show the first few lines of the input file"
echo ${FD_BED}/${FN_BED}
cat  ${FD_BED}/${FN_BED} | head -n 3 
echo
    
### init: create output folder if not exist
mkdir -p ${FD_OUT}

### calculate the GC content of each fragment
bedtools nuc \
    -fi  ${FD_GEN}/${FN_GEN} \
    -bed ${FD_BED}/${FN_BED} |\
    gzip -c > ${FD_OUT}/${FN_OUT}

### Show input/output file
echo "Show the first few lines of the output file"
echo ${FD_OUT}/${FN_OUT}
zcat ${FD_OUT}/${FN_OUT} | head -n 3
echo

### print end message
timer_stop=`date +%s`
runtime=$(echo "${timer_stop} - ${timer_start}" | bc -l)
echo
echo 'Done!'
echo "Run Time: $(displaytime ${runtime})"

EOF

Submitted batch job 29147842


**CHECK**

In [8]:
cat ${FD_LOG}/fragment_nuc_tmpra_gata1.txt

Hostname:           dl-01.genome.duke.edu
Slurm Array Index:  0
Time Stamp:         10-11-22+22:18:47

Input  file:  /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/Tewhey_K562_TileMPRA/fragment/Tile_K562_hg38_20210130.GATA1.stranded_pos.bed
Output file:  /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/Tewhey_K562_TileMPRA/fragment_nuc/Tile_K562_hg38_20210130.GATA1.stranded_pos.bed.gz

Show the first few lines of the input file
/gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/Tewhey_K562_TileMPRA/fragment/Tile_K562_hg38_20210130.GATA1.stranded_pos.bed
chrX	47786401	47786600	X:47786401-47786600	.	+
chrX	47786451	47786650	X:47786451-47786650	.	+
chrX	47786501	47786700	X:47786501-47786700	.	+

Show the first few lines of the output file
/gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/Tewhey_K562_TileMPRA/fragment_nuc/Tile_K562_hg38_20210130.GATA1.stranded_pos.bed.gz
#1_usercol	2_usercol	3_usercol	4_usercol	5_usercol	6_usercol	7_pct_at	8_pct_g

### Region: MYC

**RUN**

In [9]:
sbatch -p ${NODE} \
    -J bedtools_nuc_tmpra_myc \
    --mem 20G \
    --array 0 \
    -o ${FD_LOG}/fragment_nuc_tmpra_myc.txt \
    <<'EOF'
#!/bin/bash
### set directories & global variables
source ../config/config_duke.sh
CHROM=chr8
REGION=MYC

### set input and output
FD_BED=${FD_RES}/Tewhey_K562_TileMPRA/fragment
FN_BED=Tile_K562_hg38_20210130.MYC.stranded_pos.bed

FD_OUT=${FD_RES}/Tewhey_K562_TileMPRA/fragment_nuc
FN_OUT=${FN_BED}.gz

### genome annotation
FD_GEN=${FD_ANN}/genome/hg38/fasta
FN_GEN=${CHROM}.fa

### print start message
timer_start=`date +%s`
echo "Hostname:          " $(hostname)
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Time Stamp:        " $(date +"%m-%d-%y+%T")
echo

### Show input/output file
echo "Input  file: " ${FD_BED}/${FN_BED}
echo "Output file: " ${FD_OUT}/${FN_OUT}
echo
echo "Show the first few lines of the input file"
echo ${FD_BED}/${FN_BED}
cat  ${FD_BED}/${FN_BED} | head -n 3 
echo
    
### init: create output folder if not exist
mkdir -p ${FD_OUT}

### calculate the GC content of each fragment
bedtools nuc \
    -fi  ${FD_GEN}/${FN_GEN} \
    -bed ${FD_BED}/${FN_BED} |\
    gzip -c > ${FD_OUT}/${FN_OUT}

### Show input/output file
echo "Show the first few lines of the output file"
echo ${FD_OUT}/${FN_OUT}
zcat ${FD_OUT}/${FN_OUT} | head -n 3
echo

### print end message
timer_stop=`date +%s`
runtime=$(echo "${timer_stop} - ${timer_start}" | bc -l)
echo
echo 'Done!'
echo "Run Time: $(displaytime ${runtime})"

EOF

Submitted batch job 29147843


**CHECK**

In [10]:
cat ${FD_LOG}/fragment_nuc_tmpra_myc.txt

Hostname:           dl-01.genome.duke.edu
Slurm Array Index:  0
Time Stamp:         10-11-22+22:20:17

Input  file:  /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/Tewhey_K562_TileMPRA/fragment/Tile_K562_hg38_20210130.MYC.stranded_pos.bed
Output file:  /gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/Tewhey_K562_TileMPRA/fragment_nuc/Tile_K562_hg38_20210130.MYC.stranded_pos.bed.gz

Show the first few lines of the input file
/gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/Tewhey_K562_TileMPRA/fragment/Tile_K562_hg38_20210130.MYC.stranded_pos.bed
chr8	126735901	126736100	8:126735901-126736100	.	+
chr8	126735951	126736150	8:126735951-126736150	.	+
chr8	126736001	126736200	8:126736001-126736200	.	+

Show the first few lines of the output file
/gpfs/fs1/data/reddylab/Kuei/out/proj_combeffect_encode_fcc/Tewhey_K562_TileMPRA/fragment_nuc/Tile_K562_hg38_20210130.MYC.stranded_pos.bed.gz
#1_usercol	2_usercol	3_usercol	4_usercol	5_usercol	6_usercol	7_pct_at	8_p