## Set up script to split chromosomes within a bed files

In [1]:
%%writefile splitBed.sh
#############################################
### Usage ./splitBed.sh input.bed outdir
#############################################

### get directory from command line
FP_IN=$1
FD_OT=$2

### loop through the input bed file and 
### grep each chromosome to a output file
for chr in `cut -f 1 $FP_IN | sort | uniq`;
do
    echo $chr
    grep -w $chr $FP_IN > $FD_OT/$chr.bed
done

Overwriting splitBed.sh


In [2]:
!cat splitBed.sh

#############################################
### Usage ./splitBed.sh input.bed outdir
#############################################

### get directory from command line
FP_IN=$1
FD_OT=$2

### loop through the input bed file and 
### grep each chromosome to a output file
for chr in `cut -f 1 $FP_IN | sort | uniq`;
do
    echo $chr
    grep -w $chr $FP_IN > $FD_OT/$chr.bed
done


## Input

In [7]:
%%bash
FD_ALIGN=/data/reddylab/gjohnson/whole_genome_STARRseq/wgss3/alignment_and_processing/alignments
ls -d $FD_ALIGN/Input*/

/data/reddylab/gjohnson/whole_genome_STARRseq/wgss3/alignment_and_processing/alignments/Input1/
/data/reddylab/gjohnson/whole_genome_STARRseq/wgss3/alignment_and_processing/alignments/Input2/
/data/reddylab/gjohnson/whole_genome_STARRseq/wgss3/alignment_and_processing/alignments/Input3/
/data/reddylab/gjohnson/whole_genome_STARRseq/wgss3/alignment_and_processing/alignments/Input4/
/data/reddylab/gjohnson/whole_genome_STARRseq/wgss3/alignment_and_processing/alignments/Input5/


In [3]:
%%bash
### set environment
module load perl
module load gcc
source /data/reddylab/software/miniconda2/bin/activate alex_dev
export PATH=/data/reddylab/software/homer/bin/:$PATH

### set log file directory
FD_LOG=/gpfs/fs1/data/reddylab/Kuei/out/CombEffect_STARR/log

### run script using sbatch
sbatch -pnew,all \
    --array=0-4 \
    --mem 8G \
    -o ${FD_LOG}/prep_splitbed_input.%a.txt \
    <<'EOF'
#!/bin/bash
### set directories
FD_ALIGN=/data/reddylab/gjohnson/whole_genome_STARRseq/wgss3/alignment_and_processing/alignments

### get data directory and set output directory
FD_DATS=($(ls -d ${FD_ALIGN}/Input*/))
FD_DAT=${FD_DATS[${SLURM_ARRAY_TASK_ID}]}
FD_OUT=/data/reddylab/Kuei/out/CombEffect_STARR/data/"$(basename -- $FD_DAT)"
FP_DAT=${FD_DAT}/*.bedpe

### create the output directory in not exist
mkdir -p $FD_OUT
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Input  directory:  " ${FD_DAT}
echo "Input  file:       " ${FP_DAT}
echo "Output directory:  " ${FD_OUT}

echo "Show the first few lines of the input file"
head ${FP_DAT}

### run script to split the bed file
chmod +x splitBed.sh
echo "Splitting the input file"
./splitBed.sh ${FP_DAT} ${FD_OUT}
echo "Done!"

EOF

Submitted batch job 25311130


## Output DMSO

In [9]:
%%bash
FD_ALIGN=/data/reddylab/gjohnson/whole_genome_STARRseq/wgss3/alignment_and_processing/alignments
ls -d $FD_ALIGN/TFX*_DMSO/

/data/reddylab/gjohnson/whole_genome_STARRseq/wgss3/alignment_and_processing/alignments/TFX2_DMSO/
/data/reddylab/gjohnson/whole_genome_STARRseq/wgss3/alignment_and_processing/alignments/TFX3_DMSO/
/data/reddylab/gjohnson/whole_genome_STARRseq/wgss3/alignment_and_processing/alignments/TFX4_DMSO/
/data/reddylab/gjohnson/whole_genome_STARRseq/wgss3/alignment_and_processing/alignments/TFX5_DMSO/


In [11]:
%%bash
### set environment
module load perl
module load gcc
source /data/reddylab/software/miniconda2/bin/activate alex_dev
export PATH=/data/reddylab/software/homer/bin/:$PATH

### set log file directory
FD_LOG=/gpfs/fs1/data/reddylab/Kuei/out/CombEffect_STARR/log

### run script using sbatch
sbatch -pnew,all \
    --array=0-3 \
    --mem 8G \
    -o ${FD_LOG}/prep_splitbed_dmso.%a.txt \
    <<'EOF'
#!/bin/bash
### set directories
FD_ALIGN=/data/reddylab/gjohnson/whole_genome_STARRseq/wgss3/alignment_and_processing/alignments

### get data directory and set output directory
FD_DATS=($(ls -d ${FD_ALIGN}/TFX*_DMSO/))
FD_DAT=${FD_DATS[${SLURM_ARRAY_TASK_ID}]}
FD_OUT=/data/reddylab/Kuei/out/CombEffect_STARR/data/"$(basename -- $FD_DAT)"
FP_DAT=${FD_DAT}/*.bedpe

### create the output directory in not exist
mkdir -p $FD_OUT
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Input  directory:  " ${FD_DAT}
echo "Input  file:       " ${FP_DAT}
echo "Output directory:  " ${FD_OUT}

echo "Show the first few lines of the input file"
head ${FP_DAT}

### run script to split the bed file
chmod +x splitBed.sh
echo "Splitting the input file"
./splitBed.sh ${FP_DAT} ${FD_OUT}
echo "Done!"

EOF

Submitted batch job 25311157


## Output Dex

In [13]:
%%bash
FD_ALIGN=/data/reddylab/gjohnson/whole_genome_STARRseq/wgss3/alignment_and_processing/alignments
ls -d $FD_ALIGN/TFX*_Dex/

/data/reddylab/gjohnson/whole_genome_STARRseq/wgss3/alignment_and_processing/alignments/TFX2_Dex/
/data/reddylab/gjohnson/whole_genome_STARRseq/wgss3/alignment_and_processing/alignments/TFX3_Dex/
/data/reddylab/gjohnson/whole_genome_STARRseq/wgss3/alignment_and_processing/alignments/TFX4_Dex/
/data/reddylab/gjohnson/whole_genome_STARRseq/wgss3/alignment_and_processing/alignments/TFX5_Dex/


In [15]:
%%bash
### set environment
module load perl
module load gcc
source /data/reddylab/software/miniconda2/bin/activate alex_dev
export PATH=/data/reddylab/software/homer/bin/:$PATH

### set log file directory
FD_LOG=/gpfs/fs1/data/reddylab/Kuei/out/CombEffect_STARR/log

### run script using sbatch
sbatch -pnew,all \
    --array=0-3 \
    --mem 8G \
    -o ${FD_LOG}/prep_splitbed_dex.%a.txt \
    <<'EOF'
#!/bin/bash
### set directories
FD_ALIGN=/data/reddylab/gjohnson/whole_genome_STARRseq/wgss3/alignment_and_processing/alignments

### get data directory and set output directory
FD_DATS=($(ls -d ${FD_ALIGN}/TFX*_Dex/))
FD_DAT=${FD_DATS[${SLURM_ARRAY_TASK_ID}]}
FD_OUT=/data/reddylab/Kuei/out/CombEffect_STARR/data/"$(basename -- $FD_DAT)"
FP_DAT=${FD_DAT}/*.bedpe

### create the output directory in not exist
mkdir -p $FD_OUT
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Input  directory:  " ${FD_DAT}
echo "Input  file:       " ${FP_DAT}
echo "Output directory:  " ${FD_OUT}

echo "Show the first few lines of the input file"
head ${FP_DAT}

### run script to split the bed file
chmod +x splitBed.sh
echo "Splitting the input file"
./splitBed.sh ${FP_DAT} ${FD_OUT}
echo "Done!"

EOF

Submitted batch job 25311210


## Merge each chromosome for each sample

### Test looping

In [10]:
%%bash
FD_WRK=/data/reddylab/Kuei/out/CombEffect_STARR
FD_BEDS=($(ls -d ${FD_WRK}/data/Input?/))

for FD_BED in ${FD_BEDS[@]}; do
   echo ${FD_BED}
done

/data/reddylab/Kuei/out/CombEffect_STARR/data/Input1/
/data/reddylab/Kuei/out/CombEffect_STARR/data/Input2/
/data/reddylab/Kuei/out/CombEffect_STARR/data/Input3/
/data/reddylab/Kuei/out/CombEffect_STARR/data/Input4/
/data/reddylab/Kuei/out/CombEffect_STARR/data/Input5/


In [31]:
%%bash
FD_WRK=/data/reddylab/Kuei/out/CombEffect_STARR
FD_BEDS=($(ls -d ${FD_WRK}/data/TFX?_DMSO/))

for FD_BED in ${FD_BEDS[@]}; do
   echo ${FD_BED}
done

/data/reddylab/Kuei/out/CombEffect_STARR/data/TFX2_DMSO/
/data/reddylab/Kuei/out/CombEffect_STARR/data/TFX3_DMSO/
/data/reddylab/Kuei/out/CombEffect_STARR/data/TFX4_DMSO/
/data/reddylab/Kuei/out/CombEffect_STARR/data/TFX5_DMSO/


In [33]:
%%bash
FD_WRK=/data/reddylab/Kuei/out/CombEffect_STARR
FD_BEDS=($(ls -d ${FD_WRK}/data/TFX?_Dex/))

for FD_BED in ${FD_BEDS[@]}; do
   echo ${FD_BED}
done

/data/reddylab/Kuei/out/CombEffect_STARR/data/TFX2_Dex/
/data/reddylab/Kuei/out/CombEffect_STARR/data/TFX3_Dex/
/data/reddylab/Kuei/out/CombEffect_STARR/data/TFX4_Dex/
/data/reddylab/Kuei/out/CombEffect_STARR/data/TFX5_Dex/


In [19]:
ls /data/reddylab/Kuei/out/CombEffect_STARR/data/Input1/chr*.bed

/data/reddylab/Kuei/out/CombEffect_STARR/data/Input1/chr10.bed
/data/reddylab/Kuei/out/CombEffect_STARR/data/Input1/chr11.bed
/data/reddylab/Kuei/out/CombEffect_STARR/data/Input1/chr12.bed
/data/reddylab/Kuei/out/CombEffect_STARR/data/Input1/chr13.bed
/data/reddylab/Kuei/out/CombEffect_STARR/data/Input1/chr14.bed
/data/reddylab/Kuei/out/CombEffect_STARR/data/Input1/chr15.bed
/data/reddylab/Kuei/out/CombEffect_STARR/data/Input1/chr16.bed
/data/reddylab/Kuei/out/CombEffect_STARR/data/Input1/chr17.bed
/data/reddylab/Kuei/out/CombEffect_STARR/data/Input1/chr18.bed
/data/reddylab/Kuei/out/CombEffect_STARR/data/Input1/chr19.bed
/data/reddylab/Kuei/out/CombEffect_STARR/data/Input1/chr1.bed
/data/reddylab/Kuei/out/CombEffect_STARR/data/Input1/chr20.bed
/data/reddylab/Kuei/out/CombEffect_STARR/data/Input1/chr21.bed
/data/reddylab/Kuei/out/CombEffect_STARR/data/Input1/chr22.bed
/data/reddylab/Kuei/out/CombEffect_STARR/data/Input1/chr2.bed
/data/reddylab/Kuei/out/CombEffect_STARR/data/Input1/chr3

In [28]:
%%bash
CHROMS=({1..22} X Y)
for CHROM in ${CHROMS[@]}; do
    echo ${CHROM}
done

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
X
Y


### Merge each chromosome for Inputs

In [29]:
%%bash
### set environment
module load perl
module load gcc
source /data/reddylab/software/miniconda2/bin/activate alex_dev
export PATH=/data/reddylab/software/homer/bin/:$PATH

### set log file directory
FD_LOG=/gpfs/fs1/data/reddylab/Kuei/out/CombEffect_STARR/log

### run script using sbatch
sbatch -pnew,all \
    --mem 8G \
    -o ${FD_LOG}/prep_merge_input.txt \
    <<'EOF'
#!/bin/bash
### set directories
FD_WRK=/data/reddylab/Kuei/out/CombEffect_STARR
FD_BEDS=($(ls -d ${FD_WRK}/data/Input?/))
FD_OUT=${FD_WRK}/data/Input

### create merged directory
mkdir -p ${FD_OUT}

### merge for each chromosome
CHROMS=({1..22} X Y)
for CHROM in ${CHROMS[@]}; do
    ### init: create new file
    FN_OUT=chr${CHROM}.bed
    echo -n "" > ${FD_OUT}/${FN_OUT}
    
    ### merging
    for FD_BED in ${FD_BEDS[@]}; do
        ### count each file before merged
        wc -l ${FD_BED}/${FN_OUT}
        
        ### merge files
        cat   ${FD_BED}/${FN_OUT} >> ${FD_OUT}/${FN_OUT}
    done
    
    ### count file after merged
    wc -l ${FD_OUT}/${FN_OUT}
    echo
done

echo Done

EOF

Submitted batch job 25422007


In [30]:
%%bash
FD_LOG=/gpfs/fs1/data/reddylab/Kuei/out/CombEffect_STARR/log
cat ${FD_LOG}/prep_merge_input.txt

1586763 /data/reddylab/Kuei/out/CombEffect_STARR/data/Input1//chr1.bed
1719418 /data/reddylab/Kuei/out/CombEffect_STARR/data/Input2//chr1.bed
1985257 /data/reddylab/Kuei/out/CombEffect_STARR/data/Input3//chr1.bed
1619069 /data/reddylab/Kuei/out/CombEffect_STARR/data/Input4//chr1.bed
1300621 /data/reddylab/Kuei/out/CombEffect_STARR/data/Input5//chr1.bed
8211128 /data/reddylab/Kuei/out/CombEffect_STARR/data/Input/chr1.bed

1517981 /data/reddylab/Kuei/out/CombEffect_STARR/data/Input1//chr2.bed
1639757 /data/reddylab/Kuei/out/CombEffect_STARR/data/Input2//chr2.bed
1890272 /data/reddylab/Kuei/out/CombEffect_STARR/data/Input3//chr2.bed
1547377 /data/reddylab/Kuei/out/CombEffect_STARR/data/Input4//chr2.bed
1249993 /data/reddylab/Kuei/out/CombEffect_STARR/data/Input5//chr2.bed
7845380 /data/reddylab/Kuei/out/CombEffect_STARR/data/Input/chr2.bed

1232997 /data/reddylab/Kuei/out/CombEffect_STARR/data/Input1//chr3.bed
1329924 /data/reddylab/Kuei/out/CombEffect_STARR/data/Input2//chr3.bed
1531280 

### Merge each chromosome for Output (DMSO)

In [36]:
%%bash
### set environment
module load perl
module load gcc
source /data/reddylab/software/miniconda2/bin/activate alex_dev
export PATH=/data/reddylab/software/homer/bin/:$PATH

### set log file directory
FD_LOG=/gpfs/fs1/data/reddylab/Kuei/out/CombEffect_STARR/log

### run script using sbatch
sbatch -pnew,all \
    --mem 8G \
    -o ${FD_LOG}/prep_merge_output_dmso.txt \
    <<'EOF'
#!/bin/bash
### set directories
FD_WRK=/data/reddylab/Kuei/out/CombEffect_STARR
FD_BEDS=($(ls -d ${FD_WRK}/data/TFX?_DMSO/))
FD_OUT=${FD_WRK}/data/TFX_DMSO

### create merged directory
mkdir -p ${FD_OUT}

### merge for each chromosome
CHROMS=({1..22} X Y)
for CHROM in ${CHROMS[@]}; do
    ### init: create new file
    FN_OUT=chr${CHROM}.bed
    echo -n "" > ${FD_OUT}/${FN_OUT}
    
    ### merging
    for FD_BED in ${FD_BEDS[@]}; do
        ### count each file before merged
        wc -l ${FD_BED}/${FN_OUT}
        
        ### merge files
        cat   ${FD_BED}/${FN_OUT} >> ${FD_OUT}/${FN_OUT}
    done
    
    ### count file after merged
    wc -l ${FD_OUT}/${FN_OUT}
    echo
done

echo Done

EOF

Submitted batch job 25422073


In [39]:
%%bash
FD_LOG=/gpfs/fs1/data/reddylab/Kuei/out/CombEffect_STARR/log
cat ${FD_LOG}/prep_merge_output_dmso.txt

3848855 /data/reddylab/Kuei/out/CombEffect_STARR/data/TFX2_DMSO//chr1.bed
2345265 /data/reddylab/Kuei/out/CombEffect_STARR/data/TFX3_DMSO//chr1.bed
2702080 /data/reddylab/Kuei/out/CombEffect_STARR/data/TFX4_DMSO//chr1.bed
2531183 /data/reddylab/Kuei/out/CombEffect_STARR/data/TFX5_DMSO//chr1.bed
11427383 /data/reddylab/Kuei/out/CombEffect_STARR/data/TFX_DMSO/chr1.bed

3391586 /data/reddylab/Kuei/out/CombEffect_STARR/data/TFX2_DMSO//chr2.bed
2091450 /data/reddylab/Kuei/out/CombEffect_STARR/data/TFX3_DMSO//chr2.bed
2421659 /data/reddylab/Kuei/out/CombEffect_STARR/data/TFX4_DMSO//chr2.bed
2239378 /data/reddylab/Kuei/out/CombEffect_STARR/data/TFX5_DMSO//chr2.bed
10144073 /data/reddylab/Kuei/out/CombEffect_STARR/data/TFX_DMSO/chr2.bed

2643287 /data/reddylab/Kuei/out/CombEffect_STARR/data/TFX2_DMSO//chr3.bed
1625019 /data/reddylab/Kuei/out/CombEffect_STARR/data/TFX3_DMSO//chr3.bed
1865831 /data/reddylab/Kuei/out/CombEffect_STARR/data/TFX4_DMSO//chr3.bed
1710389 /data/reddylab/Kuei/out/CombEf

In [42]:
5302 + 2456 + 2758 + 2555
#13071

13071

### Merge each chromosome for Output (Dex)

In [37]:
%%bash
### set environment
module load perl
module load gcc
source /data/reddylab/software/miniconda2/bin/activate alex_dev
export PATH=/data/reddylab/software/homer/bin/:$PATH

### set log file directory
FD_LOG=/gpfs/fs1/data/reddylab/Kuei/out/CombEffect_STARR/log

### run script using sbatch
sbatch -pnew,all \
    --mem 8G \
    -o ${FD_LOG}/prep_merge_output_dex.txt \
    <<'EOF'
#!/bin/bash
### set directories
FD_WRK=/data/reddylab/Kuei/out/CombEffect_STARR
FD_BEDS=($(ls -d ${FD_WRK}/data/TFX?_Dex/))
FD_OUT=${FD_WRK}/data/TFX_Dex

### create merged directory
mkdir -p ${FD_OUT}

### merge for each chromosome
CHROMS=({1..22} X Y)
for CHROM in ${CHROMS[@]}; do
    ### init: create new file
    FN_OUT=chr${CHROM}.bed
    echo -n "" > ${FD_OUT}/${FN_OUT}
    
    ### merging
    for FD_BED in ${FD_BEDS[@]}; do
        ### count each file before merged
        wc -l ${FD_BED}/${FN_OUT}
        
        ### merge files
        cat   ${FD_BED}/${FN_OUT} >> ${FD_OUT}/${FN_OUT}
    done
    
    ### count file after merged
    wc -l ${FD_OUT}/${FN_OUT}
    echo
done

echo Done

EOF

Submitted batch job 25422074


In [40]:
%%bash
FD_LOG=/gpfs/fs1/data/reddylab/Kuei/out/CombEffect_STARR/log
cat ${FD_LOG}/prep_merge_output_dex.txt

3991734 /data/reddylab/Kuei/out/CombEffect_STARR/data/TFX2_Dex//chr1.bed
2316427 /data/reddylab/Kuei/out/CombEffect_STARR/data/TFX3_Dex//chr1.bed
3039977 /data/reddylab/Kuei/out/CombEffect_STARR/data/TFX4_Dex//chr1.bed
3715191 /data/reddylab/Kuei/out/CombEffect_STARR/data/TFX5_Dex//chr1.bed
13063329 /data/reddylab/Kuei/out/CombEffect_STARR/data/TFX_Dex/chr1.bed

3529288 /data/reddylab/Kuei/out/CombEffect_STARR/data/TFX2_Dex//chr2.bed
2054980 /data/reddylab/Kuei/out/CombEffect_STARR/data/TFX3_Dex//chr2.bed
2695597 /data/reddylab/Kuei/out/CombEffect_STARR/data/TFX4_Dex//chr2.bed
3298694 /data/reddylab/Kuei/out/CombEffect_STARR/data/TFX5_Dex//chr2.bed
11578559 /data/reddylab/Kuei/out/CombEffect_STARR/data/TFX_Dex/chr2.bed

2707973 /data/reddylab/Kuei/out/CombEffect_STARR/data/TFX2_Dex//chr3.bed
1562721 /data/reddylab/Kuei/out/CombEffect_STARR/data/TFX3_Dex//chr3.bed
2040708 /data/reddylab/Kuei/out/CombEffect_STARR/data/TFX4_Dex//chr3.bed
2491843 /data/reddylab/Kuei/out/CombEffect_STARR/da

In [41]:
6730 + 2904 + 3811 + 4970
#18415

18415