# Siklenka_6621_201109A5 - chip_seq
This notebook will create all the necessary files, scripts and folders to pre-process the aforementioned project. Is designed to be used in a jupyter server deployed in a system running SLURM. The majority of the scripts and heavy-lifting processes are wrapped up in sbatch scripts.As an end user, in order to pre-process your samples provided in the spread sheet, you will simply need to *run the entire notebook* (Cell > Run all) and the system should take care of the rest for you.
#### Create necessary folder(s)

In [1]:
%%bash
mkdir -p /data/reddylab/Alex/collab/20190701_Matt//data/chip_seq/metadata
mkdir -p /data/reddylab/Alex/collab/20190701_Matt//data/chip_seq/raw_reads
mkdir -p /data/reddylab/Alex/collab/20190701_Matt//data/chip_seq/processed_raw_reads
mkdir -p /data/reddylab/Alex/collab/20190701_Matt//processing/chip_seq/scripts
mkdir -p /data/reddylab/Alex/collab/20190701_Matt//processing/chip_seq/jsons
mkdir -p /data/reddylab/Alex/collab/20190701_Matt//processing/chip_seq/logs

Save metadata file

In [2]:
%%writefile /data/reddylab/Alex/collab/20190701_Matt//data/chip_seq/metadata/chip_seq_download_metadata.Siklenka_6621_201109A5.txt
Sequencing core project	Sequencing core library name	Name	Paired-end or single-end	Genome	Library type	Control
Siklenka_6621_201109A5	KS157-CREneg-g5-K27ac-rep1	KS157.CREneg.g5.K27ac.rep1	PE	mm10	ChIP-seq	
Siklenka_6621_201109A5	KS157-CREneg-g5-K27ac-rep2	KS157.CREneg.g5.K27ac.rep2	PE	mm10	ChIP-seq	
Siklenka_6621_201109A5	KS157-CREneg-g5-K27ac-rep3	KS157.CREneg.g5.K27ac.rep3	PE	mm10	ChIP-seq	
Siklenka_6621_201109A5	KS157-CREpos-NTC-K27ac-rep1	KS157.CREpos.NTC.K27ac.rep1	PE	mm10	ChIP-seq	
Siklenka_6621_201109A5	KS157-CREpos-NTC-K27ac-rep2	KS157.CREpos.NTC.K27ac.rep2	PE	mm10	ChIP-seq	
Siklenka_6621_201109A5	KS157-CREpos-NTC-K27ac-rep3	KS157.CREpos.NTC.K27ac.rep3	PE	mm10	ChIP-seq	
Siklenka_6621_201109A5	KS157-CREpos-g5-K27ac-rep1	KS157.CREpos.g5.K27ac.rep1	PE	mm10	ChIP-seq	
Siklenka_6621_201109A5	KS157-CREpos-g5-K27ac-rep2	KS157.CREpos.g5.K27ac.rep2	PE	mm10	ChIP-seq	
Siklenka_6621_201109A5	KS157-CREpos-g5-K27ac-rep3	KS157.CREpos.g5.K27ac.rep3	PE	mm10	ChIP-seq	


Writing /data/reddylab/Alex/collab/20190701_Matt//data/chip_seq/metadata/chip_seq_download_metadata.Siklenka_6621_201109A5.txt


#### Download FASTQ from dukeds
Create file to download FASTQ files

In [None]:
%%writefile /data/reddylab/Alex/collab/20190701_Matt//processing/chip_seq/scripts/download_Siklenka_6621_201109A5.sh
#!/bin/bash
METADATA=/data/reddylab/Alex/collab/20190701_Matt//data/chip_seq/metadata/chip_seq_download_metadata.Siklenka_6621_201109A5.txt
DATA_HOME=/data/reddylab/Alex/collab/20190701_Matt//data/chip_seq
mkdir -p ${DATA_HOME}/raw_reads/

module load ddsclient
ddsclient download -p Siklenka_6621_201109A5 ${DATA_HOME}/raw_reads/Siklenka_6621_201109A5


Execute file to download files

In [None]:
%%script --out blocking_job_str bash
sbatch -o /data/reddylab/Alex/collab/20190701_Matt//processing/chip_seq/logs/Siklenka_6621_201109A5_download_fastq_files.out \
 -p all,new \
 --wrap="ssh aeb84@Hardac-xfer.genome.duke.edu 'sh /data/reddylab/Alex/collab/20190701_Matt//processing/chip_seq/scripts/download_Siklenka_6621_201109A5.sh' "

Extract blocking job id

In [None]:
import re
blocking_job = re.match('Submitted batch job (\d+).*', blocking_job_str).group(1)

#### Merge lanes of FASTQ files

In [None]:
%%writefile /data/reddylab/Alex/collab/20190701_Matt//processing/chip_seq/scripts/merge_lanes_Siklenka_6621_201109A5.sh
#!/bin/bash
#SBATCH --array=0-9%20
ORDER=Siklenka_6621_201109A5
RAW_DATA_DIR=/data/reddylab/Alex/collab/20190701_Matt//data/chip_seq/raw_reads/${ORDER}
PROCESSED_DATA_DIR=/data/reddylab/Alex/collab/20190701_Matt//data/chip_seq/processed_raw_reads/${ORDER}
METADATA=/data/reddylab/Alex/collab/20190701_Matt//data/chip_seq/metadata/chip_seq_download_metadata.Siklenka_6621_201109A5.txt

mkdir -p ${PROCESSED_DATA_DIR}
cd ${PROCESSED_DATA_DIR}

seq_name_header=$(/bin/grep -Eoi "sequencing.?core.?library.?name" ${METADATA})
if [[ $? == 1 ]];
then
    echo -e "ERROR: Sequencing core library name not found in ${METADATA}"
    exit 1
fi

name_header=$(/bin/grep -Poi "\tname\t" ${METADATA})
if [[ $? == 1 ]];
then
    echo -e "ERROR: Library Name column not found in ${METADATA}"
    exit 1
fi
name_header=$(echo ${name_header} | cut -f2)

seq_type_header=$(head -1 ${METADATA} | /bin/grep -Poi "paired.?end.?or.?single.?end")
if [[ $? == 1 ]];
then
    echo -e "ERROR: Paired-end or single-end column not found in ${METADATA}"
    exit 1
fi

sample_seq_name=$(/data/reddylab/software/bin/print_tab_cols.awk -v cols="${seq_name_header}" ${METADATA} \
    | awk -v SLURM_ARRAY_TASK_ID=${SLURM_ARRAY_TASK_ID} 'NR==SLURM_ARRAY_TASK_ID+1{print}');
sample_name=$(/data/reddylab/software/bin/print_tab_cols.awk -v cols="${name_header}" ${METADATA} \
    | awk -v SLURM_ARRAY_TASK_ID=${SLURM_ARRAY_TASK_ID} 'NR==SLURM_ARRAY_TASK_ID+1{print}');
seq_type=$(/data/reddylab/software/bin/print_tab_cols.awk -v cols="${seq_type_header}" ${METADATA} \
    | awk -v SLURM_ARRAY_TASK_ID=${SLURM_ARRAY_TASK_ID} 'NR==SLURM_ARRAY_TASK_ID+1{print}');


for read_pair in R1 R2 UMI;
do
    sample_files=$(/bin/ls ${RAW_DATA_DIR}/${sample_seq_name/ /}_S[0-9]*_L[0-9][0-9][0-9]_${read_pair}_* 2> /dev/null)
    if [[ $? != 0 ]]; # If no samples found with that read_pair, continue
    then
        continue;
    fi
    if [[ ${read_pair} == "R1" || (${seq_type/ /} == "PE" || ${seq_type/ /} == "pe") ]];
    then
        # Merge all lanes
        merged=$(basename $(echo ${sample_files} | awk '{print $1}') | sed -e 's/_L[0-9]\{3\}_/_/')
        cat ${sample_files} > ${merged};

        # Rename samples with our sample Names
        dest_filename=$(basename $(echo ${merged} | awk '{print $1}') | sed -r 's/\_S[0-9]+//; s/\_(R1|R2|UMI)\_/\.\1\./; s/\.[0-9]+\.fastq/\.fastq/')
        mv ${merged} ${dest_filename}

        cleaned_dest_filename=${dest_filename/${sample_seq_name/ /}/${sample_name/ /}}

        if [[ ${seq_type/ /} == "SE" || ${seq_type/ /} == "se" ]];
        then
            cleaned_dest_filename=${cleaned_dest_filename/.R1/}
        fi
        
        mv ${dest_filename} ${cleaned_dest_filename}
    fi
done


Execute file to merge lanes of FASTQ files

In [None]:
%%script --out blocking_job_str bash -s "$blocking_job"
sbatch -o /data/reddylab/Alex/collab/20190701_Matt//processing/chip_seq/logs/Siklenka_6621_201109A5_merge_fastq_files_%a.out \
 -p all,new \
 --depend afterok:$1 \
 --array 0-8%20 \
 /data/reddylab/Alex/collab/20190701_Matt//processing/chip_seq/scripts/merge_lanes_Siklenka_6621_201109A5.sh

Extract blocking job id

In [None]:
import re
blocking_job = re.match('Submitted batch job (\d+).*', blocking_job_str).group(1)

In [7]:
%%bash
ORDER=Siklenka_6621_201109A5
RAW_DATA_DIR=/data/reddylab/Alex/collab/20190701_Matt//data/chip_seq/raw_reads/${ORDER}
PROCESSED_DATA_DIR=/data/reddylab/Alex/collab/20190701_Matt//data/chip_seq/processed_raw_reads/${ORDER}
# mv ${PROCESSED_DATA_DIR/chip_seq/rna_seq} ${PROCESSED_DATA_DIR}
# ls -la  ${PROCESSED_DATA_DIR}

#### Create JSON files for CWL pipeline files

In [8]:
%%writefile /data/reddylab/Alex/collab/20190701_Matt//processing/chip_seq/scripts/cwl_json_gen_Siklenka_6621_201109A5.sh
#!/bin/bash
ORDER=Siklenka_6621_201109A5
PROCESSED_DATA_DIR=/data/reddylab/Alex/collab/20190701_Matt//data/chip_seq/processed_raw_reads/${ORDER}
METADATA=/data/reddylab/Alex/collab/20190701_Matt//data/chip_seq/metadata/chip_seq_download_metadata.Siklenka_6621_201109A5.txt

python /data/reddylab/software/cwl/GGR-cwl/v1.0/json-generator/run.py \
    -m ${METADATA} \
    -d ${PROCESSED_DATA_DIR} \
    -o /data/reddylab/Alex/collab/20190701_Matt//processing/chip_seq/jsons \
    -t chip-seq \
    --fastq-gzipped \
    --mem 24000 \
    --nthreads 16 \
    --separate-jsons



Writing /data/reddylab/Alex/collab/20190701_Matt//processing/chip_seq/scripts/cwl_json_gen_Siklenka_6621_201109A5.sh


Execute file to create JSON files

In [9]:
%%script --out blocking_job_str bash -s "$blocking_job"
source /data/reddylab/software/miniconda2/bin/activate cwl10
sbatch -o /data/reddylab/Alex/collab/20190701_Matt//processing/chip_seq/logs/Siklenka_6621_201109A5_cwl_json_gen.out \
 -p all,new \
 /data/reddylab/Alex/collab/20190701_Matt//processing/chip_seq/scripts/cwl_json_gen_Siklenka_6621_201109A5.sh

Extract blocking job id

In [10]:
import re
blocking_job = re.match('Submitted batch job (\d+).*', blocking_job_str).group(1)

#### Create SLURM array master bash file for pe samples

In [2]:
%%writefile /data/reddylab/Alex/collab/20190701_Matt//processing/chip_seq/scripts/Siklenka_6621_201109A5-pe.sh
#!/bin/bash
#SBATCH --job-name=cwl_chip_seq
#SBATCH --output=/data/reddylab/Alex/collab/20190701_Matt//processing/chip_seq/logs/Siklenka_6621_201109A5-pe-%a.out
#SBATCH --mail-user=aeb84@duke.edu,ks476@duke.edu
#SBATCH --mail-type=FAIL,END
#SBATCH --mem=24000
#SBATCH --cpus-per-task=16

export PATH="/data/reddylab/software/bin:$PATH"
export PATH="/data/reddylab/software/cwl/bin:$PATH"
export PATH="/data/reddylab/software/preseq_v2.0:$PATH"
export PATH="/data/reddylab/software/rsem-1.2.21/:$PATH"
export PATH="/data/reddylab/software/phantompeakqualtools-1.2/:$PATH"
export PATH="/data/reddylab/software/miniconda2/envs/cwl10/bin:$PATH"

module load bedtools2
module load fastqc
module load samtools
module load bowtie2
module load java

# For Fastqc
export DISPLAY=:0.0

# Make sure temporary files and folders are created in a specific folder
mkdir -p /data/reddylab/Alex/collab/20190701_Matt//processing/chip_seq/tmpdirs/tmp-Siklenka_6621_201109A5-pe-${SLURM_ARRAY_TASK_ID}-
export TMPDIR="/data/reddylab/Alex/collab/20190701_Matt//processing/chip_seq/tmpdirs/tmp-Siklenka_6621_201109A5-pe-${SLURM_ARRAY_TASK_ID}-"

cwltool --debug \
    --non-strict \
    --preserve-environment PATH \
    --preserve-environment DISPLAY \
    --preserve-environment TMPDIR \
    --outdir /data/reddylab/Alex/collab/20190701_Matt//processing/chip_seq/Siklenka_6621_201109A5-pe  \
    --no-container \
    /data/reddylab/software/cwl/GGR-cwl/v1.0/ChIP-seq_pipeline/pipeline-pe.cwl \
    /data/reddylab/Alex/collab/20190701_Matt//processing/chip_seq/jsons/chip_seq_download_metadata.Siklenka_6621_201109A5-pe-${SLURM_ARRAY_TASK_ID}.json

# Delete any tmpdir not removed by cwltool
rm -rf /data/reddylab/Alex/collab/20190701_Matt//processing/chip_seq/tmpdirs/tmp-Siklenka_6621_201109A5-pe-${SLURM_ARRAY_TASK_ID}-


Overwriting /data/reddylab/Alex/collab/20190701_Matt//processing/chip_seq/scripts/Siklenka_6621_201109A5-pe.sh


Execute SLURM array master file

In [3]:
%%script --out blocking_job_str bash -s "$blocking_job"
source /data/reddylab/software/miniconda2/bin/activate cwl10
sbatch -p all,new \
 --array 0-8%20 \
 /data/reddylab/Alex/collab/20190701_Matt//processing/chip_seq/scripts/Siklenka_6621_201109A5-pe.sh

Extract blocking job id

In [13]:
import re
blocking_job = re.match('Submitted batch job (\d+).*', blocking_job_str).group(1)

In [16]:
blocking_job

'23238632'

In [18]:
%%bash
sbatch --mail-user=ks476@duke.edu --mail-type=BEGIN --dependency=afterany:23238632 --wrap "echo K27ac samples finished"

#### Create QC generating script

In [14]:
%%writefile /data/reddylab/Alex/collab/20190701_Matt//processing/chip_seq/scripts/generate_qc_cell_Siklenka_6621_201109A5-pe.sh
#!/bin/bash
#SBATCH --job-name=qc
#SBATCH --output=/data/reddylab/Alex/collab/20190701_Matt//processing/chip_seq/logs/qc_gen.Siklenka_6621_201109A5-pe.out

source /data/reddylab/software/miniconda2/bin/activate alex
cd /data/reddylab/Alex/collab/20190701_Matt//processing/chip_seq/Siklenka_6621_201109A5-pe

python /data/reddylab/software/cwl/bin/generate_stats_chipseq_paired_end.py ./ \
    -samples $(/bin/ls -1 *PBC.txt | sed 's@.PBC.txt@@') \
> qc.txt

Writing /data/reddylab/Alex/collab/20190701_Matt//processing/chip_seq/scripts/generate_qc_cell_Siklenka_6621_201109A5-pe.sh


Generate QCs for Siklenka_6621_201109A5-pe

In [15]:
%%script --out blocking_job_str bash -s "$blocking_job"
sbatch -p all,new \
 --depend afterok:$1 \
 /data/reddylab/Alex/collab/20190701_Matt//processing/chip_seq/scripts/generate_qc_cell_Siklenka_6621_201109A5-pe.sh

Extract blocking job id

In [None]:
import re
blocking_job = re.match('Submitted batch job (\d+).*', blocking_job_str).group(1)

#### Create plot generating script

In [None]:
%%writefile /data/reddylab/Alex/collab/20190701_Matt//processing/chip_seq/scripts/generate_plot.Siklenka_6621_201109A5-pe.sh
#!/bin/bash
#SBATCH --job-name=generate_fingerplot
#SBATCH --output=/data/reddylab/Alex/collab/20190701_Matt//processing/chip_seq/logs/generate-plot.Siklenka_6621_201109A5-pe-%a.out

# This script is a subscript ran from `countFactors_standard.sh` for
# data insertion into the ChIP-DB web application. Intended for samples
# that follow the Reddy Lab sequencing sample naming conventions.

METADATA=/data/reddylab/Alex/collab/20190701_Matt//data/chip_seq/metadata/chip_seq_download_metadata.Siklenka_6621_201109A5.txt
IN_DIR=/data/reddylab/Alex/collab/20190701_Matt//processing/chip_seq/Siklenka_6621_201109A5-pe
OUT_DIR=/data/reddylab/Alex/collab/20190701_Matt//processing/chip_seq/Siklenka_6621_201109A5-pe

# Initialize and read in the indices for each field
FLOWCELL_INDEX=-1; FACTOR_INDEX=-1; INPUT_CTRL_INDEX=-1
HEADER=$(head -n 1 ${METADATA} | tr '[:upper:]' '[:lower:]')
IFS=$'\t' read -ra ADDR <<< "$HEADER"
for i in "${!ADDR[@]}"; do
        # If index denotes field, change the index. Cut is 1-indexed, add 1
        if [[ ${ADDR[$i]} = 'name' ]] ; then
                FACTOR_INDEX=$((i+1))
        elif [[ ${ADDR[$i]} = 'experiment name' ]] || [[ ${ADDR[$i]} = 'sequencing core project' ]] ; then
                FLOWCELL_INDEX=$((i+1))
        elif [[ ${ADDR[$i]} = 'Control' ]]; then
                INPUT_CTRL_INDEX=$((i+1))
        fi
done

# Add 2 to skip 0th line and 1st line which is header
LINE_NUM=$((${SLURM_ARRAY_TASK_ID}+2))
FILE_LINE=$(sed -n "${LINE_NUM}p" ${METADATA})
FLOWCELL=$(echo "$FILE_LINE" | cut -f${FLOWCELL_INDEX})
FACTOR=$(echo "$FILE_LINE" | cut -f${FACTOR_INDEX})
FACTOR_FILE=$(/bin/ls -1 ${IN_DIR}/${FACTOR}*.bam.bai | sed "s/.bai//")
INPUT_CTRL=$(echo "$FILE_LINE" | cut -f${INPUT_CTRL_INDEX})
INPUT_CTRL_FILE=$(/bin/ls -1 ${IN_DIR}/${INPUT_CTRL}*bam)
if [ $INPUT_CTRL_INDEX == -1 ];
then
        INPUT_CTRL=""
        INPUT_CTRL_FILE=""
fi
echo "Factor is: ${FACTOR}, file is: ${FACTOR_FILE}"
echo "Flowcell is: ${FLOWCELL}, Input control is: ${INPUT_CTRL}"

# Write sample metadata to file with name, timestamp and
# additional information
METADATA_FILE="${OUT_DIR}/${FACTOR}_metadata.txt"
TIMESTAMP=$(stat -c%z ${METADATA} | cut -d' ' -f1)
HEADER="Factor\tFlowcell\tInput_control\tTimestamp"
echo -e ${HEADER} > ${METADATA_FILE}
echo -e "${FACTOR}\t${FLOWCELL}\t${INPUT_CTRL}\t${TIMESTAMP}" >> ${METADATA_FILE}


# Case where sample has w/ Input control
if [ $INPUT_CTRL_INDEX != -1 ];
then
        echo "Sample, ${FACTOR_FILE}, has Input control ${INPUT_CTRL}"
        plotFingerprint -b ${FACTOR_FILE} ${INPUT_CTRL_FILE} \
        --labels ${FACTOR} ${INPUT_CTRL} \
        --outQualityMetrics ${OUT_DIR}/${FACTOR}_QCmetrics.txt \
        -T "Fingerprint of ${FACTOR}" \
        -plot ${OUT_DIR}/${FACTOR}.png \
        --outRawCounts ${OUT_DIR}/${FACTOR}_counts.tab
# Case where sample has no control
else
        echo "Sample, ${FACTOR_FILE}, has no controls"
        plotFingerprint -b ${FACTOR_FILE} \
        --labels ${FACTOR} \
        --outQualityMetrics ${OUT_DIR}/${FACTOR}_QCmetrics.txt \
        -T "Fingerprint of ${FACTOR}" \
        -plot ${OUT_DIR}/${FACTOR}.png \
        --outRawCounts ${OUT_DIR}/${FACTOR}_counts.tab
fi

exit 0

Generate plots and data for website

In [None]:
%%script --out blocking_job_str bash -s "$blocking_job"
source /data/reddylab/software/miniconda2/bin/activate cwl10
sbatch -p all,new \
 --depend afterok:$1 \
 --array 0-8%5 \
 /data/reddylab/Alex/collab/20190701_Matt//processing/chip_seq/scripts/generate_plot.Siklenka_6621_201109A5-pe.sh

Extract blocking job id

In [None]:
import re
blocking_job = re.match('Submitted batch job (\d+).*', blocking_job_str).group(1)

#### Create data upload script

In [None]:
%%writefile /data/reddylab/Alex/collab/20190701_Matt//processing/chip_seq/scripts/data_upload_Siklenka_6621_201109A5-pe.sh
#!/bin/bash
#SBATCH --job-name=data_upload
#SBATCH --output=/data/reddylab/Alex/collab/20190701_Matt//processing/chip_seq/logs/data_upload.Siklenka_6621_201109A5-pe.out

source /data/reddylab/software/miniconda2/bin/activate alex
python /gpfs/fs1/data/reddylab/software/cwl/ggr-cwl-ipynb-gen/data_upload.py -i /data/reddylab/Alex/collab/20190701_Matt//processing/chip_seq/Siklenka_6621_201109A5-pe -u None -d None -c None -o /data/reddylab/Alex/collab/20190701_Matt//processing/chip_seq/logs/Siklenka_6621_201109A5-pe_data_upload.log

### Upload ChIP-seq to web-application

In [None]:
%%script --out blocking_job_str bash -s "$blocking_job"
source /data/reddylab/software/miniconda2/bin/activate alex
sbatch -p all,new \
 --depend afterok:$1 \
 /data/reddylab/Alex/collab/20190701_Matt//processing/chip_seq/scripts/data_upload_Siklenka_6621_201109A5-pe.sh

Extract blocking job id

In [None]:
import re
blocking_job = re.match('Submitted batch job (\d+).*', blocking_job_str).group(1)