Skip to content

Commit

Permalink
- Added HISAT2 and HTSeq as possible option for aligner and counter tool
Browse files Browse the repository at this point in the history
- Modified job submission and configuration files for analysis step to allow user to specify the aligner and counter used in analysis
- Modified structure of aligner index and reference folders into one single genome folder with multiple sub-folders for indexes and reference
- Fixed a bug with single-ended file handling in pre-processing streaming script
- Fixed emr job submission command to deal with complex extra arguments for aligner and counter
  • Loading branch information
andr-kun committed Oct 13, 2016
1 parent c64ea2e commit b81fa15
Show file tree
Hide file tree
Showing 11 changed files with 236 additions and 77 deletions.
10 changes: 7 additions & 3 deletions analysis_job.config
@@ -1,5 +1,5 @@
[job_config]
name = Analysis job
name = FASTQ alignment and quantification
action_on_failure = CONTINUE
analysis_script = run_pipeline_multiple_files.py
analysis_script_s3_location = s3://[YOUR-BUCKET]/scripts
Expand All @@ -17,7 +17,11 @@ annotation_file =
# Option for strand specificity is NONE|FIRST_READ_TRANSCRIPTION_STRAND|SECOND_READ_TRANSCRIPTION_STRAND
strand_specificity = NONE
run_picard = True
star_extra_args =
# Option for aligner tools is STAR or HISAT2
aligner_tool = STAR
aligner_extra_args =
# Option for counter tools is featureCount or HTSeq
counter_tool = featureCount
counter_extra_args = -t exon -g gene_name
picard_extra_args =
region = us-west-2
region = us-west-2
3 changes: 1 addition & 2 deletions emr_cluster.config
Expand Up @@ -8,8 +8,7 @@ bootstrap_scripts_local_location = source/cluster_creator
upload_bootstrap_scripts = True

software_installer_location = s3://[YOUR-BUCKET]/...
genome_ref_location = s3://[YOUR-BUCKET]/...
star_ref_location = s3://[YOUR-BUCKET]/...
genome_folder_location = s3://[YOUR-BUCKET]/...

[EMR_nodes]
key_name = [YOUR KEY NAME]
Expand Down
6 changes: 3 additions & 3 deletions launch_cluster.py
Expand Up @@ -12,8 +12,8 @@


def check_configuration(config):
if not utility.check_config(config, "EMR", ["release_label", "software_installer_location", "genome_ref_location",
"star_ref_location"]):
if not utility.check_config(config, "EMR", ["release_label", "software_installer_location",
"genome_folder_location"]):
return False

if not utility.check_upload_config(config["EMR"], "upload_bootstrap_scripts", "bootstrap_scripts",
Expand Down Expand Up @@ -93,7 +93,7 @@ def build_command(config):
if bootstrap_script == "install_software.sh":
bootstrap_action_args = [config["EMR"]["software_installer_location"]]
elif bootstrap_script == "copy_reference.sh":
bootstrap_action_args = [config["EMR"]["genome_ref_location"], config["EMR"]["star_ref_location"]]
bootstrap_action_args = [config["EMR"]["genome_folder_location"]]

bootstrap_actions.append({
"Name": bootstrap_script,
Expand Down
37 changes: 11 additions & 26 deletions source/cluster_creator/copy_reference.sh
@@ -1,38 +1,23 @@
#!/bin/bash
# copies reference files from S3 & unzips
# input args:
# $1 - AWS S3 URI for genome reference location
# $2 - AWS S3 URI for STAR reference location
# $1 - AWS S3 URI for location containing genome reference, star index and hisat index

# want to terminate on error
set -e
set -o pipefail

function unzip_files() {
# unzip any .gz files in current directory or any subdirectories
# determine if there are any .gz files; note that without this test, the xargs command would fail with a null input
zip_files=$( find -L . -name "*.gz" -print0 )
if [ "$zip_files" != "" ] ; then
# unzip all the .gz files using as many processors as possible
find -L . -name "*.gz" -print0 | xargs -P0 -0 gunzip
fi
}

ref_dir=/mnt/ref
genome_dir=genome_ref
star_dir=star_ref

mkdir $ref_dir
pushd $ref_dir > /dev/null
aws s3 sync $1 $ref_dir

# Genome Ref
aws s3 sync $1 $genome_dir
pushd $ref_dir/$genome_dir > /dev/null
unzip_files
popd > /dev/null
pushd $ref_dir
# unzip any .gz files in current directory or any subdirectories
# determine if there are any .gz files; note that without this test, the xargs command would fail with a null input
zip_files=$( find -L . -name "*.gz" -print0 )
if [ "$zip_files" != "" ] ; then
# unzip all the .gz files using as many processors as possible
find -L . -name "*.gz" -print0 | xargs -0 -n1 -P0 gunzip
fi
popd

# STAR Ref
aws s3 sync $2 $star_dir
pushd $ref_dir/$star_dir > /dev/null
unzip_files
popd > /dev/null
12 changes: 12 additions & 0 deletions source/cluster_creator/install_software.sh
Expand Up @@ -32,13 +32,24 @@ fc=$( find -name "featureCounts"|grep bin )
sr_path=${fc%featureCounts}
ln -s $sr_path subread

# Install HISAT2
unzip hisat2*.zip
hisat_dir=$( find . -maxdepth 1 -type d -name "hisat2*")
ln -s $hisat_dir hisat

# Install HTSeq
sudo yum install python27-devel python27-numpy python27-matplotlib -y
sudo pip install pysam
sudo pip install htseq

# Install samtools
tar -xjf samtools*.tar.bz2
sam_dir=$( find . -maxdepth 1 -type d -name "samtools*" )
pushd $sam_dir > /dev/null
make
sudo make install
popd > /dev/null
ln -s $sam_dir samtools

# Install htslib
hts_dir=$( find $sam_dir -maxdepth 1 -type d -name "htslib-*" )
Expand Down Expand Up @@ -78,6 +89,7 @@ popd > /dev/null
mkdir /mnt/output

# Install python dependencies for framework
sudo yum install python27-Cython -y
sudo pip install pandas
sudo pip install boto3
sudo python3 -m pip install boto3
Expand Down
3 changes: 3 additions & 0 deletions source/cluster_creator/prepare_install_files.sh
Expand Up @@ -22,6 +22,9 @@ cd $tmp
# STAR
wget -O STAR-2.5.2a.tar.gz https://github.com/alexdobin/STAR/archive/2.5.2a.tar.gz

# HISAT2
wget -O hisat2-2.0.4.zip ftp://ftp.ccb.jhu.edu/pub/infphilo/hisat2/downloads/hisat2-2.0.4-Linux_x86_64.zip

# subread
wget -O subread-1.5.0-p3-Linux-x86_64.tar.gz https://sourceforge.net/projects/subread/files/subread-1.5.0-p3/subread-1.5.0-p3-Linux-x86_64.tar.gz/download

Expand Down
2 changes: 1 addition & 1 deletion source/preprocessing/preprocess_streaming.sh
Expand Up @@ -208,7 +208,7 @@ while read f ; do
<( paste - - - - < $fq_1 ) <( paste - - - - < $fq_2 ) | gzip --fast > $f
else
# single file
awk '{printf("%s%s",$0,(NR%4==0)?"\n":"\t")}' $fq_1 |gzip --fast > $f
awk '{printf("%s%s",$0,(NR%4==0)?"\n":"\t")}' $se_id |gzip --fast > $f
fi

# upload data & remove scratch directory
Expand Down

0 comments on commit b81fa15

Please sign in to comment.