- Added HISAT2 and HTSeq as possible option for aligner and counter tool

- Modified job submission and configuration files for analysis step to allow user to specify the aligner and counter used in analysis - Modified structure of aligner index and reference folders into one single genome folder with multiple sub-folders for indexes and reference - Fixed a bug with single-ended file handling in pre-processing streaming script - Fixed emr job submission command to deal with complex extra arguments for aligner and counter
VCCRI · Oct 13, 2016 · b81fa15 · b81fa15
1 parent c64ea2e
commit b81fa15
Show file tree

Hide file tree

Showing 11 changed files with 236 additions and 77 deletions.
diff --git a/analysis_job.config b/analysis_job.config
@@ -1,5 +1,5 @@
 [job_config]
-name = Analysis job
+name = FASTQ alignment and quantification
 action_on_failure = CONTINUE
 analysis_script = run_pipeline_multiple_files.py
 analysis_script_s3_location = s3://[YOUR-BUCKET]/scripts
@@ -17,7 +17,11 @@ annotation_file =
 # Option for strand specificity is NONE|FIRST_READ_TRANSCRIPTION_STRAND|SECOND_READ_TRANSCRIPTION_STRAND
 strand_specificity = NONE
 run_picard = True
-star_extra_args =
+# Option for aligner tools is STAR or HISAT2
+aligner_tool = STAR
+aligner_extra_args =
+# Option for counter tools is featureCount or HTSeq
+counter_tool = featureCount
 counter_extra_args = -t exon -g gene_name
 picard_extra_args =
-region = us-west-2
+region = us-west-2
diff --git a/emr_cluster.config b/emr_cluster.config
@@ -8,8 +8,7 @@ bootstrap_scripts_local_location = source/cluster_creator
 upload_bootstrap_scripts = True
 
 software_installer_location = s3://[YOUR-BUCKET]/...
-genome_ref_location = s3://[YOUR-BUCKET]/...
-star_ref_location = s3://[YOUR-BUCKET]/...
+genome_folder_location = s3://[YOUR-BUCKET]/...
 
 [EMR_nodes]
 key_name = [YOUR KEY NAME]

diff --git a/launch_cluster.py b/launch_cluster.py
@@ -12,8 +12,8 @@
 
 
 def check_configuration(config):
-    if not utility.check_config(config, "EMR", ["release_label", "software_installer_location", "genome_ref_location",
-                                                "star_ref_location"]):
+    if not utility.check_config(config, "EMR", ["release_label", "software_installer_location",
+                                                "genome_folder_location"]):
         return False
 
     if not utility.check_upload_config(config["EMR"], "upload_bootstrap_scripts", "bootstrap_scripts",
@@ -93,7 +93,7 @@ def build_command(config):
             if bootstrap_script == "install_software.sh":
                 bootstrap_action_args = [config["EMR"]["software_installer_location"]]
             elif bootstrap_script == "copy_reference.sh":
-                bootstrap_action_args = [config["EMR"]["genome_ref_location"], config["EMR"]["star_ref_location"]]
+                bootstrap_action_args = [config["EMR"]["genome_folder_location"]]
 
             bootstrap_actions.append({
                 "Name": bootstrap_script,

diff --git a/source/cluster_creator/copy_reference.sh b/source/cluster_creator/copy_reference.sh
@@ -1,38 +1,23 @@
 #!/bin/bash
 # copies reference files from S3 & unzips
 # input args:
-#   $1 - AWS S3 URI for genome reference location
-#   $2 - AWS S3 URI for STAR reference location
+#   $1 - AWS S3 URI for location containing genome reference, star index and hisat index
 
 # want to terminate on error
 set -e
 set -o pipefail
 
-function unzip_files() {
-    # unzip any .gz files in current directory or any subdirectories
-    # determine if there are any .gz files; note that without this test, the xargs command would fail with a null input
-    zip_files=$( find -L . -name "*.gz" -print0 )
-    if [ "$zip_files" != "" ] ; then
-        # unzip all the .gz files using as many processors as possible
-        find -L . -name "*.gz" -print0 | xargs -P0 -0 gunzip
-    fi
-}
-
 ref_dir=/mnt/ref
-genome_dir=genome_ref
-star_dir=star_ref
 
-mkdir $ref_dir
-pushd $ref_dir > /dev/null
+aws s3 sync $1 $ref_dir
 
-# Genome Ref
-aws s3 sync $1 $genome_dir
-pushd $ref_dir/$genome_dir > /dev/null
-unzip_files
-popd > /dev/null
+pushd $ref_dir
+# unzip any .gz files in current directory or any subdirectories
+# determine if there are any .gz files; note that without this test, the xargs command would fail with a null input
+zip_files=$( find -L . -name "*.gz" -print0 )
+if [ "$zip_files" != "" ] ; then
+    # unzip all the .gz files using as many processors as possible
+    find -L . -name "*.gz" -print0 | xargs -0 -n1 -P0 gunzip
+fi
+popd
 
-# STAR Ref
-aws s3 sync $2 $star_dir
-pushd $ref_dir/$star_dir > /dev/null
-unzip_files
-popd > /dev/null
diff --git a/source/cluster_creator/install_software.sh b/source/cluster_creator/install_software.sh
@@ -32,13 +32,24 @@ fc=$( find -name "featureCounts"|grep bin )
 sr_path=${fc%featureCounts}
 ln -s $sr_path subread
 
+# Install HISAT2
+unzip hisat2*.zip
+hisat_dir=$( find . -maxdepth 1 -type d -name "hisat2*")
+ln -s $hisat_dir hisat
+
+# Install HTSeq
+sudo yum install python27-devel python27-numpy python27-matplotlib -y
+sudo pip install pysam
+sudo pip install htseq
+
 # Install samtools
 tar -xjf samtools*.tar.bz2
 sam_dir=$( find . -maxdepth 1 -type d -name "samtools*" )
 pushd $sam_dir > /dev/null
 make
 sudo make install
 popd > /dev/null
+ln -s $sam_dir samtools
 
 # Install htslib
 hts_dir=$( find $sam_dir -maxdepth 1 -type d -name "htslib-*" )
@@ -78,6 +89,7 @@ popd > /dev/null
 mkdir /mnt/output
 
 # Install python dependencies for framework
+sudo yum install python27-Cython -y
 sudo pip install pandas
 sudo pip install boto3
 sudo python3 -m pip install boto3

diff --git a/source/cluster_creator/prepare_install_files.sh b/source/cluster_creator/prepare_install_files.sh
@@ -22,6 +22,9 @@ cd $tmp
 # STAR
 wget -O STAR-2.5.2a.tar.gz https://github.com/alexdobin/STAR/archive/2.5.2a.tar.gz
 
+# HISAT2
+wget -O hisat2-2.0.4.zip ftp://ftp.ccb.jhu.edu/pub/infphilo/hisat2/downloads/hisat2-2.0.4-Linux_x86_64.zip
+
 # subread 
 wget -O subread-1.5.0-p3-Linux-x86_64.tar.gz https://sourceforge.net/projects/subread/files/subread-1.5.0-p3/subread-1.5.0-p3-Linux-x86_64.tar.gz/download
 

diff --git a/source/preprocessing/preprocess_streaming.sh b/source/preprocessing/preprocess_streaming.sh
@@ -208,7 +208,7 @@ while read f ; do
             <( paste - - - - < $fq_1 ) <( paste - - - - < $fq_2 ) | gzip --fast > $f
     else
         # single file
-        awk '{printf("%s%s",$0,(NR%4==0)?"\n":"\t")}' $fq_1 |gzip --fast > $f
+        awk '{printf("%s%s",$0,(NR%4==0)?"\n":"\t")}' $se_id |gzip --fast > $f
     fi
 
     # upload data & remove scratch directory