In [None]:
%%bash
echo "TODAY'S DATE:"
date
echo "------------"
echo ""
#Display operating system info
lsb_release -a
echo ""
echo "------------"
echo "HOSTNAME: "; hostname 
echo ""
echo "------------"
echo "Computer Specs:"
echo ""
lscpu
echo ""
echo "------------"
echo ""
echo "Memory Specs"
echo ""
free -mh

### Set variables

In [None]:
# Set data directories
%env crab_data=/home/sam/data/C_bairdi/RNAseq
%env hemat_data=/home/sam/data/Hematodinium/RNAseq
%env wd=/home/sam/analyses

# Programs
%env /home/sam/programs/samtools-1.9/samtools
%env /home/sam/programs/seqtk-1.3/seqtk

#### Input data are here:

FastAs: https://gannet.fish.washington.edu/Atumefaciens/20200114_cbai_MEGAN_read_extractions/
Trimmed-FastQs: https://gannet.fish.washington.edu/Atumefaciens/20191218_cbai_fastp_RNAseq_trimming/

In [None]:
%%bash

timestamp=$(date +%Y%m%d)


for directory in ${crab_data} ${hemat_data}
do
	# Get species name
	species=$(echo ${directory} | awk -F"/" '{print $5}')
    
    # Make new directory and change to that directory ("$_" means use previous command's argument)
    mkdir --parents ${wd}/"${timestamp}"."${species}"_megan_reads \
    && cd "$_" || exit

	# Set seqtk list filename
	seqtk_list=${timestamp}.${species}.seqtk.read_id.list

	# Set output FastQ filenames
	R1_fq=${timestamp}.${species}.megan_R1.fq
	R2_fq=${timestamp}.${species}.megan_R2.fq

	######################################################
	# Create FastA IDs list to use for sequence extraction
	######################################################
	for fasta in *.fasta
	do
      echo "Pulling FastA IDs from ${fasta}"
      echo ""
      grep ">" "${fasta}" | awk 'sub(/^>/, "")' >> "${seqtk_list}"
	done
    
    ######################################################
	# Extract corresponding R1 and R2 reads using seqtk FastA ID list
    ######################################################
	for fastq in *R1*.gz
	do
      echo "Extracing R1 reads from ${fastq}"
      echo ""
	  ${seqtk} subseq "${fastq}" "${seqtk_list}" >> "${R1_fq}"
	done

	for fastq in *R2*.gz
	do
      echo "Extracing R1 reads from ${fastq}"
      echo ""
	  ${seqtk} subseq "${fastq}" "${seqtk_list}" >> "${R2_fq}"
	done
    
    echo "-------------------------------------"
    # Print working directoyr and list files
    pwd
	ls -ltrh
    echo ""
    echo "-------------------------------------"
    echo ""
done
