## Use taxonomic read classifications from MEGAN6 to extract Phylum-specific FastQs from Arthropoda and Alveolata

In [1]:
%%bash
echo "TODAY'S DATE:"
date
echo "------------"
echo ""
#Display operating system info
lsb_release -a
echo ""
echo "------------"
echo "HOSTNAME: "; hostname 
echo ""
echo "------------"
echo "Computer Specs:"
echo ""
lscpu
echo ""
echo "------------"
echo ""
echo "Memory Specs"
echo ""
free -mh

TODAY'S DATE:
Tue Jan 21 13:35:55 PST 2020
------------

Distributor ID:	Ubuntu
Description:	Ubuntu 16.04.6 LTS
Release:	16.04
Codename:	xenial

------------
HOSTNAME: 
swoose

------------
Computer Specs:

Architecture:          x86_64
CPU op-mode(s):        32-bit, 64-bit
Byte Order:            Little Endian
CPU(s):                24
On-line CPU(s) list:   0-23
Thread(s) per core:    2
Core(s) per socket:    6
Socket(s):             2
NUMA node(s):          1
Vendor ID:             GenuineIntel
CPU family:            6
Model:                 44
Model name:            Intel(R) Xeon(R) CPU           X5670  @ 2.93GHz
Stepping:              2
CPU MHz:               2925.783
BogoMIPS:              5851.61
Virtualization:        VT-x
L1d cache:             32K
L1i cache:             32K
L2 cache:              256K
L3 cache:              12288K
NUMA node0 CPU(s):     0-23
Flags:                 fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr

No LSB modules are available.


### Set variables

In [3]:
# Set data directories
%env crab_data=/home/sam/data/C_bairdi/RNAseq
%env hemat_data=/home/sam/data/Hematodinium/RNAseq
%env wd=/home/sam/analyses

# Programs
%env seqtk=/home/sam/programs/seqtk-1.3/seqtk

env: crab_data=/home/sam/data/C_bairdi/RNAseq
env: hemat_data=/home/sam/data/Hematodinium/RNAseq
env: wd=/home/sam/analyses
env: seqtk=/home/sam/programs/seqtk-1.3/seqtk


#### Input data are here:

FastAs: https://gannet.fish.washington.edu/Atumefaciens/20200114_cbai_MEGAN_read_extractions/

Trimmed-FastQs: https://gannet.fish.washington.edu/Atumefaciens/20191218_cbai_fastp_RNAseq_trimming/

In [None]:
%%bash

timestamp=$(date +%Y%m%d)


for directory in ${crab_data} ${hemat_data}
do
	# Get species name
	species=$(echo ${directory} | awk -F"/" '{print $5}')
    
    # Make new directory and change to that directory ("$_" means use previous command's argument)
    mkdir --parents ${wd}/"${timestamp}"."${species}"_megan_reads \
    && cd "$_" || exit

	# Set seqtk list filename
	seqtk_list=${timestamp}.${species}.seqtk.read_id.list

	# Set output FastQ filenames
	R1_fq=${timestamp}.${species}.megan_R1.fq
	R2_fq=${timestamp}.${species}.megan_R2.fq

	######################################################
	# Create FastA IDs list to use for sequence extraction
	######################################################
	for fasta in "${directory}"/*.fasta
	do
      echo "Pulling FastA IDs from ${fasta}"
      echo ""
      grep ">" "${fasta}" | awk 'sub(/^>/, "")' >> "${seqtk_list}"
	done
    
    ######################################################
	# Extract corresponding R1 and R2 reads using seqtk FastA ID list
    ######################################################
	for fastq in "${directory}"/*R1*.gz
	do
      echo "Extracting R1 reads from ${fastq}"
      echo ""
	  ${seqtk} subseq "${fastq}" "${seqtk_list}" >> "${R1_fq}"
	done

	for fastq in "${directory}"/*R2*.gz
	do
      echo "Extracting R2 reads from ${fastq}"
      echo ""
	  ${seqtk} subseq "${fastq}" "${seqtk_list}" >> "${R2_fq}"
	done
    
    echo "-------------------------------------"
    # Print working directoyr and list files
    pwd
	ls -ltrh
    echo ""
    echo "-------------------------------------"
    echo ""
done
