In [None]:
# Imports a parser from cogent
from cogent.parse.fasta import MinimalFastaParser as parse

In [None]:
# applies for the whole segment
nprocs = 4

In [None]:
# Checking out data file.
# This file was created using the QC_basic notebook.
!head ../../SeqData/16SfinalQC.fasta

In [None]:
# This could run out of memory if seq files are too large
# Getting just the unique sequences to work with
!usearch -derep_fulllength ../../SeqData/16SfinalQC.fasta -fastaout ../../SeqData/16SfinalQC.unique.fasta -sizeout -threads 4

In [None]:
!head ../../SeqData/16SfinalQC.unique.fasta

In [None]:
# Sequences are sorted by size
# Here the size of clusters - we are excluding the singletons here
# You would change minsize to 1 if you wanted to include singletons

!usearch -sortbysize ../../SeqData/16SfinalQC.unique.fasta -fastaout ../../SeqData/16SfinalQC.unique.sorted.fasta -minsize 2

In [None]:
# Getting the OTUs 

!usearch -cluster_otus ../../SeqData/16SfinalQC.unique.sorted.fasta -otus ../../SeqData/otus.fasta

In [None]:
!head ../../SeqData/otus.fasta
# See the OTU names are currently not in QIIME format - we can use a script from Edgar to fix this.

Then we need to go and get just the ITS2 sections of the sequences

In [None]:
%%mothur
remove.seqs(fasta=../../SeqData/otus.fasta, accnos=../../SeqData/to_remove_tax.accnos)

In [None]:
!head data/finalQC_usearchfmt.fasta

In [None]:
# Pulling out the sample identifier.
# It is adding a portion to the finalQC file that has the barcode label.
# Then we can use this later
# Now we will see how these reads map to the defined centroids (after removing EuK, etc.)
# Basically, we cut, cut, refined our fasta to make our OTU centroids.
# THEN, we went back to our original QC'd total fasta file and will throw it all against these nicely defined seeds.
# Anything that doesn't match, we won't keep.
!awk -F"_" \
'BEGIN{OFS=";"}{ if ( substr($1,0,1) == ">"){ print $0,"barcodelabel=",$1 } else { print $0 } }' \
../../SeqData/16SfinalQC.fasta | \
sed 's/;>//' > ../../SeqData/16SfinalQC.usearch.fasta

In [None]:
!head ../../SeqData/16SfinalQC.usearch.fasta

In [None]:
# This is where the actual OTUs are being assigned. We choose 97% sequence ID threshold here.
# This might take a while - Depending ont he clustering algorithm, like pairwise... 
# it would take, like, days on the same number of processors.
# This is why usearch (centroid-based) is so much better
# But is it more biologically relevant? ... maybe, maybe not.
# Edgar is showing it's not that bad.

# We take our total QC data
# We compare it to the otusn.pick.fasta seed database we made above *using 
# We produce a readmap.uc file which tells us how the reads from our finalQC file map to the otusn seed database.

!usearch -usearch_global ../../SeqData/16SfinalQC.usearch.fasta \
-db ../../SeqData/otus.pick.fasta \
-strand plus -id 0.97 \
-uc ../../SeqData/readmap.uc \
-threads 4

In [None]:
# Makes an OTU table
# It will tell me the OTU ID, and then for all the samples, which OTUs it has sequences from.
!python /opt/virt_env/bin/uc2otutab.py ../../SeqData/readmap.uc > ../../SeqData/otu_table.txt

In [None]:
# Issues with biom table formatting
!if [ -f ../../SeqData/otu_table.biom ]; then rm ../../SeqData/otu_table.biom; fi #This is to mitigate a biom bug
!biom convert -i ../../SeqData/otu_table.txt -o ../../SeqData/otu_table.biom --table-type="OTU table" --to-json

In [None]:
# Issues with biom table formatting
!if [ -f ../../SeqData/otu_table_summary.txt ]; then rm ../../SeqData/otu_table_summary.txt; fi #This is to mitigate a biom bug
!biom summarize-table -i ../../SeqData/otu_table.biom -o ../../SeqData/otu_table_summary.txt

In [None]:
# This tells us the overall data info
# Num obs = OTUs
# total count = total seqs

!cat ../../SeqData/otu_table_summary.txt

This leaves us with the following data:
OTU sequences: ../../SeqData/otus.pick.fasta
OTU table: ../../SeqData/otu_table.biom

We have 4.8k different OTUs, and 1.6M sequences