In [None]:
# Imports a parser from cogent
from cogent.parse.fasta import MinimalFastaParser as parse

In [None]:
%load_ext mothurmagic
# Loads mothurmagic so we can run mothur in the notebook using %%mothur at the top of the cell

In [None]:
# applies for the whole segment
nprocs = 4

In [None]:
# Checking out data file.
# This file was created using the QC_basic notebook.
!head ../../SeqData/16SfinalQC.fasta

In [None]:
%%mothur
unique.seqs(fasta=../../SeqData/16SfinalQC.fasta)

In [None]:
counts = {}

with open("../../SeqData/16SfinalQC.names") as f:
    for line in f:
        seedID, seqIDs = line.split("\t")
        count = len(seqIDs.split(","))
        counts[seedID] = count 

In [None]:
# Adds the counts from this dictionary to a new file so it looks like a usearch file with "size=XXX"

with open("../../SeqData/16SfinalQC.unique.usearch_names.fasta", "w") as f:
    for n, s in parse(open("../../SeqData/16SfinalQC.unique.fasta")):
        f.write(">%s;size=%s;\n%s\n"%(n,counts[n],s)) 

We could also just do this with usearch 

!usearch -derep_fulllength data/finalQC.fasta -output data/finalQC.unique.fasta -sizeout -threads 20

In [None]:
# This could run out of memory if seq files are too large
!usearch -derep_fulllength ../../SeqData/16SfinalQC.fasta -fastaout ../../SeqData/16SfinalQC.unique.fasta ‑minseqlength  -sizeout -threads 4

In [None]:
!head ../../SeqData/16SfinalQC.unique.fasta

In [None]:
# Sequences are sorted by size
# Here the size of clusters - we are excluding the singletons here
# You would change minsize to 1 if you wanted to include singletons
# Or, you know, just not do this step.
# But you should just get rid of them.
!usearch -sortbysize ../../SeqData/16SfinalQC.unique.fasta -fastaout ../../SeqData/16SfinalQC.unique.sorted.fasta -minsize 2

In [None]:
!tail ../../SeqData/16SfinalQC.unique.sorted.fasta

In [None]:
# Here we are just creating the centroids, or "seeds" for future clustering
# Default is 97% minimum ID - not recommended to use more than 97%.

!usearch -cluster_otus ../../SeqData/16SfinalQC.unique.sorted.fasta -otus ../../SeqData/otus.fasta

In [None]:
!head ../../SeqData/otus.fasta
# See the OTU names are currently not in QIIME format - we can use a script from Edgar to fix this.

In [None]:
# You do need to assign taxonomy in order to pull out the Euks., etc.
# Input is your fasta file
# Output is a fasta with taxonomy assinged (still working with unique seqs)

!parallel_assign_taxonomy_uclust.py -i ../../SeqData/otus.fasta -o ../../SeqData/otus.tax -O 4
#-r data/97_Silva_111_rep_set_no_ambig.fasta \
#-t data/Silva_111_taxa_map_full.txt \



In [None]:
# This makes a file of what we want to remove
# Could change this to pull out different groups.
# These primers actually had good Archaeal targets - so, it would be okay to include them.
!egrep "Chloroplast|Eukaryota|Unassigned|mitochondria" \
../../SeqData/otus.tax/otus_tax_assignments.txt \
| awk '{print $1}' > ../../SeqData/to_remove_tax.accnos

In [None]:
# wc is number of lines of the taxa that will be removed
!wc -l ../../SeqData/to_remove_tax.accnos

In [None]:
# Looking at what you're removing
!head ../../SeqData/to_remove_tax.accnos

In [None]:
%%mothur
remove.seqs(fasta=../../SeqData/otus.fasta, accnos=../../SeqData/to_remove_tax.accnos)

In [None]:
!head data/finalQC_usearchfmt.fasta

In [None]:
# Pulling out the sample identifier.
# It is adding a portion to the finalQC file that has the barcode label.
# Then we can use this later
# Now we will see how these reads map to the defined centroids (after removing EuK, etc.)
# Basically, we cut, cut, refined our fasta to make our OTU centroids.
# THEN, we went back to our original QC'd total fasta file and will throw it all against these nicely defined seeds.
# Anything that doesn't match, we won't keep.
!awk -F"_" \
'BEGIN{OFS=";"}{ if ( substr($1,0,1) == ">"){ print $0,"barcodelabel=",$1 } else { print $0 } }' \
../../SeqData/16SfinalQC.fasta | \
sed 's/;>//' > ../../SeqData/16SfinalQC.usearch.fasta

In [None]:
!head ../../SeqData/16SfinalQC.usearch.fasta

In [None]:
# This is where the actual OTUs are being assigned. We choose 97% sequence ID threshold here.
# This might take a while - Depending ont he clustering algorithm, like pairwise... 
# it would take, like, days on the same number of processors.
# This is why usearch (centroid-based) is so much better
# But is it more biologically relevant? ... maybe, maybe not.
# Edgar is showing it's not that bad.

# We take our total QC data
# We compare it to the otusn.pick.fasta database we made above
# We produce a readmap.uc file which tells us how the reads from our finalQC file map to the otusn seed database.

!usearch -usearch_global ../../SeqData/16SfinalQC.usearch.fasta \
-db ../../SeqData/otus.pick.fasta \
-strand plus -id 0.97 \
-uc ../../SeqData/readmap.uc \
-threads 4

In [None]:
# Makes an OTU table
# It will tell me the OTU ID, and then for all the samples, which OTUs it has sequences from.
!python /opt/virt_env/bin/uc2otutab.py ../../SeqData/readmap.uc > ../../SeqData/otu_table.txt

In [None]:
# Issues with biom table formatting
!if [ -f ../../SeqData/otu_table.biom ]; then rm ../../SeqData/otu_table.biom; fi #This is to mitigate a biom bug
!biom convert -i ../../SeqData/otu_table.txt -o ../../SeqData/otu_table.biom --table-type="OTU table" --to-json

In [None]:
# Issues with biom table formatting
!if [ -f ../../SeqData/otu_table_summary.txt ]; then rm ../../SeqData/otu_table_summary.txt; fi #This is to mitigate a biom bug
!biom summarize-table -i ../../SeqData/otu_table.biom -o ../../SeqData/otu_table_summary.txt

In [None]:
# This tells us the overall data info
# Num obs = OTUs
# total count = total seqs
# Chantal had 50% reduction after QC.

!cat ../../SeqData/otu_table_summary.txt

This leaves us with the following data:
OTU sequences: ../../SeqData/otus.pick.fasta
OTU table: ../../SeqData/otu_table.biom

We have 4.8k different OTUs, and 1.6M sequences