In [1]:
# Imports a parser from cogent
from cogent.parse.fasta import MinimalFastaParser as parse

In [2]:
%load_ext mothurmagic
# Loads mothurmagic so we can run mothur in the notebook using %%mothur at the top of the cell

In [3]:
# applies for the whole segment
nprocs = 4

In [4]:
# Checking out data file.
# This file was created using the QC_basic notebook.
!head ../../SeqData/16SfinalQC.fasta

>67.2_0
GACGTAGGGTGCAAGCGTTGTCCGGATTTATTGGGCATAAAGAGCTCGTAGGCGGCTTGTTGCGTCGACCGTGAAAACCTACCGCTTAACGGTGGGCTTGCGGTCGATACGGGCAGGCTAGAGTTCGGTAGGGGAGACTGGAATTCCTGGTGTAGCGGTGAAATGCGCAGATATCAGGAGGAACACCGGTGGCGAAGGCGGGTCTCTGGGCCGATACTGACGCTGAGGAGCGAAAGCGTGGGGAGCGAACAG
>74.2_1
TACGAAGGGGGCTAGCGTTGTTCGGATTTACTGGGCGTAAAGCGCACGTAGGCGGATCGATCAGTCAGGGGTGAAATCCCAGGGCTCAACCCTGGAACTGCCTTTGATACTGTCGATCTGGAGTATGGAAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGATATTCGGAGGAACACCAGTGGCGAAGGCGGCTCACTGGTCCATTACTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACAG
>70.2_140
TACGAAGGGGGCTAGCGTTGTTCGGATTTACTGGGCGTAAAGCGCACGTAGGCGGATCGATCAGTCAGGGGTGAAATCCCAGGGCTCAACCCTGGAACTGCCTTTGATACTGTCGATCTGGAGTATGGAAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGATATTCGGAGGAACACCAGTGGCGAAGGCGGCTCACTGGTCCATTACTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACAG
>19_180
TACGAAGGGGGCTAGCGTTGTTCGGATTTACTGGGCGTAAAGCGCACGTAGGCGGATCGATCAGTCAGGGGTGAAATCCCAGGGCTCAACCCTGGAACTGCCTTTGATACTGTCGATCTGGAGTATGGAAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGATATTCGGAGGAACACCAGTGGCGAAGGCG

In [5]:
%%mothur
unique.seqs(fasta=../../SeqData/16SfinalQC.fasta)

mothur > unique.seqs(fasta=../../SeqData/16SfinalQC.fasta)
1814142	361189

Output File Names:
../../SeqData/16SfinalQC.names
../../SeqData/16SfinalQC.unique.fasta


mothur > quit()


In [9]:
counts = {}

with open("../../SeqData/16SfinalQC.names") as f:
    for line in f:
        seedID, seqIDs = line.split("\t")
        count = len(seqIDs.split(","))
        counts[seedID] = count 

In [12]:
# Adds the counts from this dictionary to a new file so it looks like a usearch file with "size=XXX"

with open("../../SeqData/16SfinalQC.unique.usearch_names.fasta", "w") as f:
    for n, s in parse(open("../../SeqData/16SfinalQC.unique.fasta")):
        f.write(">%s;size=%s;\n%s\n"%(n,counts[n],s)) 

We could also just do this with usearch 

!usearch -derep_fulllength data/finalQC.fasta -output data/finalQC.unique.fasta -sizeout -threads 20

In [7]:
# This could run out of memory if seq files are too large
!usearch -derep_fulllength ../../SeqData/16SfinalQC.fasta -fastaout ../../SeqData/16SfinalQC.unique.fasta -sizeout -threads 4

usearch v8.0.1623_i86osx32, 4.0Gb RAM (17.2Gb total), 4 cores
(C) Copyright 2013-15 Robert C. Edgar, all rights reserved.
http://drive5.com/usearch

Licensed to: tlw59@cornell.edu

00:04 576Mb  100.0% Reading ../../SeqData/16SfinalQC.fasta
00:08 618Mb 1814142 seqs, 361189 uniques, 285281 singletons (79.0%)
00:08 618Mb Min size 1, median 1, max 40844, avg 5.02
00:15 618Mb  100.0% Writing ../../SeqData/16SfinalQC.unique.fasta


In [8]:
!head ../../SeqData/16SfinalQC.unique.fasta

>56_65;size=40844;
TACGGAGGGTGCAAGCGTTATCCGGATTCACTGGGTTTAAAGGGTGCGTAGGCGGACATGTAAGTCCGTGGTGAAATCTC
CGAGCTTAACTCGGAAACTGCCATGGATACTATATGTCTTGAATGTTGTGGAGGTTAGCGGAATATGTCATGTAGCGGTG
AAATGCATAGATATGACATAGAACACCAATTGCGAAGGCAGCTGGCTACACAAATATTGACGCTGAGGCACGAAAGCGTG
GGGATCAAACAG
>74.2_175;size=28737;
TACGGAGGGTGCAAGCGTTATCCGGATTCACTGGGTTTAAAGGGTGCGTAGGCGGACATGTAAGTCCGTGGTGAAATCTC
CAAGCTTAACTTGGAAACTGCCATGGATACTATATGTCTTGAATGTTGTGGAGGTTAGCGGAATATGTCATGTAGCGGTG
AAATGCATAGATATGACATAGAACACCAATTGCGAAGGCAGCTGGCTACACAAATATTGACGCTGAGGCACGAAAGCGTG
GGGATCAAACAG


In [9]:
# Sequences are sorted by size
# Here the size of clusters - we are excluding the singletons here
# You would change minsize to 1 if you wanted to include singletons
# Or, you know, just not do this step.
# But you should just get rid of them.
!usearch -sortbysize ../../SeqData/16SfinalQC.unique.fasta -fastaout ../../SeqData/16SfinalQC.unique.sorted.fasta -minsize 2

usearch v8.0.1623_i86osx32, 4.0Gb RAM (17.2Gb total), 4 cores
(C) Copyright 2013-15 Robert C. Edgar, all rights reserved.
http://drive5.com/usearch

Licensed to: tlw59@cornell.edu

00:01 148Mb  100.0% Reading ../../SeqData/16SfinalQC.unique.fasta
00:01 114Mb Getting sizes                                        
00:02 115Mb Sorting 75908 sequences
00:03 115Mb  100.0% Writing output


In [10]:
!tail ../../SeqData/16SfinalQC.unique.sorted.fasta

>1_443424;size=2;
GACGAGGGGTGCAAACGTTATTCGGAATGATTGGGCGTAAAGGGTGCGTAGGCGGCTTATTAAGTCAACTGTTAAATTTC
TCAGCCTAACTGGGAGTATGCGGTAGAAACTGATAGGCTTGAGGATGGAAGAGAGAAGTAGAATTCTCGGAGTAGCGGTT
AAATGCGTAGATCTCGAGAGGAACACCGATGGCGAAGGCAGCTTCTTGGTCCATTTCTGACGCTGAGGCACGAAAGCGTG
GGGAGCAAACAG
>61_617277;size=2;
TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGCGTAAAGGGAGCGTAGGTGGATATTTAAGTGGGATGTGAAATACT
CGGGCTTAACCTGAGTGCTGCATTCCAAACTGGATATCTAGAGTGCAGGAGAGGAAAGTAGAATTCCTAGTGTAGCGGTG
AAATGCGTAGAGATTAGGAAGAATACCAGTGGCGAAGGCGACTTTCTGGACTGTAACTGACGCTGAGGCTCGAAAGCGTG
GGGAGCAAACAG


In [11]:
# Here we are just creating the centroids, or "seeds" for future clustering
# Default is 97% minimum ID - not recommended to use more than 97%.

!usearch -cluster_otus ../../SeqData/16SfinalQC.unique.sorted.fasta -otus ../../SeqData/otus.fasta

usearch v8.0.1623_i86osx32, 4.0Gb RAM (17.2Gb total), 4 cores
(C) Copyright 2013-15 Robert C. Edgar, all rights reserved.
http://drive5.com/usearch

Licensed to: tlw59@cornell.edu

00:47  45Mb  100.0% 10003 OTUs, 8593 chimeras (11.3%)


In [13]:
# You do need to assign taxonomy in order to pull out the Euks., etc.
# Input is your fasta file
# Output is a fasta with taxonomy assinged (still working with unique seqs)
# It's using the database that we got when recently updating QIIME.

!parallel_assign_taxonomy_uclust.py -i ../../SeqData/otus.fasta -o ../../SeqData/otus.tax -O 4
#-r data/97_Silva_111_rep_set_no_ambig.fasta \
#-t data/Silva_111_taxa_map_full.txt \



In [14]:
# This makes a file of what we want to remove
# Could change this to pull out different groups.
# These primers actually had good Archaeal targets - so, it would be okay to include them.
!egrep "Chloroplast|Eukaryota|Unassigned|mitochondria" \
../../SeqData/otus.tax/otus_tax_assignments.txt \
| awk '{print $1}' > ../../SeqData/to_remove_tax.accnos

In [15]:
# wc is number of lines of the taxa that will be removed
!wc -l ../../SeqData/to_remove_tax.accnos

     794 ../../SeqData/to_remove_tax.accnos


In [16]:
# Looking at what you're removing
!head ../../SeqData/to_remove_tax.accnos

76_30945
99_111952
4_100185
39_180566
77_28994
69.2_9798
22_26542
1_81582
21_66858
50_18960


In [24]:
# We can blast a few of these to see what they might be.
! grep -A 1 ">99_111952" ../../SeqData/otus.fasta
# Clone matches might be artifacts of previous errors?

>99_111952
TACGTAGGTGGCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGCGCGCAGGTGGTGCGGTAAGTCTGTCGTGAAATCTC


In [25]:
%%mothur
remove.seqs(fasta=../../SeqData/otus.fasta, accnos=../../SeqData/to_remove_tax.accnos)

mothur > remove.seqs(fasta=../../SeqData/otus.fasta, accnos=../../SeqData/to_remove_tax.accnos)
Removed 794 sequences from your fasta file.

Output File Names:
../../SeqData/otus.pick.fasta


mothur > quit()


In [32]:
# Pulling out the sample identifier.
# It is adding a portion to the finalQC file that has the barcode label.
# Then we can use this later
# Now we will see how these reads map to the defined centroids (after removing EuK, etc.)
# Basically, we cut, cut, refined our fasta to make our OTU centroids.
# THEN, we went back to our original QC'd total fasta file and will throw it all against these nicely defined seeds.
# Anything that doesn't match, we won't keep.
!awk -F"_" \
'BEGIN{OFS=";"}{ if ( substr($1,0,1) == ">"){ print $0,"barcodelabel=",$1 } else { print $0 } }' \
../../SeqData/16SfinalQC.fasta | \
sed 's/;>//' > ../../SeqData/16SfinalQC.usearch.fasta

In [33]:
!head ../../SeqData/16SfinalQC.usearch.fasta

>67.2_0;barcodelabel=67.2
GACGTAGGGTGCAAGCGTTGTCCGGATTTATTGGGCATAAAGAGCTCGTAGGCGGCTTGTTGCGTCGACCGTGAAAACCTACCGCTTAACGGTGGGCTTGCGGTCGATACGGGCAGGCTAGAGTTCGGTAGGGGAGACTGGAATTCCTGGTGTAGCGGTGAAATGCGCAGATATCAGGAGGAACACCGGTGGCGAAGGCGGGTCTCTGGGCCGATACTGACGCTGAGGAGCGAAAGCGTGGGGAGCGAACAG
>74.2_1;barcodelabel=74.2
TACGAAGGGGGCTAGCGTTGTTCGGATTTACTGGGCGTAAAGCGCACGTAGGCGGATCGATCAGTCAGGGGTGAAATCCCAGGGCTCAACCCTGGAACTGCCTTTGATACTGTCGATCTGGAGTATGGAAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGATATTCGGAGGAACACCAGTGGCGAAGGCGGCTCACTGGTCCATTACTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACAG
>70.2_140;barcodelabel=70.2
TACGAAGGGGGCTAGCGTTGTTCGGATTTACTGGGCGTAAAGCGCACGTAGGCGGATCGATCAGTCAGGGGTGAAATCCCAGGGCTCAACCCTGGAACTGCCTTTGATACTGTCGATCTGGAGTATGGAAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGATATTCGGAGGAACACCAGTGGCGAAGGCGGCTCACTGGTCCATTACTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACAG
>19_180;barcodelabel=19
TACGAAGGGGGCTAGCGTTGTTCGGATTTACTGGGCGTAAAGCGCACGTAGGCGGATCGATCAGTCAGGGGTGAAATCCCAGGGCTCAACCCTGGAACTGCCTTTGATACTGTCGATCTGGAGTATGGAA

In [34]:
# This is where the actual OTUs are being assigned. We choose 97% sequence ID threshold here.
# This might take a while - Depending ont he clustering algorithm, like pairwise... 
# it would take, like, days on the same number of processors.
# This is why usearch (centroid-based) is so much better
# But is it more biologically relevant? ... maybe, maybe not.
# Edgar is showing it's not that bad.

# We take our total QC data
# We compare it to the otusn.pick.fasta database we made above
# We produce a readmap.uc file which tells us how the reads from our finalQC file map to the otusn seed database.

!usearch -usearch_global ../../SeqData/16SfinalQC.usearch.fasta \
-db ../../SeqData/otus.pick.fasta \
-strand plus -id 0.97 \
-uc ../../SeqData/readmap.uc \
-threads 4

usearch v8.0.1623_i86osx32, 4.0Gb RAM (17.2Gb total), 4 cores
(C) Copyright 2013-15 Robert C. Edgar, all rights reserved.
http://drive5.com/usearch

Licensed to: tlw59@cornell.edu

00:00 6.4Mb  100.0% Reading ../../SeqData/otus.pick.fasta
00:00 4.0Mb  100.0% Masking
00:00 4.9Mb  100.0% Word stats
00:00 4.9Mb  100.0% Alloc rows
00:00  14Mb  100.0% Build index
00:59  49Mb  100.0% Searching 16SfinalQC.usearch.fasta, 92.8% matched


In [35]:
# Makes an OTU table
# It will tell me the OTU ID, and then for all the samples, which OTUs it has sequences from.
!python /opt/virt_env/bin/uc2otutab.py ../../SeqData/readmap.uc > ../../SeqData/otu_table.txt

../../SeqData/readmap.uc 100.0%   


In [36]:
# Issues with biom table formatting
!if [ -f ../../SeqData/otu_table.biom ]; then rm ../../SeqData/otu_table.biom; fi #This is to mitigate a biom bug
!biom convert -i ../../SeqData/otu_table.txt -o ../../SeqData/otu_table.biom --table-type="OTU table" --to-json

In [37]:
# Issues with biom table formatting
!if [ -f ../../SeqData/otu_table_summary.txt ]; then rm ../../SeqData/otu_table_summary.txt; fi #This is to mitigate a biom bug
!biom summarize-table -i ../../SeqData/otu_table.biom -o ../../SeqData/otu_table_summary.txt

In [38]:
# This tells us the overall data info
# Num obs = OTUs
# total count = total seqs
# Chantal had 50% reduction after QC.

!cat ../../SeqData/otu_table_summary.txt

Num samples: 102
Num observations: 9209
Total count: 1681659
Table density (fraction of non-zero values): 0.125

Counts/sample summary:
 Min: 57.0
 Max: 51691.0
 Median: 16022.500
 Mean: 16486.853
 Std. dev.: 13077.038
 Sample Metadata Categories: None provided
 Observation Metadata Categories: None provided

Counts/sample detail:
 16: 57.0
 66: 63.0
 14: 64.0
 45: 69.0
 63: 85.0
 60: 87.0
 48: 125.0
 57: 139.0
 5: 140.0
 11: 195.0
 8: 296.0
 54: 447.0
 18: 705.0
 9: 881.0
 68: 917.0
 30: 1019.0
 20: 1108.0
 27: 1517.0
 51: 1520.0
 25: 1544.0
 47: 1570.0
 105: 1987.0
 6: 2865.0
 68.2: 2870.0
 17: 2877.0
 37: 2984.0
 40: 3165.0
 2: 3312.0
 97: 3498.0
 46: 3634.0
 29: 3752.0
 42: 3971.0
 75.2: 4587.0
 24: 5249.0
 15: 5615.0
 75: 6663.0
 22: 7646.0
 43: 10188.0
 92: 11094.0
 94: 11506.0
 4: 13031.0
 100: 13409.0
 98: 13715.0
 83: 13803.0
 71.2: 14147.0
 79: 14677.0
 10: 14798.0
 53: 14870.0
 73.2: 15183.0
 106: 15378.0
 77.2

This leaves us with the following data:
OTU sequences: ../../SeqData/otus.pick.fasta
OTU table: ../../SeqData/otu_table.biom

We have 4.8k different OTUs, and 1.6M sequences