In [1]:
# Imports a parser from cogent
from cogent.parse.fasta import MinimalFastaParser as parse

In [1]:
%load_ext mothurmagic
# Loads mothurmagic so we can run mothur in the notebook using %%mothur at the top of the cell

In [2]:
# applies for the whole segment
nprocs = 4

In [3]:
# Checking out data file.
# This file was created using the QC_basic notebook.
!head ../../SeqData/16SfinalQC.fasta

>67.2_0
TTGCCAGCCGCCGCGGTAAGACGTAGGGTGCAAGCGTTGTCCGGATTTATTGGGCATAAAGAGCTCGTAGGCGGCTTGTTGCGTCGACCGTGAAAACCTACCGCTTAACGGTGGGCTTGCGGTCGATACGGGCAGGCTAGAGTTCGGTAGGGGAGACTGGAATTCCTGGTGTAGCGGTGAAATGCGCAGATATCAGGAGGAACACCGGTGGCGAAGGCGGGTCTCTGGGCCGATACTGACGCTGAGGAGCGAAAGCGTGGGGAGCGAACAGGATTAGATACCCTTGTAGTCCC
>74.2_1
TTGCCAGCCGCCGCGGTAATACGAAGGGGGCTAGCGTTGTTCGGATTTACTGGGCGTAAAGCGCACGTAGGCGGATCGATCAGTCAGGGGTGAAATCCCAGGGCTCAACCCTGGAACTGCCTTTGATACTGTCGATCTGGAGTATGGAAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGATATTCGGAGGAACACCAGTGGCGAAGGCGGCTCACTGGTCCATTACTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACAGGATTAGAAACCCTGGTAGTCC
>67_2
GTGCCAGCAGCCGCGGTAATACAGAGGGTGCAAGCGTTGTTCGGAATTATTGGGCGTAAAGGGTGCGTAGGCGGTGCGGTAAGTCTTTTGTGAAATCTCCGGGCTCAACTCGGAGCCTGCAGAAGAAACTGCCGTGCTGGAGTATGGGAGAGGTGAGTGGAATTCCCGGTGTAGCGGTGAAATGCGTAGATATCGGGAGGAACACCTGTGGCGAAAGCGGCTCACTGGACCATCACTGACGCTGATGCACGAAAGCTAGGGGAGCAAACAGGATTAGAAACCCCTGTAGTCC
>1_3
GTGCCAGCCGCCGCGGTAATACAGAGGTGGCAAGCGTTGTTCGGAATTACTGGGCGTAAAGGGCGCGTAGGCGGCCTTCTAAGTCA

In [8]:
%%mothur
unique.seqs(fasta=../../SeqData/16SfinalQC.fasta)

mothur > unique.seqs(fasta=../../SeqData/16SfinalQC.fasta)
1788698	804218

Output File Names:
../../SeqData/16SfinalQC.names
../../SeqData/16SfinalQC.unique.fasta


mothur > quit()


In [9]:
counts = {}

with open("../../SeqData/16SfinalQC.names") as f:
    for line in f:
        seedID, seqIDs = line.split("\t")
        count = len(seqIDs.split(","))
        counts[seedID] = count 

In [12]:
# Adds the counts from this dictionary to a new file so it looks like a usearch file with "size=XXX"

with open("../../SeqData/16SfinalQC.unique.usearch_names.fasta", "w") as f:
    for n, s in parse(open("../../SeqData/16SfinalQC.unique.fasta")):
        f.write(">%s;size=%s;\n%s\n"%(n,counts[n],s)) 

We could also just do this with usearch 

!usearch -derep_fulllength data/finalQC.fasta -output data/finalQC.unique.fasta -sizeout -threads 20

In [18]:
# This could run out of memory if seq files are too large
!usearch -derep_fulllength ../../SeqData/16SfinalQC.fasta -fastaout ../../SeqData/16SfinalQC.unique.fasta ‑minseqlength  -sizeout -threads 4

usearch v8.0.1623_i86osx32, 4.0Gb RAM (17.2Gb total), 4 cores
(C) Copyright 2013-15 Robert C. Edgar, all rights reserved.
http://drive5.com/usearch

Licensed to: tlw59@cornell.edu

00:05 656Mb  100.0% Reading ../../SeqData/16SfinalQC.fasta
00:10 703Mb 1788698 seqs, 804218 uniques, 671925 singletons (83.6%)
00:10 703Mb Min size 1, median 1, max 1650, avg 2.22
00:30 703Mb  100.0% Writing ../../SeqData/16SfinalQC.unique.fasta


In [19]:
!head ../../SeqData/16SfinalQC.unique.fasta

>28_165;size=1650;
GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTATCCGGATTCACTGGGTTTAAAGGGTGCGTAGGCGGACATGT
AAGTCCGTGGTGAAATCTCCGAGCTTAACTCGGAAACTGCCATGGATACTATATGTCTTGAATGTTGTGGAGGTTAGCGG
AATATGTCATGTAGCGGTGAAATGCATAGATATGACATAGAACACCAATTGCGAAGGCAGCTGGCTACACAAATATTGAC
GCTGAGGCACGAAAGCGTGGGGATCAAACAGGATTAGAAACCCCAGTAGTCC
>44_563;size=1510;
GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTATCCGGATTCACTGGGTTTAAAGGGTGCGTAGGCGGACATGT
AAGTCCGTGGTGAAATCTCCGAGCTTAACTCGGAAACTGCCATGGATACTATATGTCTTGAATGTTGTGGAGGTTAGCGG
AATATGTCATGTAGCGGTGAAATGCATAGATATGACATAGAACACCAATTGCGAAGGCAGCTGGCTACACAAATATTGAC
GCTGAGGCACGAAAGCGTGGGGATCAAACAGGATTAGAAACCCTAGTAGTCC


In [20]:
# Sequences are sorted by size
# Here the size of clusters - we are excluding the singletons here
# You would change minsize to 1 if you wanted to include singletons
# Or, you know, just not do this step.
# But you should just get rid of them.
!usearch -sortbysize ../../SeqData/16SfinalQC.unique.fasta -fastaout ../../SeqData/16SfinalQC.unique.sorted.fasta -minsize 2

usearch v8.0.1623_i86osx32, 4.0Gb RAM (17.2Gb total), 4 cores
(C) Copyright 2013-15 Robert C. Edgar, all rights reserved.
http://drive5.com/usearch

Licensed to: tlw59@cornell.edu

00:02 337Mb  100.0% Reading ../../SeqData/16SfinalQC.unique.fasta
00:02 303Mb Getting sizes                                        
00:05 304Mb Sorting 132293 sequences
00:08 304Mb  100.0% Writing output


In [23]:
!tail ../../SeqData/16SfinalQC.unique.sorted.fasta

>98_199509;size=2;
GTGCCAGCAGCCGCGGTAACACGTAGGCACCAAGCGTTGTCCGGATTTATTGGGCGTAAAGAGCTCGTAGGCGGTTGAGT
AAGTCGGGTGTGAAAACTCTGGGCTTAACCCAGAGCCGCCACCCGATACTGCTCTGACTTGAGTTCGGTAGGGGAGCAGG
GAATTCCTAGTGTAGCGGTGAAATGCGCAGATATTAGGAGGAACACCGGTGGCGAAGGCGCTGCTCTGGGCCGATACTGA
CGCTGAGGAGCGAAAGCGTGGGTAGCAAACAGGATTAGAAACCCGAGTAGTCC
>7_648581;size=2;
GTGCCAGCCGCCGCGGTAATACGAAGGGGGCTAGCGTTGCTCGGAATTACTGGGCGTAAAGCGTACGTAGGCGGATCCTT
AAGTCGGTGGTGAAATCCTGAAGCTCAACTTCAGAACTGCCTTCGATACTGGGGACCTTGAGTTCGGGAGAGGTGAGTGG
AACTGCGAGTGTAGAGGTGAAATTCGTAGATATTCGCAAGAACACCAGTGGCGAAGGCGGCTCACTGGCCCGATACTGAC
GCTGAGGTACGAAAGCGTGGGGAGCAAACAGGATTAGAAACCCTTGTAGTCC


In [25]:
# Here we are just creating the centroids, or "seeds" for future clustering
# Default is 97% minimum ID - not recommended to use more than 97%.

!usearch -cluster_otus ../../SeqData/16SfinalQC.unique.sorted.fasta -otus ../../SeqData/otus.fasta

usearch v8.0.1623_i86osx32, 4.0Gb RAM (17.2Gb total), 4 cores
(C) Copyright 2013-15 Robert C. Edgar, all rights reserved.
http://drive5.com/usearch

Licensed to: tlw59@cornell.edu

00:36  51Mb  100.0% 5079 OTUs, 3885 chimeras (2.9%)


In [26]:
!head ../../SeqData/otus.fasta
# See the OTU names are currently not in QIIME format - we can use a script from Edgar to fix this.

>28_165
GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTATCCGGATTCACTGGGTTTAAAGGGTGCGTAGGCGGACATGT
AAGTCCGTGGTGAAATCTCCGAGCTTAACTCGGAAACTGCCATGGATACTATATGTCTTGAATGTTGTGGAGGTTAGCGG
AATATGTCATGTAGCGGTGAAATGCATAGATATGACATAGAACACCAATTGCGAAGGCAGCTGGCTACACAAATATTGAC
GCTGAGGCACGAAAGCGTGGGGATCAAACAGGATTAGAAACCCCAGTAGTCC
>13_1570
GTGCCAGCAGCCGCGGTAATACGTAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGGTTTGTT
AAGACCGATGTGAAATCCCCGGGCTCAACCTGGGAACTGCATTGGTGACTGGCAAGCTAGAGTATGGCAGAGGGGGGTAG
AATTCCACGTGTAGCAGTGAAATGCGTAGAGATGTGGAGGAATACCGATGGCGAAGGCAGCCCCCTGGGCCAATACTGAC
GCTCATGCACGAAAGCGTGGGGAGCAAACAGGATTAGAAACCCCAGTAGTCC


In [9]:
# You do need to assign taxonomy in order to pull out the Euks., etc.
# Input is your fasta file
# Output is a fasta with taxonomy assinged (still working with unique seqs)

!parallel_assign_taxonomy_uclust.py -i ../../SeqData/otus.fasta -o ../../SeqData/otus.tax -O 4
#-r data/97_Silva_111_rep_set_no_ambig.fasta \
#-t data/Silva_111_taxa_map_full.txt \



In [14]:
# This makes a file of what we want to remove
# Could change this to pull out different groups.
# These primers actually had good Archaeal targets - so, it would be okay to include them.
!egrep "Chloroplast|Eukaryota|Unassigned|mitochondria" \
../../SeqData/otus.tax/otus_tax_assignments.txt \
| awk '{print $1}' > ../../SeqData/to_remove_tax.accnos

In [15]:
# wc is number of lines of the taxa that will be removed
!wc -l ../../SeqData/to_remove_tax.accnos

     220 ../../SeqData/to_remove_tax.accnos


In [16]:
# Looking at what you're removing
!head ../../SeqData/to_remove_tax.accnos

23_181260
71_103928
104_391181
61_89028
62_245684
49_60737
50_418125
13_47397
64_98226
64_84554


In [3]:
%%mothur
remove.seqs(fasta=../../SeqData/otus.fasta, accnos=../../SeqData/to_remove_tax.accnos)

mothur > remove.seqs(fasta=../../SeqData/otus.fasta, accnos=../../SeqData/to_remove_tax.accnos)
Removed 220 sequences from your fasta file.

Output File Names:
../../SeqData/otus.pick.fasta


mothur > quit()


In [None]:
!head data/finalQC_usearchfmt.fasta

In [10]:
# Pulling out the sample identifier.
# It is adding a portion to the finalQC file that has the barcode label.
# Then we can use this later
# Now we will see how these reads map to the defined centroids (after removing EuK, etc.)
# Basically, we cut, cut, refined our fasta to make our OTU centroids.
# THEN, we went back to our original QC'd total fasta file and will throw it all against these nicely defined seeds.
# Anything that doesn't match, we won't keep.
!awk -F"_" \
'BEGIN{OFS=";"}{ if ( substr($1,0,1) == ">"){ print $0,"barcodelabel=",$1 } else { print $0 } }' \
../../SeqData/16SfinalQC.fasta | \
sed 's/;>//' > ../../SeqData/16SfinalQC.usearch.fasta

In [12]:
!head ../../SeqData/16SfinalQC.usearch.fasta

>67.2_0;barcodelabel=67.2
TTGCCAGCCGCCGCGGTAAGACGTAGGGTGCAAGCGTTGTCCGGATTTATTGGGCATAAAGAGCTCGTAGGCGGCTTGTTGCGTCGACCGTGAAAACCTACCGCTTAACGGTGGGCTTGCGGTCGATACGGGCAGGCTAGAGTTCGGTAGGGGAGACTGGAATTCCTGGTGTAGCGGTGAAATGCGCAGATATCAGGAGGAACACCGGTGGCGAAGGCGGGTCTCTGGGCCGATACTGACGCTGAGGAGCGAAAGCGTGGGGAGCGAACAGGATTAGATACCCTTGTAGTCCC
>74.2_1;barcodelabel=74.2
TTGCCAGCCGCCGCGGTAATACGAAGGGGGCTAGCGTTGTTCGGATTTACTGGGCGTAAAGCGCACGTAGGCGGATCGATCAGTCAGGGGTGAAATCCCAGGGCTCAACCCTGGAACTGCCTTTGATACTGTCGATCTGGAGTATGGAAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGATATTCGGAGGAACACCAGTGGCGAAGGCGGCTCACTGGTCCATTACTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACAGGATTAGAAACCCTGGTAGTCC
>67_2;barcodelabel=67
GTGCCAGCAGCCGCGGTAATACAGAGGGTGCAAGCGTTGTTCGGAATTATTGGGCGTAAAGGGTGCGTAGGCGGTGCGGTAAGTCTTTTGTGAAATCTCCGGGCTCAACTCGGAGCCTGCAGAAGAAACTGCCGTGCTGGAGTATGGGAGAGGTGAGTGGAATTCCCGGTGTAGCGGTGAAATGCGTAGATATCGGGAGGAACACCTGTGGCGAAAGCGGCTCACTGGACCATCACTGACGCTGATGCACGAAAGCTAGGGGAGCAAACAGGATTAGAAACCCCTGTAGTCC
>1_3;barcodelabel=1
GTGCCAGCCGCCGCGGTAA

In [13]:
# This is where the actual OTUs are being assigned. We choose 97% sequence ID threshold here.
# This might take a while - Depending ont he clustering algorithm, like pairwise... 
# it would take, like, days on the same number of processors.
# This is why usearch (centroid-based) is so much better
# But is it more biologically relevant? ... maybe, maybe not.
# Edgar is showing it's not that bad.

# We take our total QC data
# We compare it to the otusn.pick.fasta database we made above
# We produce a readmap.uc file which tells us how the reads from our finalQC file map to the otusn seed database.

!usearch -usearch_global ../../SeqData/16SfinalQC.usearch.fasta \
-db ../../SeqData/otus.pick.fasta \
-strand plus -id 0.97 \
-uc ../../SeqData/readmap.uc \
-threads 4

usearch v8.0.1623_i86osx32, 4.0Gb RAM (17.2Gb total), 4 cores
(C) Copyright 2013-15 Robert C. Edgar, all rights reserved.
http://drive5.com/usearch

Licensed to: tlw59@cornell.edu

00:00 4.4Mb  100.0% Reading ../../SeqData/otus.pick.fasta
00:00 2.9Mb  100.0% Masking
00:00 3.8Mb  100.0% Word stats
00:00 3.8Mb  100.0% Alloc rows
00:00 9.2Mb  100.0% Build index
01:04  45Mb  100.0% Searching 16SfinalQC.usearch.fasta, 90.0% matched


In [14]:
# Makes an OTU table
# It will tell me the OTU ID, and then for all the samples, which OTUs it has sequences from.
!python /opt/virt_env/bin/uc2otutab.py ../../SeqData/readmap.uc > ../../SeqData/otu_table.txt

../../SeqData/readmap.uc 100.0%   


In [18]:
# Issues with biom table formatting
!if [ -f ../../SeqData/otu_table.biom ]; then rm ../../SeqData/otu_table.biom; fi #This is to mitigate a biom bug
!biom convert -i ../../SeqData/otu_table.txt -o ../../SeqData/otu_table.biom --table-type="OTU table" --to-json

In [19]:
# Issues with biom table formatting
!if [ -f ../../SeqData/otu_table_summary.txt ]; then rm ../../SeqData/otu_table_summary.txt; fi #This is to mitigate a biom bug
!biom summarize-table -i ../../SeqData/otu_table.biom -o ../../SeqData/otu_table_summary.txt

In [20]:
# This tells us the overall data info
# Num obs = OTUs
# total count = total seqs
# Chantal had 50% reduction after QC.

!cat ../../SeqData/otu_table_summary.txt

Num samples: 102
Num observations: 4858
Total count: 1610247
Table density (fraction of non-zero values): 0.224

Counts/sample summary:
 Min: 39.0
 Max: 50457.0
 Median: 14993.000
 Mean: 15786.735
 Std. dev.: 12730.953
 Sample Metadata Categories: None provided
 Observation Metadata Categories: None provided

Counts/sample detail:
 16: 39.0
 66: 51.0
 14: 52.0
 45: 52.0
 63: 60.0
 60: 71.0
 48: 106.0
 57: 111.0
 5: 112.0
 11: 155.0
 54: 202.0
 8: 272.0
 18: 681.0
 9: 702.0
 68: 749.0
 30: 788.0
 20: 978.0
 51: 1133.0
 25: 1452.0
 27: 1492.0
 47: 1547.0
 105: 1924.0
 68.2: 2304.0
 37: 2567.0
 2: 2724.0
 6: 2753.0
 17: 2780.0
 40: 3177.0
 97: 3421.0
 46: 3507.0
 75.2: 3531.0
 29: 3593.0
 42: 3889.0
 24: 4732.0
 75: 5307.0
 15: 5433.0
 22: 7388.0
 43: 9744.0
 92: 10664.0
 94: 11131.0
 71.2: 11226.0
 4: 12464.0
 100: 12963.0
 83: 13254.0
 98: 13335.0
 73.2: 13400.0
 71: 13490.0
 10: 13661.0
 79: 14321.0
 53: 14521.0
 106: 149

This leaves us with the following data:
OTU sequences: ../../SeqData/otus.pick.fasta
OTU table: ../../SeqData/otu_table.biom

We have 4.8k different OTUs, and 1.6M sequences