In [1]:
#Required Python libraires
from subprocess import call
import pandas as pd
import numpy as np
%load_ext rpy2.ipython


#src
from src import classify #holds helper functions used in this notebook

# Annotation of ASVs Using Sanger Amplicons
ASVs were generated using qiime 2. This notebook will use 16S amplicons sequences using sanger sequencing to ensure proper annotation of the ASVs as our SynCom members. We will first align all SynCom sanger amplicons to one another to find a threshold for the percent identity, then use that identity to annotate ASVs.

 Required dependencies to reproduce Notebook results:
    - Nucleotide-Nucleotide BLAST 2.7.1+
    
## Data files and their descriptions:
##### data/ASV-table-rarefied-1493.tsv  
- A table seperated file with ASV ids and the abundance of the respective ASV id in each treatment  

##### data/rep-seqs.fasta
- fasta file of ASVs and their representative sequence

##### data/References.fasta
- ASV references used for ASV annotation

Source code for custom functions can be found in src/classify.py

In [2]:
with open("data/References.fasta") as fasta:
    print('The following are the reference sanger reads that we will used to classify reads from Miseq')
    for line in fasta:
        print(line.strip())

The following are the reference sanger reads that we will used to classify reads from Miseq
>R1-V4
TAGGGAATCTTCCGCAATGGACGAAAGTCTGACGGAGCAACGCCGCGTGAGTGATGAAGGCTTTCGGGTCGTAAAACTCTGTTGTTAGGGAAGAACAA
GTGCTAGTTGAATAAGCTGGCACCTTGACGGTACCTAACCAGAAAGCCACGGCTAACTACGTGCCAGCAGCCGCGGTAATACGTAGGTGGCAAGCGTT
ATCCGGAATTATTGGGCGTAAAGCGCGCGCAGGTGGTTTCTTAAGTCTGATGTGAAAGCCCACGGCTCAACCGTGGAGGGTCATTGGAAACTGGGAGA
CTTGAGTGCAGAAGAGGAAAGTGGAATTCCATGTGTAGCGGTGAAATGCGTAGAGATATGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGTCTGTA
ACTGACACTGAGGCGCGAAAGCGTGGGGAGCAAACA
>R4-V4
CAGTGGGGAATATTGCACAATGGGCGCAAGCCTGATGCAGCCATGCCGCGTGTATGAAGAAGGCCTTCGGGTTGTAAAGTACTTTCAGCGGGGAGGAA
GGGGACGAGGTTAACARCCTCGTTCATTGACGTTACCCGCAGAAGAAGCACCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCG
TTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTCTGTTAAGTCAGATGTGAAATCCCCGGGCTTAACCTGGGAACTGCATTTGAAACTGGCA
GGCTTGAGTCTTGTAGAGGGGGGTAGAATTCCAGGTGTAGCGGTGAAATGCGTAGAGATCTGGAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACAA
AGACTGACGCTCAGGTGCGAAAGCGTGGGGAGCAAACA
>R34-V4
TGGGGAATATTGCACAAT

In [3]:
qf = 'data/References.fasta'
sf = 'data/rep-seqs.fasta'
f = 'data/ASV-table-rarefied-1493.tsv'
o='data/Classified-feature-table_rarefied.csv'
classify.classify_with_references(qf,sf,f,o)

Launching: blastn -query data/References.fasta -outfmt 6 -out tempfile -subject data/rep-seqs.fasta


In [4]:
dat = pd.read_csv(o)
dat

Unnamed: 0,#OTU ID,1-High-R1-1,1-High-R1-2,1-High-R34-1,1-High-R4-1,1-High-R4-2,1-High-R60-1,1-High-R60-2,1-High-R79-1,1-High-R79-2,...,R4-rep-3-High-3,R4-rep-3-No-1,R4-rep-3-No-2,R4-rep-3-No-3,R4-rep-3-Reg-3,R4-rep-3-Reg-4,R4-rep-3-Reg-5,R4-rep-3-Reg-6,R4-rep-4-High-4,classification
0,9f7c449150b6d22402bb0165995d7144,167.0,156.0,42.0,0.0,12.0,84.0,208.0,180.0,193.0,...,7.0,10.0,16.0,13.0,8.0,54.0,82.0,12.0,0.0,R4-V4
1,ddd1920008ac6e63128cc80da116a9c5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,8.0,0.0,4.0,1.0,739.0,
2,6bb413e62bd71f30c72c6c2d780e76c4,75.0,49.0,136.0,0.0,3.0,228.0,105.0,59.0,83.0,...,2.0,5.0,2.0,5.0,5.0,46.0,60.0,5.0,0.0,R4-V4
3,2184f98774585f9579f1edced9811dc3,35.0,40.0,22.0,0.0,0.0,60.0,83.0,47.0,55.0,...,13.0,19.0,18.0,5.0,20.0,103.0,84.0,18.0,0.0,R4-V4
4,84f1b2ec9c3978357b0c6fa8ad083de0,35.0,38.0,34.0,0.0,3.0,61.0,70.0,53.0,29.0,...,4.0,13.0,19.0,14.0,19.0,153.0,109.0,24.0,0.0,R4-V4
5,7761cf667ebf9551f446bd8410a0bd13,65.0,61.0,36.0,0.0,8.0,65.0,100.0,59.0,79.0,...,6.0,9.0,6.0,7.0,9.0,50.0,54.0,6.0,0.0,R4-V4
6,9dfd601cf6f271bd79d1a5eaa21b389e,79.0,58.0,51.0,0.0,2.0,61.0,79.0,60.0,65.0,...,7.0,9.0,11.0,8.0,6.0,52.0,55.0,3.0,0.0,R4-V4
7,40cfbfb6528cb679092fa1cf6a11304e,67.0,74.0,36.0,2.0,17.0,15.0,0.0,12.0,23.0,...,42.0,52.0,47.0,53.0,41.0,17.0,11.0,24.0,0.0,
8,b496204e96c1699ba766bf7aa3aa58da,80.0,46.0,27.0,0.0,13.0,56.0,85.0,67.0,53.0,...,1.0,5.0,6.0,7.0,4.0,52.0,48.0,8.0,0.0,R4-V4
9,591ae4834e667dc6c6d577ced6f58ebb,76.0,60.0,31.0,0.0,5.0,65.0,80.0,53.0,67.0,...,5.0,4.0,4.0,4.0,5.0,32.0,55.0,1.0,0.0,R4-V4


# Nucleotide Blast Classification Added to Abundance Data
We have sucessfully annotated OTU ID codes with appropriate reference sequences. Annotation has been saved to the columns 'classification'. ASVs that met 95% identity to multiple references were annotated with both isolate IDs. For example, ASV:0728ca617d4b8a4f1d77b2326939682b was annotated as R1-V4 and R61-V4