###### Imports


In [54]:
# Imports
from qiime2 import Metadata
import pandas as pd
from qiime2 import Artifact, sdk
from qiime2.plugins.dada2.methods import denoise_pyro # The samples were obtained through pyrosequencing
import qiime2.plugins.vsearch.methods as vsm
import qiime2.plugins.phylogeny.methods as pm
from qiime2.plugins import phylogeny
from qiime2.plugins.alignment.methods import mafft
import qiime2.plugins.metadata.actions as metadata_actions
import qiime2.plugins.feature_classifier.actions as feature_classifier_actions
from qiime2.plugins.feature_classifier.pipelines import classify_consensus_vsearch
from qiime2.plugins.feature_classifier.pipelines import classify_consensus_blast
from qiime2.plugins.feature_table.visualizers import tabulate_seqs
import qiime2.plugins.feature_table.actions as fta
import matplotlib.pyplot as plt
import matplotlib 
import seaborn as sns
from Bio import SeqIO


In [27]:
# Useful functions
matplotlib.use('module://ipykernel.pylab.backend_inline')
pm = sdk.PluginManager()
def see(artifact):
    from_format = artifact.format
    if issubclass(from_format, sdk.plugin_manager.SingleFileDirectoryFormatBase):
        from_format = artifact.format.file.format
    return set(pm.transformers[from_format].keys())
import os
import pandas
import qiime2
import tempfile

def v2frame(viz_fp: str) -> list:
    '''viz_fp is a path to the qiime2 visualization object'''
    viz = qiime2.Visualization.load(viz_fp)
    with tempfile.TemporaryDirectory() as tmpdir:
        viz.export_data(tmpdir)
        fp = os.path.join(tmpdir, 'quality-plot.html')
        ov = os.path.join(tmpdir, 'overview.html')
        dfs = pandas.read_html(fp, index_col=0)
        df2s = pandas.read_html(ov, index_col=0)
    return dfs + df2s

def get_tax(tax: str) -> str:
    if not ';' in tax:
        return tax
    tax = tax.split(';')
    for t in tax[::-1]:
        if ' ' == t[0]:
            if 'uncultured' in t:
                continue
            return t.strip().replace(';', '')
        if t[-1:-3] == '_s':
            continue
        return t

### Jellyfish microbiome analysis

#### Sample background


#### Importing the data

The fastq files contain single-end reads, sequenced with the 454 GS FLX+ via a pyrosequencing approach

```{bash}
qiime tools import \
    --type 'SampleData[SequencesWithQuality]' \
    --input-path data.tsv \
    --output-path jelly.qza \
    --input-format SingleEndFastqManifestPhred33V2
```


In [11]:
raw_data = Artifact.load('artifacts/jelly.qza')

### Raw data exploration

The initial data exploration was performed with fastqc. There is a high variation in read length, though the majority are in the 450 - 500 bp range. As expected for short-read sequencing, read quality drops at higher read lengths, necessitating trimming. Surprisingly, the sample has little adapter content.

#### Blasting overrepresenting sequences

To get a rough idea of what microorganisms were represented most in the sample, I used data from `fastqc`'s overrepresented sequences module.
The sequences listed in the module were combined, clustered together to remove redundant reads with `cd-hit`, then blasted against the custom 16s rRNA database.

- `cd-hit` grouped 238 overrepresented sequences from the 7 samples into only 17 clusters
- The following describes the best 3 hits for the top 3 most overrepresented sequences
  - 1. Entoplasma, Mesoplasma, Lebetimonas
  - 2. _Ferruginivarius sediminum_, Azospirillum, Desulfosporosinus
  - 3. Flavobacteria, Dokdonia, Joostella


### Quality control
The dada2 plugin denoises sequences and dereplicates them 


In [12]:
denoised_Ftable, denoised_Seqs, denoised_stats = denoise_pyro(raw_data, trunc_len=600) # Will begin filtering at 600 bp
denoised_Ftable.save('artifacts/denoised.qza')
denoised_Seqs.save('artifacts/denoised_seqs.qza')
stats_viz, = metadata_actions.tabulate(input=denoised_stats.view(Metadata))
stats_viz.save('vis/denoised.qzv')

Running external command line application(s). This may print messages to stdout and/or stderr.
The command(s) being run are below. These commands cannot be manually re-run as they will depend on temporary files that no longer exist.

Command: run_dada.R --input_directory /tmp/qiime2/sc31/data/54baf4e3-b7ca-4f9c-a76c-8897cbea851c/data --output_path /tmp/tmpvop5ny4s/output.tsv.biom --output_track /tmp/tmpvop5ny4s/track.tsv --filtered_directory /tmp/tmpvop5ny4s --truncation_length 600 --trim_left 0 --max_expected_errors 2.0 --truncation_quality_score 2 --max_length Inf --pooling_method independent --chimera_method consensus --min_parental_fold 1.0 --allow_one_off False --num_threads 1 --learn_min_reads 250000 --homopolymer_gap_penalty 1 --band_size 32

R version 4.2.2 (2022-10-31) 


Loading required package: Rcpp


KeyboardInterrupt: 

In [13]:
# Importing previous
denoised_FtableDF = denoised_Ftable.view(pd.DataFrame).T
denoised_FtableDF.index.names = ['Feature ID']
denoised_FtableMD = denoised_Ftable.view(Metadata)
seqs_viz, = fta.tabulate_seqs(data=denoised_Seqs) # Tabulate sequences to 
denoised_SeqsDF = denoised_Seqs.view(Metadata).to_dataframe()
seqs_viz.save('vis/sequences.qzv')

FeatureData[Sequence]
FeatureTable[Frequency]


'vis/sequences.qzv'

##### Clustering
Once the sequences have been dereplicated, they can be clustered them into operational taxonomic units (OTUs). Given that jellyfish microorganisms may not be as well represented in databases, I'll be using de novo clustering rather than reference-based

In [28]:
otuFreqs, otuSeqs = vsm.cluster_features_de_novo(denoised_Seqs, denoised_Ftable, 0.99)
otuFreqs.save('artifacts/otuFreqs.qza')
otuSeqs.save('artifacts/otuSeqs.qza')

Running external command line application. This may print messages to stdout and/or stderr.
The command being run is below. This command cannot be manually re-run as it will depend on temporary files that no longer exist.

Command: vsearch --cluster_size /tmp/tmpgqn7og7s --id 0.99 --centroids /tmp/q2-DNAFASTAFormat-82pdf45f --uc /tmp/tmp9mboselc --qmask none --xsize --threads 1 --minseqlength 1 --fasta_width 0



vsearch v2.22.1_linux_x86_64, 22.9GB RAM, 16 cores
https://github.com/torognes/vsearch

Reading file /tmp/tmpgqn7og7s 100%
15600 nt in 26 seqs, min 600, max 600, avg 600
Sorting by abundance 100%
Counting k-mers 100%
Clustering 100%
Sorting clusters 100%
Writing clusters 100%
Clusters: 20 Size min 1, max 3, avg 1.3
Singletons: 16, 61.5% of seqs, 80.0% of clusters


'artifacts/otuSeqs.qza'

In [33]:
otuSeqsDF = otuSeqs.view(Metadata).to_dataframe()

Unnamed: 0_level_0,Sequence
Feature ID,Unnamed: 1_level_1
8331adf13c184313c464be9c7b21097c,CACTCTTGCGAGCATACTACTCAGGCGGAGTACTTAACGCGTTAGC...
b5f4256d637420120c2e71bc85666bfd,CATTCTTGCGAACGTACTCCCCAGGTGGGATACTTATCACTTTCGC...
00aabc54895675bbc65b0c1fdaf1fd07,TAACCTTGCGGCCGTACTCCCCAGGCGGTGTGCTTAATGCGTTAGC...
fe008f8f1f5a1776177e14bae97e10e2,CACACTTGCGTGCGTACTCCCCAGGCGGAACACTTAACGCGTTGGC...
3a4c4c7d2b8e5a489eb2d1fddc95e025,TAATCTTGCGACCGTACTCCCCAGGCGGAATGCTTAATCCGTTAGG...
6d4121b0c9e7eda284b4ab5bf571dc86,TAGTCTTGCGACCGTAGTCCCCAGGCGGAGTGCTTAACGCGTTAGC...
c6eddca0dc7ee0b43403e873ba6c8cfc,TAGTCTTGCGACCGTACTCCCCAGGCGGAGAACTTAACGCGTTAGC...
d4c1d85ff4ca5efdc346801641b2d9a2,TAATCTTGCGACCGTACTCCCCAGGCGGTCTACTTAACGCGTTAGC...
c00ee893109657877cc78cca52003ca9,CAGCCTTGCGACCATACTCCCCAGGCGGAACACTTAACGCTTTCGC...
d6f1d18cadc754defc476dd14e4f4c58,TAATCTTGCGACCGTACTCCCCAGGCGGTTCATTTAATGCGTTAGC...


#### Pairing sequences with their frequencies in Pandas


In [None]:
# display(denoised_FtableDF)
freq_seq = denoised_FtableDF.merge(denoised_SeqsDF, on='Feature ID')
freq_seq['Feature ID'] = freq_seq.index
freq_seq = freq_seq.reset_index(drop=True)
display(freq_seq)

Unnamed: 0,sample-1,sample-2,sample-3,sample-4,sample-5,sample-6,sample-7,Sequence,Feature ID
0,0.0,0.0,92.0,0.0,0.0,5247.0,1983.0,CACTCTTGCGAGCATACTACTCAGGCGGAGTACTTAACGCGTTAGC...,8331adf13c184313c464be9c7b21097c
1,4268.0,2209.0,325.0,392.0,0.0,0.0,0.0,CACTCTTGCGAGCATACTACTCAGGCGGAGTACTTAACGCGTTAGC...,b37b36ef2cd58aa19929a8ae4f8f4c21
2,671.0,277.0,97.0,30.0,0.0,387.0,42.0,CATTCTTGCGAACGTACTCCCCAGGTGGGATACTTATCACTTTCGC...,b5f4256d637420120c2e71bc85666bfd
3,94.0,0.0,298.0,284.0,0.0,0.0,6.0,TAACCTTGCGGCCGTACTCCCCAGGCGGTGTGCTTAATGCGTTAGC...,00aabc54895675bbc65b0c1fdaf1fd07
4,0.0,0.0,0.0,0.0,681.0,0.0,0.0,CACTCTTGCGAGCATACTACTCAGGCGGAGTACTTAACGCGTTAGC...,3c5ee7863e02dd5604419d803a014014
5,235.0,0.0,0.0,0.0,0.0,121.0,0.0,CACACTTGCGTGCGTACTCCCCAGGCGGAACACTTAACGCGTTGGC...,fe008f8f1f5a1776177e14bae97e10e2
6,188.0,0.0,0.0,0.0,0.0,0.0,0.0,TAATCTTGCGACCGTACTCCCCAGGCGGAATGCTTAATCCGTTAGG...,3a4c4c7d2b8e5a489eb2d1fddc95e025
7,0.0,0.0,0.0,0.0,0.0,163.0,0.0,TAACCTTGCGGCCGTACTCCCCAGGCGGTGTGCTTAATGCGTTAGC...,0c2db32fc52f027a04e433a93c0669df
8,0.0,0.0,0.0,0.0,76.0,0.0,0.0,TAACCTTGCGGCCGTACTCCCCAGGCGGTGTGCTTAATGCGTTAGC...,5f460fa8105db752feb6166a52b135d2
9,0.0,0.0,0.0,0.0,0.0,45.0,0.0,TAGTCTTGCGACCGTAGTCCCCAGGCGGAGTGCTTAACGCGTTAGC...,6d4121b0c9e7eda284b4ab5bf571dc86


In [None]:
sns.heatmap(freq_seq.iloc[:, :-2])
plt.show()

<Figure size 640x480 with 2 Axes>

### Taxonomic analyses


#### Alignment-based classification


##### Blast


In [34]:
# Run search
classified, top_hits = classify_consensus_blast(otuSeqs, ref_seqs, ref_ids)
classified.save('artifacts/blast_seqs.qza')
top_hits.save('artifacts/blast_hits.qza')

Running external command line application. This may print messages to stdout and/or stderr.
The command being run is below. This command cannot be manually re-run as it will depend on temporary files that no longer exist.

Command: blastn -query /tmp/qiime2/sc31/data/6ce41b78-4959-4a10-ad3a-6fa008e4eb34/data/dna-sequences.fasta -evalue 0.001 -strand both -outfmt 6 -subject /tmp/qiime2/sc31/data/4dca7356-2c27-4284-8679-b3ec0974b1ab/data/dna-sequences.fasta -perc_identity 80.0 -qcov_hsp_perc 80.0 -max_target_seqs 10 -out /tmp/q2-BLAST6Format-24ykq59q



'artifacts/blast_hits.qza'

In [39]:
blastDF = blast.view(Metadata).to_dataframe()
blastTopDF = blast_top_hits.view(Metadata).to_dataframe()

##### Vsearch


In [16]:
# Run search
classified, top_hits = classify_consensus_vsearch(denoised_Seqs, ref_seqs, ref_ids)
classified.save('artifacts/vsearch_seqs.qza')
top_hits.save('artifacts/vsearch_hits.qza')


Running external command line application. This may print messages to stdout and/or stderr.
The command being run is below. This command cannot be manually re-run as it will depend on temporary files that no longer exist.

Command: vsearch --usearch_global /tmp/qiime2/sc31/data/bf89e318-4049-42b4-b410-ebf99b324ef1/data/dna-sequences.fasta --id 0.8 --query_cov 0.8 --strand both --maxaccepts 10 --maxrejects 0 --db /tmp/qiime2/sc31/data/4dca7356-2c27-4284-8679-b3ec0974b1ab/data/dna-sequences.fasta --threads 1 --output_no_hits --blast6out /tmp/q2-BLAST6Format-e2m8jf6c



vsearch v2.22.1_linux_x86_64, 22.9GB RAM, 16 cores
https://github.com/torognes/vsearch

Reading file /tmp/qiime2/sc31/data/4dca7356-2c27-4284-8679-b3ec0974b1ab/data/dna-sequences.fasta 100%
1272167297 nt in 866608 seqs, min 302, max 4563, avg 1468
Masking 100%
Counting k-mers 100%
Creating k-mer index 100%
Searching 100%
Matching unique query sequences: 26 of 26 (100.00%)


'artifacts/vsearch_hits.qza'

In [17]:
vsearchDF = vsearch_seqs.view(Metadata).to_dataframe()
vsearchTopDF = vsearch_top_hits.view(Metadata).to_dataframe()

#### Machine learning


## Diversity analyses

#### Phylogeny reconstruction
- Reconstructing a phylogenetic tree from the otu sequences may allow us to identify unknown otus by placing them in relation to those that were identified. 
- qiime2 offers several different 

In [65]:
# First, generate a multiple sequence alignment with mafft
alignedotus = mafft(otuSeqs)
iqtreeUR = pm.iqtree_ultrafast_bootstrap(alignedotus.alignment)
raxmlUR = pm.raxml_rapid_bootstrap(alignedotus.alignment)
fasttreeUR = pm.fasttree(alignedotus.alignment)
iqtreeR = pm.midpoint_root(iqtreeUR.tree)
raxmlR = pm.midpoint_root(raxmlUR.tree)
fasttreeR = pm.midpoint_root(fasttreeUR.tree)

Running external command line application. This may print messages to stdout and/or stderr.
The command being run is below. This command cannot be manually re-run as it will depend on temporary files that no longer exist.

Command: mafft --preservecase --inputorder --thread 1 /tmp/qiime2/sc31/data/6ce41b78-4959-4a10-ad3a-6fa008e4eb34/data/dna-sequences.fasta



inputfile = orig
20 x 600 - 600 d
nthread = 1
nthreadpair = 1
nthreadtb = 1
ppenalty_ex = 0
stacksize: 8192 kb
generating a scoring matrix for nucleotide (dist=200) ... done
Gap Penalty = -1.53, +0.00, +0.00



Making a distance matrix ..
    1 / 20 (thread    0)
done.

Constructing a UPGMA tree (efffree=0) ... 
   10 / 20
done.

Progressive alignment 1/2... 
STEP     6 / 19 (thread    0)
Reallocating..done. *alloclen = 2203
STEP    19 / 19 (thread    0)
done.

Making a distance matrix from msa.. 
    0 / 20 (thread    0)
done.

Constructing a UPGMA tree (efffree=1) ... 
   10 / 20
done.

Progressive alignment 2/2... 
STEP     5 / 19 (thread    0)
Reallocating..done. *alloclen = 2202
STEP    19 / 19 (thread    0)
done.

disttbfast (nuc) Version 7.515
alg=A, model=DNA200 (2), 1.53 (4.59), -0.00 (-0.00), noshift, amax=0.0
1 thread(s)


Strategy:
 FFT-NS-2 (Fast but rough)
 Progressive method (guide trees were built 2 times.)

If unsure which option to use, try 'mafft --auto input > outpu

Running external command line application. This may print messages to stdout and/or stderr.
The command being run is below. This command cannot be manually re-run as it will depend on temporary files that no longer exist.

Command: iqtree -bb 1000 -st DNA --runs 1 -s /tmp/qiime2/sc31/data/3267aa82-df09-4459-92db-2277b184680d/data/aligned-dna-sequences.fasta -m MFP -pre /tmp/tmpdxraaq6_/q2iqtreeufboot -nt 1

IQ-TREE multicore version 2.2.0.3 COVID-edition for Linux 64-bit built Aug  2 2022
Developed by Bui Quang Minh, James Barbetti, Nguyen Lam Tung,
Olga Chernomor, Heiko Schmidt, Dominik Schrempf, Michael Woodhams, Ly Trong Nhan.

Host:    bravo15-pop (AVX2, FMA3, 22 GB RAM)
Command: iqtree -bb 1000 -st DNA --runs 1 -s /tmp/qiime2/sc31/data/3267aa82-df09-4459-92db-2277b184680d/data/aligned-dna-sequences.fasta -m MFP -pre /tmp/tmpdxraaq6_/q2iqtreeufboot -nt 1
Seed:    81051 (Using SPRNG - Scalable Parallel Random Number Generator)
Time:    Thu Jun  1 06:56:10 2023
Kernel:  AVX+FMA - 1 t

FastTree Version 2.1.11 Double precision (No SSE3)
Alignment: /tmp/qiime2/sc31/data/3267aa82-df09-4459-92db-2277b184680d/data/aligned-dna-sequences.fasta
Nucleotide distances: Jukes-Cantor Joins: balanced Support: SH-like 1000
Search: Normal +NNI +SPR (2 rounds range 10) +ML-NNI opt-each=1
TopHits: 1.00*sqrtN close=default refresh=0.80
ML Model: Jukes-Cantor, CAT approximation with 20 rate categories
Initial topology in 0.00 seconds
Refining topology: 17 rounds ME-NNIs, 2 rounds ME-SPRs, 9 rounds ML-NNIs
Total branch-length 2.047 after 0.04 sec
ML-NNI round 1: LogLk = -6854.969 NNIs 5 max delta 5.69 Time 0.05
Switched to using 20 rate categories (CAT approximation)
Rate categories were divided by 0.823 so that average rate = 1.0
CAT-based log-likelihoods may not be comparable across runs
Use -gamma for approximate but comparable Gamma(20) log-likelihoods
ML-NNI round 2: LogLk = -6015.470 NNIs 1 max delta 6.30 Time 0.07
ML-NNI round 3: LogLk = -6015.351 NNIs 0 max delta 0.00 Time 0.07
T

In [77]:
iqtreeUR.tree.save('artifacts/iqtreeUR.qza')
raxmlUR.tree.save('artifacts/raxmlUR.qza')
fasttreeUR.tree.save('artifacts/fasttreeUR.qza')
iqtreeR.rooted_tree.save('artifacts/iqtreeR.qza')
raxmlR.rooted_tree.save('artifacts/raxmlR.qza')
fasttreeR.rooted_tree.save('artifacts/fasttree.qza')

'artifacts/fasttree.qza'

## Data sources

- https://www.ebi.ac.uk/ena/browser/view/PRJEB8518
- https://docs.qiime2.org/2023.5/data-resources/ High-quality reference OTUs
- The `see` and `v2frame` functions were obtained from [this](https://forum.qiime2.org/t/how-to-capture-a-value-from-a-summary-and-pipe-it/19783/5) link via user thermokarst


# Load previous artifacts

In [78]:
# Load previous artifacts
denoised_Seqs = Artifact.load('artifacts/denoised_seqs.qza')
denoised_Ftable = Artifact.load('artifacts/denoised.qza')
otuFreqs = Artifact.load('artifacts/otuFreqs.qza')
otuSeqs = Artifact.load('artifacts/otuSeqs.qza')
ref_ids = Artifact.load('../data/artifacts/AllIDs.qza')
ref_seqs = Artifact.load('../data/artifacts/AllSeqs.qza')
blast = Artifact.load('artifacts/blast_seqs.qza')
blast_top_hits = Artifact.load('artifacts/blast_hits.qza')
vsearch_seqs = Artifact.load('artifacts/vsearch_seqs.qza')
vsearch_top_hits = Artifact.load('artifacts/vsearch_hits.qza')
iqtreeUR = Artifact.load('artifacts/iqtreeUR.qza')
raxmlUR = Artifact.load('artifacts/raxmlUR.qza')
fasttreeUR = Artifact.load('artifacts/fasttreeUR.qza')
iqtreeR = Artifact.load('artifacts/iqtreeR.qza')
raxmlR = Artifact.load('artifacts/raxmlR.qza')
fasttreeR = Artifact.load('artifacts/fasttreeR.qza')