# Download GNPS Data

In [1]:
import networkx as nx
from gnpsdata import taskresult
import os
from gnpsdata import workflow_fbmn
import pandas as pd
import csv
from qiime2 import Visualization

In [2]:
task = "cf6e14abf5604f47b28b467a513d3532"

In [3]:
# Downloading raw data from GNPS
def download_graphml(task, output_file):
    taskresult.download_task_resultfile(task, "gnps_molecular_network_graphml/", output_file)

def get_graphml_network(task):
    taskresult.download_task_resultfile(task, "gnps_molecular_network_graphml/", "temp.graphml")

    G = nx.read_graphml("temp.graphml")

    return G

def download_quantification(task, output_file):
    taskresult.download_task_resultfile(task, "quantification_table/", output_file)

def download_metadata(task, output_file):
    taskresult.download_task_resultfile(task, "metadata_merged/", output_file)

def download_mgf(task, output_file):
    taskresult.download_task_resultfile(task, "spectra_reformatted/", output_file)
    
# Qiime2 Data
def download_qiime2(task, output_file):
    taskresult.download_task_resultfile(task, "qiime2_output/qiime2_table.qza", output_file)

def download_qiime2_manifest(task, output_file):
    taskresult.download_task_resultfile(task, "qiime2_output/qiime2_manifest.tsv", output_file)

def download_qiime2_metadata(task, output_file):
    taskresult.download_task_resultfile(task, "qiime2_output/qiime2_metadata.tsv", output_file)

In [4]:
# Download quantification and manifest
os.makedirs("../data", exist_ok=True)
download_quantification(task, "../data/quant.csv")
download_qiime2_manifest(task, "../data/manifest.csv")
# Downloading metadata
workflow_fbmn.download_metadata(task, "../data/unprocessed_metadata.tsv")

# Changing Metadata and Manifest Column name

In [5]:
#read metadata file
metadata = pd.read_csv("../data/unprocessed_metadata.tsv", sep = "\t", index_col=False)
#rename 1st column to "#OTU ID
metadata = metadata.rename(columns={"filename":"#OTU ID"})
#convert back to .tsv
metadata.to_csv('../data/metadata.tsv', sep="\t", index=False)

# Import Into Qiime2
## Convert .tsv to .biom

In [6]:
# Replace the following file names with your own
input_file = '../data/quant.csv'
output_file = '../data/biom_quant.tsv'

# Open the input CSV file
with open(input_file, 'r') as csv_file:
    reader = csv.reader(csv_file)
    header = next(reader)  # Get the header row
    
    # Get the indexes of the first three columns
    indexes = [0, 1, 2]
    
    # Get the indexes of columns with "Peak area" in the header
    peak_area_indexes = [i for i in range(len(header)) if 'Peak area' in header[i]]
    
    # Open the output TSV file
    with open(output_file, 'w', newline='') as tsv_file:
        writer = csv.writer(tsv_file, delimiter='\t')
        
        # Write the header row with updated column names
        new_header = ['#OTU ID'] + [header[i].replace('Peak area', '') for i in peak_area_indexes]
        writer.writerow(new_header)
        
        # Loop through the remaining rows of the input CSV file
        for row in reader:
            # Extract the values from the first three columns
            otu_id = row[0]
            
            # Extract the values from columns with "Peak area" in the header
            peak_area_values = [row[i].replace('Peak area', '') for i in peak_area_indexes]
            
            # Write a row to the output TSV file
            new_row = [otu_id] + peak_area_values
            writer.writerow(new_row)


In [7]:
! biom convert \
  -i ../data/biom_quant.tsv \
  -o ../data/quant.biom --to-hdf5

In [8]:
! qiime tools import \
  --input-path ../data/quant.biom \
  --type 'FeatureTable[Frequency]' \
  --input-format BIOMV210Format \
  --output-path ../data/qiime_table.qza

[32mImported ../data/quant.biom as BIOMV210Format to ../data/qiime_table.qza[0m
[0m

# ANOVA

In [9]:
p_formula = 'ATTRIBUTE_Year~ATTRIBUTE_Sample_Area+ATTRIBUTE_Latitude'

In [10]:
! qiime longitudinal anova \
  --m-metadata-file ../data/metadata.tsv \
  --p-formula p_formula \
  --p-sstype 'I' \
  --o-visualization ../data/metadata.qzv

[31m[1mPlugin error from longitudinal:

  Formula not valid: missing tilde.
  Enter a valid formula in format "y ~ model".

Debug info has been saved to /tmp/qiime2-q2cli-err-dhxzr35h.log[0m
[0m

# Visualization

In [11]:
Visualization.load('../data/metadata.qzv')

# Principal Coordinate Analysis (PCoA) & Distance Matrix

In [12]:
! qiime diversity beta \
  --i-table ../data/qiime_table.qza \
  --p-metric canberra_adkins \
  --o-distance-matrix ../data/distance_matrix.qza

[32mSaved DistanceMatrix to: ../data/distance_matrix.qza[0m
[0m

## PCoA

In [13]:
! qiime diversity pcoa \
  --i-distance-matrix ../data/distance_matrix.qza \
  --o-pcoa ../data/pcoa.qza

[32mSaved PCoAResults to: ../data/pcoa.qza[0m
[0m

# Emperor plot

In [14]:
! qiime emperor plot \
  --i-pcoa ../data/pcoa.qza \
  --m-metadata-file ../data/metadata.tsv \
  --o-visualization ../data/emperor_plot.qzv \
  --p-ignore-missing-samples

[32mSaved Visualization to: ../data/emperor_plot.qzv[0m
[0m

# Visualization

In [15]:
Visualization.load("../data/emperor_plot.qzv")

# Classifier Data/Heat Map

In [23]:
metadata_column = 'ATTRIBUTE_Sample_Area'
estimator = 'RandomForestClassifier'
n_estimators = 500
random_state = 123


In [24]:
! qiime sample-classifier classify-samples \
  --i-table ../data/qiime_table.qza \
  --m-metadata-file ../data/metadata.tsv \
  --m-metadata-column $metadata_column \
  --p-optimize-feature-selection \
  --p-parameter-tuning \
  --p-estimator $estimator \
  --p-n-estimators $n_estimators \
  --p-random-state $random_state \
  --o-accuracy-results ../data/accuracy_results.qzv \
  --o-feature-importance ../data/feature_importance.qza \
  --o-heatmap ../data/heatmap.qzv \
  --o-model-summary ../data/model_summary.qzv \
  --o-predictions ../data/predictions.qza \
  --o-probabilities ../data/probabilities.qza \
  --o-sample-estimator ../data/sample_estimator.qza \
  --o-test-targets ../data/test_targets.qza \
  --o-training-targets ../data/training_targets.qza 


[31m[1mPlugin error from sample-classifier:

  Missing samples in metadata: {'SD_01-2018_29_b.mzXML ', 'SD_10_2018_19_a.mzXML ', 'SD_01-2018_28_a.mzXML ', 'SD_01-2018_5_b.mzXML ', 'SD_01-2018_12_a.mzXML ', 'SD_10_2018_18_b.mzXML ', 'SD_12-2017_3_b.mzXML ', 'SD_12-2017_1_a.mzXML ', 'SD_10_2018_14_b.mzXML ', 'SD_01-2018_13_a.mzXML ', 'SD_12-2017_24_a.mzXML ', 'SD_01-2018_17_a.mzXML ', 'SD_01-2018_11_a.mzXML ', 'SD_10_2018_12_a.mzXML ', 'SD_01-2018_14_b.mzXML ', 'SD_10_2018_18_a.mzXML ', 'SD_01-2018_10_b.mzXML ', 'SD_12-2017_2_b.mzXML ', 'SD_10_2018_29_b.mzXML ', 'SD_10_2018_16_b.mzXML ', 'SD_12-2017_25_a.mzXML ', 'SD_01-2018_16_b.mzXML ', 'SD_10_2018_29_a.mzXML ', 'SD_01-2018_24_b.mzXML ', 'SD_12-2017_26_a.mzXML ', 'SD_12-2017_14_b.mzXML ', 'SD_10_2018_3_a.mzXML ', 'SD_12-2017_16_a.mzXML ', 'SD_12-2017_1_b.mzXML ', 'SD_12-2017_4_a.mzXML ', 'SD_01-2018_30_b.mzXML ', 'SD_10_2018_23_a.mzXML ', 'SD_12-2017_21_a.mzXML ', 'SD_10_2018_23_b.mzXML ', 'SD_12-2017_13_b.mzXML ', 'SD_12-2017_7_a.mz

# Visualization

In [18]:
Visualization.load('../data/heatmap.qzv')

# PermANOVA

In [19]:
metadata_column_permanova = 'ATTRIBUTE_Sample_Area'

In [25]:
! qiime diversity beta-group-significance \
  --i-distance-matrix ../data/distance_matrix.qza \
  --m-metadata-file ../data/metadata.tsv \
  --m-metadata-column $metadata_column_permanova \
  --o-visualization ../data/permanova.qzv

# Visualization

In [21]:
Visualization.load('../data/permanova.qzv')

ValueError: ../data/permanova.qzv does not exist.