# Data retrive and denosing 

In [2]:
# importing required packages, setting data an results directory

import os
import qiime2
from qiime2 import Visualization
import pandas as pd

data_dir = 'Data'
results_dir = "Results"

if not os.path.isdir(data_dir):
    os.makedirs(data_dir)

if not os.path.isdir(results_dir):
    os.makedirs(results_dir)

## 1. Data retrieve

Download of sequencing data and the corresponding metadata using q2-fondue.

In [3]:
!echo -e "id\nPRJEB19491" > $data_dir/0-study-id.tsv

In [4]:
fondue_env = '/home/jovyan/.conda/envs/fondue/bin'

In [5]:
%%script env fondue_env="$fondue_env" data_dir="$data_dir" bash

# append the env location to PATH so that qiime
# can find all required executables
export PATH=$fondue_env:$PATH
    
$fondue_env/qiime tools import \
      --type NCBIAccessionIDs \
      --input-path $data_dir/0-study-id.tsv \
      --output-path $data_dir/0-study-id.qza

$fondue_env/qiime fondue get-all \
    --i-accession-ids $data_dir/0-study-id.qza \
    --p-email kai.wang@usys.ethz.ch \
    --output-dir $data_dir/0-fondue-output

Imported Data/0-study-id.tsv as NCBIAccessionIDsDirFmt to Data/0-study-id.qza


QIIME is caching your current deployment for improved performance. This may take a few moments and should only happen once per deployment.
Plugin error from fondue:

  Neither single- nor paired-end sequences could be downloaded. Please check your accession IDs.

Debug info has been saved to /tmp/qiime2-q2cli-err-_643op38.log


CalledProcessError: Command 'b'\n# append the env location to PATH so that qiime\n# can find all required executables\nexport PATH=$fondue_env:$PATH\n    \n$fondue_env/qiime tools import \\\n      --type NCBIAccessionIDs \\\n      --input-path $data_dir/0-study-id.tsv \\\n      --output-path $data_dir/0-study-id.qza\n\n$fondue_env/qiime fondue get-all \\\n    --i-accession-ids $data_dir/0-study-id.qza \\\n    --p-email kai.wang@usys.ethz.ch \\\n    --output-dir $data_dir/0-fondue-output\n'' returned non-zero exit status 1.

## 2. Data export

In [None]:
!qiime tools export \
    --input-path $data_dir/0-fondue-output/paired_reads.qza \
    --output-path $data_dir/0-paired_reads

In [None]:
! gunzip Data/0-paired_reads/*.fastq.gz

In [13]:
%%script env fondue_env="$fondue_env" data_dir="$data_dir" bash
export PATH=$fondue_env:$PATH
$fondue_env/qiime tools export \
    --input-path $data_dir/0-fondue-output/metadata.qza \
    --output-path $data_dir/0-exported-metadata

Exported Data/0-fondue-output/metadata.qza as SRAMetadataDirFmt to directory Data/0-exported-metadata


In [207]:
metadata = pd.read_csv(f'{data_dir}/0-exported-metadata/sra-metadata.tsv', sep = '\t')

In [208]:
metadata['Description [sample]'].value_counts()

rumen fluid grass     6
solid phase hay       6
liquid phase grass    6
liquid phase corn     6
solid phase corn      6
rumen fluid corn      6
liquid phase hay      6
rumen fluid hay       6
solid phase grass     6
Name: Description [sample], dtype: int64

In [209]:
metadata[['Phase', "Diets"]] = metadata['Description [sample]'].str.rsplit(" ",expand=True, n = 1)
metadata = metadata[['ID', 'Phase', 'Diets']]
metadata.head()

Unnamed: 0,ID,Phase,Diets
0,ERR1842970,rumen fluid,corn
1,ERR1842971,rumen fluid,corn
2,ERR1842972,rumen fluid,corn
3,ERR1842973,rumen fluid,grass
4,ERR1842974,rumen fluid,grass


In [26]:
metadata_bac = metadata[0:27]
metadata_arc = metadata[27:54]

In [30]:
metadata_bac.to_csv(f'{data_dir}/0-metadata_bac.tsv', sep = '\t', index=False)
metadata_arc.to_csv(f'{data_dir}/0-metadata_arc.tsv', sep = '\t', index=False)

# 3 Data import

In [215]:
manifest = pd.read_csv(f'{data_dir}/0-paired_reads/MANIFEST')

In [216]:
manifest['filename'] = f'$PWD/Data/0-paired_reads/' + manifest['filename'].str.slice(0,31)

In [217]:
manifest = pd.pivot(manifest, columns= 'direction', values='filename', index = ['sample-id'])

In [218]:
manifest.reset_index(inplace=True)

In [221]:
manifest.rename(columns={"forward": "forward-absolute-filepath", "reverse": "reverse-absolute-filepath"}, inplace=True)

In [232]:
manifest_bac = manifest[0:27]
manifest_arc = manifest[27:54]

In [239]:
manifest_bac.to_csv(f'{data_dir}/0-manifest_bac', sep = '\t', index=False)
manifest_arc.to_csv(f'{data_dir}/0-manifest_arc', sep = '\t', index=False)

In [235]:
!head Data/0-manifest_arc -n 3

sample-id	forward-absolute-filepath	reverse-absolute-filepath
ERR1842997	$PWD/Data/0-paired_reads/ERR1842997_00_L001_R1_001.fastq	$PWD/Data/0-paired_reads/ERR1842997_00_L001_R2_001.fastq
ERR1842998	$PWD/Data/0-paired_reads/ERR1842998_00_L001_R1_001.fastq	$PWD/Data/0-paired_reads/ERR1842998_00_L001_R2_001.fastq


In [240]:
! qiime tools import \
    --type "SampleData[PairedEndSequencesWithQuality]" \
    --input-format PairedEndFastqManifestPhred33V2 \
    --input-path Data/0-manifest_arc \
    --output-path Data/1-seqs_arc.qza

[32mImported Data/0-manifest_arc as PairedEndFastqManifestPhred33V2 to Data/1-seqs_arc.qza[0m
[0m

In [241]:
! qiime tools import \
    --type "SampleData[PairedEndSequencesWithQuality]" \
    --input-format PairedEndFastqManifestPhred33V2 \
    --input-path Data/0-manifest_bac \
    --output-path Data/1-seqs_bac.qza

[32mImported Data/0-manifest_bac as PairedEndFastqManifestPhred33V2 to Data/1-seqs_bac.qza[0m
[0m

# 4.Denosing-Bacteria

In [247]:
!qiime demux summarize \
      --i-data Data/1-seqs_bac.qza \
      --o-visualization Results/1-seqs_bac.qzv

Visualization.load('Results/1-seqs_bac.qzv')

In [253]:
!qiime dada2 denoise-paired \
    --i-demultiplexed-seqs Data/1-seqs_bac.qza \
    --p-trim-left-f 28 \
    --p-trim-left-r 19 \
    --p-trunc-len-f 207 \
    --p-trunc-len-r 199 \
    --p-n-threads 3 \
    --o-table Data/1-feature-table_bac.qza \
    --o-representative-sequences Data/1-rep-seqs_bac.qza \
    --o-denoising-stats Data/1-dada2-stats_bac.qza

[32mSaved FeatureTable[Frequency] to: Data/1-feature-table_bac.qza[0m
[32mSaved FeatureData[Sequence] to: Data/1-rep-seqs_bac.qza[0m
[32mSaved SampleData[DADA2Stats] to: Data/1-dada2-stats_bac.qza[0m
[0m

In [1]:
!qiime feature-table summarize \
    --i-table Data/1-feature-table_bac.qza \
    --m-sample-metadata-file Data/0-metadata_bac.tsv \
    --o-visualization Results/1-feature-table_bac.qzv

[32mSaved Visualization to: Results/1-feature-table_bac.qzv[0m
[0m

NameError: name 'Visualization' is not defined

In [3]:
Visualization.load('Results/1-feature-table_bac.qzv')

In [4]:
!qiime metadata tabulate \
    --m-input-file Data/1-dada2-stats_bac.qza \
    --o-visualization Results/1-dada2-stats_bac.qzv

Visualization.load('Results/1-dada2-stats_bac.qzv')

[32mSaved Visualization to: Results/1-dada2-stats_bac.qzv[0m
[0m

## 5. Denoising-Archaea

In [252]:
!qiime demux summarize \
      --i-data Data/1-seqs_arc.qza \
      --o-visualization Results/1-seqs_arc.qzv

Visualization.load('Results/1-seqs_arc.qzv')

[32mSaved Visualization to: Results/1-seqs_arc.qzv[0m
[0m

In [9]:
!qiime dada2 denoise-paired \
    --i-demultiplexed-seqs Data/1-seqs_arc.qza \
    --p-trim-left-f 25 \
    --p-trim-left-r 20 \
    --p-trunc-len-f 247 \
    --p-trunc-len-r 200 \
    --p-n-threads 3 \
    --o-table Data/1-feature-table_arc.qza \
    --o-representative-sequences Data/1-rep-seqs_arc.qza \
    --o-denoising-stats Data/1-dada2-stats_arc.qza

[32mSaved FeatureTable[Frequency] to: Data/1-feature-table_arc.qza[0m
[32mSaved FeatureData[Sequence] to: Data/1-rep-seqs_arc.qza[0m
[32mSaved SampleData[DADA2Stats] to: Data/1-dada2-stats_arc.qza[0m
[0m

In [2]:
!qiime feature-table summarize \
    --i-table Data/1-feature-table_arc.qza \
    --m-sample-metadata-file Data/0-metadata_arc.tsv \
    --o-visualization Results/1-feature-table_arc.qzv
Visualization.load('Results/1-feature-table_arc.qzv')

[32mSaved Visualization to: Results/1-feature-table_arc.qzv[0m
[0m

In [1]:
!qiime metadata tabulate \
    --m-input-file Data/1-dada2-stats_arc.qza \
    --o-visualization Results/1-dada2-stats_arc.qzv

Visualization.load('Results/1-dada2-stats_arc.qzv')

[32mSaved Visualization to: Results/1-dada2-stats_arc.qzv[0m
[0m

NameError: name 'Visualization' is not defined