# Step 0: Fetching data

In [1]:
# importing all required packages and setting data directory
import os
import qiime2
from qiime2 import Visualization
import pandas as pd


data_dir = 'RawData'

if not os.path.isdir(data_dir):
    os.makedirs(data_dir)

## Creating a tsv file of study id `PRJEB19491` and convert it to a QIIME2 artifact

In [2]:
# Study ID PRJEB19491
!echo -e "id\nPRJEB19491" > $data_dir/study-id.tsv

In [3]:
!/home/jovyan/.conda/envs/fondue/bin/qiime tools import \
      --type NCBIAccessionIDs \
      --input-path $data_dir/study-id.tsv \
      --output-path $data_dir/study-id.qza

[32mImported RawData/study-id.tsv as NCBIAccessionIDsDirFmt to RawData/study-id.qza[0m


## Fetch sequence and metadata using `q2-fondue`

There were errors running fondue code here. So this action was actually running in terminal. I will debug latter.

In [4]:
# !/home/jovyan/.conda/envs/fondue/bin/qiime fondue get-all \
#       --i-accession-ids $data_dir/study-id.qza \
#       --p-email kai.wang@usys.ethz.ch \
#       --output-dir $data_dir/fondue-output

## Checking downloaded data

### A) Checking if there are samples that failed to download

In [5]:
!/home/jovyan/.conda/envs/fondue/bin/qiime tools export \
      --input-path $data_dir/fondue-output/failed_runs.qza \
      --output-path $data_dir/fondue-output/

[32mExported RawData/fondue-output/failed_runs.qza as SRAFailedIDsDirFmt to directory RawData/fondue-output/[0m


In [6]:
pd.read_csv(f'{data_dir}/fondue-output/sra-failed-ids.tsv', sep = "\t").head(n = 6)

Unnamed: 0,ID,Error message


#### 🚩 All samples were downloaded successfully.

### B) Exploring the sequencing data a little bit

In [7]:
!qiime demux summarize \
      --i-data $data_dir/fondue-output/paired_reads.qza \
      --o-visualization $data_dir/fondue-output/paired_reads.qzv

!qiime demux summarize \
      --i-data $data_dir/fondue-output/single_reads.qza \
      --o-visualization $data_dir/fondue-output/single_reads.qzv

[32mSaved Visualization to: RawData/fondue-output/paired_reads.qzv[0m
[0m[32mSaved Visualization to: RawData/fondue-output/single_reads.qzv[0m
[0m

In [8]:
Visualization.load(f'{data_dir}/fondue-output/paired_reads.qzv')

In [9]:
Visualization.load(f'{data_dir}/fondue-output/single_reads.qzv')

#### 😏 As expected, there were no single sequencing data. 

    Copy the paired sequence and it's visualization to the upper foder for convenience of downstream analysis.

In [10]:
! cp RawData/fondue-output/paired_reads* RawData/

### C) Exploring and modifying metadata

In [12]:
!/home/jovyan/.conda/envs/fondue/bin/qiime tools export \
      --input-path $data_dir/fondue-output/metadata.qza \
      --output-path $data_dir/fondue-output/

[32mExported RawData/fondue-output/metadata.qza as SRAMetadataDirFmt to directory RawData/fondue-output/[0m


In [13]:
metadata = pd.read_csv('RawData/fondue-output/sra-metadata.tsv', sep = "\t")

In [14]:
metadata.columns

Index(['ID', 'Experiment ID', 'Biosample ID', 'Bioproject ID', 'Study ID',
       'Sample Accession', 'Organism', 'Library Source', 'Library Layout',
       'Library Selection', 'Instrument', 'Platform', 'Bases', 'Spots',
       'Avg Spot Len', 'Bytes', 'Public', 'Alias [sample]', 'Center Name',
       'Description [sample]', 'Ena checklist [sample]',
       'Ena-first-public [run]', 'Ena-first-public [study]',
       'Ena-last-update [run]', 'Ena-last-update [study]',
       'Insdc center name [sample]', 'Insdc first public [sample]',
       'Insdc last update [sample]', 'Insdc status [sample]', 'Library Name',
       'Name', 'Sample name [sample]', 'Sra accession [sample]', 'Tax ID',
       'Title', 'Title [sample]'],
      dtype='object')

#### 🚩 `Description [sample]` is the only column of our interest. We further split it into two columns including `Phase` and `Diets`, and save the new metadata.

In [15]:
metadata['Description [sample]'].value_counts()

rumen fluid hay       6
liquid phase hay      6
solid phase grass     6
liquid phase corn     6
rumen fluid grass     6
liquid phase grass    6
solid phase corn      6
rumen fluid corn      6
solid phase hay       6
Name: Description [sample], dtype: int64

In [16]:
metadata[['Phase', "Diets"]] = metadata['Description [sample]'].str.rsplit(" ",expand=True, n = 1)

In [17]:
metadata = metadata[['ID', 'Phase', 'Diets']]

In [18]:
metadata.to_csv("RawData/metadata.tsv", sep="\t")

## 🥳🥳🥳 Data is ready! Go SuperGrass!