In [None]:
!pip install jsonapi_client
!pip install pandas

In [None]:
# declarations:
study_id = GALAXY_INPUTS['study_id']

In [None]:
from jsonapi_client import Session
import pandas as pd
import requests
import os

###################
# Definie Functions
###################

########################################################
# input: study ACCESSION from mgnify
# https://www.ebi.ac.uk/metagenomics/browse/studies
# example: GetBiomSamplesByIds(MGYS00006539)
#
#
# output: list with 2 objects
# when output[0] == None, we have a study without any analysis data
# when output[0] == 1,  output[1] = df with all meta-analysis Informations (accession, date, version, ...) for each sample in the study
########################################################

def GetAnalysisFromStudy(_study_id):
    with Session("https://www.ebi.ac.uk/metagenomics/api/v1") as mgnify:
        biomes_dfs = []
        for r in mgnify.iterate(f'studies/{_study_id}/analyses'):
            biom_df = pd.json_normalize(r.json)
            biom_df['url'] = str(r.links.self)
            biomes_dfs.append(biom_df)
    # testing if study has no analysis data
    if(biomes_dfs == []):
        return [None,None]
    main_biomes_df = pd.concat(biomes_dfs)

    return [1,main_biomes_df]

In [None]:
#################### 
# we get a DF with the fist <max_sample_count> of the biom <biom>
####################

df_samples = GetAnalysisFromStudy(study_id)[1]

df_samples

In [None]:
####################################
# from <study_analaysis> we extract all analysis-accessions for all samples in <sample_accessions>
#
# where analysis_accesions[i][0] = number of analysis-accessions for sample i
# and analysis_accesions[i][1] = list of analysis-accessions for sample i
####################################

analysis_accesions = df_samples['attributes.accession'].to_list()
analysis_accesions

In [None]:
##################################
# for each of of our samples we try to get the taxonomic assingments from MGnify
# if none of the analysis accessions of a sample return a file, we skip the sample
# (this happens when for e.g. the version of the MGnify pipeline is to old)
#
# Trying for "OTUs and taxonomic assignments for SSU rRNA" and "All reads encoding SSU rRNA" beacuse of different pipeline versions
#
# all Analysis files are saved in 'outputs/collection', so that they are automatically exported into Galaxy as one Collection
#
# analysis files are saved under 'outputs/collection/<analysis-accession>-<sample-accession>.tsv'
##################################


# for each of our samples
for analysis_accession in analysis_accesions:
    url = ""
    print(f"start grab for {analysis_accession}")
    # 
    # Get all downloads for one analysis
    with Session("https://www.ebi.ac.uk/metagenomics/api/v1") as mgnify:    
        dfs = []
        for r in mgnify.iterate(f'analyses/{analysis_accession}/downloads'):
            df = pd.json_normalize(r.json)
            df['url'] = str(r.links.self)
            dfs.append(df)
            
    main_df = pd.concat(dfs)
    data_type = "TSV"
    data_label = "OTUs and taxonomic assignments for SSU rRNA"
        
    # get URL for data_label = "OTUs and taxonomic assignments for SSU rRNA"
    c1 = main_df["attributes.file-format.name"] == data_type
    c2 = main_df["attributes.description.description"] == data_label
    if(main_df.loc[(c1 & c2), "url"].size == 0):
        # if we dont get a match try the same with data_label = 'All reads encoding SSU rRNA'
        data_label = "All reads encoding SSU rRNA"
        c2 = main_df["attributes.description.description"] == data_label
        if(main_df.loc[(c1 & c2), "url"].size == 0):
            print("No analysis found")
        else:
            url = main_df.loc[(c1 & c2), "url"].iloc[0]
    url = main_df.loc[(c1 & c2), "url"].iloc[0]

    if(url == ""):
        print(f'no analysis found for {analysis_accession}')
        continue

    ##########################
    # download the data
    #########################

    data_output_folder = 'outputs/collection'
    os.makedirs(data_output_folder, exist_ok=True)

    response = requests.get(url)

    if not response:
        print(f"Could not download file, got response: {response.status_code}")
        break
        
    print(url)
    data_output_path = os.path.join(data_output_folder, f"{analysis_accession}-{df_samples[df_samples['attributes.accession'] == analysis_accession]['relationships.sample.data.id'].to_list()[0]}.tsv")
    with open(data_output_path, "w") as f:
        f.write(response.text)

     