In [39]:
!pip install jsonapi_client

Defaulting to user installation because normal site-packages is not writeable


In [40]:
inputBioms = ["root:Host-associated:Human:Digestive system:Oral:Subgingival plaque","root:Host-associated:Human:Digestive system:Oral:Supragingival plaque"]

runName = "run_01"

max_sample_count = 10

In [41]:
from jsonapi_client import Session
import pandas as pd
import requests
import os

###################
# Definie Functions
###################

########################################################
# input: biom string from MGnify
# https://www.ebi.ac.uk/metagenomics/browse/biomes/
# example: GetBiomSamplesByIds(root:Engineered:Bioreactor)
#
#
# output: df with the first <max_sample_count> 
########################################################
def GetBiomSamplesByIds(biom_id: str):
    with Session("https://www.ebi.ac.uk/metagenomics/api/v1") as mgnify:
        i = 0
        biomes_dfs = []
        for r in mgnify.iterate(f'biomes/{biom_id}/samples'):
            biom_df = pd.json_normalize(r.json)
            biom_df['url'] = str(r.links.self)
            biomes_dfs.append(biom_df)
            i += 1
            if(i == max_sample_count):
                break
    main_biomes_df = pd.concat(biomes_dfs)
    return main_biomes_df

########################################################
# input: study ACCESSION from mgnify
# https://www.ebi.ac.uk/metagenomics/browse/studies
# example: GetBiomSamplesByIds(MGYS00006539)
#
#
# output: list with 2 objects
# when output[0] == None, we have a study without any analysis data
# when output[0] == 1,  output[1] = df with all meta-analysis Informations (accession, date, version, ...) for each sample in the study
########################################################

def GetAnalysisFromStudy(_study_id):
    with Session("https://www.ebi.ac.uk/metagenomics/api/v1") as mgnify:
        biomes_dfs = []
        for r in mgnify.iterate(f'studies/{_study_id}/analyses'):
            biom_df = pd.json_normalize(r.json)
            biom_df['url'] = str(r.links.self)
            biomes_dfs.append(biom_df)
    # testing if study has no analysis data
    if(biomes_dfs == []):
        return [None,None]
    main_biomes_df = pd.concat(biomes_dfs)

    return [1,main_biomes_df]

In [42]:

for biom in inputBioms:
    df_samples = GetBiomSamplesByIds(biom)

    sample_accessions = (df_samples['attributes.accession'].to_list())

    studies = df_samples['relationships.studies.data'].to_list()
    ####################
    # for each of the studies we test if they have analysis data.
    # if so we save the data in <study_analaysises>,
    # until we have data for every sample.
    ####################

    # list of df with analysis data from studys
    study_analaysises = []
    # study-accessions from studys without analysis
    studies_without_analysis = []
    # study-accessions from studys having analysis data in <study_analaysises>
    studies_Ids = []

    for study in studies:
        # list of studies for sample
        s_ids_sample = []
        for s in str(study).split("'"):
            if('MGYS' in s):
                s_ids_sample.append(s) 
        # for each study
        for st in s_ids_sample:
            # test if we havent checked study
            if(st in studies_Ids):
                # we allready have analysis data for this study
                break
            if(st not in studies_without_analysis):
                # Get analysis data for this study
                study_analaysis = GetAnalysisFromStudy(st)
                if(study_analaysis[0] == None):
                    # discard if study has no analysis
                    studies_without_analysis.append(st)
                    continue
                # keep if study has analysis
                studies_Ids.append(st)
                study_analaysises.append(study_analaysis[1])
                break
    # Merge all analysis data into one large df
    study_analaysis = pd.concat(study_analaysises)
    analysis_accesions = []
    for sample_acc in sample_accessions:
        acc = study_analaysis[study_analaysis['relationships.sample.data.id'] == sample_acc]['attributes.accession'].to_list()
        analysis_accesions.append([len(acc),acc])
    analysis_accesions

    ##################################
    # for each of of our samples we try to get the taxonomic assingments from MGnify
    # if none of the analysis accessions of a sample return a file, we skip the sample
    # (this happens when for e.g. the version of the MGnify pipeline is to old)
    #
    # Trying for "OTUs and taxonomic assignments for SSU rRNA" and "All reads encoding SSU rRNA" beacuse of different pipeline versions
    #
    # all Analysis files are saved in 'outputs/collection', so that they are automatically exported into Galaxy as one Collection
    #
    # analysis files are saved under 'outputs/collection/<analysis-accession>-<sample-accession>.tsv'
    ##################################

    # for each of our samples
    for analysis_sample in analysis_accesions:
        url = ""
        # for each analysis-accession for the sample
        for analysis_accession in analysis_sample[1]:
            print(f"start grab for {analysis_accession}")
            
            # Get all downloads for one analysis
            with Session("https://www.ebi.ac.uk/metagenomics/api/v1") as mgnify:
                
                dfs = []
                for r in mgnify.iterate(f'analyses/{analysis_accession}/downloads'):
                    df = pd.json_normalize(r.json)
                    df['url'] = str(r.links.self)
                    dfs.append(df)
                
            main_df = pd.concat(dfs)

            data_type = "TSV"
            data_label = "OTUs and taxonomic assignments for SSU rRNA"
            
            # get URL for data_label = "OTUs and taxonomic assignments for SSU rRNA"
            c1 = main_df["attributes.file-format.name"] == data_type
            c2 = main_df["attributes.description.description"] == data_label
            if(main_df.loc[(c1 & c2), "url"].size == 0):
                # if we dont get a match try the same with data_label = 'All reads encoding SSU rRNA'
                data_label = "All reads encoding SSU rRNA"
                c2 = main_df["attributes.description.description"] == data_label
                if(main_df.loc[(c1 & c2), "url"].size == 0):
                    continue
                else:
                    url = main_df.loc[(c1 & c2), "url"].iloc[0]
                    break
            url = main_df.loc[(c1 & c2), "url"].iloc[0]
            break

        if(url == ""):
            print(f'no analysis found for {analysis_sample}')
            continue

        ##########################
        # download the data
        #########################

        data_output_folder = f'outputs/collection/{runName}/{biom}'
        os.makedirs(data_output_folder, exist_ok=True)

        response = requests.get(url)

        if not response:
            print(f"Could not download file, got response: {response.status_code}")
            break
            
        print(url)
        data_output_path = os.path.join(data_output_folder, f"{analysis_accession}-{study_analaysis[study_analaysis['attributes.accession'] == analysis_accession]['relationships.sample.data.id'].to_list()[0]}.tsv")
        with open(data_output_path, "w") as f:
            f.write(response.text)

start grab for MGYA00661793
https://www.ebi.ac.uk/metagenomics/api/v1/analyses/MGYA00661793/file/ERZ14207391_FASTA_SSU_OTU.tsv
start grab for MGYA00661792
https://www.ebi.ac.uk/metagenomics/api/v1/analyses/MGYA00661792/file/ERZ14207393_FASTA_SSU_OTU.tsv
start grab for MGYA00661791
https://www.ebi.ac.uk/metagenomics/api/v1/analyses/MGYA00661791/file/ERZ14207392_FASTA_SSU_OTU.tsv
start grab for MGYA00661789
https://www.ebi.ac.uk/metagenomics/api/v1/analyses/MGYA00661789/file/ERZ14207394_FASTA_SSU_OTU.tsv
start grab for MGYA00661787
https://www.ebi.ac.uk/metagenomics/api/v1/analyses/MGYA00661787/file/ERZ14207366_FASTA_SSU_OTU.tsv
start grab for MGYA00661785
https://www.ebi.ac.uk/metagenomics/api/v1/analyses/MGYA00661785/file/ERZ14207367_FASTA_SSU_OTU.tsv
start grab for MGYA00661784
https://www.ebi.ac.uk/metagenomics/api/v1/analyses/MGYA00661784/file/ERZ14207368_FASTA_SSU_OTU.tsv
start grab for MGYA00661783
https://www.ebi.ac.uk/metagenomics/api/v1/analyses/MGYA00661783/file/ERZ14207369_FA

In [43]:
from functools import reduce

def MergeOnTaxaLevelFunction(taxa, runName,biomName):
    if(not os.path.isdir(f"./outputs/collection/{runName}/merged/")):
        os.mkdir(f"./outputs/collection/{runName}/merged/")
    galaxyInput_rankToMerge = taxa
    # selection from "superkingdom" "kingdom" "phylum" "class" "order" "family" "genus" "species" "all" "only counts"

    galaxyInput_countTables = runName


    mappingFromTaxaToNumber = {
        "superkingdom":1,
        "kingdom":2,
        "phylum":3,
        "class":4,
        "order":5,
        "family":6,
        "genus":7,
        "species":8,
        "all" : 10,         # to prevent key error in converting rank to number
        "only counts" : 10  # only counts and all will be handled when creating the tables
    }

    rankToMerge = mappingFromTaxaToNumber[galaxyInput_rankToMerge]

    filePaths = os.listdir(f"./outputs/collection/{runName}/{biomName}")

    #filePaths = []

    

    tables = []
    for file in filePaths:
        # Read out all Tables and remove unneccecary informations
        df = pd.read_csv(f"./outputs/collection/{runName}/{biomName}/{file}",sep="\t",header=1)
        df = df.iloc[: , 1:]
        if("taxid" in df.columns):
            df = df.drop(columns=["taxid"])
        df.columns = [file.split("/")[-1].replace(".tsv",""),"#KEY"]
        columns = df.columns.tolist()
        columns.reverse()
        df = df[columns]
        tables.append(df)
    # Merge all tables into one
    df = reduce(lambda df1,df2:pd.merge(df1,df2,on="#KEY",how="outer"),tables)
    df = df.fillna(0)

    if(galaxyInput_rankToMerge == "only counts"):
        df.to_csv(f"./outputs/collection/{runName}/merged/countTable_{biomName}.tsv",sep="\t",index=False)

    if(rankToMerge < 10):
        df2 = df
        df2 = df2.reset_index()
        for index, row in df.iterrows():
            rowRaw = row['#KEY']
            rowSplit = rowRaw.split(";")
            if(len(rowSplit) >= rankToMerge):
                df2.loc[index,"#KEY"] = rowSplit[rankToMerge-1].replace("__","_taxa_")
            else:
                df2.loc[index,"#KEY"] = "noValue"

        df2 = df2[df2["#KEY"] != "noValue"]
        df2 = df2.groupby(["#KEY"]).sum()
        df2 = df2.reset_index()
        df2 = df2.drop(columns=["index"])
        df2.to_csv(f"./outputs/collection/{runName}/merged/mergedOn_{galaxyInput_rankToMerge}_{biomName}.tsv",sep="\t",index=False)

    if(galaxyInput_rankToMerge == "all"):
        tables = []
        for i in range(2,9):
            df2 = df
            df2 = df2.reset_index()
            for index, row in df.iterrows():
                rowRaw = row['#KEY']
                rowSplit = rowRaw.split(";")
                if(len(rowSplit) >= rankToMerge):
                    df2.loc[index,"#KEY"] = rowSplit[i-1].replace("__","_taxa_")
                else:
                    df2.loc[index,"#KEY"] = "noValue"

            df2 = df2[df2["#KEY"] != "noValue"]
            df2 = df2.groupby(["#KEY"]).sum()
            df2 = df2.reset_index()
            df2 = df2.drop(columns=["index"])
            tables.append(df2)
        result = pd.concat(tables)
        result.to_csv(f"./outputs/collection/{runName}/merged/mergedOn_{galaxyInput_rankToMerge}_{biomName}.tsv",sep="\t",index=False)


In [47]:
countFoldersForRun = os.listdir(f"./outputs/collection/{runName}")
for folder in inputBioms:
    print(folder)
    for taxa in ["superkingdom","kingdom","phylum","class","order","family","genus","species","all","only counts"]:
        MergeOnTaxaLevelFunction(taxa,runName,folder)

root:Host-associated:Human:Digestive system:Oral:Subgingival plaque
root:Host-associated:Human:Digestive system:Oral:Supragingival plaque
