In [1]:
!pip install jsonapi_client


Defaulting to user installation because normal site-packages is not writeable


In [2]:
from jsonapi_client import Session
import pandas as pd


###################
# Definie Functions
###################

# retuns list of id`s for all Studies to Biom
def GetBiomStudyIds(biom_id: str):
    with Session("https://www.ebi.ac.uk/metagenomics/api/v1") as mgnify:
        
        biomes_dfs = []
        for r in mgnify.iterate(f'biomes/{biom_id}/studies'):
            biom_df = pd.json_normalize(r.json)
            biom_df['url'] = str(r.links.self)
            biomes_dfs.append(biom_df)
        
    main_biomes_df = pd.concat(biomes_dfs)

    main_biomes_df = main_biomes_df.dropna()

    return main_biomes_df['id'].to_list()


# returns List of analysis ids for list of studies
def GetAnalysesIdsOfStudies(study_ids : list[str]):
    biomes_dfs = []
    for study in study_ids:
        with Session("https://www.ebi.ac.uk/metagenomics/api/v1") as mgnify:
            for r in mgnify.iterate(f'studies/{study}/analyses'):
                biom_df = pd.json_normalize(r.json)
                biom_df['url'] = str(r.links.self)
                biomes_dfs.append(biom_df)

                # only take first studie

                break
        
    biomes_dfs = pd.concat(biomes_dfs)['id'].to_list()
    return biomes_dfs



In [3]:
import pandas as pd
import numpy as np
#########################
# normalization functions
#########################


# norms in tool: vst, rlog, tmm, RLE, Upperquartile

# => look if possible to implement / else: try to implement  with tool and use other normalisation here.

# maybe try  Min-Max scaling, Z-score normalization, decimal scaling, and log transformation

#df = pd.read_csv('outputs/collection/(root:Environmental:Aquatic:Marine)MGYS00006028.tsv',sep='\t')

#returns normalized df
def NormalizeDf(_inputDF, method):
    
    if(method == "softmax"):
        return SoftmaxNormalization(_inputDF)
    elif(method == "standart"):
        return StandartNormalization(_inputDF)
    elif(method == "log"):
        return LogNormalization(_inputDF)
    elif(method == "log10"):
        return Log10Normalization(_inputDF)
    elif(method == "sigmoid"):
        return SigmoidNormalization(_inputDF)
    elif(method == "quantile"):
        return quantile_normalize(_inputDF)
    else:
        print(f"no normalization method found with the name {method}")
        return _inputDF # return unput df to prevent crashes

def StandartNormalization(_inputDF):
    for col in _inputDF.columns.tolist():
        if(col in ['superkingdom','kingdom','phylum']):
            continue
        _divider = sum(_inputDF[col])
        _inputDF[col] = _inputDF[col] / _divider
    return _inputDF
        ### normalization part

def SoftmaxNormalization(_inputDF):
    _inputDF = StandartNormalization(_inputDF)
    _inputDF.replace(0.0,np.nan)
    numeric_columns = _inputDF.select_dtypes(include=[np.number]).columns

    for col in numeric_columns:
        _inputDF[col] = np.exp(_inputDF[col]) / np.sum(np.exp(_inputDF[col]))
        pass

    return _inputDF


def LogNormalization(_inputDF):
    numeric_columns = _inputDF.select_dtypes(include=[np.number]).columns
    for col in numeric_columns:
        _inputDF[col] = _inputDF[col].apply(np.log)
    return _inputDF.replace(-np.inf, 0)

def Log10Normalization(_inputDF):
    numeric_columns = _inputDF.select_dtypes(include=[np.number]).columns
    for col in numeric_columns:
        _inputDF[col] = _inputDF[col].apply(np.log10)
    return _inputDF.replace(-np.inf, 0)

def SigmoidNormalization(_inputDF):
    _inputDF = StandartNormalization(_inputDF)
    def sigmoid(x):
        return ((1 / (1 + np.exp(-x))) - .5) * 2    # -.5 and time to so we get values between 0 and 1
    _inputDF = _inputDF.apply(lambda x: sigmoid(x) if np.issubdtype(x.dtype, np.number) else x)

    return _inputDF


# _inputDF.apply(lambda x: (((1 / (1 + np.exp(-x))) - .5) * 2)  if np.issubdtype(x.dtype, np.number) else x)

# ??
def quantile_normalize(df):
    normalized_df = df.copy()
    numeric_columns = df.select_dtypes(include=[np.number]).columns
    ranks = df[numeric_columns].rank(method='average')
    mean_ranks = ranks.mean(axis=1)
    
    sorted_ranks = mean_ranks.sort_values()
    quantiles = sorted_ranks / len(sorted_ranks)
    #return quantiles
    for column in numeric_columns:
        if 0 in df[column].values:
            df[column].replace(0, np.nan, inplace=True)
            sorted_column = df[column].sort_values()
            normalized_df[column] = np.interp(df[column], sorted_column, quantiles)
            normalized_df[column].fillna(0, inplace=True)
        else:
            sorted_column = df[column].sort_values()
            normalized_df[column] = np.interp(df[column], sorted_column, quantiles)

    return normalized_df
# testing
"""
df = pd.read_csv('test/testForNorm/norm_test_1.tsv',sep='\t')

df2 = df.copy()

from datetime import datetime

t1 = datetime.now()

df = quantile_normalize(df)
"""
#df.to_csv("test_soft_1.tsv",sep='\t')

#t2 = datetime.now()

#df2 = quantile_normalize(df2)

#df2.to_csv("test_soft_2.tsv",sep='\t')

#t3 = datetime.now()

#print(f'{(t2-t1).total_seconds()},{(t3-t2).total_seconds()}')

#df.to_csv('outputs/collection/root:Environmental:Terrestrial:Soil_20_2_softmax.tsv',sep='\t')
df

FileNotFoundError: [Errno 2] No such file or directory: 'test/testForNorm/norm_test_1.tsv'

In [None]:
from jsonapi_client import Session
import pandas as pd
import requests
import os

# root:Environmental:Aquatic:Marine
#"root:Environmental:Terrestrial:Soil"
biom_ids = ["root:Environmental:Terrestrial:Soil","root:Environmental:Aquatic:Marine"]

normalization_methods = ['softmax','log','sigmoid','quantile']

phylumDepths = [1,2]

max_studies = 20

for biom_id in biom_ids:

    #########################################
    # Get All Studies for Biom
    #########################################

    study_ids = GetBiomStudyIds(biom_id)
    print(f'found {len(study_ids)} studies for {biom_id}')
    #########################################
    # Create outputFolder
    #########################################

    data_tsvs = []


    #########################################
    # remove Studies IDS wich result in dying Kernel

    # maybe only use ver 5.0 for smaller files?
    #########################################
    
    remove_ids = ["MGYS00003194"]
    for id in remove_ids:
        if(id in study_ids):
            study_ids.remove(id)
    

    data_output_folder = 'outputs/collection'
    os.makedirs(data_output_folder, exist_ok=True)
    """
    if(len(study_ids) > max_studies):
        study_ids = study_ids[:max_studies]
    """
    found_studies = 0

    for study_id in study_ids:
        if(found_studies == max_studies):
            break
        print(f"start {study_id}")
        #########################################
        # Get all downloads for one study
        #########################################
        with Session("https://www.ebi.ac.uk/metagenomics/api/v1") as mgnify:
            
            dfs = []
            for r in mgnify.iterate(f'studies/{study_id}/downloads'):
                df = pd.json_normalize(r.json)
                df['url'] = str(r.links.self)
                dfs.append(df)

        # if no data for studie is found / studie has no data
        if(dfs == []):
            continue
        main_df = pd.concat(dfs)

        # print(main_df)
        
        ##########################
        # get specific data url
        #########################



        data_type = "TSV"
        data_label = "Phylum level taxonomies SSU (TSV)"#"Phylum level taxonomies LSU (TSV)"
        pipeline_version = "5.0"


        c1 = main_df["attributes.file-format.name"] == data_type
        c2 = main_df["attributes.description.description"] == data_label
        c3 = main_df["relationships.pipeline.data.id"] == pipeline_version

        url_df = main_df.loc[(c1 & c2 & c3), "url"] #just added pipeline version
        if(len(url_df.index) > 0):
            url = url_df.iloc[0]
        else:
            print(f"Found no {data_type} with description {data_label} for {study_id}")
            continue

        ##########################
        # download the data
        #########################

        response = requests.get(url)

        if not response:
            print(f"Could not download file for {study_id}, got response: {response.status_code}")
            continue # break brakes whole loop

        found_studies += 1

        print(url)

        data_output_path = os.path.join(data_output_folder, f"({biom_id}){study_id}.tsv")
        with open(data_output_path, "w") as f:
            f.write(response.text)

        data_tsv = pd.read_table(data_output_path, sep="\t")

        # open tsv header
        #header_row = data_tsv.iloc[0,:].tolist()
        if("superkingdom" not in data_tsv.columns):
            data_tsv.insert(0,"superkingdom",'Unassigned')
        data_tsv = data_tsv.set_index(['superkingdom','kingdom','phylum'])

        data_tsvs.append(data_tsv)

    print(f'found {found_studies} for {biom_id}')

    total_tsv = pd.concat(data_tsvs,axis=1)
    total_tsv = total_tsv.fillna(0)

    #drop unassinged at pylum level for normalization
    if('phylum' in total_tsv.columns):
        total_tsv.drop(total_tsv[total_tsv['phylum'] == 'Unassigned'].index, inplace = True)

    #save first go
    data_output_path = os.path.join(data_output_folder, f"{biom_id}_{found_studies}_-1_none.tsv")

    total_tsv.to_csv(data_output_path, sep="\t")



    # read without index so we can sum easely
    total_tsv = pd.read_table(data_output_path, sep="\t")

    clean_output_path = data_output_path


    for normalization_method in normalization_methods:
        base_total_tsv = pd.read_table(clean_output_path,sep='\t',index_col=1)                  # load tsv with no norm
        print(f'start {normalization_method}')
        base_total_tsv = NormalizeDf(base_total_tsv,normalization_method)
        data_output_path = os.path.join(data_output_folder, f"{biom_id}_{found_studies}_-1_{normalization_method}.tsv")
        base_total_tsv.to_csv(data_output_path, sep="\t")                           # save as normed tsv

    ###############################
    # colapse rows to specifig depth (depth on phylum level (ignoring Kingdom and Superkingdom))
    ###############################
    for index_depth in phylumDepths:
        index_seperator = '_'
        # Load tsv with no norm
        total_tsv = pd.read_table(clean_output_path, sep="\t")

        index_row = total_tsv.iloc[:,2].tolist()

        for i in range(len(index_row)):
            index = index_row[i].split(index_seperator)
            current_index = []
            if(len(index) < index_depth):
                current_index = index
            else:
                current_index = index[:index_depth]
            index_row[i] = index_seperator.join(current_index)
        
        total_tsv.iloc[:,2] = index_row

        total_tsv = total_tsv.groupby(["superkingdom","kingdom","phylum"]).sum()

        # drop unassinged in phylumn


        
        # save table of phylum level
        data_output_path = os.path.join(data_output_folder, f"{biom_id}_{found_studies}_{index_depth}_none.tsv")
        total_tsv.to_csv(data_output_path, sep="\t")


        clean_output_path_phylum = data_output_path
        # normalize table

        for normalization_method in normalization_methods:
            total_tsv = pd.read_table(clean_output_path_phylum,sep='\t',index_col=1)
            print(f'start {normalization_method}')
            total_tsv = NormalizeDf(total_tsv,normalization_method)
            data_output_path = os.path.join(data_output_folder, f"{biom_id}_{found_studies}_{index_depth}_{normalization_method}.tsv")
            total_tsv.to_csv(data_output_path, sep="\t")

        # add to galaxy (manually)
        # put(data_output_path)             #remove coment for Galaxy (used to export from notebook to galaxy)

main_df

: 