In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json

STUDY_ID = "ST001142"
ANALYSIS_ID = "AN001875"

Effettua una richiesta GET all'URL

In [2]:
path = "https://www.metabolomicsworkbench.org/data/study_textformat_view.php?JSON=YES&STUDY_ID={study_id}&ANALYSIS_ID={analysis_id}"
url = path.format(study_id=STUDY_ID, analysis_id=ANALYSIS_ID)

try:
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")
    pre_tag = soup.find('pre')

    if pre_tag:
        # Estrai il testo JSON
        json_text = pre_tag.text

        #controlla se il json è valido
        json.loads(json_text)

        # Decodifica il JSON
        data = json.loads(json_text)

except json.JSONDecodeError as e:
    print(e)


Estrai i dati da "SUBJECT_SAMPLE_FACTORS"

In [3]:
sample_factors = data.get("SUBJECT_SAMPLE_FACTORS", [])
metabolite_data = data.get("MS_METABOLITE_DATA", {}).get("Data", [])

factors_data = {}
for item in sample_factors:
    factors = item.get("Factors", {})
    additional_factors = item.get("Additional sample data", {})

    # Aggiungi a sample id i fattori aggiuntivi (es. sesso, età, ecc.)
    factors_data[item.get("Sample ID")] = {**factors, **additional_factors}

metabolites = []
for item in metabolite_data:
    metabolites.append(item['Metabolite'])
    del item['Metabolite']


Converte la lista di dizionari in un DataFrame Pandas

In [4]:
factors_df = pd.DataFrame(factors_data)
metabolite_df = pd.DataFrame(metabolite_data, index=metabolites)

# transponi i DataFrame
factors_df = factors_df.T
metabolite_df = metabolite_df.T


Unisci i due DataFrame utilizzando "Sample ID" come chiave

In [5]:
result_df = pd.merge(factors_df, metabolite_df, left_index=True, right_index=True)

In [7]:
result_df.to_csv(f"output/{STUDY_ID}_{ANALYSIS_ID}.csv", index=True, index_label="Sample ID")