In [30]:
!pip install pymzml boto3
!pip install pyarrow
!pip install fastparquet


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Collecting fastparquet
  Downloading fastparquet-2024.11.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Collecting cramjam>=2.3 (from fastparquet)
  Downloading cramjam-2.11.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Downloading fastparquet-2024.11.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m33.6 MB

In [9]:
import pymzml
import gzip
import os
import zipfile
import pandas as pd
import numpy as np
import boto3
from botocore import UNSIGNED
from botocore.client import Config

In [10]:
s3_client = boto3.client('s3', config=Config(signature_version=UNSIGNED))

bucket_name = 'nasa-osdr'

base_path = 'OSD-832/version-2/metabolomics/'

temp_dir = '/tmp/osd_data'
os.makedirs(temp_dir, exist_ok=True)

In [11]:
# Characterizing a single file for debugging

key = base_path + 'GLDS-703_metabolomics_082120_Marek_Raber_Plasma_pos1.mzML.gz'
filename = os.path.basename(key)
local_filepath = os.path.join(temp_dir, filename)

print(f"Downloading {filename} for inspection...")
s3_client.download_file(bucket_name, key, local_filepath)

print(f"Inspecting {filename}...")
spectra = pymzml.run.Reader(local_filepath)

spectrum_to_inspect = next(iter(spectra))
attributes = [attr for attr in dir(spectrum_to_inspect) if not attr.startswith('_')]
print(f"\nAttributes: {attributes}")
print(f"\nMS Level: {spectrum_to_inspect.ms_level}")

if len(spectrum_to_inspect.i) > 0:
    highest_mz, highest_intensity = max(zip(spectrum_to_inspect.mz, spectrum_to_inspect.i), key=lambda item: item[1])
    print(f"Highest Peak (m/z, Intensity): ({highest_mz}, {highest_intensity})")
else:
    print("Highest Peak (m/z, Intensity): (Spectrum is empty)")

print(f"Total number of peaks in this spectrum: {len(spectrum_to_inspect.i)}")
print(f"Scan Time (Retention Time): {spectrum_to_inspect.scan_time_in_minutes()} minutes")
print(f"Number of spectra found: {len(list(spectra))}")
print(spectra)

os.remove(local_filepath)

Downloading GLDS-703_metabolomics_082120_Marek_Raber_Plasma_pos1.mzML.gz for inspection...
Inspecting GLDS-703_metabolomics_082120_Marek_Raber_Plasma_pos1.mzML.gz...


MS Level: 1
Highest Peak (m/z, Intensity): (141.95888723059318, 17880.0)
Total number of peaks in this spectrum: 2347
Scan Time (Retention Time): 0.002083333333 minutes
Number of spectra found: 17539
<pymzml.run.Reader object at 0x74b421e72780>


In [12]:
results = []
metadata_key = 'OSD-832/version-2/metadata/OSD-832_metadata_OSD-832-ISA.zip'
local_zip_path = os.path.join(temp_dir, 'metadata.zip')

try:
    print(f"Downloading {os.path.basename(metadata_key)}...")
    s3_client.download_file(bucket_name, metadata_key, local_zip_path)
    print("Download complete.")

    print("Unzipping metadata file...")
    with zipfile.ZipFile(local_zip_path, 'r') as zip_ref:
        zip_ref.extractall(temp_dir)
        list_of_files = zip_ref.namelist()
        print(f"\nList of zip files:")
        print(f"{list_of_files}\n")
    print("Unzip complete.")

    unzipped_files = os.listdir(temp_dir)
    meta_files = [f for f in unzipped_files if f.startswith('s_') and f.endswith('.txt')]

    if meta_files:
        meta_filename = meta_files[0]
        local_metadata_path = os.path.join(temp_dir, meta_filename)
        print(f"Found sample metadata file: {meta_filename}")

        df_meta = pd.read_csv(local_metadata_path, sep='\t')

        print(df_meta.columns.tolist())
        print("\nSuccessfully loaded metadata. Displaying first 5 rows:")
        display(df_meta.head())
        
    else:
        print("Error: Could not load sample metadata file.")

except Exception as e:
    print(f"An error occurred: {e}")

Downloading OSD-832_metadata_OSD-832-ISA.zip...
Download complete.
Unzipping metadata file...

List of zip files:
['i_Investigation.txt', 's_OSD-832.txt', 'a_OSD-832_metabolite-profiling_mass-spectrometry_TripleTOF 5600(AB Sciex).txt']

Unzip complete.
Found sample metadata file: s_OSD-832.txt
['Source Name', 'Sample Name', 'Characteristics[Organism]', 'Term Source REF', 'Term Accession Number', 'Characteristics[Strain]', 'Term Source REF.1', 'Term Accession Number.1', 'Characteristics[Animal Source]', 'Characteristics[Genotype]', 'Term Source REF.2', 'Term Accession Number.2', 'Characteristics[Sex]', 'Term Source REF.3', 'Term Accession Number.3', 'Factor Value[Hindlimb Unloading]', 'Factor Value[Ionizing Radiation]', 'Term Source REF.4', 'Term Accession Number.4', 'Characteristics[Age at Irradiation]', 'Unit', 'Term Source REF.5', 'Term Accession Number.5', 'Characteristics[Material Type]', 'Term Source REF.6', 'Term Accession Number.6', 'Protocol REF', 'Parameter Value[Diet]', 'Para

Unnamed: 0,Source Name,Sample Name,Characteristics[Organism],Term Source REF,Term Accession Number,Characteristics[Strain],Term Source REF.1,Term Accession Number.1,Characteristics[Animal Source],Characteristics[Genotype],...,Term Accession Number.11,Parameter Value[absorbed radiation dose rate],Protocol REF.2,Parameter Value[Euthanasia Method],Term Source REF.12,Term Accession Number.12,Parameter Value[Sample Storage Temperature],Unit.3,Term Source REF.13,Term Accession Number.13
0,9394,9394 plasma,Rattus norvegicus,NCBITAXON,http://purl.bioontology.org/ontology/NCBITAXON...,WAG/Rij/Cmcr,OSD,https://osdr.nasa.gov/,Medical College of Wisconsin,Wild Type,...,http://purl.obolibrary.org/obo/NCIT_C48660,Not Applicable,sample collection,Guillotine,SNOMEDCT,http://purl.bioontology.org/ontology/SNOMEDCT/...,-80.0,degree Celsius,UO,http://purl.obolibrary.org/obo/UO_0000027
1,9395,9395 plasma,Rattus norvegicus,NCBITAXON,http://purl.bioontology.org/ontology/NCBITAXON...,WAG/Rij/Cmcr,OSD,https://osdr.nasa.gov/,Medical College of Wisconsin,Wild Type,...,http://purl.obolibrary.org/obo/NCIT_C48660,Not Applicable,sample collection,Guillotine,SNOMEDCT,http://purl.bioontology.org/ontology/SNOMEDCT/...,-80.0,degree Celsius,UO,http://purl.obolibrary.org/obo/UO_0000027
2,9396,9396 plasma,Rattus norvegicus,NCBITAXON,http://purl.bioontology.org/ontology/NCBITAXON...,WAG/Rij/Cmcr,OSD,https://osdr.nasa.gov/,Medical College of Wisconsin,Wild Type,...,http://purl.obolibrary.org/obo/NCIT_C48660,Not Applicable,sample collection,Guillotine,SNOMEDCT,http://purl.bioontology.org/ontology/SNOMEDCT/...,-80.0,degree Celsius,UO,http://purl.obolibrary.org/obo/UO_0000027
3,9397,9397 plasma,Rattus norvegicus,NCBITAXON,http://purl.bioontology.org/ontology/NCBITAXON...,WAG/Rij/Cmcr,OSD,https://osdr.nasa.gov/,Medical College of Wisconsin,Wild Type,...,http://purl.obolibrary.org/obo/NCIT_C48660,Not Applicable,sample collection,Guillotine,SNOMEDCT,http://purl.bioontology.org/ontology/SNOMEDCT/...,-80.0,degree Celsius,UO,http://purl.obolibrary.org/obo/UO_0000027
4,9398,9398 plasma,Rattus norvegicus,NCBITAXON,http://purl.bioontology.org/ontology/NCBITAXON...,WAG/Rij/Cmcr,OSD,https://osdr.nasa.gov/,Medical College of Wisconsin,Wild Type,...,http://purl.obolibrary.org/obo/NCIT_C48660,Not Applicable,sample collection,Guillotine,SNOMEDCT,http://purl.bioontology.org/ontology/SNOMEDCT/...,-80.0,degree Celsius,UO,http://purl.obolibrary.org/obo/UO_0000027


In [13]:
assay_files = [f for f in os.listdir(temp_dir) if f.startswith('a_') and f.endswith('.txt')]

if assay_files:
    assay_filename = assay_files[0]
    local_assay_path = os.path.join(temp_dir, assay_filename)
    print(f"Found assay file: {assay_filename}")

    df_assay = pd.read_csv(local_assay_path, sep='\t')
    
    print("\nDisplaying first 5 rows of the assay file:")
    display(df_assay.head())
else:
    print("Error: Could not load assay file.")

Found assay file: a_OSD-832_metabolite-profiling_mass-spectrometry_TripleTOF 5600(AB Sciex).txt

Displaying first 5 rows of the assay file:


Unnamed: 0,Sample Name,Protocol REF,Parameter Value[Processed Amount],Unit,Term Source REF,Term Accession Number,Extract Name,Protocol REF.1,Parameter Value[Instrument],Parameter Value[Ion Source],Parameter Value[Analyzer],MS Assay Name,Raw Spectral Data File,Parameter Value[Raw Spectral Data File (mzML)]
0,9394 plasma,Extraction,100.0,microliter,UO,http://purl.obolibrary.org/obo/UO_0000101,9394 plasma,Mass Spectrometry,AB SCIEX TripleTOF 5600,electrospray ionization (ESI),Quadrupole time-of-flight (Q-TOF),metabolomics,"GLDS-703_metabolomics_pos49.raw.zip, GLDS-703_...",GLDS-703_metabolomics_082120_Marek_Raber_Plasm...
1,9395 plasma,Extraction,100.0,microliter,UO,http://purl.obolibrary.org/obo/UO_0000101,9395 plasma,Mass Spectrometry,AB SCIEX TripleTOF 5600,electrospray ionization (ESI),Quadrupole time-of-flight (Q-TOF),metabolomics,"GLDS-703_metabolomics_pos23.raw.zip, GLDS-703_...",GLDS-703_metabolomics_082120_Marek_Raber_Plasm...
2,9396 plasma,Extraction,100.0,microliter,UO,http://purl.obolibrary.org/obo/UO_0000101,9396 plasma,Mass Spectrometry,AB SCIEX TripleTOF 5600,electrospray ionization (ESI),Quadrupole time-of-flight (Q-TOF),metabolomics,"GLDS-703_metabolomics_pos22.raw.zip, GLDS-703_...",GLDS-703_metabolomics_082120_Marek_Raber_Plasm...
3,9397 plasma,Extraction,100.0,microliter,UO,http://purl.obolibrary.org/obo/UO_0000101,9397 plasma,Mass Spectrometry,AB SCIEX TripleTOF 5600,electrospray ionization (ESI),Quadrupole time-of-flight (Q-TOF),metabolomics,"GLDS-703_metabolomics_pos15.raw.zip, GLDS-703_...",GLDS-703_metabolomics_082120_Marek_Raber_Plasm...
4,9398 plasma,Extraction,100.0,microliter,UO,http://purl.obolibrary.org/obo/UO_0000101,9398 plasma,Mass Spectrometry,AB SCIEX TripleTOF 5600,electrospray ionization (ESI),Quadrupole time-of-flight (Q-TOF),metabolomics,"GLDS-703_metabolomics_pos12.raw.zip, GLDS-703_...",GLDS-703_metabolomics_082120_Marek_Raber_Plasm...


In [14]:
meta_subset = df_meta[['Sample Name', 'Factor Value[Ionizing Radiation]', 'Factor Value[Hindlimb Unloading]']]
assay_subset = df_assay[['Sample Name', 'Parameter Value[Raw Spectral Data File (mzML)]']]

df_combined = pd.merge(assay_subset, meta_subset, on='Sample Name')
df_combined['Parameter Value[Raw Spectral Data File (mzML)]'] = df_combined['Parameter Value[Raw Spectral Data File (mzML)]'].str.split(', ')
df_combined = df_combined.explode('Parameter Value[Raw Spectral Data File (mzML)]')

display(df_combined.head())

Unnamed: 0,Sample Name,Parameter Value[Raw Spectral Data File (mzML)],Factor Value[Ionizing Radiation],Factor Value[Hindlimb Unloading]
0,9394 plasma,GLDS-703_metabolomics_082120_Marek_Raber_Plasm...,sham-irradiated,Normally Loaded Control
0,9394 plasma,GLDS-703_metabolomics_091920_Raber_Marek_Negat...,sham-irradiated,Normally Loaded Control
1,9395 plasma,GLDS-703_metabolomics_082120_Marek_Raber_Plasm...,sham-irradiated,Normally Loaded Control
1,9395 plasma,GLDS-703_metabolomics_091920_Raber_Marek_Negat...,sham-irradiated,Normally Loaded Control
2,9396 plasma,GLDS-703_metabolomics_082120_Marek_Raber_Plasm...,sham-irradiated,Normally Loaded Control


In [7]:
s3_file_keys = []
print(f"Finding all files in s3://{bucket_name}/{base_path}...")
paginator = s3_client.get_paginator('list_objects_v2')
pages = paginator.paginate(Bucket=bucket_name, Prefix=base_path)

Finding all files in s3://nasa-osdr/OSD-832/version-2/metabolomics/...


In [None]:
first_page = next(iter(pages))

print(f"\nThe 'page' object: {type(first_page)}")
print(f"Keys: {list(first_page.keys())}")

if 'Contents' in first_page:
    contents = first_page['Contents']
    print(f"\nThe 'Contents' key holds: {type(contents)}")

    if contents:
        first_obj = contents[0]
        print(f"\nEach item inside 'Contents': {type(first_obj)}")
        
        print(f"\nThe keys for a single file object: {list(first_obj.keys())}")

        print(f"\nThe 'Key' for the first file: '{first_obj['Key']}'")


The 'page' object: <class 'dict'>
Keys: ['ResponseMetadata', 'IsTruncated', 'Contents', 'Name', 'Prefix', 'MaxKeys', 'EncodingType', 'KeyCount']

The 'Contents' key holds: <class 'list'>

Each item inside 'Contents': <class 'dict'>

The keys for a single file object: ['Key', 'LastModified', 'ETag', 'ChecksumAlgorithm', 'ChecksumType', 'Size', 'StorageClass']

[Step 5] The 'Key' for the first file: 'OSD-832/version-2/metabolomics/GLDS-703_metabolomics_082120_Marek_Raber_Plasma_pos1.mzML.gz'


In [15]:
for page in pages:
    for obj in page.get('Contents', []):
        if obj['Key'].endswith('.mzML.gz'):
            s3_file_keys.append(obj['Key'])

print(f"Found {len(s3_file_keys)} total files to process.")

Found 133 total files to process.


In [21]:
first_key = s3_file_keys[0]
first_filename = os.path.basename(first_key)
local_first_filename = os.path.join(temp_dir, first_filename)

print(f"Downloading '{first_filename}' to inspect...")
s3_client.download_file(bucket_name, first_key, local_first_filename)

run_first_filename = pymzml.run.Reader(local_first_filename)
first_spectrum = next(iter(run_first_filename))

print(f"\nEach item from the 'run' object: {type(first_spectrum)}")

os.remove(local_first_filename)

Downloading 'GLDS-703_metabolomics_082120_Marek_Raber_Plasma_pos1.mzML.gz' to inspect...

Each item from the 'run' object: <class 'pymzml.spec.Spectrum'>


In [20]:
total_files = len(s3_file_keys)
for x, key in enumerate(s3_file_keys):
    try:
        filename = os.path.basename(key)
        local_filepath = os.path.join(temp_dir, filename)

        percent_complete = ((x + 1) / total_files) * 100
        
        print(f"({x + 1}/{total_files}) [{percent_complete:.2f}%] Downloading {filename}...")
        s3_client.download_file(bucket_name, key, local_filepath)
        
        print(f"({x + 1}/{total_files}) [{percent_complete:.2f}%] Processing {filename}...")
        run = pymzml.run.Reader(local_filepath)
        
        if run:
            file_total_intensity = 0
            num_spectra = 0
            ms1_scans = 0
            ms2_scans = 0
            total_base_peak_intensity = 0.0
            total_peaks_in_file = 0
            for spectrum in run:
                file_total_intensity += sum(spectrum.i)
                num_spectra += 1
                if spectrum.ms_level == 1:
                    ms1_scans += 1
                elif spectrum.ms_level == 2:
                    ms2_scans += 1

                if len(spectrum.i) > 0:
                    i = np.max(spectrum.i)
                    total_base_peak_intensity += i

                total_peaks_in_file += len(spectrum.i)

            avg_peaks_per_scan = total_peaks_in_file / num_spectra if num_spectra > 0 else 0

            results.append({
                'filename': filename,
                'total_intensity': file_total_intensity,
                'tic_datapoints': num_spectra,
                'ms1_scans': ms1_scans,
                'ms2_scans': ms2_scans,
                'sum_base_peak_intensity': total_base_peak_intensity,
                'avg_peaks_per_scan': avg_peaks_per_scan
            })
            print(f"({x + 1}/{total_files}) [{percent_complete:.2f}%] Successfully processed {filename}.")

        else:
            print(f"Skipping: {filename} is empty or invalid")
        
        os.remove(local_filepath)
        print(f"Finished and cleaned up {filename}.")

    except Exception as e:
        print(f"Failed to process {key}: {e}")

(1/133) [0.75%] Downloading GLDS-703_metabolomics_082120_Marek_Raber_Plasma_pos1.mzML.gz...
(1/133) [0.75%] Processing GLDS-703_metabolomics_082120_Marek_Raber_Plasma_pos1.mzML.gz...
(1/133) [0.75%] Successfully processed GLDS-703_metabolomics_082120_Marek_Raber_Plasma_pos1.mzML.gz.
Finished and cleaned up GLDS-703_metabolomics_082120_Marek_Raber_Plasma_pos1.mzML.gz.
(2/133) [1.50%] Downloading GLDS-703_metabolomics_082120_Marek_Raber_Plasma_pos10.mzML.gz...
(2/133) [1.50%] Processing GLDS-703_metabolomics_082120_Marek_Raber_Plasma_pos10.mzML.gz...
(2/133) [1.50%] Successfully processed GLDS-703_metabolomics_082120_Marek_Raber_Plasma_pos10.mzML.gz.
Finished and cleaned up GLDS-703_metabolomics_082120_Marek_Raber_Plasma_pos10.mzML.gz.
(3/133) [2.26%] Downloading GLDS-703_metabolomics_082120_Marek_Raber_Plasma_pos11.mzML.gz...
(3/133) [2.26%] Processing GLDS-703_metabolomics_082120_Marek_Raber_Plasma_pos11.mzML.gz...
(3/133) [2.26%] Successfully processed GLDS-703_metabolomics_082120_Mar

In [22]:
df_features = pd.DataFrame(results)

print("\nDataFrame with features:")
display(df_features)


DataFrame with features:


Unnamed: 0,filename,total_intensity,tic_datapoints,ms1_scans,ms2_scans,sum_base_peak_intensity,avg_peaks_per_scan
0,GLDS-703_metabolomics_082120_Marek_Raber_Plasm...,1.368131e+10,17540,2973,14567,323813056.0,6566.761345
1,GLDS-703_metabolomics_082120_Marek_Raber_Plasm...,1.155506e+10,16563,3038,13525,342596704.0,4973.952364
2,GLDS-703_metabolomics_082120_Marek_Raber_Plasm...,1.153483e+10,16677,3030,13647,341492416.0,4930.622354
3,GLDS-703_metabolomics_082120_Marek_Raber_Plasm...,1.143695e+10,16571,3034,13537,343744672.0,4910.967473
4,GLDS-703_metabolomics_082120_Marek_Raber_Plasm...,1.122497e+10,16879,3038,13841,319452896.0,4832.251377
...,...,...,...,...,...,...,...
128,GLDS-703_metabolomics_091920_Raber_Marek_Negat...,5.110776e+09,18423,3797,14626,46674604.0,14070.876296
129,GLDS-703_metabolomics_091920_Raber_Marek_Negat...,7.086478e+09,18638,3813,14825,143802784.0,13582.883786
130,GLDS-703_metabolomics_091920_Raber_Marek_Negat...,6.737794e+09,18523,3807,14716,144499936.0,13510.817200
131,GLDS-703_metabolomics_091920_Raber_Marek_Negat...,7.099848e+09,18527,3800,14727,220994080.0,13497.215469


In [23]:
df_final = pd.merge(df_features, df_combined, left_on='filename', right_on='Parameter Value[Raw Spectral Data File (mzML)]', how='left')

print("Final Merged DataFrame:")
display(df_final.head())

Final Merged DataFrame:


Unnamed: 0,filename,total_intensity,tic_datapoints,ms1_scans,ms2_scans,sum_base_peak_intensity,avg_peaks_per_scan,Sample Name,Parameter Value[Raw Spectral Data File (mzML)],Factor Value[Ionizing Radiation],Factor Value[Hindlimb Unloading]
0,GLDS-703_metabolomics_082120_Marek_Raber_Plasm...,13681310000.0,17540,2973,14567,323813056.0,6566.761345,QC Blank,GLDS-703_metabolomics_082120_Marek_Raber_Plasm...,Not Applicable,Not Applicable
1,GLDS-703_metabolomics_082120_Marek_Raber_Plasm...,11555060000.0,16563,3038,13525,342596704.0,4973.952364,9429 plasma,GLDS-703_metabolomics_082120_Marek_Raber_Plasm...,mixed radiation field,Hindlimb Unloaded
2,GLDS-703_metabolomics_082120_Marek_Raber_Plasm...,11534830000.0,16677,3030,13647,341492416.0,4930.622354,9447 plasma,GLDS-703_metabolomics_082120_Marek_Raber_Plasm...,mixed radiation field,Normally Loaded Control
3,GLDS-703_metabolomics_082120_Marek_Raber_Plasm...,11436950000.0,16571,3034,13537,343744672.0,4910.967473,9398 plasma,GLDS-703_metabolomics_082120_Marek_Raber_Plasm...,sham-irradiated,Normally Loaded Control
4,GLDS-703_metabolomics_082120_Marek_Raber_Plasm...,11224970000.0,16879,3038,13841,319452896.0,4832.251377,9442 plasma,GLDS-703_metabolomics_082120_Marek_Raber_Plasm...,mixed radiation field,Normally Loaded Control


In [32]:
codespace_path = 'data/cleaned_data/OSD-832_final_processed_data.parquet'

os.makedirs(os.path.dirname(codespace_path), exist_ok=True)

df_final.to_parquet(codespace_path, engine='fastparquet')