<a href="https://colab.research.google.com/github/Palaeoprot/PRIDE/blob/main/PRIDE_Parsing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import xml.etree.ElementTree as ET
import csv
from collections import defaultdict

# --- Configuration ---
# FIX: Change the input file to point to the actual XML file, not HTML
INPUT_XML = '/content/PXD058447.xml'  # Updated to use the correct XML file
OUTPUT_CSV = '/content/proteomex_data_extracted.csv'

# Namespace handling
NS = {
    'px': 'http://www.proteomexchange.org/schemas/ProteomeXchange.xsd',
    'cv': 'http://psidev.info/psi/cv/1.1',
    'xsi': 'http://www.w3.org/2001/XMLSchema-instance'
}

# Map accession numbers to readable names for CV parameters
CV_MAP = {
    # Species
    'MS:1001469': 'Species Scientific Name',
    'MS:1001467': 'NCBI TaxID',

    # Review Level
    'MS:1002854': 'Review Level',

    # Repository Support
    'MS:1002856': 'Repository Support',

    # Dataset Origin
    'MS:1002868': 'Dataset Origin',

    # Contact roles
    'MS:1000586': 'Contact Name',
    'MS:1000589': 'Contact Email',
    'MS:1000590': 'Contact Affiliation',
    'MS:1002037': 'Role: Dataset Submitter',
    'MS:1002332': 'Role: Lab Head',

    # Publication
    'MS:1001922': 'Publication DOI',

    # Keyword
    'MS:1001925': 'Keyword',

    # File types
    'MS:1002846': 'Associated raw file',
    'MS:1002850': 'Peak list file',
    'MS:1002848': 'Result file',
    'MS:1002851': 'Other type file',

    # Dataset links
    'MS:1002852': 'Dataset FTP Location',
    'MS:1001930': 'PRIDE Project URI',
}


def parse_xml(file_path):
    """Parse the ProteomeXchange XML file."""
    try:
        tree = ET.parse(file_path)
        root = tree.getroot()
        print(f"✅ Successfully parsed XML file: {file_path}")
        print(f"📄 Root element: {root.tag}")
        return root
    except ET.ParseError as e:
        print(f"❌ XML Parse Error: {e}")
        print(f"💡 Make sure you're using the XML file, not HTML!")
        return None
    except FileNotFoundError as e:
        print(f"❌ File not found: {e}")
        print(f"💡 Available files to check:")
        import os
        if os.path.exists('/content'):
            for f in os.listdir('/content'):
                if f.endswith(('.xml', '.html')):
                    print(f"   - {f}")
        return None
    except Exception as e:
        print(f"❌ Unexpected error: {e}")
        return None


def get_cv_value(cv_param, default_name="Unknown"):
    """Extract name and value from cvParam using mapping."""
    accession = cv_param.get('accession')
    name = CV_MAP.get(accession, cv_param.get('name', default_name))
    value = cv_param.get('value')
    return name, value


def extract_dataset_identifier(root):
    """Extract dataset identifiers like PXD and DOI."""
    data = {}
    for identifier in root.findall('.//DatasetIdentifier'):
        for cv in identifier.findall('cvParam'):
            name, value = get_cv_value(cv)
            if 'accession' in name.lower():
                data['PXD Accession'] = value
            elif 'doi' in name.lower():
                data['DOI'] = value
    return data


def extract_summary(root):
    """Extract dataset summary."""
    summary = root.find('DatasetSummary')
    if summary is None:
        return {}
    data = {
        'Title': summary.get('title', ''),
        'Announce Date': summary.get('announceDate', ''),
        'Hosting Repository': summary.get('hostingRepository', ''),
        'Description': summary.findtext('Description', '').strip(),
    }

    # Review Level
    review = summary.find('.//ReviewLevel/cvParam')
    if review is not None:
        _, value = get_cv_value(review)
        data['Review Level'] = value

    # Repository Support
    support = summary.find('.//RepositorySupport/cvParam')
    if support is not None:
        _, value = get_cv_value(support)
        data['Repository Support'] = value

    return data


def extract_species(root):
    """Extract species information."""
    data = {}
    species = root.find('SpeciesList/Species')
    if species is not None:
        for cv in species.findall('cvParam'):
            name, value = get_cv_value(cv)
            if 'scientific' in name.lower():
                data['Species'] = value
            elif 'taxid' in name.lower():
                data['TaxID'] = value
    return data


def extract_instruments(root):
    """Extract instrument models."""
    instruments = []
    for inst in root.findall('InstrumentList/Instrument'):
        for cv in inst.findall('cvParam'):
            if cv.get('name'):
                instruments.append(cv.get('name'))
    return {'Instrument': '; '.join(instruments)}


def extract_modifications(root):
    """Extract modifications."""
    mods = []
    for cv in root.findall('ModificationList/cvParam'):
        name = cv.get('name')
        accession = cv.get('accession')
        mods.append(f"{name} ({accession})")
    return {'Modifications': '; '.join(mods)}


def extract_contacts(root):
    """Extract contact information."""
    data = {}
    for contact in root.findall('ContactList/Contact'):
        role = 'Contact'
        if contact.find(".//cvParam[@accession='MS:1002037']") is not None:
            role = 'Submitter'
        elif contact.find(".//cvParam[@accession='MS:1002332']") is not None:
            role = 'Lab Head'

        name = email = affiliation = None
        for cv in contact.findall('cvParam'):
            param_name, value = get_cv_value(cv)
            if param_name == 'Contact Name':
                name = value
            elif param_name == 'Contact Email':
                email = value
            elif param_name == 'Contact Affiliation':
                affiliation = value

        if role == 'Submitter':
            data['Submitter Name'] = name
            data['Submitter Email'] = email
            data['Submitter Affiliation'] = affiliation
        elif role == 'Lab Head':
            data['Lab Head Name'] = name
            data['Lab Head Email'] = email
            data['Lab Head Affiliation'] = affiliation
    return data


def extract_publications(root):
    """Extract DOIs of publications."""
    dois = []
    for pub in root.findall('PublicationList/Publication'):
        for cv in pub.findall('cvParam'):
            name, value = get_cv_value(cv)
            if 'doi' in name.lower() and value:
                dois.append(value)
    return {'Publications (DOIs)': '; '.join(dois)}


def extract_keywords(root):
    """Extract keywords."""
    for cv in root.findall('KeywordList/cvParam'):
        if cv.get('value'):
            return {'Keywords': cv.get('value')}
    return {'Keywords': ''}


def extract_full_dataset_links(root):
    """Extract FTP and PRIDE links."""
    data = {}
    for link in root.findall('FullDatasetLinkList/FullDatasetLink'):
        for cv in link.findall('cvParam'):
            name, value = get_cv_value(cv)
            if 'ftp' in name.lower():
                data['FTP Location'] = value
            elif 'pride' in name.lower():
                data['PRIDE URI'] = value
    return data


def extract_files(root):
    """Extract all dataset files."""
    files = []
    for f in root.findall('DatasetFileList/DatasetFile'):
        file_info = {
            'File ID': f.get('id'),
            'File Name': f.get('name'),
            'File Type': '',
            'File URI': ''
        }

        for cv in f.findall('cvParam'):
            name, value = get_cv_value(cv)
            if 'URI' in name:
                file_info['File URI'] = value
            else:
                file_info['File Type'] = name
        files.append(file_info)
    return files


def main():
    """Main function to parse XML and create CSV."""
    print("🔄 Starting PRIDE XML parsing...")

    root = parse_xml(INPUT_XML)
    if root is None:
        print("❌ Failed to parse XML file. Please check the file path and format.")
        return

    # Extract top-level metadata (single row)
    print("📊 Extracting metadata...")
    metadata = {}
    metadata.update(extract_dataset_identifier(root))
    metadata.update(extract_summary(root))
    metadata.update(extract_species(root))
    metadata.update(extract_instruments(root))
    metadata.update(extract_modifications(root))
    metadata.update(extract_contacts(root))
    metadata.update(extract_publications(root))
    metadata.update(extract_keywords(root))
    metadata.update(extract_full_dataset_links(root))

    # Extract files (multiple rows)
    print("📁 Extracting file information...")
    files = extract_files(root)

    # If no files, add one dummy row so CSV is still valid
    if not files:
        files = [dict.fromkeys(['File ID', 'File Name', 'File Type', 'File URI'], '')]

    # Add metadata to each file row
    for f in files:
        f.update({k: v for k, v in metadata.items() if k not in f})

    # Write to CSV
    print("💾 Writing to CSV...")
    fieldnames = list(files[0].keys())
    try:
        with open(OUTPUT_CSV, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            for row in files:
                writer.writerow(row)

        print(f"✅ Successfully extracted data to '{OUTPUT_CSV}'")
        print(f"📁 Total files included: {len(files)}")
        print(f"📋 Columns extracted: {len(fieldnames)}")

        # Show a preview of extracted data
        print(f"\n📖 Sample metadata:")
        print(f"   Title: {metadata.get('Title', 'N/A')}")
        print(f"   PXD: {metadata.get('PXD Accession', 'N/A')}")
        print(f"   Species: {metadata.get('Species', 'N/A')}")
        print(f"   Files: {len(files)} files")

    except Exception as e:
        print(f"❌ Error writing CSV: {e}")


if __name__ == "__main__":
    main()

🔄 Starting PRIDE XML parsing...
❌ XML Parse Error: mismatched tag: line 9, column 121
💡 Make sure you're using the XML file, not HTML!
❌ Failed to parse XML file. Please check the file path and format.
