In [1]:
import streamlit as st
import requests
import plotly.express as px
import pandas as pd
import io
from ftplib import FTP
import os
import subprocess
#set streamlit wide
st.set_page_config(layout="wide")


# Function to get the SDRF annotation count
def fetch_data():
    sdrfs_url = "https://www.ebi.ac.uk/pride/ws/archive/v2/search/projects?keyword=sdrf"
    all_pxds_url = "https://www.ebi.ac.uk/pride/ws/archive/v2/search/projects?"
    
    number_sdrfs = int(requests.get(sdrfs_url).headers['total_records'])
    number_pxds = int(requests.get(all_pxds_url).headers['total_records'])
    
    return number_sdrfs, number_pxds

# Function to create the donut chart
def plot_donut_chart(number_sdrfs, number_pxds):
    annotated = number_sdrfs
    not_annotated = number_pxds - number_sdrfs
    labels = ['SDRF-annotated', 'Not annotated']
    sizes = [annotated, not_annotated]
    colors = ['orange', 'lightgrey']

    # Create the donut chart using Plotly
    fig = px.pie(
        names=labels,
        values=sizes,
        color=labels,
        color_discrete_map={"SDRF-annotated": "orange", "Not annotated": "lightgrey"},
        hole=0.4,
        labels={'labels': 'Status'},
        title=f"PRIDE Projects with SDRF Annotation: {annotated}/{number_pxds}"
    )

    # Update chart layout for styling
    fig.update_traces(textinfo='percent+label', pull=[0.1, 0])
    # fig.update_layout(margin=dict(t=20, b=20, l=20, r=20))  # Adjust margins

    # Show the plot in Streamlit
    st.plotly_chart(fig)


# Function to get SDRF FTP link from project accession
def get_sdrf_ftp(pxd, df):
    #first try if it's an indexed SDRF
    #get publicationDate from df
    publication_date = df[df['accession'] == pxd]['publicationDate'].values[0]
    year = publication_date.split("-")[0]
    month = publication_date.split("-")[1]
    for fname in ["sdrf.tsv", "sdrf.txt"]:
        ftp_link = f"ftp://ftp.pride.ebi.ac.uk/pride/data/archive/{year}/{month}/{pxd}/{fname}"
        try:
            sdrf_file = load_sdrf_from_ftp(ftp_link)
            print(f"{fname} found and loaded.")
            return ftp_link
        except Exception as e:
            #this only works if it's a non-indexed SDRF
            files_url = f"https://www.ebi.ac.uk/pride/ws/archive/v2/projects/{pxd}/files"
            response = requests.get(files_url)
            files = response.json()
            for f in files:
                if 'sdrf' in f['fileName'].lower():
                    ftp_link = next((loc['value'] for loc in f['publicFileLocations']
                                    if loc['name'] == 'FTP Protocol'), None)
                    if ftp_link:
                        return ftp_link
            return None

def load_sdrf_from_ftp(ftp_link):
    """
    Download an SDRF file from a given FTP link and return it as a pandas DataFrame.
    Supports .tsv, .txt, .csv, and .xlsx files.
    """
    # Parse FTP path
    ftp_root = "ftp.pride.ebi.ac.uk"
    path_parts = ftp_link.replace(f"ftp://{ftp_root}/", "").split("/")
    directory = "/".join(path_parts[:-1])
    filename = path_parts[-1]
    extension = os.path.splitext(filename)[-1].lower()

    # Connect and download file to memory
    ftp = FTP(ftp_root)
    ftp.login()
    ftp.cwd(directory)

    buffer = io.BytesIO()
    ftp.retrbinary(f"RETR {filename}", buffer.write)
    ftp.quit()
    buffer.seek(0)

    # Read based on file extension
    if extension in [".tsv", ".txt"]:
        return pd.read_csv(buffer, sep="\t", encoding="ISO-8859-1")
    elif extension == ".csv":
        return pd.read_csv(buffer, encoding="ISO-8859-1")
    elif extension == ".xlsx":
        return pd.read_excel(buffer)
    else:
        raise ValueError(f"Unsupported file format: {extension}")

def validate_sdrf(file_path):
    """Runs SDRF validation and returns success status & messages."""
    try:
        result = subprocess.run(
            ["parse_sdrf", "validate-sdrf", "--sdrf_file", file_path],
            capture_output=True,
            text=True,
            check=False  # Prevent exception on failure
        )
        if result.returncode == 0:
            return True, result.stdout.strip()
        else:
            return False, result.stdout.strip()
    except FileNotFoundError:
        return None, "SDRF validation tool not found. Install `sdrf-pipelines` via `pip install sdrf-pipelines`."



# Fetch the data and show donut chart
number_sdrfs, number_pxds = fetch_data()
plot_donut_chart(number_sdrfs, number_pxds)


2025-04-18 14:36:12.625 
  command:

    streamlit run /home/compomics/miniconda3/envs/publicdata/lib/python3.10/site-packages/ipykernel_launcher.py [ARGUMENTS]


In [None]:
for proj in all_projects:
    pxd = proj['accession']
    files = requests.get(f"https://www.ebi.ac.uk/pride/ws/archive/v2/projects/{pxd}/files").json()
    for f in files:
        if 'sdrf' in f['fileName'].lower():  # matches 'plasma_sdrf', 'mysdrf.tsv', etc.
            # Handle as SDRF file
