In [2]:
import pandas as pd

This Jupyter Notebook serves to prepare ProteoBench inputs from software tool outputs that have important information, needed by ProteoBench, in more than one file

# MSAID output preparation

In [None]:
# Change paths to your local paths
input_path = "./path/to/your/MSAID/output/dir/"  # path should contain the proteingroups.tsv and precursors.tsv files
output_path = "./path/to/your/desired/output/dir/"  # path where the output files will be saved

In [None]:
# Change the path to your proteingroups.tsv and precursors.tsv files
protein_file = pd.read_csv(input_path + "proteingroups.tsv", sep="\t")
precursor_file = pd.read_csv(input_path + "precursors.tsv", sep="\t")

In [None]:
# Map the proteins to the precursors using the "PROTEIN_IDS" column in the precursor file
def add_fasta_headers(prec_df, protein_df):
    # Create a dictionary from the second DataFrame for fast look-up
    protein_to_header = dict(zip(protein_df["PROTEIN_IDS"], protein_df["FASTA_HEADERS"]))

    # Function to find and join headers for each PROTEIN_IDS entry
    def get_fasta_headers(protein_ids):
        ids = protein_ids.split(";")  # Split the IDs by the separator
        headers = [protein_to_header.get(protein_id.strip(), "") for protein_id in ids]
        headers = [header for header in headers if header]  # Remove empty headers
        return "; ".join(headers) if headers else None

    # Apply the function to the PROTEIN_IDS column and create a new FASTA_HEADERS column
    prec_df["FASTA_HEADERS"] = prec_df["PROTEIN_IDS"].apply(get_fasta_headers)

    return prec_df

In [20]:
prec_df_with_headers = add_fasta_headers(precursor_file, protein_file)

In [None]:
# Change the path to the output file
prec_df_with_headers.to_csv(
    output_path + "precursors_with_headers.tsv", sep="\t", index=False
)  # This file can be uploaded to ProteoBench

# AlphaDIA output preparation

In [None]:
input_path = "/mnt/d/Proteobench_manuscript_data/run_output/AlphaDIA_1.10_default/transfer/"  # path should contain the precursors.tsv file and the precursor.matrix.tsv file
output_path = "/mnt/d/Proteobench_manuscript_data/run_output/AlphaDIA_1.10_default/transfer/"  # path where the output files will be saved

In [4]:
precursors_long = pd.read_csv(input_path + "precursors.tsv", sep="\t")
precursor_matrix = pd.read_csv(input_path + "precursor.matrix.tsv", sep="\t")

In [None]:
precursors

Unnamed: 0,mod_seq_charge_hash,LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_01,LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_02,LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_03,LFQ_Orbitrap_AIF_Condition_B_Sample_Alpha_01,LFQ_Orbitrap_AIF_Condition_B_Sample_Alpha_02,LFQ_Orbitrap_AIF_Condition_B_Sample_Alpha_03
0,15481211221274,4.388767e+08,4.542098e+08,4.327669e+08,4.844609e+08,0.000000e+00,4.324494e+08
1,159521083995512,3.251849e+08,0.000000e+00,0.000000e+00,3.145478e+08,0.000000e+00,0.000000e+00
2,321833340036170,5.105902e+08,7.067959e+08,6.510211e+08,0.000000e+00,0.000000e+00,0.000000e+00
3,724298168523893,4.821941e+07,0.000000e+00,2.532368e+07,1.973107e+08,1.942472e+08,1.853911e+08
4,842012188086435,0.000000e+00,0.000000e+00,0.000000e+00,3.732418e+08,3.725704e+08,3.663040e+08
...,...,...,...,...,...,...,...
62543,18445773035312378183,0.000000e+00,0.000000e+00,0.000000e+00,2.110023e+08,2.820278e+08,0.000000e+00
62544,18445899706933904523,6.227492e+07,7.216148e+07,1.495587e+08,0.000000e+00,0.000000e+00,0.000000e+00
62545,18445980366100704500,0.000000e+00,0.000000e+00,0.000000e+00,1.284837e+08,1.158165e+08,7.537017e+07
62546,18446123786064586938,8.531072e+08,9.023949e+08,8.646071e+08,8.663234e+08,9.044181e+08,8.176428e+08


In [None]:
precursor_matrix_with_precursor_info = pd.merge(
    precursor_matrix,
    precursors_long[["genes", "decoy", "mods", "mod_sites", "sequence", "charge", "mod_seq_charge_hash"]],
    on="mod_seq_charge_hash",
)

In [10]:
precursor_matrix_with_precursor_info.to_csv(
    output_path + "precursor_matrix_with_precursor_info.tsv", sep="\t", index=False
)  # This file can be uploaded to ProteoBench