# MSAID output preparation

This Jupyter Notebook serves to add protein information from proteingroups.tsv to precursors.tsv, for Proteobench upload

In [16]:
import pandas as pd

In [17]:
# Change paths to your local paths
input_path = "/mnt/d/Proteobench_manuscript_data/run_output/MSAID_default/"
output_path = "/mnt/d/Proteobench_manuscript_data/run_output/MSAID_default/"

In [18]:
# Change the path to your proteingroups.tsv and precursors.tsv files
protein_file = pd.read_csv(input_path + 'proteingroups.tsv', sep='\t')
precursor_file = pd.read_csv(input_path + 'precursors.tsv', sep='\t')

In [19]:
# Map the proteins to the precursors using the "PROTEIN_IDS" column in the precursor file
def add_fasta_headers(prec_df, protein_df):
    # Create a dictionary from the second DataFrame for fast look-up
    protein_to_header = dict(zip(protein_df['PROTEIN_IDS'], protein_df['FASTA_HEADERS']))

    # Function to find and join headers for each PROTEIN_IDS entry
    def get_fasta_headers(protein_ids):
        ids = protein_ids.split(';')  # Split the IDs by the separator
        headers = [protein_to_header.get(protein_id.strip(), '') for protein_id in ids]
        headers = [header for header in headers if header]  # Remove empty headers
        return '; '.join(headers) if headers else None

    # Apply the function to the PROTEIN_IDS column and create a new FASTA_HEADERS column
    prec_df['FASTA_HEADERS'] = prec_df['PROTEIN_IDS'].apply(get_fasta_headers)

    return prec_df

In [20]:
prec_df_with_headers = add_fasta_headers(precursor_file, protein_file)

In [21]:
# Change the path to the output file
prec_df_with_headers.to_csv(output_path + 'precursors_with_headers.tsv', sep='\t', index=False)