# Peptide-Protein Mapping

This notebook processes peptide and protein data to map peptides to their positions within protein sequences. The output includes peptide sequences with their surrounding amino acids.

---

## Workflow

1. Load input files (proteins, peptides)
2. Create mapping dictionary
3. Search for peptide matches within protein sequences
4. Extract peptide context
5. Merge results and save output


## Import required libraries 

In [None]:
import pandas as pd
import re


## Define file paths and parameters

In [None]:
# Input and output file paths
input_proteins_file = 'PGs_processed_250522.tsv'
input_peptides_file = 'precursors_all samples, filtered, normalized_250522.tsv'
output_file = 'results.csv'
final_output_file = 'precursors_all samples, filtered, normalized_250522_Q.tsv'

# Chunk size for processing large protein files
chunk_size = 500


## Load peptide data and create mapping dictionary

In [None]:
# Load peptides with Protein.Group information
peptides_df = pd.read_csv(input_peptides_file, sep='\t', encoding='UTF-8', usecols=['Stripped.Sequence', 'Protein.Group'])
peptides_df_complete = pd.read_csv(input_peptides_file, sep='\t', encoding='UTF-8')

# Remove any rows with missing values in essential columns
peptides_df = peptides_df.dropna(subset=['Stripped.Sequence', 'Protein.Group'])

# Create dictionary mapping Protein.Group to peptide sequences
peptides_dict = peptides_df.groupby('Protein.Group')['Stripped.Sequence'].unique().to_dict()


## Define peptide matching function

In [None]:
def find(protein_sequence, peptide_sequence):
    """
    Search for peptide sequences within the given protein sequence.
    Handles ambiguous amino acids using regex substitutions.
    
    Returns:
        A string of matching peptides with surrounding amino acids, or None if no match found.
    """
    results = []
    for peptide in peptide_sequence:
           # Handle ambiguous amino acids by replacing with regex patterns
        peptide = peptide.replace('Z', '[QE]').replace('B', '[DN]').replace('J', '[LI]').replace('X', '.')
        for match in re.finditer(peptide, protein_sequence):
            start = match.start()
            end = match.end()
            matched = protein_sequence[start:end]
            previous = protein_sequence[start-1] if start > 0 else '.'
            following = protein_sequence[end] if end < len(protein_sequence) else '.'
            results.append(f'{previous}.{matched}.{following}')
    return (', '.join(results) if results else None)


## Process protein data in chunks and search for matching peptides

In [None]:

# Read proteins file in chunks to handle large files efficiently
proteins_df = pd.read_csv(input_proteins_file, sep='\t', encoding='UTF-8', usecols=['Protein.Group', 'UniProt_Sequence'], chunksize=chunk_size)

# Process each chunk
for i, chunk in enumerate(proteins_df):
      # Handle multiple UniProt sequences per protein group
    chunk['UniProt_Sequence'] = chunk['UniProt_Sequence'].str.split(';')
    chunk = chunk.explode('UniProt_Sequence')

    # Apply peptide matching function row by row
    def process_row(row):
        group = row['Protein.Group']
        seq = row['UniProt_Sequence']
        peptides = peptides_dict.get(group, [])
        return find(seq, peptides)

    chunk['results'] = chunk.apply(process_row, axis=1)
    chunk = chunk[chunk['results'].notna()]
    chunk.to_csv(output_file, mode='a', index=False, header=(i == 0))

    print(f"Processed rows {i * chunk_size}-{(i + 1) * chunk_size}")


## Extract stripped peptide sequences and prepare final dataset

In [None]:
# Load intermediate results
df = pd.read_csv(output_file)

# Clean up results column
df['results'] = df['results'].astype(str).str.replace('"', '').str.split(',')
df = df.explode('results')
df['results'] = df['results'].str.strip()
df = df[df['results'] != '']
df = df[['results']].drop_duplicates()

# Extract stripped peptide sequence (remove surrounding amino acids)
df['Stripped.Sequence'] = df['results'].str.extract(r'\.([A-Z]+)\.')
df.columns = ['Stripped.Sequence_Q', 'Stripped.Sequence']
df = df[['Stripped.Sequence', 'Stripped.Sequence_Q']]


## Merge enriched data with original peptide dataset and save final output

In [None]:
# Merge with complete peptide dataset
df_final = pd.merge(peptides_df_complete, df, on='Stripped.Sequence', how='left')

# Save final dataset
df_final.to_csv(final_output_file, sep='\t', index=False)
print(f"Final table saved to '{final_output_file}'.")
