In [None]:
def read_mgf_peaks(file_path):
    spectrum = [0]  # start with 0
    with open(file_path, 'r') as f:
        for line in f:
            line = line.strip()
            if line and not line.startswith(('BEGIN', 'END', 'PEPMASS', 'CHARGE')):
                try:
                    mz = float(line.split()[0])
                    spectrum.append(round(mz))
                except:
                    continue
    return sorted(spectrum)

In [None]:
#@title Amio acids mapping and prefix mass calc

amino_acid_masses = [
    57, 71, 87, 97, 99, 101, 103, 113,
    114, 115, 128, 129, 131, 137, 147,
    156, 163, 186
]

mass_to_aa = {
    57: 'G', 71: 'A', 87: 'S', 97: 'P', 99: 'V',
    101: 'T', 103: 'C', 113: 'I', 114: 'N', 115: 'D',
    128: 'K', 129: 'E', 131: 'M', 137: 'H', 147: 'F',
    156: 'R', 163: 'Y', 186: 'W'
}

def compute_prefix_mass(peptide): #[0,2,3,4] example
    prefix_mass = [0] # list to store the prefix / cumlutaive masses
    current = 0        # the first item is 0, the second is 0+first mass in the peptide--> mass of the 1 amino acid, third is the mass of 1 and 2 amino acids
    for i in peptide:
        current += i
        prefix_mass.append(current)
    return prefix_mass

In [None]:
#@title count and store the occurance of masses

def count_masses(masses): # the function takes list of integers represent the masses
    count_dict = {} # key is the mass value how many it's appeared
    for m in masses:
        if m in count_dict:
            count_dict[m] += 1
        else:
            count_dict[m] = 1
    return count_dict

# total mass
def mass(peptide): # peptide is a list of masses
    return sum(peptide)

# the parent mass is the mass of the whole peptide
def parent_mass(spectrum):
    return max(spectrum)


In [None]:
#@title calculate the mass of each possible fragment could be genrated from the peptide

# Linear spectrum
'''
The spectrum shows masses of fragments that would appear if the peptide broke at various points
Each mass corresponds to a possible fragment
The function generates all possible linear fragments (no wrapping around)

'''
def linear_spectrum(peptide):
    prefix_mass = compute_prefix_mass(peptide)
    spectrum = [0]
    for i in range(len(peptide)):
        for j in range(i+1, len(peptide)+1):
            spectrum.append(prefix_mass[j] - prefix_mass[i])
    return sorted(spectrum)

# Cyclic spectrum: includes wraparound subpeptides
def cyclic_spectrum(peptide):
    n = len(peptide)
    prefix_mass = compute_prefix_mass(peptide)
    peptide_mass = prefix_mass[-1]
    spectrum = [0]
    for i in range(n):
        for j in range(i+1, n+1):
            sub = prefix_mass[j] - prefix_mass[i]
            spectrum.append(sub)
            if i > 0 and j < n:
                wrap = peptide_mass - sub
                spectrum.append(wrap)
    return sorted(spectrum)


In [None]:
#@title checking consistnecy

# Check consistency using manual counts
def is_consistent(peptide, spectrum):
    spectrum_counts = count_masses(spectrum) # dict has masses vals and how many it's repeated of the experimental spectrum
    peptide_counts = count_masses(linear_spectrum(peptide)) # count the freq of the mass of each possible fragment (sub-peptide)
    for m in peptide_counts:
        if peptide_counts[m] > spectrum_counts.get(m, 0): # if the mass of the fragemnts appeared more than observed reject it
            return False
    return True

'''
The is_consistent() function checks whether a candidate peptide's linear fragment masses appear in the experimental spectrum without exceeding their observed counts.
 This ensures the peptide is still a valid candidate for further expansion in the sequencing algorithm.
'''

"\nThe is_consistent() function checks whether a candidate peptide's linear fragment masses appear in the experimental spectrum without exceeding their observed counts.\n This ensures the peptide is still a valid candidate for further expansion in the sequencing algorithm.\n"

In [None]:
#@title expanding peptides

def expand(peptides):
    new_peptides = []
    for peptide in peptides:
        for m in amino_acid_masses:
            new_peptides.append(peptide + [m])
    return new_peptides

# Convert to amino acid letters
def decode_peptide(peptide):
    return ''.join(mass_to_aa.get(m, '?') for m in peptide)

In [None]:
def score(peptide, spectrum, cyclic=True):

    theo = cyclic_spectrum(peptide)
    theo_counts = count_masses(theo)
    spec_counts = count_masses(spectrum)

    total = 0
    for m in theo_counts:
        if m in spec_counts:
            total += min(theo_counts[m], spec_counts[m])
    return total

def trim(leaderboard, spectrum, N):
    scored = []
    for peptide in leaderboard:
        s = score(peptide, spectrum)
        scored.append((peptide, s))

def get_top_peptides(scored, N):
    for i in range(len(scored)):
        for j in range(i + 1, len(scored)):
            if scored[j][1] > scored[i][1]:
                temp = scored[i]
                scored[i] = scored[j]
                scored[j] = temp

    result = []
    if len(scored) <= N:
        for pair in scored:
            result.append(pair[0])
        return result

    threshold = scored[N - 1][1]

    for pair in scored:
        if pair[1] >= threshold:
            result.append(pair[0])

    return result



In [None]:
#@title main BB algorinm

def leaderboard_cyclopeptide_sequencing(spectrum, N):
    leaderboard = [[]]
    leader_peptide = []
    parent = parent_mass(spectrum)

    while leaderboard:
        leaderboard = expand(leaderboard)
        for peptide in leaderboard[:]:
            m = mass(peptide)
            if m == parent:
                if score(peptide, spectrum) > score(leader_peptide, spectrum):
                    leader_peptide = peptide
            elif m > parent:
                leaderboard.remove(peptide)
        leaderboard = trim(leaderboard, spectrum, N)

    return leader_peptide



spectrum = read_mgf_peaks("CCMSLIB00000531485.txt")
result = leaderboard_cyclopeptide_sequencing(spectrum, N=20)

print("Best-scoring peptide:", decode_peptide(result), "→", result)


Best-scoring peptide: PNNIMDG → [97, 114, 114, 113, 131, 115, 57]
