In [2]:
import collections

# Standard integer masses for amino acids
INTEGER_MASSES = sorted(list(set([
    57, 71, 87, 97, 99, 101, 103, 113, 114, 115, 128, 129, 131, 137, 147, 156, 163, 186
])))

def get_cyclospectrum(peptide_masses): # Same as in BA4E
    if not peptide_masses:
        return collections.Counter({0: 1}) # Spectrum of empty peptide is {0:1}
    
    spectrum = collections.Counter()
    spectrum[0] += 1

    n = len(peptide_masses)
    doubled_for_subs = peptide_masses + peptide_masses[:n-1] # I need to concat the n - 1 peptides in the end to mimic the circle

    for length in range(1, n): # Subpeptide lengths from 1 to n-1
        for i in range(n):     # Starting position in the original peptide
            subpeptide_mass = sum(doubled_for_subs[i : i + length])
            spectrum[subpeptide_mass] += 1
    
    # Add mass of the full peptide
    spectrum[sum(peptide_masses)] += 1
    return spectrum

# From BA4E
def expand_peptide_candidates(current_peptides_set, amino_acid_masses_list): # Add all amino acids to all candidates
    expanded_set = set()
    for p_tuple in current_peptides_set:
        for aa_mass in amino_acid_masses_list:
            expanded_set.add(p_tuple + (aa_mass,))
    return expanded_set

def calculate_score_for_peptide(peptide_mass_tuple, experimental_spectrum_counter):
    peptide_mass_list = list(peptide_mass_tuple) # Don't need to use amino_acid_string_to_mass_list from BA4F, only integers matter in this task
    theoretical_cyclospectrum_counter = get_cyclospectrum(peptide_mass_list)
    
    current_score = 0
    for mass, theo_count in theoretical_cyclospectrum_counter.items():
        exp_count = experimental_spectrum_counter.get(mass, 0) # Check this
        current_score += min(theo_count, exp_count)
    return current_score

def trim_leaderboard_cut(candidate_peptides_list, experimental_spectrum_counter, N_cutoff): # Need to cut/prune the leaderboard
    if N_cutoff == 0: # If N is 0, leaderboard becomes empty
        return set()

    scored_peptides_with_info = []
    for p_tuple in candidate_peptides_list:
        score = calculate_score_for_peptide(p_tuple, experimental_spectrum_counter)
        scored_peptides_with_info.append({'peptide': p_tuple, 'score': score})

    scored_peptides_with_info.sort(key=lambda x: x['score'], reverse=True)
        
    if len(scored_peptides_with_info) <= N_cutoff:
        return {item['peptide'] for item in scored_peptides_with_info}
    else:
        cutoff_score_value = scored_peptides_with_info[N_cutoff-1]['score']
        final_leaderboard_set = {
            item['peptide'] for item in scored_peptides_with_info if item['score'] >= cutoff_score_value
        }
        return final_leaderboard_set

def leaderboard_cyclopeptide_sequencing(experimental_spectrum_input_list, N_leaderboard_size): # Create the LEADERBOARDCYCLOPEPTIDESEQUENCING algorithm
    current_leaderboard_as_set = {tuple()}  # Start with 0-peptide
    leader_peptide_as_tuple = tuple()       # Best peptide with mass == ParentMass

    experimental_spectrum_as_counter = collections.Counter(experimental_spectrum_input_list)
    # Initial score for the initial 0-peptide LeaderPeptide
    leader_peptide_current_score = calculate_score_for_peptide(tuple(), experimental_spectrum_as_counter)
    
    parent_mass = max(experimental_spectrum_input_list) # Assumes spectrum is not empty

    while current_leaderboard_as_set:
        expanded_candidate_peptides = expand_peptide_candidates(current_leaderboard_as_set, INTEGER_MASSES) # Expand while you can
        
        peptides_for_cut_consideration = []

        for p_tuple in expanded_candidate_peptides:
            current_mass = sum(p_tuple)

            if current_mass == parent_mass:
                current_score = calculate_score_for_peptide(p_tuple, experimental_spectrum_as_counter)
                if current_score > leader_peptide_current_score:
                    leader_peptide_current_score = current_score
                    leader_peptide_as_tuple = p_tuple
                peptides_for_cut_consideration.append(p_tuple)
            elif current_mass < parent_mass:
                peptides_for_cut_consideration.append(p_tuple)
            # Peptides with current_mass > parent_mass are pruned

        if not peptides_for_cut_consideration:
            current_leaderboard_as_set = set() # End loop
        else:
            current_leaderboard_as_set = trim_leaderboard_cut(
                peptides_for_cut_consideration,
                experimental_spectrum_as_counter,
                N_leaderboard_size
            )
    return "-".join(map(str, leader_peptide_as_tuple))


if __name__ == "__main__":
    file_path = "../data/rosalind_ba4g.txt"

    N_input = 0
    spectrum_input_str = ""
    with open(file_path, 'r') as f:
        N_input = int(f.readline().strip())
        spectrum_input_str = f.readline().strip()

    spectrum_list = list(map(int, spectrum_input_str.split()))
    result_peptide_str = leaderboard_cyclopeptide_sequencing(spectrum_list, N_input)
    
    print(result_peptide_str)

97-115-101-137-114-128-103-131-156-115-113-113-128
