# Translate an RNA String into an Amino Acid String

In [0]:
from collections import Counter
from collections import defaultdict

In [0]:
CODON_TABLE = {
    'UUU': 'F',     'CUU': 'L',     'AUU': 'I',     'GUU': 'V',
    'UUC': 'F',     'CUC': 'L',     'AUC': 'I',     'GUC': 'V',
    'UUA': 'L',     'CUA': 'L',     'AUA': 'I',     'GUA': 'V',
    'UUG': 'L',     'CUG': 'L',     'AUG': 'M',     'GUG': 'V',
    'UCU': 'S',     'CCU': 'P',     'ACU': 'T',     'GCU': 'A',
    'UCC': 'S',     'CCC': 'P',     'ACC': 'T',     'GCC': 'A',
    'UCA': 'S',     'CCA': 'P',     'ACA': 'T',     'GCA': 'A',
    'UCG': 'S',     'CCG': 'P',     'ACG': 'T',     'GCG': 'A',
    'UAU': 'Y',     'CAU': 'H',     'AAU': 'N',     'GAU': 'D',
    'UAC': 'Y',     'CAC': 'H',     'AAC': 'N',     'GAC': 'D',
    'UAA': 'Stop',  'CAA': 'Q',     'AAA': 'K',     'GAA': 'E',
    'UAG': 'Stop',  'CAG': 'Q',     'AAG': 'K',     'GAG': 'E',
    'UGU': 'C',     'CGU': 'R',     'AGU': 'S',     'GGU': 'G',
    'UGC': 'C',     'CGC': 'R',     'AGC': 'S',     'GGC': 'G',
    'UGA': 'Stop',  'CGA': 'R',     'AGA': 'R',     'GGA': 'G',
    'UGG': 'W',     'CGG': 'R',     'AGG': 'R',     'GGG': 'G'
}

In [0]:
def rna_to_peptide(rna):
  peptide = ''
  for idx in range(0,len(rna),3):
    if CODON_TABLE[rna[idx:idx+3]] == 'Stop':
      break
    peptide+=CODON_TABLE[rna[idx:idx+3]]
  return peptide

In [0]:
print(rna_to_peptide('AUGGCCAUGGCGCCCAGAACUGAGAUCAAUAGUACCCGUAUUAACGGGUGA'))

MAMAPRTEINSTRING


# Find Substrings of a Genome Encoding a Given Amino Acid String

In [0]:
def dna_to_rna(genome):
	return genome.replace('T','U')

In [0]:
def rna_to_dna(genome):
	return genome.replace('U','T')

In [0]:
def rev_complement(genome):
	mp = {
		'A':'U',
		'U':'A',
		'C':'G',
		'G':'C'
	}
	ans = ''
	for ch in genome:
		ans+=mp[ch]
	return ans[::-1]

In [0]:
def get_substring(genome, peptide):
  ans = []
  k = len(peptide)*3
  rna = dna_to_rna(genome)
  for idx in range(0, len(rna)-k+1):
    if rna_to_peptide(rna[idx:idx+k]) == peptide or rna_to_peptide(rev_complement(rna[idx:idx+k])) == peptide:
      ans.append(genome[idx:idx+k])
    #endif
  #endfor
  return ans

In [0]:
ans = get_substring('ATGGCCATGGCCCCCAGAACTGAGATCAATAGTACCCGTATTAACGGGTGA','MA')
for read in ans:
  print(read)

ATGGCC
GGCCAT
ATGGCC


# Generate the Theoretical Spectrum of a Cyclic Peptide

In [0]:
amino_acid = {
    'G':57,
    'A':71,
    'S':87,
    'P':97,
    'V':99,
    'T':101,
    'C':103,
    'I':113,
    'L':113,
    'N':114,
    'D':115,
    'K':128,
    'Q':128,
    'E':129,
    'M':131,
    'H':137,
    'F':147,
    'R':156,
    'Y':163,
    'W':186
}

In [0]:
def generate_mass(mass, ans, k):
  nw_mass = mass + mass[0:(k-1)]
  for idx in range(len(nw_mass)-k+1):
    ans.append(sum(nw_mass[idx:idx+k]))
  #endfor

In [0]:
def theoretical_spectrum(peptide):
  mass = [amino_acid[ch] for ch in peptide]
  ans = [0, sum(mass)]
  for idx in range(1, len(mass)):
    generate_mass(mass,ans,idx)
  #endfor
  return sorted(ans)

In [0]:
print(*theoretical_spectrum('GNCANADVSRVSH'))

0 57 71 71 87 87 99 99 103 114 114 115 137 156 171 174 185 185 186 186 186 194 214 217 224 243 255 256 274 281 285 288 288 300 301 308 323 342 342 342 345 359 371 372 380 395 399 402 411 429 441 457 459 470 473 474 479 482 486 494 498 528 528 530 536 556 557 566 569 573 588 596 597 623 627 642 643 645 650 660 665 667 668 683 687 713 714 722 737 741 744 753 754 774 780 782 782 812 816 824 828 831 836 837 840 851 853 869 881 899 908 911 915 930 938 939 951 965 968 968 968 987 1002 1009 1010 1022 1022 1025 1029 1036 1054 1055 1067 1086 1093 1096 1116 1124 1124 1124 1125 1125 1136 1139 1154 1173 1195 1196 1196 1207 1211 1211 1223 1223 1239 1239 1253 1310


# Compute the Number of Peptides of Given Total Mass

In [0]:
unique_mass = [57,71,87,97,99,101,103,113,114,115,128,129,131,137,147,156,163,186]

In [0]:
def no_of_peptides(current_sum, mass, dp):
  if current_sum == mass:
    return 1
  if current_sum > mass:
    return 0
  if current_sum in dp:
    return dp[current_sum]
  ans = 0
  for m in unique_mass:
    ans += no_of_peptides(current_sum+m, mass, dp)
  dp[current_sum] = ans
  return ans

In [0]:
no_of_peptides(0, 1024, dict())

14712706211

# Find a Cyclic Peptide with Theoretical Spectrum Matching an Ideal Spectrum 

In [0]:
def generate_circular_spectrum(mass):
  ans = [0, sum(mass)]
  for idx in range(1, len(mass)):
    generate_mass(mass,ans,idx)
  return sorted(ans)

In [0]:
def expand(unit, peptides):
  l = len(peptides)
  for _ in range(l):
    peptide = peptides.pop(0)
    for m in unit:
      peptides.append(peptide + [m])
    #endfor
  #endfor

In [0]:
def is_consistent(peptide, experimental):
  ans = [0]
  for k in range(1, len(peptide)+1):
    for idx in range(len(peptide)-k+1):
      ans.append(sum(peptide[idx:idx+k:]))
    #endfor
  #endfor
  theoretical_spectrum = sorted(ans)
  cnt = Counter(experimental)
  for m in theoretical_spectrum:
    if cnt[m] > 0:
      cnt[m] -= 1
    else:
      return False
  #endfor
  return True

In [0]:
def cyclopeptide_sequencing(spectrum):
  acids = amino_acid.values()
  unit = [mass for mass in set(spectrum) if mass in acids]
  peptides = [[]]
  ans = []
  cnt = 0
  while peptides:
    if cnt == 3:
      break
    cnt += 1
    expand(unit, peptides)
    remove = []
    for peptide in peptides:
      if sum(peptide) == max(spectrum):
        if generate_circular_spectrum(peptide) == spectrum:
          ans.append(peptide)
        remove.append(peptide)
        #endif
      elif not is_consistent(peptide, spectrum):
        remove.append(peptide)
    #endfor
    for p in remove:
      peptides.remove(p)
  #endwhile
  
  return ['-'.join([str(p) for p in item]) for item in ans]

In [0]:
ans = cyclopeptide_sequencing([0, 113 , 128 , 186 , 241 , 299 , 314 , 427])
print(*ans)

128-113-186 128-186-113 113-128-186 113-186-128 186-128-113 186-113-128


# Compute the Score of a Cyclic Peptide Against a Spectrum

In [0]:
def compute_score(peptide, spectrum):
  theoretical = theoretical_spectrum(peptide)
  count = Counter(spectrum)
  score = 0
  for key in theoretical:
    if count[key] > 0:
      score += 1
      count[key] -= 1
    #endif
  #endfor
  return score

In [0]:
compute_score('NQEL', [0, 99 ,113 ,114 ,128 ,227 ,257 ,299 ,355 ,356 ,370 ,371 ,484])

11

#Implement LeaderboardCyclopeptideSequencing 

In [0]:
#it will be called from __main__
def leaderboard_seq(N, spectrum):
    acids = [71,87,97,99,101,103,113,114,115,128,129,131,137,147,156,163,186]
    def expand(leader_board):
        n = len(leader_board)
        for _ in range(n):
            peptide = leader_board.pop(0)
            for m in acids:
                leader_board.append(peptide + [m])
    def cyclo_spectrum(peptide):
        cyclic , t_spectrum = peptide + peptide , [0, sum(peptide)]
        for pos in range(len(peptide)):
            for k in range(1, len(peptide)):
                t_spectrum.append(sum(cyclic[pos:k]))
        return t_spectrum
    def linear_spectrum(peptide):
        t_spectrum = [0]
        n = len(peptide)
        for k in range(1,n+1):
            for idx in range(n-k+1):
                t_spectrum.append(sum(peptide[idx:idx+k]))
        return t_spectrum

    def score(peptide):
        if not peptide: return 0
        t_spectrum , score = cyclo_spectrum(peptide) , 0
        for mass in spectrum:
            if mass in t_spectrum:
                score += 1
                t_spectrum.remove(mass)
        return score
    def trim_score(peptide):
        t_spectrum , score = linear_spectrum(peptide) , 0
        for mass in spectrum:
            if mass in t_spectrum:
                score += 1
                t_spectrum.remove(mass)
        return score


    def trim(leader_board):
        scores = [trim_score(peptide) for peptide in leader_board]
        index = sorted(range(len(scores)), key = lambda i: scores[i], reverse = True)
        sorted_score = [scores[i] for i in index]
        sorted_board = [leader_board[i] for i in index]
        #print('two:',N, sorted_score, sorted_board)
        for i in range(N, len(leader_board)):
            if sorted_score[i] < sorted_score[N-1]:
                return sorted_board[0:i]
        return sorted_board
    def driver():
        leader_board , leader_peptide , parent_mass = [[]] , [] , max(spectrum) 
        while leader_board:
            expand(leader_board)
            remove = []
            for peptide in leader_board:
                if sum(peptide) == parent_mass:
                    #print(peptide, score(peptide))
                    if score(peptide) > score(leader_peptide):
                        leader_peptide = peptide.copy()
                elif sum(peptide) > parent_mass:
                    remove.append(peptide)
            for peptide in remove:
                leader_board.remove(peptide)
            #print('before', leader_board)
            leader_board = trim(leader_board)
            #print('after', leader_board)
            #print(leader_peptide)
        return leader_peptide
    return driver()

In [0]:
peptide = leaderboard_seq(10,[0 ,71 ,113 ,129 ,147 ,200 ,218 ,260 ,313 ,331 ,347 ,389 ,460])
print('-'.join([str(acid) for acid in peptide]))
    

71-147-113-129


# Implement ConvolutionCyclopeptideSequencing

In [0]:
def convolution_seq(M, N, spectrum):
    
    def expand(leader_board,acids):
        n = len(leader_board)
        for _ in range(n):
            peptide = leader_board.pop(0)
            for m in acids:
                leader_board.append(peptide + [m])
    def cyclo_spectrum(peptide):
        cyclic , t_spectrum = peptide + peptide , [0, sum(peptide)]
        for pos in range(len(peptide)):
            for k in range(1, len(peptide)):
                t_spectrum.append(sum(cyclic[pos:k]))
        return t_spectrum
    def linear_spectrum(peptide):
        t_spectrum = [0]
        n = len(peptide)
        for k in range(1,n+1):
            for idx in range(n-k+1):
                t_spectrum.append(sum(peptide[idx:idx+k]))
        return t_spectrum

    def score(peptide):
        if not peptide: return 0
        t_spectrum , score = cyclo_spectrum(peptide) , 0
        for mass in spectrum:
            if mass in t_spectrum:
                score += 1
                t_spectrum.remove(mass)
        return score
    def trim_score(peptide):
        t_spectrum , score = linear_spectrum(peptide) , 0
        for mass in spectrum:
            if mass in t_spectrum:
                score += 1
                t_spectrum.remove(mass)
        return score


    def trim(leader_board):
        scores = [trim_score(peptide) for peptide in leader_board]
        index = sorted(range(len(scores)), key = lambda i: scores[i], reverse = True)
        sorted_score = [scores[i] for i in index]
        sorted_board = [leader_board[i] for i in index]
        #print('two:',N, sorted_score, sorted_board)
        for i in range(N, len(leader_board)):
            if sorted_score[i] < sorted_score[N-1]:
                return sorted_board[0:i]
        return sorted_board
    def convolution():
        n = len(spectrum)
        cnt = defaultdict(int)
        for i in range(n):
            for j in range(n):
                if 57 <= spectrum[i]-spectrum[j] <= 200:
                    cnt[spectrum[i]-spectrum[j]] += 1
        sorted_acid = sorted(cnt.items(), key=lambda x: x[1], reverse=True)
        for i in range(M, len(sorted_acid)):
            _, c = sorted_acid[i]
            if c < sorted_acid[M-1][1]:
                sorted_acid = sorted_acid[0:i]
                break
        return [m for m,_ in sorted_acid]



    def driver():
        leader_board , leader_peptide , acids , parent_mass = [[]] , [] , convolution() , max(spectrum) 
        #print(acids)
        while leader_board:
            expand(leader_board,acids)
            remove = []
            for peptide in leader_board:
                if sum(peptide) == parent_mass:
                    #print(peptide, score(peptide))
                    if score(peptide) > score(leader_peptide):
                        leader_peptide = peptide.copy()
                elif sum(peptide) > parent_mass:
                    remove.append(peptide)
            for peptide in remove:
                leader_board.remove(peptide)
            #print('before', leader_board)
            leader_board = trim(leader_board)
            #print('after', leader_board)
            #print(leader_peptide)
        return leader_peptide
    return driver()

In [0]:
peptide = convolution_seq(20,60,[57 ,57 ,71 ,99 ,129 ,137 ,170 ,186 ,194 ,208 ,228 ,265 ,285 ,299 ,307 ,323 ,356 ,364 ,394 ,422 ,493])
print('-'.join([str(m) for m in peptide]))

99-71-137-57-58-71


# Generate the Convolution of a Spectrum

In [0]:
def convolution_spectrum(spectrum):
  n = len(spectrum)
  mp = defaultdict(int)
  for x in spectrum:
    for y in spectrum:
      if (x-y) > 0:
        mp[(x-y)] += 1
      #endif
    #endfor
  #endfor
  conv_spec = sorted(mp.items(), key=lambda x: (-x[1],x[0]))
  ans = []
  for x,y in conv_spec:
    ans.extend([x]*y)
  return ans

In [0]:
print(*convolution_spectrum([0,137,186,323]))

137 137 186 186 49 323


#Generate the Theoretical Spectrum of a Linear Peptide

In [0]:
def linear_spectrum(peptide):
  arr = [amino_acid[ch] for ch in peptide]
  ans = [0]
  for k in range(1, len(peptide)+1):
    for idx in range(len(peptide)-k+1):
      ans.append(sum(arr[idx:idx+k:]))
    #endfor
  #endfor
  return sorted(ans)

In [0]:
print(*linear_spectrum('NQEL'))

0 113 114 128 129 242 242 257 370 371 484


#Compute the Score of a Linear Peptide

In [0]:
def linear_spectrum_score(peptide, experiment):
  theoretical = linear_spectrum(peptide)
  cnt = Counter(experiment)
  score = 0
  for key in theoretical:
    if cnt[key] > 0:
      score += 1
      cnt[key] -= 1
    #endif
  #endfor
  return score

In [0]:
linear_spectrum_score('NQEL',[0, 99, 113, 114, 128, 227, 257, 299, 355, 356, 370, 371, 484])

8

#Trim a Peptide Leaderboard

In [0]:
def trim(peptides, spectrum, k):
  mp = {}
  for peptide in peptides:
    mp[peptide] = linear_spectrum_score(peptide, spectrum)
  #endfor
  peptide_list = sorted(mp.items(), key=lambda x: -x[1])
  idx = k-1
  last_score = peptide_list[idx][1]
  for i in range(k, len(peptide_list)):
    _,score = peptide_list[i]
    if last_score == score:
      idx+=1
    else:
      break
  #endfor
  return [x for x,y in peptide_list[0:idx+1]]


In [0]:
print(*trim(['LAST','ALST','TLLT','TQAS'], [0,71,87,101,113,158,184,188,259,271,372],2))

LAST ALST
