Reconstruct a string from a sequence of (k,d)-mers corresponding to a path in a paired de Bruijn graph.

Given: A sequence of (k, d)-mers (a1|b1), ... , (an|bn) such that Suffix(ai|bi) = Prefix(ai+1|bi+1) for all i from 1 to n-1.

Return: A string Text where the i-th k-mer in Text is equal to Suffix(ai|bi) for all i from 1 to n, if such a string exists.

In [None]:
def PairedDeBruijnGraphNodesToPairedRead(first_node,second_node,k):
  paired_read = []
  paired_read.append(first_node[0:k-1] + second_node[k-2])
  paired_read.append(first_node[k-1:len(first_node)] + second_node[len(second_node)-1])
  return paired_read

num_of_read_pairs = |Text| - (2*k + d) + 1

|Text| = num_of_read_pairs + (2*k + d) - 1

In [None]:
def AssembleStringFromNodes(eulerian_path,k,d,num_of_read_pairs):
  string_list = []
  for i in range(num_of_read_pairs + 2*k + d - 1):
    string_list.append(' ')
  i = 0
  while i + 1 <= len(eulerian_path) - 1: #i --> starting position of first kmer in read pair, starting position of second kmer is i + k + d
    read_pair = PairedDeBruijnGraphNodesToPairedRead(eulerian_path[i],eulerian_path[i+1],k)
    string_list[i:i+k] = list(read_pair[0])
    string_list[(i+k+d):(i+k+d)+k] = list(read_pair[1])
    i = i + 1
  return ''.join(string_list)

In [98]:
def AssembleStringFromNodes(eulerian_path,k,d,num_of_read_pairs):
  string_list = []
  for i in range(num_of_read_pairs + 2*k + d - 1):
    string_list.append(' ')
  for i in range(len(eulerian_path)): #i --> starting position of first kmer in read pair, starting position of second kmer is i + k + d
    string_list[i:i+k] = list(eulerian_path[i][0])
    string_list[(i+k+d):(i+k+d)+k] = list(eulerian_path[i][1])
  return ''.join(string_list)

In [75]:
def PatternPrefix(pattern):
  return pattern[0:len(pattern)-1]

In [76]:
def PatternSuffix(pattern):
  return pattern[1:len(pattern)]

In [77]:
def ReadPairPrefix(read_pair):
  return PatternPrefix(read_pair[0]) + PatternPrefix(read_pair[1])

In [78]:
def ReadPairSuffix(read_pair):
  return PatternSuffix(read_pair[0]) + PatternSuffix(read_pair[1])

In [79]:
def FormatReadPairs(read_pairs):
  for i in range(len(read_pairs)):
    read_pairs[i] = read_pairs[i].split('|')
  return read_pairs

In [111]:
def GappedGenomePathString(read_pairs,k,d):
  read_pairs = FormatReadPairs(read_pairs)
  string = AssembleStringFromNodes(read_pairs,k,d,len(read_pairs))
  return string

In [100]:
read_pairs = [ #sequence of (k,d)-mers corresponding to a path in a paired de Bruijn graph
'GACC|GCGC',
'ACCG|CGCC',
'CCGA|GCCG',
'CGAG|CCGG',
'GAGC|CGGA']

In [101]:
k = 4

In [102]:
d = 2

In [103]:
GappedGenomePathString(read_pairs,k,d)

GACCGAGCGCCGGA


In [112]:
with open('/content/rosalind_ba3l.txt') as task_file:
  task_arguments = [line.rstrip() for line in task_file]

In [113]:
k = int(task_arguments[0][0:2])

In [114]:
d = int(task_arguments[0][3:len(task_arguments[0])])

In [115]:
read_pairs = task_arguments[1:len(task_arguments)]

In [116]:
f = open("task_result.txt","w")
f.write(GappedGenomePathString(read_pairs,k,d))
f.close()