MaximalNonBranchingPaths(Graph)

        Paths ← empty list

        for each node v in Graph

            if v is not a 1-in-1-out node

                if out(v) > 0

                    for each outgoing edge (v, w) from v

                        NonBranchingPath ← the path consisting of the single edge (v, w)

                        while w is a 1-in-1-out node

                            extend NonBranchingPath by the outgoing edge (w, u) from w 

                            w ← u

                        add NonBranchingPath to the set Paths

        for each isolated cycle Cycle in Graph

            add Cycle to Paths
            
        return Paths

In [1]:
def NodeInDegree(graph_dict,node):
  node_in_degree = 0
  for adjacent_nodes in graph_dict.values():
    node_in_degree = node_in_degree + adjacent_nodes.count(node)
  return node_in_degree

In [2]:
def NodeOutDegree(graph_dict,node):
  if node in graph_dict.keys():
    return len(graph_dict[node])
  else:
    return 0

A node v in a directed graph Graph is called a 1-in-1-out node if its indegree and outdegree are both equal to 1, i.e., in(v) = out(v) = 1.  We can rephrase the definition of a "maximal non-branching path" from the main text as a path whose internal nodes are 1-in-1-out nodes and whose initial and final nodes are not 1-in-1-out nodes.  Also, note that the definition from the main text does not handle the special case when Graph has a connected component that is an isolated cycle, in which all nodes are 1-in-1-out nodes.

In [56]:
def OneInOneOutNode(graph_dict,node):
  if NodeInDegree(graph_dict,node) == 1 and NodeOutDegree(graph_dict,node) == 1:
    return True
  else:
    return False

In [57]:
def NonBranchingPaths(graph_dict,de_bruijn_graph_nodes):
  paths = []
  for node in de_bruijn_graph_nodes:
    if not OneInOneOutNode(graph_dict,node): #not 1-in-1-out node
      if NodeOutDegree(graph_dict,node) > 0:
        for adjacent_node in graph_dict[node]:
          non_branching_path = [node,adjacent_node]
          while NodeInDegree(graph_dict,adjacent_node) == 1 and NodeOutDegree(graph_dict,adjacent_node) == 1: #1-in-1-out node
            non_branching_path.append(graph_dict[adjacent_node][0])
            adjacent_node = graph_dict[adjacent_node][0]
          paths.append(non_branching_path)
  for node in de_bruijn_graph_nodes:
      #definition from the main text does not handle the special case when Graph has a connected component that is an isolated cycle, in which all nodes are 1-in-1-out nodes.
      if OneInOneOutNode(graph_dict,node): #1-in-1-out node
        for adjacent_node in graph_dict[node]:
          non_branching_path = [node,adjacent_node]
          while NodeInDegree(graph_dict,adjacent_node) == 1 and NodeOutDegree(graph_dict,adjacent_node) == 1 and adjacent_node != node:
            non_branching_path.append(graph_dict[adjacent_node][0])
            adjacent_node = graph_dict[adjacent_node][0]
          paths.append(non_branching_path)
  return paths

In [58]:
def FindAllKmers(non_branching_path,k):
  kmers_list = []
  i = 0
  while i + k - 1 <= len(non_branching_path) - 1:
    kmers_list.append(non_branching_path[i:i+k])
    i = i + 1
  return kmers_list

A maximal non-branching path is a non-branching path that cannot be extended into a longer non-branching path --> if we have a list of non-branching paths and if we sort the list by increasing non-branching paths length, certain non-branching path can only be contained in non-branching paths with higher length

In [59]:
from numpy import delete

In [104]:
def MaximalNonBranchingPaths(paths):
  indices_to_delete = []
  for i in range(len(paths)):
    for j in range(len(paths)):
      if len(paths[i]) < len(paths[j]):
        if i != j and paths[i] in FindAllKmers(paths[j],len(paths[i])):
          indices_to_delete.append(i)
          break
      elif len(paths[i]) == len(paths[j]): #detect cycles
        if i != j and paths[i] != paths[j] and set(paths[i]) == set(paths[j]) and j not in indices_to_delete:
          indices_to_delete.append(i)
          break
  paths = list(delete(paths,indices_to_delete))
  return paths

In [124]:
def PrintResult(de_bruijn_graph_nodes,paths):
  for path in paths:
    string_to_print = str(de_bruijn_graph_nodes[path[0]])
    for i in range(1,len(path)):
      string_to_print = string_to_print + de_bruijn_graph_nodes[path[i]][len(de_bruijn_graph_nodes[path[i]])-1]
    print(string_to_print)

In [128]:
def PrintResultToFile(de_bruijn_graph_nodes,paths):
  f = open("task_result.txt","w")
  for path in paths:
    string_to_print = str(de_bruijn_graph_nodes[path[0]])
    for i in range(1,len(path)):
      string_to_print = string_to_print + de_bruijn_graph_nodes[path[i]][len(de_bruijn_graph_nodes[path[i]])-1]
    f.write(string_to_print + ' ')
  f.close()

In [107]:
def PatternPrefix(pattern):
  return pattern[0:len(pattern)-1]

In [108]:
def PatternSuffix(pattern):
  return pattern[1:len(pattern)]

In [109]:
def DeBruijnGraphDict(patterns):
  de_bruijn_graph_dict = {}
  for pattern in patterns:
    de_bruijn_graph_dict.setdefault(PatternPrefix(pattern),[])
    de_bruijn_graph_dict[PatternPrefix(pattern)].append(PatternSuffix(pattern))
  return de_bruijn_graph_dict

In [110]:
def DeBruijnGraphNodes(de_bruijn_graph_dict):
  de_bruijn_graph_nodes = set()
  for key,adjacent_nodes in de_bruijn_graph_dict.items():
    de_bruijn_graph_nodes.update([key] + adjacent_nodes)
  return list(de_bruijn_graph_nodes) #mapping from node to number defined by mapping from index to node at index

In [111]:
def GraphDict(de_bruijn_graph_dict,de_bruijn_graph_nodes):
  graph_dict = {}
  for key,adjacent_nodes in de_bruijn_graph_dict.items():
    graph_dict.setdefault(de_bruijn_graph_nodes.index(key),[])
    for node in adjacent_nodes:
      graph_dict[de_bruijn_graph_nodes.index(key)].append(de_bruijn_graph_nodes.index(node))
  return graph_dict

Contig Generation Problem

Generate the contigs from a collection of reads (with imperfect coverage).

Given: A collection of k-mers Patterns.

Return: All contigs in DeBruijn(Patterns). (You may return the strings in any order.)

In [134]:
def ContigGeneration(patterns):
  de_bruijn_graph_dict = DeBruijnGraphDict(patterns)
  de_bruijn_graph_nodes_patterns = DeBruijnGraphNodes(de_bruijn_graph_dict)
  graph_dict = GraphDict(de_bruijn_graph_dict,de_bruijn_graph_nodes_patterns)
  de_bruijn_graph_nodes_numbers = DeBruijnGraphNodes(graph_dict)
  non_branching_paths = NonBranchingPaths(graph_dict,de_bruijn_graph_nodes_numbers)
  maximal_non_branching_paths = MaximalNonBranchingPaths(non_branching_paths)
  #PrintResult(de_bruijn_graph_nodes_patterns,maximal_non_branching_paths)
  PrintResultToFile(de_bruijn_graph_nodes_patterns,maximal_non_branching_paths)

In [135]:
patterns = [
'ATG',
'ATG',
'TGT',
'TGG',
'CAT',
'GGA',
'GAT',
'AGA']

In [136]:
ContigGeneration(patterns)

ATG
ATG
CAT
TGT
TGGA
AGA
GAT


  arr = asarray(arr)


In [130]:
with open('/content/rosalind_ba3k.txt') as task_file:
  patterns = [line.rstrip() for line in task_file]

In [None]:
ContigGeneration(patterns)