In [17]:
def FindAllKmers(dna_string,k):
  kmers_list = []
  i = 0
  while i + k - 1 <= len(dna_string) - 1:
    kmers_list.append(dna_string[i:i+k])
    i = i + 1
  return kmers_list

In [18]:
def Prefix(pattern):
  return pattern[0:len(pattern)-1]

In [19]:
def Suffix(pattern):
  return pattern[1:len(pattern)]

In [20]:
def DeBruijnGraphK(patterns,k):
  de_bruijn_graph_dict = {}
  for pattern in patterns:
    de_bruijn_graph_dict.setdefault(Prefix(pattern),[])
    de_bruijn_graph_dict[Prefix(pattern)].append(Suffix(pattern))
  return de_bruijn_graph_dict

U DeBruijn grafu su identitični čvorovi gluani zajedno --> dakle, u listi čvorova ne možemo imati duplikate, možemo imati samo duplikatne veze, tj. readove koji se ponavljaju --> ako zbrojimo sve liste susjedstva onda možemo imati duplikate

In [21]:
def DeBruijnGraphNodes(de_bruijn_graph_dict):
  de_bruijn_graph_nodes = set()
  for node,adjacent_nodes in de_bruijn_graph_dict.items():
    de_bruijn_graph_nodes.update([node] + adjacent_nodes)
  return list(de_bruijn_graph_nodes) #mapping from read to number defined by mapping from index of list --> element at that index

In [22]:
def DefineGraphDict(de_bruijn_graph_dict,de_bruijn_graph_nodes):
  graph_dict = {}
  for i in range(len(de_bruijn_graph_nodes)):
    graph_dict.setdefault(i,[])
  for key,values_list in de_bruijn_graph_dict.items():
    for value in values_list:
      graph_dict[de_bruijn_graph_nodes.index(key)].append(de_bruijn_graph_nodes.index(value))
  return graph_dict

U listi susjedstva možemo imati duplikatne čvorove, npr.

AT --> TG, TG, TG (u genomu imamo 3 puta kmer ATG)

U ovom slučaju u rječniku moramo imati 3 ključa --> (AT, TG), (AT, TG), (AT, TG) --> ne možemo jer ključ mora biti jedinstven

In [23]:
def VisitedEdges(graph_dict):
  visited_edges_dict = {}
  for key,adjacent_nodes in graph_dict.items():
    for node in adjacent_nodes:
      visited_edges_dict.setdefault((key,node),[])
      visited_edges_dict[(key,node)].append(0)
  return visited_edges_dict

In [24]:
def NodeInDegree(de_bruijn_graph_dict,node):
  node_in_degree = 0
  for values_list in de_bruijn_graph_dict.values():
    if node in values_list:
      node_in_degree = node_in_degree + 1
  return node_in_degree

In [25]:
def NodeOutDegree(graph_dict,node):
  return len(graph_dict[node])

In [26]:
import numpy as np
from random import randint, randrange

In [27]:
def VisitedEdgesDictSum(visited_edges_dict):
  visited_edges_dict_sum = 0
  for values_list in visited_edges_dict.values():
    visited_edges_dict_sum = visited_edges_dict_sum + sum(values_list)
  return visited_edges_dict_sum

In [28]:
def VisitedEdgesDictValueCount(visited_edges_dict):
  visited_edges_dict_value_count = 0
  for values_list in visited_edges_dict.values():
    visited_edges_dict_value_count = visited_edges_dict_value_count + len(values_list)
  return visited_edges_dict_value_count

In [29]:
def EulerianCycle(graph_dict):
  visited_edges_dict = VisitedEdges(graph_dict)
  starting_node = randint(min(graph_dict.keys()),max(graph_dict.keys())) #lower and upper bound included
  cycle = [starting_node]
  visited_edges_dict_value_count = VisitedEdgesDictValueCount(visited_edges_dict)
  while VisitedEdgesDictSum(visited_edges_dict) < visited_edges_dict_value_count: #repeat until Eulerian cycle is found --> input is an Eulerian directed graph --> Eulerian cycle can always be found
    #while loop entered --> sum(visited_edges_dict.values()) < len(visited_edges_dict) --> cycle smaller than Eulerian cycle is being formed
    possible_adjacent_nodes = [key[1] for key in visited_edges_dict.keys() if key[0] == cycle[len(cycle)-1] and visited_edges_dict[key].count(0) > 0]
    if len(possible_adjacent_nodes) == 0 and VisitedEdgesDictSum(visited_edges_dict) < visited_edges_dict_value_count: #cycle smaller than Eulerian cycle completed as we got stuck at starting node --> all edges are not visited
      #possible_starting_nodes_list = [node for node in cycle if NodeOutDegree(graph_dict,node) >= NodeOutDegree(graph_dict,cycle[0])] #no, this way we are choosing node regardless of the number of times it appeared in cycle
      #possible_starting_nodes_list = [visited_edge[0] for node in cycle for visited_edge in visited_edges_dict.keys() if visited_edge[0] == node and visited_edges_dict[visited_edge] == 1] --> this caused efficiency problems
      possible_starting_nodes_list = [node for node in cycle if NodeOutDegree(graph_dict,node) > cycle.count(node)] #if NodeOutDegree(node) > number of times node occurs in cycle then there are unused outgoing edges (and unused ingoing edges because of unused outgoing edges, unused outgoing edges must be visited by unused incoming edges), every occurence means that one outgoing edge is used
      if len(possible_starting_nodes_list) == 0: #if this condition is satisfied than graph is not strongly connected --> there is no Eulerian cycle nor Eulerian path in graph
        return []
      if len(possible_starting_nodes_list) == 1:
        starting_node = possible_starting_nodes_list[0] #choose only available starting node
      else:
        starting_node = possible_starting_nodes_list[randrange(0,len(possible_starting_nodes_list))] #randomly choose new starting node among nodes with higher NodeOutDegree than previous starting node
      cycle = cycle[cycle.index(starting_node):len(cycle)] + cycle[1:cycle.index(starting_node)+1] #construct new_cycle using previous cycle
    else: #len(possible_adjacent_nodes) > 1 and sum(visited_edges_dict.values()) < len(visited_edges_dict) --> cycle is not finished yet
      if len(possible_adjacent_nodes) == 1:
        next_node = possible_adjacent_nodes[0]
        #visited_edges_dict[(cycle[len(cycle)-1], next_node)] = 1
        visited_edges_dict_value_list = visited_edges_dict[(cycle[len(cycle)-1], next_node)]
        visited_edges_dict_value_list[visited_edges_dict_value_list.index(0)] = 1 #always update the first zero
        cycle.append(next_node)
      else:
        next_node = possible_adjacent_nodes[randint(0,len(possible_adjacent_nodes)-1)] #lower and upper bound included
        visited_edges_dict_value_list = visited_edges_dict[(cycle[len(cycle)-1], next_node)]
        visited_edges_dict_value_list[visited_edges_dict_value_list.index(0)] = 1 #always update the first zero
        cycle.append(next_node)
  return cycle

U grafu sa Eulerovim putom imamo 2 nebalansirana čvora --> ne možemo imati samo jedan nebalansirani čvor jer njegovim balansiranjem (dodavanjem ulazne ili izlazne veze) neki drugi čvor ćemo učinit nebalansiranim. Dakle, ne možemo imati neparan broj nebalansiranih čvorova.

Thus, a nearly balanced graph has an Eulerian path if and only if
adding an edge between its unbalanced nodes makes the graph balanced and strongly
connected.

In [30]:
def FindUnbalancedNodes(graph_dict):
  unbalanced_nodes = []
  for node in graph_dict.keys():
    if NodeInDegree(graph_dict,node) != NodeOutDegree(graph_dict,node):
      unbalanced_nodes.append(node)
  for adjacent_nodes_list in graph_dict.values():
    for adjacent_node in adjacent_nodes_list:
      if adjacent_node not in graph_dict.keys():
        unbalanced_nodes.append(adjacent_node)
  return unbalanced_nodes

In [31]:
def UnbalancedNodesOrder(graph_dict,unbalanced_nodes):
  ordered_unbalanced_nodes = [unbalanced_nodes[0]]
  if NodeInDegree(graph_dict, unbalanced_nodes[1]) < NodeOutDegree(graph_dict, unbalanced_nodes[1]): #node lacks one incoming edge --> node is starting node in Eulerian path
    ordered_unbalanced_nodes.insert(0, unbalanced_nodes[1])
  else: #NodeInDegree(graph_dict, unbalanced_nodes[1]) > NodeOutDegree(graph_dict, unbalanced_nodes[1]) --> node lacks one outgoing edge --> node is ending node in Eulerian path
    ordered_unbalanced_nodes.insert(1, unbalanced_nodes[1])
  return ordered_unbalanced_nodes

In [32]:
def BalanceUnbalancedNodes(graph_dict, ordered_unbalanced_nodes):
  #ordered_unbalanced_nodes = [starting_node, ending_node]
  graph_dict.update({ordered_unbalanced_nodes[1]:[ordered_unbalanced_nodes[0]]})
  return graph_dict

In [33]:
def FindEulerianPathInEulerianCycle(ordered_unbalanced_nodes,eulerian_cycle):
  #ordered_unbalanced_nodes = [starting_node,ending_node]
  eulerian_cycle = np.array(eulerian_cycle)
  eulerian_path = []
  eulerian_path_start_indices = list(np.where(eulerian_cycle == ordered_unbalanced_nodes[0])[0])
  eulerian_path_end_indices = list(np.where(eulerian_cycle == ordered_unbalanced_nodes[1])[0])
  eulerian_cycle = list(eulerian_cycle)
  for start_index in eulerian_path_start_indices:
    for end_index in eulerian_path_end_indices:
      if end_index < start_index:
        if (len(eulerian_cycle) - 1 - start_index + 1) + (end_index - 0 + 1) == len(eulerian_cycle):
          return eulerian_cycle[start_index:len(eulerian_cycle)] + eulerian_cycle[1:end_index+1]
      else:
        if (end_index - start_index + 1) == len(eulerian_cycle):
          return eulerian_cycle[start_index:end_index+1]

In [34]:
def PrintResult(eulerian_cycle):
  string_to_print = ''
  for node in eulerian_cycle:
    string_to_print = string_to_print + str(node) + '->'
  print(string_to_print[0:len(string_to_print)-2])

In [35]:
def AssembleStringFromNodes(nodes):
  string = str(nodes[0])
  for i in range(1,len(nodes)):
    string = string + str(nodes[i][k-1-1])
  return string

In [36]:
def StringReconstruction(reads,k):
  de_bruijn_graph_dict = DeBruijnGraphK(reads,k)
  de_bruijn_graph_nodes = DeBruijnGraphNodes(de_bruijn_graph_dict)
  graph_dict = DefineGraphDict(de_bruijn_graph_dict,de_bruijn_graph_nodes)
  unbalanced_nodes = FindUnbalancedNodes(graph_dict)
  unbalanced_nodes = UnbalancedNodesOrder(graph_dict,unbalanced_nodes)
  graph_dict = BalanceUnbalancedNodes(graph_dict,unbalanced_nodes)
  eulerian_cycle = EulerianCycle(graph_dict)
  eulerian_path = FindEulerianPathInEulerianCycle(unbalanced_nodes,eulerian_cycle)
  for i in range(len(eulerian_path)):
    eulerian_path[i] = de_bruijn_graph_nodes[eulerian_path[i]]
  string = AssembleStringFromNodes(eulerian_path)
  return string

In [39]:
k = 4

In [40]:
reads = [
'CTTA',
'ACCA',
'TACC',
'GGCT',
'GCTT',
'TTAC']

In [41]:
StringReconstruction(reads,k)

'GGCTTACCA'

In [38]:
with open('/content/rosalind_ba3h_1_dataset.txt') as task_file:
  reads = [line.rstrip() for line in task_file]

In [39]:
k = 25

In [40]:
f = open("task_result.txt", "w")
f.write(StringReconstruction(reads,k))
f.close()