In [None]:
def FindAllKmers(dna_string,k):
  kmers_list = []
  i = 0
  while i + k - 1 <= len(dna_string) - 1:
    kmers_list.append(dna_string[i:i+k])
    i = i + 1
  return kmers_list

In [None]:
def Prefix(pattern):
  return pattern[0:len(pattern)-1]

In [None]:
def Suffix(pattern):
  return pattern[1:len(pattern)]

In [None]:
def DeBruijnGraphK(patterns,k):
  de_bruijn_graph_dict = {}
  for pattern in patterns:
    de_bruijn_graph_dict.setdefault(Prefix(pattern),[])
    de_bruijn_graph_dict[Prefix(pattern)].append(Suffix(pattern))
  return de_bruijn_graph_dict

U DeBruijn grafu su identitični čvorovi gluani zajedno --> dakle, u listi čvorova ne možemo imati duplikate, možemo imati samo duplikatne veze, tj. readove koji se ponavljaju --> ako zbrojimo sve liste susjedstva onda možemo imati duplikate




In [None]:
def DeBruijnGraphNodes(de_bruijn_graph_dict):
  de_bruijn_graph_nodes = set()
  for node,values_list in de_bruijn_graph_dict.items():
    de_bruijn_graph_nodes.update([node] + values_list)
  return list(de_bruijn_graph_nodes) #mapping from read to number defined by mapping from index of list to element at the index

In [None]:
def DefineGraphDict(de_bruijn_graph_dict,de_bruijn_graph_nodes):
  graph_dict = {}
  for key,nodes_list in de_bruijn_graph_dict.items():
    graph_dict.setdefault(de_bruijn_graph_nodes.index(key),[])
    for node in nodes_list:
      graph_dict[de_bruijn_graph_nodes.index(key)].append(de_bruijn_graph_nodes.index(node))
  return graph_dict

U listi susjedstva možemo imati duplikatne čvorove, npr.

AT --> TG, TG, TG (u genomu imamo 3 puta kmer ATG)

U ovom slučaju u rječniku moramo imati 3 ključa --> (AT, TG), (AT, TG), (AT, TG) --> ne možemo jer ključ mora biti jedinstven

In [None]:
def VisitedEdges(graph_dict):
  visited_edges_dict = {}
  for key,adjacent_nodes in graph_dict.items():
    for node in adjacent_nodes:
      visited_edges_dict.setdefault((key,node),[])
      visited_edges_dict[(key,node)].append(0)
  return visited_edges_dict

In [None]:
def NodeInDegree(graph_dict,node):
  node_in_degree = 0
  for values_list in graph_dict.values():
    node_in_degree = node_in_degree + list(values_list).count(node)
  return node_in_degree

In [None]:
def NodeOutDegree(graph_dict,node):
  return len(graph_dict[node])

In [None]:
import numpy as np
from random import randint, randrange

In [None]:
def VisitedEdgesDictSum(visited_edges_dict):
  visited_edges_dict_sum = 0
  for values_list in visited_edges_dict.values():
    visited_edges_dict_sum = visited_edges_dict_sum + sum(values_list)
  return visited_edges_dict_sum

In [None]:
def VisitedEdgesDictValueCount(visited_edges_dict):
  visited_edges_dict_value_count = 0
  for values_list in visited_edges_dict.values():
    visited_edges_dict_value_count = visited_edges_dict_value_count + len(values_list)
  return visited_edges_dict_value_count

In [None]:
def EulerianCycle(graph_dict):
  visited_edges_dict = VisitedEdges(graph_dict)
  starting_node = randint(min(graph_dict.keys()),max(graph_dict.keys())) #lower and upper bound included
  cycle = [starting_node]
  visited_edges_dict_value_count = VisitedEdgesDictValueCount(visited_edges_dict)
  while VisitedEdgesDictSum(visited_edges_dict) < visited_edges_dict_value_count: #repeat until Eulerian cycle is found --> input is an Eulerian directed graph --> Eulerian cycle can always be found
    #while loop entered --> sum(visited_edges_dict.values()) < len(visited_edges_dict) --> cycle smaller than Eulerian cycle is being formed
    possible_adjacent_nodes = [key[1] for key in visited_edges_dict.keys() if key[0] == cycle[len(cycle)-1] and visited_edges_dict[key].count(0) > 0]
    if len(possible_adjacent_nodes) == 0 and VisitedEdgesDictSum(visited_edges_dict) < visited_edges_dict_value_count: #cycle smaller than Eulerian cycle completed as we got stuck at starting node --> all edges are not visited
      #possible_starting_nodes_list = [node for node in cycle if NodeOutDegree(graph_dict,node) >= NodeOutDegree(graph_dict,cycle[0])] #no, this way we are choosing node regardless of the number of times it appeared in cycle
      #possible_starting_nodes_list = [visited_edge[0] for node in cycle for visited_edge in visited_edges_dict.keys() if visited_edge[0] == node and visited_edges_dict[visited_edge] == 1] --> this caused efficiency problems
      possible_starting_nodes_list = [node for node in cycle if NodeOutDegree(graph_dict,node) > cycle.count(node)] #if NodeOutDegree(node) > number of times node occurs in cycle then there are unused outgoing edges (and unused ingoing edges because of unused outgoing edges, unused outgoing edges must be visited by unused incoming edges), every occurence means that one outgoing edge is used
      if len(possible_starting_nodes_list) == 0: #if this condition is satisfied than graph is not strongly connected --> there is no Eulerian cycle nor Eulerian path in graph
        return []
      if len(possible_starting_nodes_list) == 1:
        starting_node = possible_starting_nodes_list[0] #choose only available starting node
      else:
        starting_node = possible_starting_nodes_list[randrange(0,len(possible_starting_nodes_list))] #randomly choose new starting node among nodes with higher NodeOutDegree than previous starting node
      cycle = cycle[cycle.index(starting_node):len(cycle)] + cycle[1:cycle.index(starting_node)+1] #construct new_cycle using previous cycle
    else: #len(possible_adjacent_nodes) > 1 and sum(visited_edges_dict.values()) < len(visited_edges_dict) --> cycle is not finished yet
      if len(possible_adjacent_nodes) == 1:
        next_node = possible_adjacent_nodes[0]
        #visited_edges_dict[(cycle[len(cycle)-1], next_node)] = 1
        visited_edges_dict_value_list = visited_edges_dict[(cycle[len(cycle)-1], next_node)]
        visited_edges_dict_value_list[visited_edges_dict_value_list.index(0)] = 1 #always update the first zero
        cycle.append(next_node)
      else:
        next_node = possible_adjacent_nodes[randint(0,len(possible_adjacent_nodes)-1)] #lower and upper bound included
        visited_edges_dict_value_list = visited_edges_dict[(cycle[len(cycle)-1], next_node)]
        visited_edges_dict_value_list[visited_edges_dict_value_list.index(0)] = 1 #always update the first zero
        cycle.append(next_node)
  return cycle

In [None]:
def AssembleStringFromEdges(nodes):
  string = str(nodes[0])
  for i in range(1,len(nodes)-(k-1)): #we take last numbers of len(nodes)-(k-1) elements (starting from the start) as other reads can be constructed with the first kmer of the assembled genome
    string = string + str(nodes[i][len(nodes[i])-1])
  return string

Example for the comment above:

000 --> 001 --> 011 --> 111 --> 110 --> 101 --> 010 --> 100 --> 000

010 --> 100 --> 000

Last number of 010 is the same as the second number of 100, that is the same as the first number of 000 --> last number of 010 is the same as the first number of 000, i.e., the first number of the first kmer of the assembled genome

Second number of 100 is the same as the first number of 000, i.e., the same as the first number of the first kmer in the assembled genome

Last number of 100 is the same as the second number of 000, i.e., the same as the second number of the first kmer of the assembled genome

Same can be applied for following 4-mers:

0111 is the start of the assembled genome, the first 4-kmer of the assembled genome

0010 --> 0101 --> 1011 --> 0111

In [None]:
def ReplaceNodesWithEdges(eulerian_cycle):
  eulerian_cycle_edges = []
  i = 0
  while i + 1 <= len(eulerian_cycle) - 1:
    eulerian_cycle_edges.append(eulerian_cycle[i] + eulerian_cycle[i+1][len(eulerian_cycle[i+1])-1])
    i = i + 1
  return eulerian_cycle_edges

In [None]:
def ReplaceNumbersWithBinaryStrings(eulerian_cycle,de_bruijn_graph_nodes):
  for i in range(len(eulerian_cycle)):
    eulerian_cycle[i] = de_bruijn_graph_nodes[eulerian_cycle[i]]
  return eulerian_cycle

Let BinaryStringsk be the set of all 2^k binary k-mers. The only thing we need
to do is to solve the k-Universal Circular String Problem is to find an Eulerian cycle in DEBRUIJN(BinaryStringsk).

In [None]:
def StringReconstruction(k):
  de_bruijn_graph_dict = DeBruijnGraphK(BinaryStringsK(k),k)
  de_bruijn_graph_nodes = DeBruijnGraphNodes(de_bruijn_graph_dict)
  graph_dict = DefineGraphDict(de_bruijn_graph_dict,de_bruijn_graph_nodes)
  eulerian_cycle = EulerianCycle(graph_dict)
  eulerian_cycle = ReplaceNumbersWithBinaryStrings(eulerian_cycle,de_bruijn_graph_nodes)
  eulerian_cycle_edges = ReplaceNodesWithEdges(eulerian_cycle) #replace nodes with edges as edges in de_bruijn_graph_dict coorespond to binary kmers --> k-universal circular string is assembled from binary kmers
  string = AssembleStringFromEdges(eulerian_cycle_edges)
  return string

Let BinaryStringsk be the set of all 2k binary k-mers. The only thing we need
to do is to solve the k-Universal Circular String Problem is to find an Eulerian cycle
in DEBRUIJN(BinaryStringsk). --> dakle, binarni kmeri su veze, a binarni (k-1)meri su čvorovi

In [None]:
from math import pow

In [None]:
def BinaryStringsK(k):
  binary_strings = []
  for i in range(0,int(pow(2,k))): #for some value of k, decimal numbers computed from binary numbers are in range [0, 2^k-1]
    binary_i = bin(i)[2:len(bin(i))]
    if len(binary_i) < k:
      binary_strings.append((k - len(binary_i)) * '0' + binary_i)
    else:
      binary_strings.append(binary_i)
  return binary_strings

In [None]:
k = 4

In [None]:
StringReconstruction(k)

'1110010100001101'

In [None]:
with open('/content/rosalind_ba3i.txt') as task_file:
  k = int([line.rstrip() for line in task_file][0])

In [None]:
f = open("task_result.txt","w")
f.write(StringReconstruction(k))
f.close()