In [29]:
def FindAllKmers(dna_string, k):
  kmers_list = []
  i = 0
  while i + (k - 1) <= len(dna_string) - 1:
    kmers_list.append(dna_string[i:i+k])
    i = i + 1
  return kmers_list

In [30]:
def DeBruijnGraphK(patterns, k):
  adjacency_dict = {}
  for kmer in patterns:
    adjacency_dict.setdefault(kmer[0:len(kmer)-1], list())
    adjacency_dict[kmer[0:len(kmer) - 1]].append(kmer[1:len(kmer)])
  return adjacency_dict

U DeBruijn grafu su identitični čvorovi gluani zajedno --> dakle, u listi čvorova ne možemo imati duplikate, možemo imati samo duplikatne veze, tj. readove koji se ponavljaju

In [31]:
def DeBruijnGraphNodes(de_bruijn_graph_dict):
  de_bruijn_graph_nodes = set()
  for key,values_list in de_bruijn_graph_dict.items():
    de_bruijn_graph_nodes.update([key] + values_list)
  return list(de_bruijn_graph_nodes) #mapping from read to number defined by mapping from index of list --> element at that index

In [33]:
def DefineGraphDict(de_bruijn_graph_dict, de_bruijn_graph_nodes):
  graph_dict = {}
  for i in range(len(de_bruijn_graph_nodes)):
    graph_dict.setdefault(i,[])
  for key,values_list in de_bruijn_graph_dict.items():
    for value in values_list:
      graph_dict[de_bruijn_graph_nodes.index(key)].append(de_bruijn_graph_nodes.index(value))
  return graph_dict

In [34]:
def VisitedEdges(graph_dict):
  visited_edges_dict = {}
  for key,values_list in graph_dict.items():
    for value in values_list:
      visited_edges_dict.setdefault((key,value), 0)
  return visited_edges_dict

In [35]:
def NodeInDegree(graph_dict,node):
  node_indegree = 0
  for adjacent_nodes_list in graph_dict.values():
    if node in adjacent_nodes_list:
      node_indegree = node_indegree + 1
  return node_indegree

In [36]:
def NodeOutDegree(graph_dict,node):
  return len(graph_dict[node])

In [37]:
import numpy as np
from random import randint, randrange

In [38]:
def EulerianCycle(graph_dict):
  visited_edges_dict = VisitedEdges(graph_dict)
  starting_node = randint(min(graph_dict.keys()), max(graph_dict.keys()))
  cycle = [starting_node]
  while sum(visited_edges_dict.values()) < len(visited_edges_dict): #repeat until Eulerian cycle is found --> input is an Eulerian directed graph --> Eulerian cycle can always be found
    #while loop entered --> sum(visited_edges_dict.values()) < len(visited_edges_dict) --> cycle smaller than Eulerian cycle is being formed
    possible_adjacent_nodes = [key[1] for key in visited_edges_dict.keys() if key[0] == cycle[len(cycle)-1] and visited_edges_dict[key] == 0]
    if len(possible_adjacent_nodes) == 0 and sum(visited_edges_dict.values()) < len(visited_edges_dict): #cycle smaller than Eulerian cycle completed as we got stuck at starting node --> all edges are not visited
      #possible_starting_nodes_list = [node for node in cycle if NodeOutDegree(graph_dict,node) >= NodeOutDegree(graph_dict,cycle[0])] #no, thsi way we are choosing node regardless of the number of times it appeared in cycle
      #possible_starting_nodes_list = [visited_edge[0] for node in cycle for visited_edge in visited_edges_dict.keys() if visited_edge[0] == node and visited_edges_dict[visited_edge] == 1] --> this caused efficiency problems
      possible_starting_nodes_list = [node for node in cycle if NodeOutDegree(graph_dict,node) > cycle.count(node)] #if NodeOutDegree(node) > number of times node occurs in cycle then there are unused outgoing edges, every occurence measn that one outgoing edge is used 
      starting_node = possible_starting_nodes_list[randrange(0,len(possible_starting_nodes_list))] #randomly choose new starting node among nodes with higher NodeOutDegree than previous starting node
      cycle = cycle[cycle.index(starting_node):len(cycle)] + cycle[1:cycle.index(starting_node)+1] #construct new_cycle using previous cycle
    else: #len(possible_adjacent_nodes) > 1 and sum(visited_edges_dict.values()) < len(visited_edges_dict) --> cycle is not finished yet
      if len(possible_adjacent_nodes) == 1:
        next_node = possible_adjacent_nodes[0]
        visited_edges_dict[(cycle[len(cycle)-1], next_node)] = 1
        cycle.append(next_node)
      else:
        next_node = possible_adjacent_nodes[randint(0,len(possible_adjacent_nodes)-1)]
        visited_edges_dict[(cycle[len(cycle)-1], next_node)] = 1
        cycle.append(next_node)
  return cycle

In [142]:
def AssembleStringFromEdges(nodes):
  string = str(nodes[0])
  for i in range(1,len(nodes)-(k-1)): #we take last numbers of len(nodes)-(k-1) elements (starting from the start) as other reads can be constructed with the first kmer of the assembled genome
    string = string + str(nodes[i][len(nodes[i])-1])
  return string

Example for the comment above:

000 --> 001 --> 011 --> 111 --> 110 --> 101 --> 010 --> 100 --> 000

010 --> 100 --> 000

Last number of 010 is the same as the second number of 100, that is the same as the first number of 000 --> last number of 010 is the same as the first number of 000, i.e., the first number of the first kmer of the assembled genome

Second number of 100 is the same as the first number of 000, i.e., the same as the first number of the first kmer in the assembled genome

Last number of 100 is the same as the second number of 000, i.e., the same as the second number of the first kmer of the assembled genome

Same can be applied for following 4-mers:

0111 is the start of the assembled genome, the first 4-kmer of the assembled genome

0010 --> 0101 --> 1011 --> 0111

In [134]:
def ReplaceNodesWithEdges(eulerian_cycle):
  eulerian_cycle_edges = []
  i = 0
  while i + 1 <= len(eulerian_cycle) - 1:
    eulerian_cycle_edges.append(eulerian_cycle[i] + eulerian_cycle[i+1][len(eulerian_cycle[i+1])-1])
    i = i + 1
  return eulerian_cycle_edges

In [135]:
def StringReconstruction(reads,k):
  de_bruijn_graph_dict = DeBruijnGraphK(reads,k)
  de_bruijn_graph_nodes = DeBruijnGraphNodes(de_bruijn_graph_dict)
  graph_dict = DefineGraphDict(de_bruijn_graph_dict,de_bruijn_graph_nodes)
  eulerian_cycle = EulerianCycle(graph_dict)
  for i in range(len(eulerian_cycle)):
    eulerian_cycle[i] = de_bruijn_graph_nodes[eulerian_cycle[i]]
  eulerian_cycle_edges = ReplaceNodesWithEdges(eulerian_cycle)
  string = AssembleStringFromEdges(eulerian_cycle_edges)
  return string

In [106]:
from math import pow

Let BinaryStringsk be the set of all 2k binary k-mers. The only thing we need
to do is to solve the k-Universal Circular String Problem is to find an Eulerian cycle
in DEBRUIJN(BinaryStringsk). --> dakle, binarni kmeri su veze, a binarni (k-1)meri su čvorovi

In [107]:
def BinaryStringsK(k):
  #for some value of k, decimal numbers computed from binary numbers are in range [0, 2^k-1]
  binary_strings = []
  for i in range(int(pow(2,k))):
    if len(bin(i)[2:len(bin(i))]) < k:
      binary_strings.append((k - len(bin(i)[2:len(bin(i))])) * '0' + bin(i)[2:len(bin(i))])
    else:
      binary_strings.append(bin(i)[2:len(bin(i))])
  return binary_strings

In [121]:
k = 4

In [122]:
reads = BinaryStringsK(k)

In [None]:
reads

In [None]:
StringReconstruction(reads,k)

In [138]:
k = 9

In [139]:
reads = BinaryStringsK(k)

In [None]:
StringReconstruction(reads,k)