###  Generate the k-mer Composition of a String

In [0]:
def generate_composition(k, text):
  return [text[idx:idx+k] for idx in range(len(text) - k + 1)]

In [121]:
ans = generate_composition(5,'CAATCCAAC')
for x in ans:
  print(x)

CAATC
AATCC
ATCCA
TCCAA
CCAAC


### Reconstruct a String from its Genome Path

In [0]:
def construct(path):
  ans = ''
  for idx, p in enumerate(path):
    if idx == 0:
      ans += p
    else:
      ans += p[-1]
  #endfor
  return ans


In [123]:
print(construct(['ACCGA','CCGAA','CGAAG','GAAGC','AAGCT']))

ACCGAAGCT


### Construct the Overlap Graph of a Collection of k-mers

In [0]:
def overlap_grap(k_mers):
  ans = []
  for k_mer in k_mers:
    for new_kmer in [k_mer[1::]+ch for ch in 'ACGT']:
      if new_kmer in k_mers:
        ans.append(k_mer + ' -> ' + new_kmer)
      #endif
    #endfor
  #endfor
  return ans


In [125]:
adj_list = overlap_grap(['ATGCG','GCATG','CATGC','AGGCA','GGCAT'])
for edge in adj_list:
  print(edge)

GCATG -> CATGC
CATGC -> ATGCG
AGGCA -> GGCAT
GGCAT -> GCATG


# Construct the De Bruijn Graph of a String

In [0]:
from collections import defaultdict

In [0]:
def de_bruijn(k, seq):
  all_prefix = [seq[idx:idx+k] for idx in range(len(seq)-k+1)]
  mp = defaultdict(list)
  for prefix in all_prefix:
    mp[prefix[0:-1:]].append(prefix[1::])
  return mp

In [128]:
ans = de_bruijn(4,'AAGATTCTCTAC')

for key in ans:
  print('{} -> {}'.format(key,','.join(ans[key])))

AAG -> AGA
AGA -> GAT
GAT -> ATT
ATT -> TTC
TTC -> TCT
TCT -> CTC,CTA
CTC -> TCT
CTA -> TAC


# Construct the De Bruijn Graph of a Collection of k-mers

In [0]:
def de_bruijn_from_k_mers(k_mers):
  mp = defaultdict(list)
  for s in k_mers:
    mp[s[0:-1:]].append(s[1::])
  return mp

In [130]:
ans = de_bruijn_from_k_mers(['GAGG','CAGG','GGGG','GGGA','CAGG','AGGG','GGAG'])

for key in ans:
  print('{} -> {}'.format(key,','.join(ans[key])))

GAG -> AGG
CAG -> AGG,AGG
GGG -> GGG,GGA
AGG -> GGG
GGA -> GAG


# Find an Eulerian Cycle in a Graph

In [0]:
def heirholzer(adj):
  current, circuit = [],[]
  current.append(6)
  while current:
    node = current[-1]
    if adj[node]:
      current.append(adj[node][-1])
      adj[node].pop()
    else:
      circuit.append(current.pop())
    #endif
  #endwhile
  return circuit[::-1]

In [0]:
def euler_cycle(edges):
  adj = defaultdict(list)
  for edge in edges:
    u , v = [e.strip() for e in edge.split('->')]
    adj[int(u)] = [int(x) for x in v.split(',')]
  return heirholzer(adj)

In [0]:
ans = euler_cycle(['0 -> 3','1 -> 0','2 -> 1,6','3 -> 2','4 -> 2','5 -> 4','6 -> 5,8','7 -> 9','8 -> 7','9 -> 6'])

In [134]:
print('->'.join([str(node) for node in ans]))

6->8->7->9->6->5->4->2->1->0->3->2->6


# Find an Eulerian Path in a Graph

In [0]:
import numpy as np

In [0]:
def heirholzer_with_start_node(adj, start):
  current, circuit = [],[]
  current.append(start)
  while current:
    node = current[-1]
    if adj[node]:
      current.append(adj[node][-1])
      adj[node].pop()
    else:
      circuit.append(current.pop())
    #endif
  #endwhile
  return circuit[::-1]

In [0]:
def euler_path(edges):
  adj = defaultdict(list)
  for edge in edges:
    u , v = edge.split(' -> ')
    adj[int(u)] = [int(x) for x in v.split(',')]
  nodes = []
  for key,val in adj.items():
    nodes.append(key)
    nodes.extend(val)
  #endfor

  mx_mark = max(nodes)
  indeg, outdeg = np.zeros((mx_mark+1,), dtype=int), np.zeros((mx_mark+1,), dtype=int)
  for key, val in adj.items():
    outdeg[key] += len(val)
    for x in val:
      indeg[x] += 1
    #endfor
  #endfor
  start, end = None, None
  for node in set(nodes):
    if indeg[node] + 1 == outdeg[node]:
      start = node
    if indeg[node] == outdeg[node] + 1:
      end = node
  #endfor
  adj[end] = [start]

  return heirholzer_with_start_node(adj, start)


In [138]:
ans = euler_path(['0 -> 2','1 -> 3','2 -> 1','3 -> 0,4','6 -> 3,7','7 -> 8','8 -> 9','9 -> 6'])
ans.pop()
print('->'.join([str(x) for x in ans]))

6->7->8->9->6->3->0->2->1->3->4


# Reconstruct a String from its k-mer Composition

In [0]:
def reconstruct_string(k, k_mers):
  de_bruijn = [(k_mer[0:-1:], k_mer[1::]) for k_mer in k_mers]
  mp = {}
  idx = 0
  for pre, suf in de_bruijn:
    if pre not in mp:
      mp[pre] = idx
      mp[idx] = pre
      idx += 1
    if suf not in mp:
      mp[suf] = idx
      mp[idx] = suf
      idx += 1
  #endfor
  adj = defaultdict(list)
  for u,v in de_bruijn:
    adj[mp[u]].append(mp[v])
  indeg, outdeg = np.zeros((idx,), dtype=int), np.zeros((idx,), dtype=int)
  for key, val in adj.items():
    outdeg[key] += len(val)
    for x in val:
      indeg[x] += 1
    #endfor
  #endfor
  start, end = None, None
  for node in range(idx):
    if indeg[node] + 1 == outdeg[node]:
      start = node
    if indeg[node] == outdeg[node] + 1:
      end = node
  #endfor
  adj[end] = [start]
  ans = heirholzer_with_start_node(adj, start)
  ans.pop()
  string = mp[ans[0]]
  for x in ans[1:]:
    string += mp[x][-1]
  return string

In [140]:
print(reconstruct_string(4, ['CTTA','ACCA','TACC','GGCT','GCTT','TTAC']))

GGCTTACCA


# Find a k-Universal Circular String

In [0]:
from itertools import product

In [0]:
def k_universal_string(k):
  k_mers = [''.join(val) for val in product('01', repeat=k)]
  de_bruijn = [(k_mer[0:-1:], k_mer[1::]) for k_mer in k_mers]
  mp = {}
  idx = 0
  for pre, suf in de_bruijn:
    if pre not in mp:
      mp[pre] = idx
      mp[idx] = pre
      idx += 1
    if suf not in mp:
      mp[suf] = idx
      mp[idx] = suf
      idx += 1
  #endfor
  adj = defaultdict(list)
  for u,v in de_bruijn:
    adj[mp[u]].append(mp[v])
  ans = heirholzer(adj)
  string = mp[ans[0]]
  for x in ans[1:]:
    string += mp[x][-1]
  return string

In [144]:
print(k_universal_string(4))

1101111001010000110
