In [0]:
from collections import defaultdict
import numpy as np

#Generate the k-mer Composition of a String

In [0]:
def k_mer_composition(k, text):
  n = len(text)
  return sorted([text[idx:idx+k] for idx in range(n-k+1)])

In [0]:
ans = k_mer_composition(5,'CAATCCAAC')
for s in ans:
  print(s) 

AATCC
ATCCA
CAATC
CCAAC
TCCAA


#Reconstruct a String from its Genome Path

In [0]:
def reconstruct_string(paths):
  return paths[0] + ''.join([s[-1] for s in paths[1:]])

In [0]:
with open('rosalind_ba3b.txt','r') as file:
  paths = file.read().splitlines()
  print(reconstruct_string(paths))

GGCTGATAAGCCAATCCAAAGAAACCCAGTATTTGCTTTACGCGTTACGTGATGAATAAGAAGGCATCTTTCATTGCTAACGATCTCGCCAAAGCATAGCCCTAATAGTACACCCTAGGGAGAGTTAACATGTTGATGCTCACCACGAAAGCACCTGTCTTGCTGTAAAAAAGCGCTACAGTATTCTGCCGCCTTAACAACCCCAAGGGTGGGGAGCAGGTCACTGCAGCTCGGAGGGGTAATAGGCGGTCGGCTACACAGAATACGCTCCGGGAGGTAGGAGAGCTTCTCCGTGAAACGTGTTGTCCCTCTAAATACTCCTACCAGTTCGTTCTTTTTGCGCAAGCCGCGCCGACCCTCGGTTCCGTCCTCGCTCGCCAGAGCTGTCCATGCTACTGCCAGATTTTAGTGCAGATGCGAATCCAGCTGTACTGCCACCTAGTCCCTCACAAAAGTATAATCTGATACTCGATTGCGTTCCTCTTCGCCCTGACTCAGAGCCAACCACACGCTACTTGGTCCCAAATCCGCCCTTAATTTCGGTGGCACATCCTGATACGCGATGGAAGCATAACTACATAGGGATGTAGCTAGGAGAAATTTAGTTCGACCCCAGGTGCACTGCTCGTCCCATACAATCACTGCCACAAGTATCTCAGGTCATACCCTCTCGTCACAACTCTAGCCAAACGGTTTCTCACTTGGACGGTCCCGATTAATTCGTGTCGAGCACATAAGAAAGAGGAAGCGCACTGGCGTTGAGGATAATTTCTGCTTACGAAGGCAAGGCGAAAAGAGTCGCAGTTGGCATGAAACCATGTCGAATCCGGATATGAGGTAGAAGAAGCCACATTAGGGACTTCTCTTGCGAATCTATGACATCTCTAGGCCCTAGATTCACAGTTATCATCGGGACGGCAGATAAACTAAGGAATTAGATACAGGTACGCAATTAGCGCAAACTCAGTATCGTCGTCTGGAGAGCCTTCACTCAACCTTA

# Construct the Overlap Graph of a Collection of k-mers

In [0]:
def construct_overlap_grap(nodes):
  ans = []
  for u in nodes:
    for v in nodes:
      if v.startswith(u[1:]):
        ans.append(u + ' -> ' + v)
  return ans

In [0]:
%%writefile input.txt
ATGCG
GCATG
CATGC
AGGCA
GGCAT

Overwriting input.txt


#Construct the De Bruijn Graph of a String

In [0]:
def construct_de_bruijn_graph(k, text):
  mp = defaultdict(list)
  n, l = len(text), k-1
  for idx in range(n-k+1):
    mp[text[idx:idx+l]].append(text[idx+1:idx+l+1])
  return mp

In [0]:
mp = construct_de_bruijn_graph(4,'AAGATTCTCTAC')
for key,val in mp.items():
  print(key + ' -> ' + ','.join(val))

AAG -> AGA
AGA -> GAT
GAT -> ATT
ATT -> TTC
TTC -> TCT
TCT -> CTC,CTA
CTC -> TCT
CTA -> TAC


#Find an Eulerian Cycle in a Graph 

In [0]:
def hierholzer(adj, start = 0):
  path, cycle = [start] , []
  while path:
    u = path[-1]
    if adj[u]:
      v = adj[u].pop()
      path.append(v)
    else:
      cycle.append(path.pop())
  return cycle[::-1]

In [0]:
with open('rosalind_ba3f.txt','r') as file:
  data = file.read().splitlines()
  adj = defaultdict(list)
  for line in data:
    u , v = line.split(' -> ')
    adj[int(u)] = [int(node) for node in v.split(',')]
  ans = hierholzer(adj)
  cycle = '->'.join([str(node) for node in ans])
  print(cycle)

0->3->8->1216->1217->1218->2052->2051->2495->2496->2494->2051->2050->1218->8->7->2960->2959->2961->7->9->25->46->47->1678->1680->1679->47->48->171->170->417->1198->2148->2147->2146->2864->2865->2863->2146->1198->1200->1289->2151->2150->2149->1289->1288->1572->1571->2085->2084->2083->1571->1570->1288->1551->1550->1549->1288->1290->2645->2646->2644->1290->2198->2197->2199->1290->1200->1199->417->415->664->1067->1068->1066->2722->2724->2723->1066->664->666->665->415->416->170->169->48->25->26->258->256->257->1016->2670->2669->2668->1016->1015->1017->257->26->2058->2057->2056->26->27->625->626->2964->2962->2963->626->1323->1321->2891->2892->2890->1321->1926->1924->1925->1321->1525->1526->1527->1321->1322->626->627->27->224->822->821->820->224->225->223->310->912->910->1623->1622->1621->910->911->1117->1118->1119->911->310->311->312->378->377->376->719->720->804->1065->1064->1063->804->802->2486->2485->2487->802->1370->1371->1369->802->803->720->718->376->312->223->27->9->15->1425->1424->14

#Find an Eulerian Path in a Graph

In [0]:
def euler_path(adj):
  nodes = []
  for key, val in adj.items():
    nodes.append(key)
    nodes.extend(val)
  N = max(nodes)
  indeg, outdeg = np.zeros(N+5, dtype=int) , np.zeros(N+5, dtype=int)
  for key, val in adj.items():
    outdeg[key] += len(val)
    for node in val:
      indeg[node] += 1
  start, finish = None, None
  for node in set(nodes):
    if indeg[node] + 1 == outdeg[node]:
      start = node
      #print('s',node)
    if indeg[node] == outdeg[node]+1:
      #print('e',node)
      finish = node
  #print(start, finish)
  adj[finish].append(start)
  ans = hierholzer(adj, start)
  idx = None
  for i in range(len(ans)-1):
    if ans[i] == finish and ans[i+1] == start:
      idx = i
      break
  #print(ans[idx+1])
  new_list = ans[idx+1:]
  new_list.pop()
  final_ans = new_list + ans[0:idx+1]
  # print(ans.index(1587), ans.index(1630))
  path = '->'.join([str(node) for node in final_ans])
  print(path)

In [0]:
with open('rosalind_ba3g.txt','r') as file:
  data = file.read().splitlines()
  adj = defaultdict(list)
  for line in data:
    u , v = line.split(' -> ')
    adj[int(u)] = [int(node) for node in v.split(',')]
  euler_path(adj)

2150->648->593->970->971->1300->1319->1318->1320->1375->1377->1376->1320->1300->1302->1631->1630->1632->1302->1301->971->972->593->254->962->963->2755->2757->2756->963->961->2345->2344->2346->961->254->229->969->1785->1783->1784->2774->2775->2773->1784->969->1705->1706->1707->969->968->2295->2293->2294->968->967->229->230->213->726->724->1207->1209->1208->724->725->2715->2713->2714->725->213->211->2642->2643->2641->2736->2734->2735->2641->211->1985->1984->1986->211->1779->1777->1778->211->1282->1284->1283->211->212->2->8->7->1381->1383->1382->7->10->13->120->119->118->193->511->513->2729->2730->2728->513->512->2520->2519->2518->512->193->397->418->1480->1481->1482->418->420->571->2203->2205->2204->571->572->573->1593->2724->2723->2722->1593->1600->1602->1601->2353->2354->2355->2669->2668->2670->2355->1601->1638->1637->1636->1601->1593->1591->1592->573->420->419->397->1693->1695->1694->397->398->399->684->682->683->399->193->1303->1304->1722->1721->1720->1304->1305->193->194->195->736->

#Reconstruct a String from its k-mer Composition

In [0]:
def string_from_composition(k, patterns):
  adj = defaultdict(list)
  for p in patterns:
    adj[p[0:-1]].append(p[1:])
  




In [0]:
with open('input.txt','r') as file:
  data = file.read().splitlines()
  k = int(data[0])
  string_from_composition(k, data[1:])

defaultdict(<class 'list'>, {'CTT': ['TTA'], 'ACC': ['CCA'], 'TAC': ['ACC'], 'GGC': ['GCT'], 'GCT': ['CTT'], 'TTA': ['TAC']})
