In [None]:
def debruijnize(reads):
    nodes = set()
    not_starts = set()
    edges = []
    for r in reads:
        r1 = r[:-1]
        r2 = r[1:]
        nodes.add(r1)
        nodes.add(r2)
        edges.append((r1, r2))
        not_starts.add(r2)
    return (nodes, edges, list(nodes - not_starts))


def build_k_mer(str, k):
    return [str[i : k + i] for i in range(0, len(str) - k + 1)]


def make_node_edge_map(edges):
    node_edge_map = {}
    for e in edges:
        n = e[0]
        if n in node_edge_map:
            node_edge_map[n].append(e[1])
        else:
            node_edge_map[n] = [e[1]]
    return node_edge_map


def eulerian_trail(m, v):
    nemap = m
    result_trail = []
    start = v
    result_trail.append(start)
    while True:
        trail = []
        previous = start
        while True:

            if previous not in nemap:
                break
            next = nemap[previous].pop()
            if len(nemap[previous]) == 0:
                nemap.pop(previous, None)
            trail.append(next)
            if next == start:
                break
            previous = next
        # completed one trail
        print(trail)
        index = result_trail.index(start)
        result_trail = (
            result_trail[0 : index + 1]
            + trail
            + result_trail[index + 1 : len(result_trail)]
        )
        # choose new start
        if len(nemap) == 0:
            break
        found_new_start = False
        for n in result_trail:
            if n in nemap:
                start = n
                found_new_start = True
                break  # from for loop
        if not found_new_start:
            print("error")
            print("result_trail", result_trail)
            print(nemap)
            break
    return result_trail


def visualize_debruijn(G):
    nodes = G[0]
    edges = G[1]
    dot_str = 'digraph "DeBruijn graph" {\n '
    for node in nodes:
        dot_str += '    %s [label="%s"] ;\n' % (node, node)
    for src, dst in edges:
        dot_str += "    %s->%s;\n" % (src, dst)
    return dot_str + "}\n"


def assemble_trail(trail):
    if len(trail) == 0:
        return ""
    result = trail[0][:-1]
    for node in trail:
        result += node[-1]
    return result


def test_assembly_debruijn(t, k):
    reads = build_k_mer(t, k)
    G = debruijnize(reads)
    v = visualize_debruijn(G)
    nemap = make_node_edge_map(G[1])
    print(G)
    print(v)
    start = next(iter(G[2])) if (len(G[2]) > 0) else next(iter(G[0]))
    trail = eulerian_trail(nemap, start)
    return assemble_trail(trail)

In [3]:
# Parsing the file content and identifying potential motifs across all sequences.

sequences = {
    "test1": "acaaccatatatagtagccactgaat",
    "test2": "ccaccccatatatagtagtgcgggtggtg",
    "test3": "ccataaatagataggcagactgtcgctgt",
    "test4": "gtaaacataccataaatagga",
    "test5": "ttcaagaaactgccataaatagcgat",
    "test6": "tagaggtttttgtgccataaataggt",
    "test7": "ccccataaataggaatatcgggatga",
    "test8": "ttgccattaatagattataccatatatgg",
    "test9": "tatcaacaacgataccaacccatatatgg",
    "test10": "tttccaaatatagaaggtgtggaaag",
    "test11": "tccaaatatagtaaaatcgagtcgcggat",
    "test12": "gactggggcccaaatatagcatgttc",
    "test13": "atcattagcttttacttaccataaatgg",
    "test14": "attcttttgccataaatggtaactcg",
    "test15": "ccataaatggcaagtctgtcgaataacgg",
    "test16": "cccataaatggcagggtattagcacg",
    "test17": "ccaaaaatagatatgtgtcgtaacagctt",
    "test18": "ccaaaaatagggggacaatggaagtgggg",
    "test19": "ccaaaaataggccagacgtgtttacaacg",
    "test20": "ccaaaaatagttaaataatgtcatacatt",
    "test21": "ctacaccttccaaaaatagtaatct",
    "test22": "ttgccaaatatggggttagagtgttc",
    "test23": "gtctttaccaaaaatggtgatcctgt",
    "test24": "ttgccaaaaatggagcgtttaccaat",
    "test25": "atccaccatttatagaaagttcaggaggc",
    "test26": "gcataagagaacattccatttatagg",
    "test27": "tcaaccccatttatagccacgtcagt",
    "test28": "catccattaatagtagccataatggcg",
    "test29": "ggagtaggcccattaatagtatcttt",
    "test30": "ccattaatagcatacaaaatcgactcaag",
    "test31": "ccaattatagaaagctgtggctggtcgtc",
    "test32": "aactattatttctcacattccattaatgg",
    "test33": "atgctttaccaataatagagcgcaa",
    "test34": "ggtcagttagatccaattatggaatg",
    "test35": "gcattccaaaattagtaacgatatct",
    "test36": "cctcctttccaaaattagttgagaag",
    "test37": "ctttgccaaaattagctattctgac",
    "test38": "acgcatgcaccacatatagtaacgtg",
    "test39": "agcgccccatttttagggtttaagct",
    "test40": "ccatttttagtaatttaatacaacgccgc",
    "test41": "ccatttttagtatggaaccgccgtgagt",
    "test42": "aaattaccataagtggtaatgcacacac",
    "test43": "caattactatatatagcgtgtttgtc",
    "test44": "ccatacatggtaaaatgtaccgaaacact",
    "test45": "ccgaaagtactataaatagtaatcca",
    "test46": "gttttcgcctttctataaatagtacc",
    "test47": "ggtacctaatatagtaatcagctctg",
    "test48": "tctttaattacttgcctaatatagct",
    "test49": "ggatgcatccctaatatagttaataa",
    "test50": "atctttaccaaaactagttaattcga",
    "test51": "tgctaaatatagaacatctccaaata",
    "test52": "ccttctaactaaatatagaagtgata",
    "test53": "attaccataaacagaaatcagtggat",
    "test54": "cctaaaaatagatcgaatgtgtgctc",
    "test55": "tactaaaaatagatcatgagctacga",
    "test56": "tagtcacttgatttccatacctaaaatgg",
    "test57": "atcaacagtctacaatatcctaaaatgg",
    "test58": "caaacattccagatttagaatggtta",
    "test59": "acttctttccttttatagctgagtgc",
    "test60": "ccgctaagccttgccttttatagca",
    "test61": "tgcccaaaaaaagaaagttgtcagac",
    "test62": "cccaaataaggaaagctctctggac",
    "test63": "aaccacacacccagaattagtaag",
    "test64": "tggttcatttaagactttaccatttctgg",
    "test65": "tgcgatggaatagtactaaatttagg",
    "test66": "tgcggaagagggttcccgatatagata",
    "test67": "tcccaaaaatactcatatgtcgggct",
    "test68": "ccatattgggtaaagattgctttttagca",
    "test69": "tagtggttaactacccgtttttagta",
    "test70": "ccaatttatagttcactttcgtgatgagaa",
    "test71": "tactcgcttttcttactaaaagtaga",
    "test72": "tactaaaagtagtagtttgtctgca",
    "test73": "ccattttaaggaatttacgatctagtgaa",
    "test74": "tcccattttaagaccaacttctcatt",
    "test75": "gatccagtagattccatttatgtacg",
    "test76": "cctgtaaatagtaacaaggtgcatcg",
    "test77": "cgatcaatatgttaccattttggggt",
    "test78": "gagcgagaccttcccaataattagtaa",
    "test79": "aagtgactatattactcaaaatagaa",
    "test80": "ttaccaaattgccaatttaggctaaa",
    "test81": "ttaccaaattgccaatttaggctaaa",
    "test82": "gaagatacctaatacggaaattttcc",
    "test83": "gcagacttgctatattaagctaatat",
    "test84": "cgctttcttactataaatgtttacta",
    "test85": "ggttcaagatttttccttttatgtac",
    "test86": "tttatagtgtccctttttcggtaagt",
    "test87": "atcctacgatgctttctaaaagaagg",
    "test88": "ccctagcaattttttactatatttgt",
    "test89": "ggaagaatttactataattgtacatg",
    "test90": "ctctaatggccttactacttaaagca",
    "test91": "gccgagtccggaaaatttccgaaaaatg",
    "test92": "atttatttcctaaagtgtaactaac",
    "test93": "gacctgtatccttttctacttttgtc",
    "test94": "atatgcccctcacaagttaccaatta",
    "test95": "aaaatggtaatttctcgggacagg",
    "test96": "ctcagtgcacacagacattccaaata",
    "test97": "gattaggattcgtttgtttccaaata",
}

# Common motif analysis to find any recurring subsequences.
from Bio import motifs
from Bio.Seq import Seq

# Convert sequences into Seq objects for motif analysis.
seqs = [Seq(sequence) for sequence in sequences.values()]
m = motifs.create(seqs)
motifs_found = m.consensus

motifs_found

ValueError: sequences must have the same length if coordinates is None