In [1]:
import collections

def de_bruijn_from_kmers(patterns):
    adj_list = collections.defaultdict(list) # Same as BA3D, DefaultDict creates the key with an empty list when accesed (if not defined)
    
    for kmer in patterns:
        prefix = kmer[:-1]
        # Suffix is all but the first character, task didn't specify the lenght of the prefix and suffix so I'm using k-1 again...
        suffix = kmer[1:]
        adj_list[prefix].append(suffix)
        
    # Sort to have consistent output
    for node in adj_list:
        adj_list[node].sort()
        
    return adj_list

def format_adjacency_list_for_printing(adj_list): # Same as BA3D
    output_lines = []
    # Sort the keys (nodes)
    for node in sorted(adj_list.keys()):
        neighbors_str = ",".join(adj_list[node]) # Join multiple values for one key
        output_lines.append(f"{node} -> {neighbors_str}")
    return "\n".join(output_lines)

if __name__ == "__main__":
    file_path = "../data/rosalind_ba3e.txt" 
    patterns_input = []
    with open(file_path, 'r') as f:
        for line in f:
            patterns_input.append(line.strip())

    de_bruijn_graph_adj_list = de_bruijn_from_kmers(patterns_input)
    # Match expected output
    result_string = format_adjacency_list_for_printing(de_bruijn_graph_adj_list)
    
    print(result_string)

AAAAAGACGTAAGCAAATT -> AAAAGACGTAAGCAAATTA
AAAAAGAGGGAAGAGGGTG -> AAAAGAGGGAAGAGGGTGA
AAAACCCAAGGAGGCTTAC -> AAACCCAAGGAGGCTTACT
AAAACCTTTAGGTTCATTT -> AAACCTTTAGGTTCATTTG
AAAACGTTGCCCACCGTGG -> AAACGTTGCCCACCGTGGA
AAAACTTCTCGGCCCGCTT -> AAACTTCTCGGCCCGCTTA
AAAAGACGTAAGCAAATTA -> AAAGACGTAAGCAAATTAG
AAAAGAGGGAAGAGGGTGA -> AAAGAGGGAAGAGGGTGAT
AAACCAACATTCCGTTGTG -> AACCAACATTCCGTTGTGG
AAACCACGGGTCCGGGTCA -> AACCACGGGTCCGGGTCAT
AAACCCAAGGAGGCTTACT -> AACCCAAGGAGGCTTACTG
AAACCCCTAGTAGCAGTTC -> AACCCCTAGTAGCAGTTCG
AAACCTCATGCTTAACCGG -> AACCTCATGCTTAACCGGC
AAACCTCGCCCGTCTCGGA -> AACCTCGCCCGTCTCGGAC
AAACCTGGTCAGTCTCAAC -> AACCTGGTCAGTCTCAACA
AAACCTTTAGGTTCATTTG -> AACCTTTAGGTTCATTTGG
AAACGTTGCCCACCGTGGA -> AACGTTGCCCACCGTGGAA
AAACTTCGCGCGGGTCCGC -> AACTTCGCGCGGGTCCGCT
AAACTTCTCGGCCCGCTTA -> AACTTCTCGGCCCGCTTAT
AAAGACGTAAGCAAATTAG -> AAGACGTAAGCAAATTAGC
AAAGAGGGAAGAGGGTGAT -> AAGAGGGAAGAGGGTGATA
AAAGGGTAAATCGTCGTTC -> AAGGGTAAATCGTCGTTCA
AAAGGTGCTATTCCGAGAG -> AAGGTGCTATTCCGAGAGA
AAAGTCATTCA