In [1]:
import collections

def construct_de_bruijn_graph(text, k):
    adj_list = collections.defaultdict(list) # DefaultDict creates the key with an empty list when accesed (if not defined)
    
    for i in range(len(text) - k + 1): # Sliding window
        kmer = text[i : i+k]
        prefix = kmer[:-1]  # First k-1 characters
        suffix = kmer[1:]   # Last k-1 characters
        adj_list[prefix].append(suffix)

    for node in adj_list:
        adj_list[node].sort()
        
    return adj_list

def format_adjacency_list_for_printing(adj_list):
    output_lines = []
    # Sort the keys (nodes)
    for node in sorted(adj_list.keys()):
        neighbors_str = ",".join(adj_list[node]) # Join multiple values for one key
        output_lines.append(f"{node} -> {neighbors_str}")
    return "\n".join(output_lines)

if __name__ == "__main__":
    file_path = "../data/rosalind_ba3d.txt" 
    with open(file_path, 'r') as f:
        k_str = f.readline().strip()
        text = f.readline().strip()

    k = int(k_str)
    
    de_bruijn_graph_adj_list = construct_de_bruijn_graph(text, k)
    # Match expected output
    result_string = format_adjacency_list_for_printing(de_bruijn_graph_adj_list)
    
    print(result_string)


AAAACGGTGGC -> AAACGGTGGCC
AAAATTTAAAA -> AAATTTAAAAC
AAAATTTTAAC -> AAATTTTAACT
AAACCACGAGC -> AACCACGAGCC
AAACCCTTCTG -> AACCCTTCTGG
AAACGGTGGCC -> AACGGTGGCCG
AAACGTGATGG -> AACGTGATGGG
AAACTAATCCG -> AACTAATCCGT
AAACTATAAGC -> AACTATAAGCC
AAACTTAGATA -> AACTTAGATAA
AAAGATAATTA -> AAGATAATTAT
AAAGATATTGT -> AAGATATTGTC
AAAGCACCCAT -> AAGCACCCATA
AAAGCAGTGAG -> AAGCAGTGAGC
AAAGCCTTACA -> AAGCCTTACAG
AAAGTCCTTGG -> AAGTCCTTGGA
AAATCACGCTC -> AATCACGCTCC
AAATCTCGCGC -> AATCTCGCGCG
AAATTATACAA -> AATTATACAAT
AAATTTAAAAC -> AATTTAAAACG
AAATTTTAACT -> AATTTTAACTA
AACAAAGCACC -> ACAAAGCACCC
AACAAGGCCAG -> ACAAGGCCAGG
AACAAGGCCGT -> ACAAGGCCGTC
AACACCGTTCT -> ACACCGTTCTC
AACAGTGAAAT -> ACAGTGAAATC
AACATACTTAC -> ACATACTTACT
AACATGTCCGG -> ACATGTCCGGT
AACCACGAGCC -> ACCACGAGCCT
AACCCATAACA -> ACCCATAACAG
AACCCCCAATT -> ACCCCCAATTG
AACCCTATCTG -> ACCCTATCTGC
AACCCTTCTGG -> ACCCTTCTGGC
AACCGAAACCA -> ACCGAAACCAC
AACCGACCAAG -> ACCGACCAAGT
AACCGTCCGAT -> ACCGTCCGATC
AACCTTCGTAA -> ACCTTCGTAAC
A