**Method 1:**


*   STEP 1:READ SEQUNCE FROM FILE
*   STEP 2:GENERATE K-MERS
*   STEP 3:ASSEMBLY


---

**OUTPUT:**
 * DE BRUNJE GRAPH
 * PATH
 * FINAL ASSEMBLED STRING








In [None]:
import graphviz  
def clean_sequence(sequence):
    """
    Process the DNA sequence by removing all non-ATGC characters.
    Replace them with spaces and then strip the string of leading/trailing spaces.
    """
    cleaned = ''.join([char if char in 'ATGC' else ' ' for char in sequence])
    return cleaned.replace(' ', '').strip()

def read_sequence_from_file(file_path):
    """
    Read DNA sequence from a file, clean it and return the cleaned sequence.
    """
    try:
        with open(file_path, 'r') as file:
            sequence = file.read()
        return clean_sequence(sequence)
    except FileNotFoundError:
        print(f"Error: The file {file_path} was not found.")
        return None

def build_k_mer(seq, k):
    """
    Build k-mers from the given sequence and k value.
    """
    return [seq[i:k + i] for i in range(0, len(seq) - k + 1)]

def debruijnize(reads):
    nodes = set()
    not_starts = set()
    edges = []
    for r in reads:
        r1 = r[:-1]
        r2 = r[1:]
        nodes.add(r1)
        nodes.add(r2)
        edges.append((r1, r2))
        not_starts.add(r2)
    return (nodes, edges, list(nodes - not_starts))

def make_node_edge_map(edges):
    node_edge_map = {}
    for e in edges:
        n = e[0]
        if n in node_edge_map:
            node_edge_map[n].append(e[1])
        else:
            node_edge_map[n] = [e[1]]
    return node_edge_map

def eulerian_trail(m, v):
    nemap = m
    result_trail = []
    start = v
    result_trail.append(start)
    while True:
        trail = []
        previous = start
        while True:
            if previous not in nemap:
                break
            next = nemap[previous].pop()
            if len(nemap[previous]) == 0:
                nemap.pop(previous, None)
            trail.append(next)
            if next == start:
                break
            previous = next
        print(trail)
        index = result_trail.index(start)
        result_trail = result_trail[:index + 1] + trail + result_trail[index + 1:]
        if len(nemap) == 0:
            break
        found_new_start = False
        for n in result_trail:
            if n in nemap:
                start = n
                found_new_start = True
                break
        if not found_new_start:
            print("error")
            print("result_trail", result_trail)
            print(nemap)
            break
    return result_trail

def visualize_debruijn(G):
    nodes = G[0]
    edges = G[1]
    dot_str = 'digraph "DeBruijn graph" {\n '
    for node in nodes:
        dot_str += f'    {node} [label="{node}"] ;\n'
    for src, dst in edges:
        dot_str += f'    {src} -> {dst} ;\n'
    return dot_str + '}\n'

def assemble_trail(trail):
    if len(trail) == 0:
        return ""
    result = trail[0][:-1]
    for node in trail:
        result += node[-1]
    return result

def test_assembly_debruijn(t, k):
    reads = build_k_mer(t, k)
    G = debruijnize(reads)
    v = visualize_debruijn(G)
    nemap = make_node_edge_map(G[1])
    print(G)
    print(v)
    start = next(iter(G[2])) if len(G[2]) > 0 else next(iter(G[0]))
    trail = eulerian_trail(nemap, start)
    return assemble_trail(trail)


file_path = input("Enter the file path containing the DNA sequence: ")
k = int(input("Enter the K-mer size: "))
sequence = read_sequence_from_file(file_path)

if sequence:
    print("Cleaned Sequence:", sequence)

    # Generate K-mers
    reads = build_k_mer(sequence, k)
    print("K-mers:", reads)

    # De Bruijn Graph
    G = debruijnize(reads)

    # Node-Edge Map
    m = make_node_edge_map(G[1])

    # Starting point
    start = G[2][0] if len(G[2]) > 0 else G[0][0]

    # Eulerian Trail
    t = eulerian_trail(m, start)

    # Visualize the graph with graphviz
    dot_str = visualize_debruijn(G)
    graph = graphviz.Source(dot_str)
    graph.render('debruijn_graph', format='png', view=True)

    # Assemble the sequence
    assembled_sequence = assemble_trail(t)
    print("Assembled sequence:", assembled_sequence)
else:
    print("No valid sequence found. Please check the file.")

**Method 2:**


*   STEP 1:READ K-MERS FROM FILE
*   STEP 2:ASSEMBLY


---

**OUTPUT:**
 * DE BRUNJE GRAPH
 * PATH
 * FINAL ASSEMBLED STRING








In [None]:
import graphviz  

def clean_sequence(sequence):
    """
    Process the DNA sequence by removing all non-ATGC characters.
    Replace them with spaces and then strip the string of leading/trailing spaces.
    """
    cleaned = ''.join([char if char in 'ATGC' else ' ' for char in sequence])
    return cleaned.replace(' ', '').strip()

def read_reads_from_file(file_path):
    """
    Read DNA reads from a file and clean them.
    Assumes each read is on a separate line in the file.
    """
    try:
        with open(file_path, 'r') as file:
            reads = [line.strip() for line in file.readlines()]
        return reads
    except FileNotFoundError:
        print(f"Error: The file {file_path} was not found.")
        return None

def debruijnize(reads):
    nodes = set()
    not_starts = set()
    edges = []
    for r in reads:
        r1 = r[:-1]
        r2 = r[1:]
        nodes.add(r1)
        nodes.add(r2)
        edges.append((r1, r2))
        not_starts.add(r2)
    return (nodes, edges, list(nodes - not_starts))

def make_node_edge_map(edges):
    node_edge_map = {}
    for e in edges:
        n = e[0]
        if n in node_edge_map:
            node_edge_map[n].append(e[1])
        else:
            node_edge_map[n] = [e[1]]
    return node_edge_map

def eulerian_trail(m, v):
    nemap = m
    result_trail = []
    start = v
    result_trail.append(start)
    while True:
        trail = []
        previous = start
        while True:
            if previous not in nemap:
                break
            next = nemap[previous].pop()
            if len(nemap[previous]) == 0:
                nemap.pop(previous, None)
            trail.append(next)
            if next == start:
                break
            previous = next
        print(trail)
        index = result_trail.index(start)
        result_trail = result_trail[:index + 1] + trail + result_trail[index + 1:]
        if len(nemap) == 0:
            break
        found_new_start = False
        for n in result_trail:
            if n in nemap:
                start = n
                found_new_start = True
                break
        if not found_new_start:
            print("error")
            print("result_trail", result_trail)
            print(nemap)
            break
    return result_trail

def visualize_debruijn(G):
    nodes = G[0]
    edges = G[1]
    dot_str = 'digraph "DeBruijn graph" {\n '
    for node in nodes:
        dot_str += f'    {node} [label="{node}"] ;\n'
    for src, dst in edges:
        dot_str += f'    {src} -> {dst} ;\n'
    return dot_str + '}\n'

def assemble_trail(trail):
    if len(trail) == 0:
        return ""
    result = trail[0][:-1]
    for node in trail:
        result += node[-1]
    return result

def test_assembly_debruijn(reads):
    # Build De Bruijn graph from reads directly
    G = debruijnize(reads)

    # Node-Edge Map
    m = make_node_edge_map(G[1])

    # Starting point
    start = G[2][0] if len(G[2]) > 0 else G[0][0]

    # Eulerian Trail
    t = eulerian_trail(m, start)

    # Visualize the graph with graphviz
    dot_str = visualize_debruijn(G)
    graph = graphviz.Source(dot_str)
    graph.render('debruijn_graph', format='png', view=True)

    # Assemble the sequence
    assembled_sequence = assemble_trail(t)
    print("Assembled sequence:", assembled_sequence)


file_path = input("Enter the file path containing the DNA reads: ")
reads = read_reads_from_file(file_path)

if reads:
    print("Reads:", reads)

    # Perform assembly
    test_assembly_debruijn(reads)
else:
    print("No valid reads found. Please check the file.")
