In [None]:
import graphviz

def clean_sequence(sequence):
    """
    Process the DNA sequence by removing all non-ATGC characters.
    Replace them with spaces and then strip the string of leading/trailing spaces.
    """
    cleaned = ''.join([char if char in 'ATGC' else ' ' for char in sequence])
    return cleaned.replace(' ', '').strip()

def read_reads_from_file(file_path):
    """
    Read DNA reads from a file and clean them.
    Assumes each read is on a separate line in the file.
    Ignores lines that start with '>'.
    """
    try:
        with open(file_path, 'r') as file:
            reads = [line.strip() for line in file.readlines() if not line.startswith('>')]
        return reads
    except FileNotFoundError:
        print(f"Error: The file {file_path} was not found.")
        return None

def generate_kmers(reads, k):
    """
    Generate k-mers from the given reads.
    """
    kmers = []
    for read in reads:
        for i in range(len(read) - k + 1):
            kmers.append(read[i:i + k])
    return kmers

def debruijnize(kmers):
    """
    Construct the De Bruijn graph from k-mers.
    """
    nodes = set()
    not_starts = set()
    edges = []
    for kmer in kmers:
        r1 = kmer[:-1]
        r2 = kmer[1:]
        nodes.add(r1)
        nodes.add(r2)
        edges.append((r1, r2))
        not_starts.add(r2)
    return (nodes, edges, list(nodes - not_starts))

def make_node_edge_map(edges):
    """
    Create a mapping of nodes to their outgoing edges.
    """
    node_edge_map = {}
    for e in edges:
        n = e[0]
        if n in node_edge_map:
            node_edge_map[n].append(e[1])
        else:
            node_edge_map[n] = [e[1]]
    return node_edge_map

def eulerian_trail(m, v):
    """
    Find an Eulerian trail in the graph.
    """
    nemap = m
    result_trail = []
    start = v
    result_trail.append(start)
    while True:
        trail = []
        previous = start
        while True:
            if previous not in nemap:
                break
            next = nemap[previous].pop()
            if len(nemap[previous]) == 0:
                nemap.pop(previous, None)
            trail.append(next)
            if next == start:
                break
            previous = next
        print(trail)
        index = result_trail.index(start)
        result_trail = result_trail[:index + 1] + trail + result_trail[index + 1:]
        if len(nemap) == 0:
            break
        found_new_start = False
        for n in result_trail:
            if n in nemap:
                start = n
                found_new_start = True
                break
        if not found_new_start:
            print("Error: Eulerian path issue")
            print("Result Trail:", result_trail)
            print("Remaining Edges:", nemap)
            break
    return result_trail

def visualize_debruijn(G):
    """
    Generate a Graphviz representation of the De Bruijn graph.
    """
    nodes = G[0]
    edges = G[1]
    dot_str = 'digraph "DeBruijn graph" {\n'
    for node in nodes:
        dot_str += f'    {node} [label="{node}"] ;\n'
    for src, dst in edges:
        dot_str += f'    {src} -> {dst} ;\n'
    return dot_str + '}\n'

def assemble_trail(trail):
    """
    Assemble the final sequence from the Eulerian trail.
    """
    if len(trail) == 0:
        return ""
    result = trail[0][:-1]
    for node in trail:
        result += node[-1]
    return result

def test_assembly_debruijn(reads, k):
    """
    Run the DNA sequence assembly using a De Bruijn graph approach.
    """
    # Generate k-mers from reads
    kmers = generate_kmers(reads, k)

    # Build De Bruijn graph from k-mers
    G = debruijnize(kmers)

    # Node-Edge Map
    m = make_node_edge_map(G[1])

    # Starting point
    start = G[2][0] if len(G[2]) > 0 else next(iter(G[0]))

    # Eulerian Trail
    t = eulerian_trail(m, start)

    # Visualize the graph with Graphviz
    dot_str = visualize_debruijn(G)
    graph = graphviz.Source(dot_str)
    graph.render('debruijn_graph', format='png', view=True)

    # Assemble the sequence
    assembled_sequence = assemble_trail(t)
    print("Assembled sequence:", assembled_sequence)

# Main Execution
file_path = input("Enter the file path containing the DNA reads: ")
k = int(input("Enter the k-mer size: "))

reads = read_reads_from_file(file_path)

if reads:
    test_assembly_debruijn(reads, k)
else:
    print("No valid reads found. Please check the file.")


Enter the file path containing the DNA reads: /content/seq.fa
Enter the k-mer size: 4
['AGC', 'GCT', 'CTA', 'TAG']
Assembled sequence: TAGCTAG


In [None]:
import graphviz

def clean_sequence(sequence):
    """
    Process the DNA sequence by removing all non-ATGC characters.
    Replace them with spaces and then strip the string of leading/trailing spaces.
    """
    cleaned = ''.join([char if char in 'ATGC' else ' ' for char in sequence])
    return cleaned.replace(' ', '').strip()

def read_reads_from_file(file_path):
    """
    Read DNA reads from a file and clean them.
    Assumes each read is on a separate line in the file.
    Ignores lines that start with '>'.
    """
    try:
        with open(file_path, 'r') as file:
            reads = [line.strip() for line in file.readlines() if not line.startswith('>')]
        return reads
    except FileNotFoundError:
        print(f"Error: The file {file_path} was not found.")
        return None

def generate_kmers(reads, k):
    """
    Generate k-mers from the given reads.
    """
    kmers = []
    for read in reads:
        for i in range(len(read) - k + 1):
            kmers.append(read[i:i + k])
    return kmers

def debruijnize(kmers):
    """
    Construct the De Bruijn graph from k-mers.
    """
    nodes = set()
    not_starts = set()
    edges = []
    for kmer in kmers:
        r1 = kmer[:-1]
        r2 = kmer[1:]
        nodes.add(r1)
        nodes.add(r2)
        edges.append((r1, r2))
        not_starts.add(r2)
    return (nodes, edges, list(nodes - not_starts))

def make_node_edge_map(edges):
    """
    Create a mapping of nodes to their outgoing edges.
    """
    node_edge_map = {}
    for e in edges:
        n = e[0]
        if n in node_edge_map:
            node_edge_map[n].append(e[1])
        else:
            node_edge_map[n] = [e[1]]
    return node_edge_map

def eulerian_trail(graph, start):
    """
    Finds an Eulerian path in the given directed graph using Hierholzer’s algorithm.

    Parameters:
        graph (dict): A dictionary where keys are nodes and values are lists of outgoing edges.
        start (str): The starting node for traversal.

    Returns:
        list: The Eulerian path as a list of nodes.
    """
    stack = [start]  # Stack to store nodes for traversal
    path = []  # Final Eulerian path (reversed order)

    # While there are nodes to process
    while stack:
        node = stack[-1]
        if node in graph and graph[node]:
            next_node = graph[node].pop()
            stack.append(next_node)
        else:
            path.append(stack.pop())

    return path[::-1]

def visualize_debruijn(G):
    """
    Generate a Graphviz representation of the De Bruijn graph.
    """
    nodes = G[0]
    edges = G[1]
    dot_str = 'digraph "DeBruijn graph" {\n'
    for node in nodes:
        dot_str += f'    {node} [label="{node}"] ;\n'
    for src, dst in edges:
        dot_str += f'    {src} -> {dst} ;\n'
    return dot_str + '}\n'

def assemble_trail(trail):
    """
    Assemble the final sequence from the Eulerian trail.
    """
    if len(trail) == 0:
        return ""
    result = trail[0][:-1]
    for node in trail:
        result += node[-1]
    return result

def test_assembly_debruijn(reads, k):
    """
    Run the DNA sequence assembly using a De Bruijn graph approach.
    """
    # Generate k-mers from reads
    kmers = generate_kmers(reads, k)

    # Build De Bruijn graph from k-mers
    G = debruijnize(kmers)

    # Node-Edge Map
    m = make_node_edge_map(G[1])

    # Starting point
    start = G[2][0] if len(G[2]) > 0 else next(iter(G[0]))

    # Eulerian Trail
    t = eulerian_trail(m, start)

    # Visualize the graph with Graphviz
    dot_str = visualize_debruijn(G)
    graph = graphviz.Source(dot_str)
    graph.render('debruijn_graph', format='png', view=True)

    # Assemble the sequence
    assembled_sequence = assemble_trail(t)
    print("Assembled sequence:", assembled_sequence)

# Main Execution
file_path = input("Enter the file path containing the DNA reads: ")
k = int(input("Enter the k-mer size: "))

reads = read_reads_from_file(file_path)

if reads:
    test_assembly_debruijn(reads, k)
else:
    print("No valid reads found. Please check the file.")


Enter the file path containing the DNA reads: /content/seq.fa
Enter the k-mer size: 4
Assembled sequence: TAGCTAG
