## Chapter 7: Combinatorics Algo (Genome Rearrangement)



Synteny blocks are regions of chromosomes that share a common order of homologous genes.
Genome hotspots: genome loci that are more susceptinle to genome rearrangement

N reversals give 2N reversal cuts and 2N+1 synteny blocks,
Random breakage model: Genome rearrangement in simulation (freq of synteny block size histogram) shows exponential dist., suggesting that genome hotspots are random


parsimonious evolutionary scenario: shortest path to evolution between two species (Occam's razor principle) -> minimum number of reversals

Each synteny block given x number such that abs(x)>0, depending on direction of block. Since 0 isn't neg or pos, 1-based indexing

reversal distance between permutations P and Q, denoted drev(P, Q), as the minimum number of reversals required to transform P into Q.

identity permutation: smallest to largest permutation


In [2]:
import math

### Greedy Sorting

`Greedy Sorting`

In [4]:
from typing import List, Dict, Iterable, Tuple

In [65]:
def k_sorting_reversal(P:List[int],k)-> List[int]:
    j = list(map(abs, P)).index(k+1) #j= curr ind of k 
    return P[:k] + list(map(lambda x: x * -1, P[k:j+1][::-1])) + P[j+1:]
# P = arr till k + neg of curr P[k->j] in flipped order + rest arr after j

In [64]:
def GreedySorting(P: List[int]) -> List[List[int]]:
    output = []
    for k in range(0,len(P)):
        if P[k]!=k+1:
            P = k_sorting_reversal(P,k)
            output.append(P)
        if P[k]==-k-1:
            P = k_sorting_reversal(P,k)
            output.append(P)
    return output

In [66]:
P = [-3,+4,+1,+5,-2]
GreedySorting(P)

[[-1, -4, 3, 5, -2],
 [1, -4, 3, 5, -2],
 [1, 2, -5, -3, 4],
 [1, 2, 3, 5, 4],
 [1, 2, 3, -4, -5],
 [1, 2, 3, 4, -5],
 [1, 2, 3, 4, 5]]

### Breakpoints
adjacency pair if p_(i+1) − p_i = 1 for instance (+3,+4) or (-10,-9) -> they are desirable because they are sorted ish
if pair is not adjacency pair then it is counted as a breakpoint

In [5]:
def BreakpointCount(P: List[int]) -> int:
    count = 0
    P = [0]+P+[len(P)+1] #adding blocks at the beginning and end
    for i in range(len(P)-1):
        if P[i+1]-P[i]!=1: #if neither (a,a+1) or (-a-1,-a) it is a breakpoint
            count+=1
        #print(P[i],P[i+1],count)
    return count

In [6]:
P = [+3,+4,+5,-12,-8,-7,-6,+1,+2,+10,+9,-11,+13,+14]
BreakpointCount(P)

8

### Genome Graph rearrangement

Chromosome = [+1,+2,-3] representation
cycle = [1,2,3,4,6,5] representation

In [10]:
def ChromosomeToCycle(Chromosome: List[int]) -> List[int]:
    Nodes = [0 for _ in range(2*len(Chromosome)+1)]
    for j in range(1,len(Chromosome)+1):
        i = Chromosome[j-1] #synteny block identity
        #print(j,i)
        if i>0: #synteny block pointing to the right
            Nodes[2*j-1] = 2*i-1
            Nodes[2*j] = 2*i
        else: #pointing to the left
            Nodes[2*j-1] = -2*i
            Nodes[2*j] = -2*i-1
        #print(Nodes)
    return Nodes[1:]

In [122]:
ChromosomeToCycle([1,-2,-3,4])

1 1
[0, 1, 2, 0, 0, 0, 0, 0, 0]
2 -2
[0, 1, 2, 4, 3, 0, 0, 0, 0]
3 -3
[0, 1, 2, 4, 3, 6, 5, 0, 0]
4 4
[0, 1, 2, 4, 3, 6, 5, 7, 8]


[1, 2, 4, 3, 6, 5, 7, 8]

In [9]:
def CycleToChromosome(Nodes: List[int]) -> List[int]:
    Chromosome = [0 for _ in range(int(len(Nodes)/2)+1)] #block = half number of edges (always even)
    Nodes = [0]+Nodes
    for j in range(1,int(len(Nodes)/2+1)):
        if Nodes[2*j-1] < Nodes[2*j]: #if pointing to the right (3,4)
            Chromosome[j] =int(Nodes[2*(j)]/2)
        else: #if pointing to the left (4,3)
            Chromosome[j] = int(-Nodes[2*j-1]/2)
        #print(Chromosome)
    return Chromosome[1:]

In [296]:
CycleToChromosome([7,9,10,12,11,8])

[0, 4, 0, 0]
[0, 4, 6, 0]
[0, 4, 6, -5]


[4, 6, -5]

ColoredEdges(P)
     Edges ← an empty set
     for each chromosome Chromosome in P
          Nodes ← ChromosomeToCycle(Chromosome)
          for j ← 1 to |Chromosome|
               add the edge (Nodes2j, Nodes2j +1) to Edges
     return Edges

In [16]:
                    #list of list
def ColoredEdges(P: List[List[int]]) -> List[Tuple[int, int]]:
    Edges = []
    for chromosome in P:
        print(chromosome)
        Nodes = ChromosomeToCycle(chromosome)
        Nodes = [0]+Nodes   
        print('Nodes',Nodes)
        for j in range(1,len(chromosome)+1):
            if (2*j+1) < len(Nodes):
                Edges.append((Nodes[2*j],Nodes[2*j+1]))
            else: #edge case for looping back edge
                Edges.append((Nodes[2*j],Nodes[1]))
    return Edges

In [17]:
ColoredEdges([[+1,-2,-3],[+4,+5,-6]])

[1, -2, -3]
Nodes [0, 1, 2, 4, 3, 6, 5]
[4, 5, -6]
Nodes [0, 7, 8, 9, 10, 12, 11]


[(2, 4), (3, 6), (5, 1), (8, 9), (10, 12), (11, 7)]

#### Two break distance

a 2-break applied to a multichromosomal genome P can increase Cycles(P, Q) by at most 1
The 2-break distance between genomes P and Q is equal to Blocks(P, Q)− Cycles(P, Q)

In [236]:
def find_disconnected_components(edges):
    # Build the adjacency list for the graph
    graph = {}
    for u, v in edges:
        if u not in graph:
            graph[u] = []
        if v not in graph:
            graph[v] = []
        graph[u].append(v)
        graph[v].append(u)  # Since the graph is undirected

    visited = set()
    disconnected_components = []

    def dfs_iterative(start_vertex):
        stack = [start_vertex]
        while stack:
            vertex = stack.pop()
            if vertex not in visited:
                visited.add(vertex)
                disconnected_components.append([vertex])
                # Add unvisited neighbors to the stack
                for neighbor in graph[vertex]:
                    if neighbor not in visited:
                        stack.append(neighbor)
                        disconnected_components[0].append(neighbor)

    component_count = 0
    for vertex in graph:
        if vertex not in visited:
            dfs_iterative(vertex)
            component_count += 1  # Each completed DFS call explores a new component

    return component_count, disconnected_components


In [202]:
def blocks(P: List[List[int]], Q: List[List[int]])-> int: #returns num synteny blocks in P and Q
    print([len(chromosome) for chromosome in P])
    num_blocks = sum([len(chromosome) for chromosome in P])
    return num_blocks

In [208]:
def cycles(P: List[List[int]], Q: List[List[int]])-> int:
    P_edges = ColoredEdges(P)
    Q_edges = ColoredEdges(Q)
    print(P_edges,Q_edges)
    return find_disconnected_components(P_edges+Q_edges)


In [190]:
def TwoBreakDistance(P: List[List[int]], Q: List[List[int]]) -> int:
    return blocks(P,Q)-cycles(P,Q)

In [209]:
P = [[1,-2,-3,4]]
Q = [[1,3,2,-4]]
TwoBreakDistance(P,Q)

[4]
[1, -2, -3, 4]
Nodes [0, 1, 2, 4, 3, 6, 5, 7, 8]
[1, 3, 2, -4]
Nodes [0, 1, 2, 5, 6, 3, 4, 8, 7]
[(2, 4), (3, 6), (5, 7), (8, 1)] [(2, 5), (6, 3), (4, 8), (7, 1)]


2

In [None]:
def detectGraphCycles(genomeEdges: List[Tuple[int, int]], blackEdges: List[Tuple[int, int]]):
    # Initialize mappings for direct and inverse edges for both edge types
    coloredEdgesMap, blackEdgesMap, reverseColoredEdgesMap, reverseBlackEdgesMap = createEdgeMaps(genomeEdges, blackEdges)
    
    visitedNodes = set()  # Keeps track of nodes visited during DFS to prevent loops
    detectedCycles = []   # Stores the cycles detected in the graph

    def depthFirstSearch(startNode, currentNode, currentPath, isFollowingColoredEdges):
        # Check if the current node has been visited to possibly identify a cycle
        if currentNode in visitedNodes:
            if currentNode == startNode:
                # A cycle is detected; add it to the list excluding the redundant current node
                detectedCycles.append(currentPath[:-1])
            return
        visitedNodes.add(currentNode)  # Mark the current node as visited
        
        # Determine the next nodes to explore based on the edge type currently being followed
        nextNodes = []
        edgeMaps = (coloredEdgesMap, reverseColoredEdgesMap) if isFollowingColoredEdges else (blackEdgesMap, reverseBlackEdgesMap)
        for edgeMap in edgeMaps:
            nextNodes.extend(edgeMap.get(currentNode, []))
        
        # Iterate over the next nodes, alternately following colored and black edges
        for nextNode in nextNodes:
            depthFirstSearch(startNode, nextNode, currentPath + [nextNode], not isFollowingColoredEdges)

    # Initiate DFS from every unique node, starting with colored edges
    allNodes = set(coloredEdgesMap.keys()) | set(blackEdgesMap.keys())
    for node in allNodes:
        if node not in visitedNodes:
            depthFirstSearch(node, node, [node], True)  # True signifies starting with colored edges

    return detectedCycles


In [212]:
P = [[+1,+2,+3,+4,+5,+6]]
Q = [[+1,-3,-6,-5],[2,-4]]
TwoBreakDistance(P,Q)

[6]
[1, 2, 3, 4, 5, 6]
Nodes [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
[1, -3, -6, -5]
Nodes [0, 1, 2, 6, 5, 12, 11, 10, 9]
[2, -4]
Nodes [0, 3, 4, 8, 7]
[(2, 3), (4, 5), (6, 7), (8, 9), (10, 11), (12, 1)] [(2, 6), (5, 12), (11, 10), (9, 1), (4, 8), (7, 3)]


3

2-Break Sorting Problem: Find a shortest transformation of one genome into another by 2-breaks.

Input: Two genomes with circular chromosomes on the same set of synteny blocks.
Output: The sequence of genomes resulting from applying a shortest sequence of 2-breaks transforming one genome into the other.

In [349]:
from typing import List, Dict, Iterable, Tuple

def TwoBreakOnGenome(P: List[int],
                     i1: int, i2: int, i3: int, i4: int) -> List[List[int]]:
    GenomeGraph = ColoredEdges(P)
    black_edges = BlackEdges(P)

    print('GenomeGraph',GenomeGraph)
    GenomeGraph =TwoBreakOnGenomeGraph(GenomeGraph,i1,i2,i3,i4)
    
    print('new GenomeGraph', GenomeGraph)
    print('BlackEdges',black_edges)
    P = GraphToGenome(GenomeGraph,black_edges)
    return P

genome_graph_t = List[Tuple[int, int]]
def TwoBreakOnGenomeGraph(GenomeGraph: genome_graph_t,
                          i1: int, i2: int, i3: int, i4: int) -> genome_graph_t:
    for edge in GenomeGraph[:]: #iterating over copy of Genome graph so remove edge doesn't mess up iteration
        if edge in [(i1,i2),(i2,i1),(i3,i4),(i4,i3)]:
            GenomeGraph.remove(edge)
    GenomeGraph.append((i1,i3))
    GenomeGraph.append((i2,i4))
    return GenomeGraph

def CycleToChromosome(Nodes: List[int]) -> List[int]:
    Chromosome = [0 for _ in range(int(len(Nodes)/2)+1)]
    Nodes = [0]+Nodes
    for j in range(1,int(len(Nodes)/2+1)):
        if Nodes[2*j-1] < Nodes[2*j]:
            Chromosome[j] =int(Nodes[2*(j)]/2)
        else:
            Chromosome[j] = int(-Nodes[2*j-1]/2)
        #print(Chromosome)
    return Chromosome[1:]
def ColoredEdges(P: List[List[int]]) -> List[Tuple[int, int]]:
    Edges = []
    for chromosome in P:
        #print(chromosome)
        Nodes = ChromosomeToCycle(chromosome)
        Nodes = [0]+Nodes   
        #print('Nodes',Nodes)
        for j in range(1,len(chromosome)+1):
            if (2*j+1) < len(Nodes):
                Edges.append((Nodes[2*j],Nodes[2*j+1]))
            else:
                Edges.append((Nodes[2*j],Nodes[1]))
    return Edges
def BlackEdges(P: List[List[int]]) -> List[Tuple[int, int]]:
    Edges = []
    for chromosome in P:
        Nodes = ChromosomeToCycle(chromosome)

        for i in range(0,len(Nodes)-1,2):
            Edges.append((Nodes[i],Nodes[i+1]))

    return Edges

def ChromosomeToCycle(Chromosome: List[int]) -> List[int]:
    Nodes = [0 for _ in range(2*len(Chromosome)+1)]
    for j in range(1,len(Chromosome)+1):
        i = Chromosome[j-1]
        #print(j,i)
        print('i',i,j,Chromosome)
        if i>0:
            Nodes[2*j-1] = 2*i-1
            Nodes[2*j] = 2*i
        else:
            Nodes[2*j-1] = -2*i
            Nodes[2*j] = -2*i-1
        #print(Nodes)
    return Nodes[1:]
def GraphToGenome(GenomeGraph: List[Tuple[int, int]],black_edges:List[Tuple[int, int]]) -> List[List[int]]:
    P = []
    for cycle in findCyclesInGraph(GenomeGraph,black_edges):
        print(cycle)
        Chromosome = CycleToChromosome(cycle)
        P.append(Chromosome)
    return P
def findCyclesInGraph(GenomeGraph: List[Tuple[int, int]], black_edges: List[Tuple[int, int]]):
    # Generate dictionaries for efficient edge lookups
    Colored_dict, Black_dict, R_Black_dict, R_Colored_dict = Genome_dict(GenomeGraph, black_edges)
    
    visited = set()  # Keep track of visited nodes to avoid revisiting
    cycles = []      # Store the cycles found

    def dfs(start, curr, path, is_colored_edge):
        nonlocal cycles
        if curr in visited:
            if curr == start:
                cycles.append(path[:-1])  # Found a cycle, append a copy of the path
            return
        visited.add(curr)
        
        # Determine the next node based on the current edge type
        next_nodes = []
        if is_colored_edge:
            if curr in Colored_dict:
                next_nodes.append(Colored_dict[curr])
            if curr in R_Colored_dict:
                next_nodes.append(R_Colored_dict[curr])
        else:
            if curr in Black_dict:
                next_nodes.append(Black_dict[curr])
            if curr in R_Black_dict:
                next_nodes.append(R_Black_dict[curr])
        
        for next_node in next_nodes:
            dfs(start, next_node, path + [next_node], not is_colored_edge)  # Alternate edge type

    # Start DFS from each node not yet visited
    for node in list(Black_dict.keys()) + list(Colored_dict.keys()):
        if node not in visited:
            dfs(node, node, [node], False)  # Start with black edges

    return cycles
def Genome_dict(GenomeGraph: List[Tuple[int, int]],black_edges:List[Tuple[int, int]]):
    black_dict = {}
    reverse_black_dict = {}
    colored_dict = {}
    reverse_colored_dict = {}
    for edge in GenomeGraph:
        a,b = edge
        colored_dict[a]=b
        reverse_colored_dict[b]=a
    for edge in black_edges:
        a,b = edge
        black_dict[a]=b
        reverse_black_dict[b]=a
    return colored_dict,black_dict,reverse_black_dict,reverse_colored_dict

## Shared K mers (creating synteny blocks)

In [None]:
def reverse_complement(pattern: str) -> str:
    complement = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G'}
    return ''.join(complement[char] for char in reversed(pattern))

def SharedKMers(k: int, s: str, t: str) -> List[Tuple[int, int]]:
    # Dictionary to store k-mers from s and their positions
    s_kmers = {}
    for i in range(len(s) - k + 1): #kmers in s and their indices
        kmer = s[i:i+k]
        if kmer not in s_kmers:
            s_kmers[kmer] = []
        s_kmers[kmer].append(i)
    
    # Pre-compute reverse complement of t
    t_rev = reverse_complement(t)
    
    red_coordinates = []
    blue_coordinates = []
    
    for j in range(len(t) - k + 1): #traversing through t and rev_comp_t
        t_kmer = t[j:j+k]
        t_kmer_rev = t_rev[len(t) - k - j: len(t) - j]
        
        # Check direct matches
        if t_kmer in s_kmers:
            for i in s_kmers[t_kmer]:
                red_coordinates.append((i, j))
        
        # Check reverse complement matches
        if t_kmer_rev in s_kmers:
            for i in s_kmers[t_kmer_rev]:
                blue_coordinates.append((i, j))
                
    return red_coordinates + blue_coordinates

hamming distance with shared kmers

In [19]:
def hamming_distance(p: str, q: str) -> int: #1.8.1
    if len(p)!=len(q):
        return -1
    ham_count = 0
    for i in range(0,len(p)):
        if p[i]!=q[i]:
            ham_count+=1
    
    return ham_count

In [21]:
def kmer_approx_match(t_kmer:str,s_kmers:List[str],d:int)->List[int]:
    match_kmers = []
    for s_kmer in s_kmers:
        if hamming_distance(t_kmer,s_kmer)<=d:
            match_kmers.append(t_kmer)
    return match_kmers


In [None]:
def Shared_Approx_Kmers(k:int,s:str,t:str,):
    # Dictionary to store k-mers from s and their positions
    s_kmers = {}
    for i in range(len(s) - k + 1): #kmers in s and their indices
        kmer = s[i:i+k]
        if kmer not in s_kmers:
            s_kmers[kmer] = []
        s_kmers[kmer].append(i)
    
    # Pre-compute reverse complement of t
    t_rev = reverse_complement(t)
    
    red_coordinates = []
    blue_coordinates = []
    
    for j in range(len(t) - k + 1): #traversing through t and rev_comp_t
        t_kmer = t[j:j+k]
        t_kmer_rev = t_rev[len(t) - k - j: len(t) - j]
        
        if kmer_approx_match
        # Check direct matches
        if t_kmer in s_kmers:
            for i in s_kmers[t_kmer]:
                red_coordinates.append((i, j))
        
        # Check reverse complement matches
        if t_kmer_rev in s_kmers:
            for i in s_kmers[t_kmer_rev]:
                blue_coordinates.append((i, j))
                
    return red_coordinates + blue_coordinates