Problem
Consider the following substitution maxtrix for DNA sequences:

   A  C  G  T    
A 10  2  5  2    
C  2 10  2  5    
G  5  2 10  2    
T  2  5  2 10  




Question 1: What is the optimal (here maximal) cost of an alignment of AATAAT and AAGG using the above substitution matrix and gap cost -5?

In [25]:
import numpy as np

# Define the substitution matrix
substitution_matrix ={'A': {'A': 10, 'C': 2, 'G': 5, 'T': 2}, 
                    'C': {'A': 2, 'C': 10, 'G': 2, 'T': 5}, 
                    'G': {'A': 5, 'C': 2, 'G': 10, 'T': 2}, 
                    'T': {'A': 2, 'C': 5, 'G': 2, 'T': 10}}

def linear_gap_cost_matrix(seq1:str, seq2:str, gap_cost:int, substitution_matrix:dict):
    """ This function takes two sequences, a gap cost and a substitution matrix and returns the cost matrix for the alignment of the two sequences. """
    n = len(seq1) + 1
    m = len(seq2) + 1
    # Initialize the matrices
    M = np.zeros((n, m))
    for i in range(1, n):
        M[i][0] =  gap_cost * i
    for j in range(1, m):
        M[0][j] =  gap_cost * j
    for i in range(1, n):
        for j in range(1, m):
            substitution_cost = substitution_matrix[seq1[i-1]][seq2[j-1]]
            M[i][j] = max(M[i-1][j-1] + substitution_cost, M[i][j-1] + gap_cost, M[i-1][j] + gap_cost)
    return M



print(linear_gap_cost_matrix('AATAAT', 'AAGG', -5, substitution_matrix))

[[  0.  -5. -10. -15. -20.]
 [ -5.  10.   5.   0.  -5.]
 [-10.   5.  20.  15.  10.]
 [-15.   0.  15.  22.  17.]
 [-20.  -5.  10.  20.  27.]
 [-25. -10.   5.  15.  25.]
 [-30. -15.   0.  10.  20.]]


Question 2: What is the optimal (here maximal) cost of an alignment of seq1.fasta and seq2.fasta using the same substitution matrix and gap cost? (You probably want to implement the algorithm for computing the cost of an optimal alignment.)

In [26]:
from Bio import SeqIO

def fasta_seq(input_file):
    with open(input_file,'r') as f:
        for i in SeqIO.parse(f,'fasta'): sequence = i.seq
    return str(sequence)

a = fasta_seq('seq1.fasta')
b = fasta_seq('seq2.fasta')

print(linear_gap_cost_matrix(a, b, -5, substitution_matrix))

[[   0.   -5.  -10. ... -980. -985. -990.]
 [  -5.   10.    5. ... -965. -970. -975.]
 [ -10.    5.   20. ... -950. -955. -960.]
 ...
 [-980. -965. -950. ... 1326. 1321. 1316.]
 [-985. -970. -955. ... 1321. 1336. 1331.]
 [-990. -975. -960. ... 1321. 1331. 1346.]]


In [33]:
def back_tracking(seq1,seq2,substitution_matrix, matrix,gap_cost):
    """ This function takes two sequences, a substitution matrix, a cost matrix and a gap cost and returns the optimal alignment of the two sequences. """
    n, m, = len(seq1), len(seq2)
    i, j = n, m
    alignment1, alignment2 = '', ''
    while i > 0 and j > 0:
        if matrix[i][j] == matrix[i-1][j-1] + substitution_matrix[seq1[i-1]][seq2[j-1]]:
            alignment1 += seq1[i-1]
            alignment2 += seq2[j-1]
            i -= 1
            j -= 1
        elif matrix[i][j] == matrix[i][j-1] + gap_cost:
            alignment1 += '-'
            alignment2 += seq2[j-1]
            j -= 1
        elif matrix[i][j] == matrix[i-1][j] + gap_cost:
            alignment1 += seq1[i-1]
            alignment2 += '-'
            i -= 1
    while i > 0:
        alignment1 += seq1[i-1]
        alignment2 += '-'
        i -= 1
    while j > 0:
        alignment1 += '-'
        alignment2 += seq2[j-1]
        j -= 1
    return (alignment1[::-1], alignment2[::-1])


substitution_matrix ={'A': {'A': 10, 'C': 2, 'G': 5, 'T': 2}, 
                    'C': {'A': 2, 'C': 10, 'G': 2, 'T': 5}, 
                    'G': {'A': 5, 'C': 2, 'G': 10, 'T': 2}, 
                    'T': {'A': 2, 'C': 5, 'G': 2, 'T': 10}}

m1 = linear_gap_cost_matrix('AATAAT', 'AAGG', -5, substitution_matrix)
print(back_tracking('AATAAT', 'AAGG', substitution_matrix, m1, -5))


('AATAAT', 'AA-GG-')
