# Pairwise Alignment

### 1. Global Alignment: Needleman-Wunsch Algorithm

In [4]:
def score_pos (c1, c2, sm, g):
    if c1 == "−" or c2=="−":
        return g
    else:
        return sm[c1+c2]

def Needleman_Wunsch (seq1, seq2, sm, g):
    S = [[0]]
    T = [[0]]
    ## initialize gaps’ row
    for j in range(1, len(seq2)+1):
        S[0].append(g * j)
        T[0].append(3)
    ## initialize gaps’ column
    for i in range(1, len(seq1)+1):
        S.append([g * i])
        T.append([2])
    ## apply the recurrence relation to fill the remaining of the matrix
    for i in range(0, len(seq1)):
        for j in range(len(seq2)):
            s1 = S[i][j] + score_pos (seq1[i], seq2[j], sm, g);
            s2 = S[i][j+1] + g
            s3 = S[i+1][j] + g
            S[i+1].append(max(s1, s2, s3))
            T[i+1].append(max3t(s1, s2, s3))
    return (S, T)
    
def max3t (v1, v2, v3):
    if v1 > v2:
        if v1 > v3: 
            return 1
        else: 
            return 3
    else:
        if v2 > v3: 
            return 2
        else: 
            return 3

In [5]:
def recover_align (T, seq1, seq2):
    res = ["", ""]
    i = len(seq1)
    j = len(seq2)
    while i>0 or j>0:
        if T[i][j]==1:
            res[0] = seq1[i-1] + res[0]
            res[1] = seq2[j-1] + res[1]
            i -= 1
            j -= 1
        elif T[i][j] == 3:
            res [0] = "−" + res [0]
            res[1] = seq2[j-1] + res[1]
            j -= 1
        else:
            res[0] = seq1[i-1] + res[0]
            res [1] = "−" + res [1]
            i -= 1 
    return res

In [6]:
def read_submat_file (filename): 
    sm = {}
    f = open(filename , "r") 
    line = f.readline() # ignore the first line
    line = f.readline() # the second line is the alphabet
    tokens = line.split() 
    ns = len(tokens)
    alphabet = []
    
    for i in range(0, ns):
        alphabet.append(tokens[i][0]) 

    print(alphabet)
        
    for i in range(0,ns):
        line = f.readline();
        tokens = line.split();
        for j in range(1, len(tokens)):
            k = alphabet[i]+alphabet[j-1]
            sm[k] = int(tokens[j]) 
    return sm

def test_prot():
    sm = read_submat_file("blosum62.mat") 
    print(sm)
    
test_prot()

['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V', 'B', 'Z', 'X', '*']
{'AA': 4, 'AR': -1, 'AN': -2, 'AD': -2, 'AC': 0, 'AQ': -1, 'AE': -1, 'AG': 0, 'AH': -2, 'AI': -1, 'AL': -1, 'AK': -1, 'AM': -1, 'AF': -2, 'AP': -1, 'AS': 1, 'AT': 0, 'AW': -3, 'AY': -2, 'AV': 0, 'AB': -2, 'AZ': -1, 'AX': 0, 'A*': -4, 'RA': -1, 'RR': 5, 'RN': 0, 'RD': -2, 'RC': -3, 'RQ': 1, 'RE': 0, 'RG': -2, 'RH': 0, 'RI': -3, 'RL': -2, 'RK': 2, 'RM': -1, 'RF': -3, 'RP': -2, 'RS': -1, 'RT': -1, 'RW': -3, 'RY': -2, 'RV': -3, 'RB': -1, 'RZ': 0, 'RX': -1, 'R*': -4, 'NA': -2, 'NR': 0, 'NN': 6, 'ND': 1, 'NC': -3, 'NQ': 0, 'NE': 0, 'NG': 0, 'NH': 1, 'NI': -3, 'NL': -3, 'NK': 0, 'NM': -2, 'NF': -3, 'NP': -2, 'NS': 1, 'NT': 0, 'NW': -4, 'NY': -2, 'NV': -3, 'NB': 3, 'NZ': 0, 'NX': -1, 'N*': -4, 'DA': -2, 'DR': -2, 'DN': 1, 'DD': 6, 'DC': -3, 'DQ': 0, 'DE': 2, 'DG': -1, 'DH': -1, 'DI': -3, 'DL': -4, 'DK': -1, 'DM': -3, 'DF': -3, 'DP': -1, 'DS': 0, 'DT': -1, 'DW': -4, 'DY': -3,

In [7]:
def print_mat (mat):
    for i in range(0, len(mat)):
        print(mat[i])
    
def test_global_alig():
    sm = read_submat_file("blosum62.mat") 
    seq1 = "PHSWG"
    seq2 = "HGWAG"
    res = Needleman_Wunsch(seq1, seq2, sm, -8)
    S = res [0]
    T = res [1]
    print("Score of optimal alignment:", S[len(seq1)][len(seq2)]) 
    print('Matrix S:')
    print_mat(S)
    print('Matrix T:')
    print_mat(T)
    alig = recover_align(T, seq1, seq2)
    print(alig[0]) 
    print(alig[1])

test_global_alig()

['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V', 'B', 'Z', 'X', '*']
Score of optimal alignment: 9
Matrix S:
[0, -8, -16, -24, -32, -40]
[-8, -2, -10, -18, -25, -33]
[-16, 0, -4, -12, -20, -27]
[-24, -8, 0, -7, -11, -19]
[-32, -16, -8, 11, 3, -5]
[-40, -24, -10, 3, 11, 9]
Matrix T:
[0, 3, 3, 3, 3, 3]
[2, 1, 3, 3, 1, 3]
[2, 1, 1, 3, 3, 1]
[2, 2, 1, 1, 1, 3]
[2, 2, 2, 1, 3, 3]
[2, 2, 1, 2, 1, 1]
PHSW−G
−HGWAG


### 2. Local Alignment: The Smith-Waterman Algorithm

In [None]:
def Smith_Waterman (seq1, seq2, sm, g): 
    S = [[0]]
    T = [[0]]
    maxscore = 0
    for j in range(1, len(seq2)+1):
        S[0].append(0)
        T[0].append(0)
    for i in range(1, len(seq1)+1):
        S.append([0])
        T.append([0])
    for i in range(0, len(seq1)):
        for j in range(len(seq2)):
            s1 = S[i][j] + score_pos (seq1[i], seq2[j], sm, g); 
            s2 = S[i][j+1] + g
            s3 = S[i+1][j] + g
            b = max(s1, s2, s3)
            if b <= 0: 
                S[i+1].append(0) 
                T[i+1].append(0)
            else:
                S[i+1].append(b) 
                T[i+1].append(max3t(s1, s2, s3))
                if b > maxscore: 
                    maxscore = b
    return (S, T, maxscore)

In [None]:
def recover_align_local (S, T, seq1, seq2): 
    res = ["", ""]
    i, j = max_mat(S) 
    while T[i][j]>0:
        if T[i][j]==1:
            res[0] = seq1[i-1] + res[0] 
            res[1] = seq2[j-1] + res[1]
            i -= 1
            j -= 1
        elif T[i][j] == 3:
            res [0] = "−" + res [0];
            res[1] = seq2[j-1] + res[1]
            j -= 1
        elif T[i][j] == 2:
            res[0] = seq1[i-1] + res[0]
            res [1] = "−" + res [1]
            i -= 1
    return res

def max_mat(mat):
    maxval = mat [0][0]
    maxrow = 0
    maxcol = 0
    for i in range(0,len(mat)):
        for j in range(0, len(mat[i])):
            if mat[i][j] > maxval:
                maxval = mat[i][j]
                maxrow = i
                maxcol = j
    return (maxrow,maxcol)

In [None]:
def test_local_alig():
    sm = read_submat_file("blosum62.mat")
    seq1 = "HGWAG"
    seq2 = "PHSWG"
    res = Smith_Waterman(seq1, seq2, sm, -8)
    S = res [0]
    T = res [1]
    print("Score of optimal alignment:", res[2])
    print_mat(S)
    print_mat(T)
    alinL= recover_align_local(S, T, seq1, seq2)
    print(alinL[0])
    print(alinL[1]) 
    
test_local_alig()

# Multiple Sequence Alignment

In [3]:
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
from Bio.Seq import Seq

seq1 = Seq("MHQAIFIYQIGYPLKSGYIQSIRSPEYDNW")
seq2 = Seq("MH--IFIYQIGYALKSGYIQSIRSPEY-NW")
seq3 = Seq("MHQAIFI-QIGYALKSGY-QSIRSPEYDNW")

seqr1 = SeqRecord(seq1, id="seq1", annotations={"molecule_type": "protein"})
seqr2 = SeqRecord(seq2, id="seq2", annotations={"molecule_type": "protein"})
seqr3 = SeqRecord(seq3, id="seq3", annotations={"molecule_type": "protein"})

alin = MultipleSeqAlignment([seqr1, seqr2, seqr3])
print(alin)

Alignment with 3 rows and 30 columns
MHQAIFIYQIGYPLKSGYIQSIRSPEYDNW seq1
MH--IFIYQIGYALKSGYIQSIRSPEY-NW seq2
MHQAIFI-QIGYALKSGY-QSIRSPEYDNW seq3
