# Find a Highest-Scoring Alignment of Two Strings

CMSC701-0101 Spring 2019 Computational Genomics   
University of Maryland   
Reginald Carey   
for Dr. Mihai Pop

In [1]:
import numpy as np
from collections import Counter
import re
from collections import defaultdict

**BLOSUM62** Scoring Matrix

Build the blosum62 scoring matrix for use in the s(x,y) function

In [2]:
AminoAcids="ACDEFGHIKLMNPQRSTVWY"
blosum62_str="""
 4  0 -2 -1 -2  0 -2 -1 -1 -1 -1 -2 -1 -1 -1  1  0  0 -3 -2
 0  9 -3 -4 -2 -3 -3 -1 -3 -1 -1 -3 -3 -3 -3 -1 -1 -1 -2 -2
-2 -3  6  2 -3 -1 -1 -3 -1 -4 -3  1 -1  0 -2  0 -1 -3 -4 -3
-1 -4  2  5 -3 -2  0 -3  1 -3 -2  0 -1  2  0  0 -1 -2 -3 -2
-2 -2 -3 -3  6 -3 -1  0 -3  0  0 -3 -4 -3 -3 -2 -2 -1  1  3
 0 -3 -1 -2 -3  6 -2 -4 -2 -4 -3  0 -2 -2 -2  0 -2 -3 -2 -3
-2 -3 -1  0 -1 -2  8 -3 -1 -3 -2  1 -2  0  0 -1 -2 -3 -2  2
-1 -1 -3 -3  0 -4 -3  4 -3  2  1 -3 -3 -3 -3 -2 -1  3 -3 -1
-1 -3 -1  1 -3 -2 -1 -3  5 -2 -1  0 -1  1  2  0 -1 -2 -3 -2
-1 -1 -4 -3  0 -4 -3  2 -2  4  2 -3 -3 -2 -2 -2 -1  1 -2 -1
-1 -1 -3 -2  0 -3 -2  1 -1  2  5 -2 -2  0 -1 -1 -1  1 -1 -1
-2 -3  1  0 -3  0  1 -3  0 -3 -2  6 -2  0  0  1  0 -3 -4 -2
-1 -3 -1 -1 -4 -2 -2 -3 -1 -3 -2 -2  7 -1 -2 -1 -1 -2 -4 -3
-1 -3  0  2 -3 -2  0 -3  1 -2  0  0 -1  5  1  0 -1 -2 -2 -1
-1 -3 -2  0 -3 -2  0 -3  2 -2 -1  0 -2  1  5 -1 -1 -3 -3 -2
 1 -1  0  0 -2  0 -1 -2  0 -2 -1  1 -1  0 -1  4  1 -2 -3 -2
 0 -1 -1 -1 -2 -2 -2 -1 -1 -1 -1  0 -1 -1 -1  1  5  0 -2 -2
 0 -1 -3 -2 -1 -3 -3  3 -2  1  1 -3 -2 -2 -3 -2  0  4 -3 -1
-3 -2 -4 -3  1 -2 -2 -3 -3 -2 -1 -4 -4 -2 -3 -3 -2 -3 11  2
-2 -2 -3 -2  3 -3  2 -1 -2 -1 -1 -2 -3 -1 -2 -2 -2 -1  2  7
"""
BLOSUM62 = defaultdict(dict)
for a,line in zip(AminoAcids, blosum62_str.strip().splitlines()):
    for b,value in zip(AminoAcids, re.split(" +", line.strip())):
        BLOSUM62[a][b] = int(value)

Return the match/mismatch values for symbols x and y.

Provide two implementations one using blosom lookup results the other using match and mismatch values

In [3]:
def blosum62_similarity(y, x):
    """
    Return the match/mismatch values for symbols x and y.
    Uses blosom table for match mismatch
    """
    return BLOSUM62[y][x]

def simple_similarity(y, x, match=1, mismatch=0):
    """
    Return the match/mismatch values for symbols x and y.
    Uses <match> and <mismatch> for match and mismatch
    """
    return match * (x == y) + mismatch * (x != y)

Compute the similarity between to sequences X and Y.  We return a populated matrix representing the confusion
between the two sequences as well as the optimal alignment data.

In [4]:
def similarity(X, Y, indel=-1, s=simple_similarity):
    """
    Compute the similarity between to sequences X and Y.  We return a populated matrix representing the confusion
    between the two sequences as well as the optimal alignment data.
    """
    def maximum(*args):
        m = max(map(lambda sv: sv[1], args))
        s = "".join(map(lambda sv: sv[0], filter(lambda sv: sv[1] == m, args)))
        return (s, m)
    def horzValue(i, j):
        """Return the value of the horizontal path."""
        return mat[i, j-1][1] + indel
    def diagValue(i, j):
        """Return the value of the diagonal path."""
        return mat[i-1, j-1][1] + s(Y[i-1], X[j-1])
    def vertValue(i, j):
        """Return the value of the vertical path."""
        return mat[i-1, j][1] + indel

    m = len(Y) + 1
    n = len(X) + 1
    mat = np.empty((m, n), dtype=tuple)
    for i in range(0, m):
        mat[i, 0] = ("v", i * indel)
    for j in range(1, n):
        mat[0, j] = ("h", j * indel)
    for i in range(1, m):
        for j in range(1, n):
            mat[i, j] = maximum(("h", horzValue(i, j)), ("d", diagValue(i, j)), ("v", vertValue(i, j)))
    return mat

Traceback returns one of potentially many optimal alignments.  Each point where a cell has more than
one maximal value, we have a bifurication leading to a doubling in the number of optimal alignments.

In [5]:
def traceback(n, m, X, Y):
    """
    Traceback returns one of potentially many optimal alignments.  Each point where a cell has more than
    one maximal value, we have a bifurication leading to a doubling in the number of optimal alignments.
    """
    retval = ("","")
    while m > 0 or n > 0:
        a, b = ("","")
        if 'h' in mat[m, n][0]:
            a, b = X[n-1], "-"
            n = n - 1
        elif 'd' in mat[m, n][0]:
            a, b = X[n-1], Y[m-1]
            n, m = n - 1, m - 1
        elif 'v' in mat[m, n][0]:
            a, b = "-", Y[m-1]
            m = m - 1
        retval = (a + retval[0], b + retval[1])
    return retval

Print a simple result as per Rosalind requirements

In [6]:
def print_simple():
    """
    Print a simple result as per Rosalind requirements
    """
    print(mat[len(Y), len(X)][1])
    for string in traceback(len(X), len(Y), X, Y):
        print(string)

Print a formatted result wraps sequences at <width> characters and identifies offsets

In [7]:
def print_formatted(width=100):
    """
    Print a formatted result wraps sequences at <width> characters and identifies offsets
    """
    
    def chunk(s, w):
        """
        Break s into w width chunks and return a list. Used to print results
        """
        retval = []
        start = 0
        while len(s):
            end = start + w
            retval.append(s[start:end])
            s = s[end:]
        return retval

    print(mat[len(Y), len(X)][1], end="")
    for (alignedX, alignedY) in [traceback(len(X), len(Y), X, Y)]:
        s1 = 0
        s2 = 0
        print()
        for (A, B) in zip(chunk(alignedX, width), chunk(alignedY, width)):
            print(f'{s1:>5} : {A}{" "*(width-len(A))} : {(s1 + len(A) - 1):<5}')
            print(f'{s2:>5} : {B}{" "*(width-len(B))} : {(s2 + len(B) - 1):<5}')
            s1 += len(A)
            s2 += len(B)

Try data from the class notes. Dont use BLOSUM62 and set indel=-1, match=1, mismatch=0

In [8]:
X, Y = "ACGC", "GACTAC"
mat = similarity(X, Y)
print_formatted()

1
    0 : -AC-GC                                                                                               : 5    
    0 : GACTAC                                                                                               : 5    


Try data from Rosalind Question.

In [9]:
X, Y = "MEANLY", "PLEASANTLY"
mat = similarity(X, Y, indel=-5, s=blosum62_similarity)
print_formatted()

8
    0 : -ME--AN-LY                                                                                           : 9    
    0 : PLEASANTLY                                                                                           : 9    


Try another data set from Rosalind Question.

In [10]:
X, Y = """\
ILYPRQSMICMSFCFWDMWKKDVPVVLMMFLERRQMQSVFSWLVTVKTDCGKGIYNHRKYLGLPTMTAGDWHWIKKQNDPHEW\
FQGRLETAWLHSTFLYWKYFECDAVKVCMDTFGLFGHCDWDQQIHTCTHENEPAIAFLDLYCRHSPMCDKLYPVWDMACQTCH\
FHHSWFCRNQEMWMKGDVDDWQWGYHYHTINSAQCNQWFKEICKDMGWDSVFPPRHNCQRHKKCMPALYAGIWMATDHACTFM\
VRLIYTENIAEWHQVYCYRSMNMFTCGNVCLRCKSWIFVKNYMMAPVVNDPMIEAFYKRCCILGKAWYDMWGICPVERKSHWE\
IYAKDLLSFESCCSQKKQNCYTDNWGLEYRLFFQSIQMNTDPHYCQTHVCWISAMFPIYSPFYTSGPKEFYMWLQARIDQNMH\
GHANHYVTSGNWDSVYTPEKRAGVFPVVVPVWYPPQMCNDYIKLTYECERFHVEGTFGCNRWDLGCRRYIIFQCPYCDTMKIC\
YVDQWRSIKEGQFRMSGYPNHGYWFVHDDHTNEWCNQPVLAKFVRSKIVAICKKSQTVFHYAYTPGYNATWPQTNVCERMYGP\
HDNLLNNQQNVTFWWKMVPNCGMQILISCHNKMKWPTSHYVFMRLKCMHVLMQMEYLDHFTGPGEGDFCRNMQPYMHQDLHWE\
GSMRAILEYQAEHHRRAFRAELCAQYDQEIILWSGGWGVQDCGFHANYDGSLQVVSGEPCSMWCTTVMQYYADCWEKCMFA""", """\
ILIPRQQMGCFPFPWHFDFCFWSAHHSLVVPLNPQMQTVFQNRGLDRVTVKTDCHDHRWKWIYNLGLPTMTAGDWHFIKKHVV\
RANNPHQWFQGRLTTAWLHSTFLYKKTEYCLVRHSNCCHCDWDQIIHTCAFIAFLDLYQRHWPMCDKLYCHFHHSWFCRNQEM\
SMDWNQWFPWDSVPRANCLEEGALIALYAGIWANSMKRDMKTDHACTVRLIYVCELHAWLKYCYTSINMLCGNVCLRCKSWIF\
VKLFYMYAPVVNTIEANSPHYYKRCCILGQGICPVERKSHCEIYAKDLLSFESCCSQKQNCYTDNWGLEYRLFFQHIQMECTD\
PHANRGWTSCQTAKYWHFNLDDRPPKEFYMWLQATPTDLCMYQHCLMFKIVKQNFRKQHGHANPAASTSGNWDSVYTPEKMAY\
KDWYVSHPPVDMRRNGSKMVPVWYPPGIWHWKQSYKLTYECFFTVPGRFHVEGTFGCNRWDHQPGTRRDRQANHQFQCPYSDT\
MAIWEHAYTYVDQWRSIKEGQMPMSGYPNHGQWNVHDDHTNEQERSPICNQPVLAKFVRSKNVSNHEICKKSQTVFHWACEAQ\
TNVCERMLNNQHVAVKRNVTFWWQMVPNCLWSCHNKMTWPTRPEQHRLFFVKMRLKCMHEYLDVAPSDFCRNMQAYMHSMRAI\
LEYQADFDLKRRLRAIAPMDLCAQYDQEIILWSGGYIYDQSLQVVSCEGCSYYADCYVKCINVKEKCMFA"""
mat = similarity(X, Y, indel=-5, s=blosum62_similarity)
print_formatted()

1555
    0 : ILYPRQSMICMSFCF-WD--MWKKDVPVVLMMFLERRQMQSVF-S-WL--VTVKTDCGKGIYNHR-K--Y-LGLPTMTAGDWHWIKK---Q-NDPHEWFQ : 99   
    0 : ILIPRQQMGCFPFPWHFDFCFWSAHHSLVVP--LNP-QMQTVFQNRGLDRVTVKTDC----HDHRWKWIYNLGLPTMTAGDWHFIKKHVVRANNPHQWFQ : 99   
  100 : GRLETAWLHSTFLYWKYFE-CDAVKVCMDTFGLFGHCDWDQQIHTCTHENEPAIAFLDLYCRHSPMCDKLYPVWDMACQTCHFHHSWFCRNQEMWMKGDV : 199  
  100 : GRLTTAWLHSTFLY-KKTEYC-LVR---HS-NCC-HCDWDQIIHTCAF-----IAFLDLYQRHWPMCDKLY------C---HFHHSWFCRNQEMSM--D- : 199  
  200 : DDWQWGYHYHTINSAQCNQWFKEICKDMGWDSVFPPRHNCQRHKKCMPALYAGIW-------MATDHACTFMVRLIYTENIAEWHQVYCYRSMNMFTCGN : 299  
  200 : --W---------N--Q---WFP-------WDSV-P-RANCLE-EGALIALYAGIWANSMKRDMKTDHACT--VRLIYVCELHAWLK-YCYTSINML-CGN : 299  
  300 : VCLRCKSWIFVK-NYMMAPVVN--DPMIEAFYKRCCILGKAWYDMWGICPVERKSHWEIYAKDLLSFESCCSQKKQNCYTDNWGLEYRLFFQSIQMN-TD : 399  
  300 : VCLRCKSWIFVKLFYMYAPVVNTIEANSPHYYKRCCILGQ------GICPVERKSHCEIYAKDLLSFESCCSQK-QNCYTDNWGLEYRLFFQHIQMECTD : 399  
  400 : PH----Y--CQTHVCW-ISA-MFPIYSPFY--TSG-PKE--FYM-WLQARI

In [11]:
[X, Y, *_] = open("rosalind_ba5e.txt").read().split("\n")
mat = similarity(X, Y, indel=-5, s=blosum62_similarity)
print_simple()

11229
RYYAPL---SR--FAYHSHIQGHDCIWLKFLRHFCPEDRVTL-A------MHMAAVARVMADR-NIK-FRKF-D--DLNGCHTVLKNPDLVSMMFYFIHREGTFLMMVRGEKWIKSPFWFQHEGKICE------Y--NCFLHTNGAFYTIAACELKFLTQNNVPQKWSLDDGSKD-------MFQQDKNIHCPQFM-PWL--VVMTV-FGGKDAFGHTM----Y-NP-EKIELIHFLDWQFWWWVEQAWKEVHKCDSWWIHPNMSDNLAQFPMGKKWSACHQKGRERRSTCYK---FKDWNENFVTYGYIENWADFQWWCADLNITDWEICNPSDEMGINAIGAEYWNKWYAETRWFNVVMETWINGSARMSS-HDFCFDGPWDQNPNPERPVYYGMRPQWQGEMENRTPVSYMNAIKVMCNTSRLLLPLLSTIADNCVTSKQITPFIKIERVNHKHLDDKGRHAFIYFKNQLGMQDQPPEPVIQNR----EHIFVPVDP-DKH-DMHDAKACERDMEHHNFSLCNTLSTHIYTMSCCPRHPVFQLMRE-H-PVNH---PAWYNAHLMKGMCCRLTRCQQDNKHLVWKYGWYPPNQF-N-----TPWG------I-H-SISH-L---MWEDLCQVT--TDQCECHQPCQMG-KVFDEWVQDQFAHGWDCLHHVVPASYHHTYHNHMYYVYEYELQHSLDVCYLGYTEIKA-D-----D-F------FR-QKYYYISMSGYG----H--HVGPDDAAHARRVENWYEEMMDKIMNS------T-GLDEHRKN--S----CCPK--IRTDKEDAIRPYPHYYKHLNSLSMHHLRCRPNVYNKNLDECNETYIGMPDKIMLRFWINGTYA----CVKR-Y--NKMGSRKI-SKFRMYNPVFNYNCCH---K--A-CSG------H-WIHWNWFYRPGCGWDK---I-EE--DREGLN-FKCYDIQEFDHDRKIPWELWLTANREDCAYEFRYLMQKM

In [12]:
print_formatted()

11229
    0 : RYYAPL---SR--FAYHSHIQGHDCIWLKFLRHFCPEDRVTL-A------MHMAAVARVMADR-NIK-FRKF-D--DLNGCHTVLKNPDLVSMMFYFIHR : 99   
    0 : RYYAPLAPCNQNAFAYHSHAQGHDNIWLKFLRHFCPEDRVTLCAGMPWMKMHMAAHMYPQPEAWTMKQIRKFWQHKNLNGCHTVLKNPD------YF-HR : 99   
  100 : EGTFLMMVRGEKWIKSPFWFQHEGKICE------Y--NCFLHTNGAFYTIAACELKFLTQNNVPQKWSLDDGSKD-------MFQQDKNIHCPQFM-PWL : 199  
  100 : EQTFLVPCRG-KY-DAPFWFQHEGKICQPRAVMSYHGNCFLHTNTAFYTI---ELKF---N--PQKWSLDDGSKDFFEPPANMFQQDKNIHCPQFMINHC : 199  
  200 : --VVMTV-FGGKDAFGHTM----Y-NP-EKIELIHFLDWQFWWWVEQAWKEVHKCDSWWIHPNMSDNLAQFPMGKKWSACHQKGRERRSTCYK---FKDW : 299  
  200 : QQEENTFNFGGKDAFGHTMLVVKQTNPLIKIERIHFLDWQFDWWEEFDECDIQSFTYWWIHPNMSDNLAQFPMGKK---CHQCGRERRSTCYKIIDMKDW : 299  
  300 : NENFVTYGYIENWADFQWWCADLNITDWEICNPSDEMGINAIGAEYWNKWYAETRWFNVVMETWINGSARMSS-HDFCFDGPWDQNPNPERPVYYGMRPQ : 399  
  300 : HENFRAYGYIFQ-SD-AW-CADLNITDWEICNPSDEMGITAIGAGYWNKWYAETLQERTHFTGEFN-VV-METWHDFCPDGPWDQNPNPERPVYYGMRPQ : 399  
  400 : WQGEMENRTPVSYMNAIKVMCNTSRLLLPLLSTIADNCVTSKQITPFIKI