# Global And Local Alignment

In [1]:
import sys
import numpy as np
from collections import Counter
import re
from collections import defaultdict
from functools import partial

The score_function returns a score for a pair of characters.  These may be the same different or one may be missing.  The code supports three scoring frameworks, one where all match/mismatch/indel scores have fixed values, and one based on BLOSUM62 and the other based PAM250.  Other scoring matrices can be incorporated.

In [2]:
def score_function(score_name="BASIC", A=1, B=0, C=1):

    def score(a, b, indel, s):
        """Return the match/mismatch/indel score for the symbols a, and b."""
        return indel if '-' in [a,b] else s[a][b]
    
    if "BLOSUM62" == score_name:
        inputString = """
               A  C  D  E  F  G  H  I  K  L  M  N  P  Q  R  S  T  V  W  Y
            A  4  0 -2 -1 -2  0 -2 -1 -1 -1 -1 -2 -1 -1 -1  1  0  0 -3 -2
            C  0  9 -3 -4 -2 -3 -3 -1 -3 -1 -1 -3 -3 -3 -3 -1 -1 -1 -2 -2
            D -2 -3  6  2 -3 -1 -1 -3 -1 -4 -3  1 -1  0 -2  0 -1 -3 -4 -3
            E -1 -4  2  5 -3 -2  0 -3  1 -3 -2  0 -1  2  0  0 -1 -2 -3 -2
            F -2 -2 -3 -3  6 -3 -1  0 -3  0  0 -3 -4 -3 -3 -2 -2 -1  1  3
            G  0 -3 -1 -2 -3  6 -2 -4 -2 -4 -3  0 -2 -2 -2  0 -2 -3 -2 -3
            H -2 -3 -1  0 -1 -2  8 -3 -1 -3 -2  1 -2  0  0 -1 -2 -3 -2  2
            I -1 -1 -3 -3  0 -4 -3  4 -3  2  1 -3 -3 -3 -3 -2 -1  3 -3 -1
            K -1 -3 -1  1 -3 -2 -1 -3  5 -2 -1  0 -1  1  2  0 -1 -2 -3 -2
            L -1 -1 -4 -3  0 -4 -3  2 -2  4  2 -3 -3 -2 -2 -2 -1  1 -2 -1
            M -1 -1 -3 -2  0 -3 -2  1 -1  2  5 -2 -2  0 -1 -1 -1  1 -1 -1
            N -2 -3  1  0 -3  0  1 -3  0 -3 -2  6 -2  0  0  1  0 -3 -4 -2
            P -1 -3 -1 -1 -4 -2 -2 -3 -1 -3 -2 -2  7 -1 -2 -1 -1 -2 -4 -3
            Q -1 -3  0  2 -3 -2  0 -3  1 -2  0  0 -1  5  1  0 -1 -2 -2 -1
            R -1 -3 -2  0 -3 -2  0 -3  2 -2 -1  0 -2  1  5 -1 -1 -3 -3 -2
            S  1 -1  0  0 -2  0 -1 -2  0 -2 -1  1 -1  0 -1  4  1 -2 -3 -2
            T  0 -1 -1 -1 -2 -2 -2 -1 -1 -1 -1  0 -1 -1 -1  1  5  0 -2 -2
            V  0 -1 -3 -2 -1 -3 -3  3 -2  1  1 -3 -2 -2 -3 -2  0  4 -3 -1
            W -3 -2 -4 -3  1 -2 -2 -3 -3 -2 -1 -4 -4 -2 -3 -3 -2 -3 11  2
            Y -2 -2 -3 -2  3 -3  2 -1 -2 -1 -1 -2 -3 -1 -2 -2 -2 -1  2  7
            """
    elif "PAM250" == score_name:
        inputString = """
               A  C  D  E  F  G  H  I  K  L  M  N  P  Q  R  S  T  V  W  Y
            A  2 -2  0  0 -3  1 -1 -1 -1 -2 -1  0  1  0 -2  1  1  0 -6 -3
            C -2 12 -5 -5 -4 -3 -3 -2 -5 -6 -5 -4 -3 -5 -4  0 -2 -2 -8  0
            D  0 -5  4  3 -6  1  1 -2  0 -4 -3  2 -1  2 -1  0  0 -2 -7 -4
            E  0 -5  3  4 -5  0  1 -2  0 -3 -2  1 -1  2 -1  0  0 -2 -7 -4
            F -3 -4 -6 -5  9 -5 -2  1 -5  2  0 -3 -5 -5 -4 -3 -3 -1  0  7
            G  1 -3  1  0 -5  5 -2 -3 -2 -4 -3  0  0 -1 -3  1  0 -1 -7 -5
            H -1 -3  1  1 -2 -2  6 -2  0 -2 -2  2  0  3  2 -1 -1 -2 -3  0
            I -1 -2 -2 -2  1 -3 -2  5 -2  2  2 -2 -2 -2 -2 -1  0  4 -5 -1
            K -1 -5  0  0 -5 -2  0 -2  5 -3  0  1 -1  1  3  0  0 -2 -3 -4
            L -2 -6 -4 -3  2 -4 -2  2 -3  6  4 -3 -3 -2 -3 -3 -2  2 -2 -1
            M -1 -5 -3 -2  0 -3 -2  2  0  4  6 -2 -2 -1  0 -2 -1  2 -4 -2
            N  0 -4  2  1 -3  0  2 -2  1 -3 -2  2  0  1  0  1  0 -2 -4 -2
            P  1 -3 -1 -1 -5  0  0 -2 -1 -3 -2  0  6  0  0  1  0 -1 -6 -5
            Q  0 -5  2  2 -5 -1  3 -2  1 -2 -1  1  0  4  1 -1 -1 -2 -5 -4
            R -2 -4 -1 -1 -4 -3  2 -2  3 -3  0  0  0  1  6  0 -1 -2  2 -4
            S  1  0  0  0 -3  1 -1 -1  0 -3 -2  1  1 -1  0  2  1 -1 -2 -3
            T  1 -2  0  0 -3  0 -1  0  0 -2 -1  0  0 -1 -1  1  3  0 -5 -3
            V  0 -2 -2 -2 -1 -1 -2  4 -2  2  2 -2 -1 -2 -2 -1  0  4 -6 -2
            W -6 -8 -7 -7  0 -7 -3 -5 -3 -2 -4 -4 -6 -5  2 -2 -5 -6 17  0
            Y -3  0 -4 -4  7 -5  0 -1 -4 -1 -2 -2 -5 -4 -4 -3 -3 -2  0 10
            """
    elif "BASIC" == score_name:
        return lambda a, b: -C if '-' in [a,b] else A if a == b else -B
    else:
        raise ValueError("score_name constrained to ['BASIC', 'BLOSUM62', 'PAM250']")

    # divide the inputString into lines where each line is a row from the tables above
    lines = inputString.strip().splitlines()
    
    # determine the order of the amino acids repesented in the table.
    amino_acids = "".join(re.split(" +", lines[0].strip()))
    
    # setup a dictionary of dictionaries to encode the matrix
    target = defaultdict(dict)
    
    # step through the remaining lines and add them to the target matrix
    for row in lines[1:]:
        
        # get the amino acid associated with this row and then the row values
        rowId, *values = re.split(" +", row.strip())
        
        # put the data into the dictionary of dictionaries
        for colId, value in zip(amino_acids, values):
            target[rowId][colId] = int(value)

    # return a function that returns a score for match/mismatch/indel based on data from the matrix 
    # and the passed in indel score, C.
    return partial(score, indel=-C, s=target)

An internal function _sequence_similarity, computes the sequence similarity between sequences X and Y.  It will compute either a global alignment or a local alignment.

In [3]:
def _sequence_similarity(X, Y, s, global_seq):
    """
    Compute the sequence similarity between to sequences X and Y.  We return a populated matrix representing
    the confusion between the two sequences as well as the optimal alignment data.
    
    This is our implementation of the Needleman-Wunsch Algorithm and the local alignment algorithm.
    
    Arguments
    ---------
    X: sequence to align
    Y: sequence to align
    s: a function taking a pair of parameters, the two characters to compare
    global_seq: 
    """
    def maximum(*args):
        """
        Given an array of tuples return a tuple where the first element is a string consisting of all first
        elements in the array of tuples where the second element is maximal in the array. E.G.
        maximum([('a',5),('b',3),('c',6),('d',3),('e',6)]) = ('ce',6)
        """
        m = max(map(lambda sv: sv[1], args))
        s = "".join(map(lambda sv: sv[0], filter(lambda sv: sv[1] == m, args)))
        return (s, m)
    def horzValue(i, j):
        """
        Return the value of the horizontal path.
        We use the s function to return a similarity score which will be an indel score
        """
        return mat[i, j-1][1] + s('-', X[j-1])
    def diagValue(i, j):
        """
        Return the value of the diagonal path.
        We use the s function to return a similarity score which will be a match/mismatch score
        """
        return mat[i-1, j-1][1] + s(Y[i-1], X[j-1])
    def vertValue(i, j):
        """
        Return the value of the vertical path.
        We use the s function to return a similarity score which will be an indel score
        """
        return mat[i-1, j][1] + s(Y[i-1], '-')

    m = len(Y) + 1
    n = len(X) + 1
    index = (-1, -1)
    maxval = -sys.maxsize
    minval = -sys.maxsize * global_seq
    mat = np.empty((m, n), dtype=tuple)
    for i in range(0, m):
        mat[i, 0] = maximum(("", minval), ("v", i * s('-', '-')))
        maxval, index = (mat[i, 0][1], (i, 0)) if mat[i, 0][1] > maxval else (maxval, index)
    for j in range(0, n):
        mat[0, j] = maximum(("", minval), ("h", j * s('-', '-')))
        maxval, index = (mat[0, j][1], (0, j)) if mat[0, j][1] > maxval else (maxval, index)
    for i in range(1, m):
        for j in range(1, n):
            mat[i, j] = maximum(("", minval), ("h", horzValue(i, j)), ("d", diagValue(i, j)), ("v", vertValue(i, j)))
            maxval, index = (mat[i, j][1], (i, j)) if mat[i, j][1] > maxval else (maxval, index)
    if global_seq:
        index = m-1, n-1
        maxval = mat[m-1, n-1][1]
    return (mat, maxval, index)

def global_sequence_similarity():
    return partial(_sequence_similarity, global_seq=True)

def local_sequence_similarity():
    return partial(_sequence_similarity, global_seq=False)

Traceback returns one of potentially many optimal alignments.  Each point where a cell has more than
one maximal value, we have a bi or tri - furication leading to a doubling/tripling in the number of equivalent alignments observed along a particular path.

In [4]:
def global_traceback(mat, X, Y):
    """
    Traceback returns one of potentially many optimal alignments.  Each point where a cell has more than
    one maximal value, we have a bifurication leading to a doubling in the number of optimal alignments.
    """
    n, m = mat.shape[1]-1, mat.shape[0]-1
    retval = ("","")
    while m > 0 or n > 0:
        a, b = ("","")
        if 'h' in mat[m, n][0]:
            a, b = X[n-1], "-"
            n = n - 1
        elif 'd' in mat[m, n][0]:
            a, b = X[n-1], Y[m-1]
            n, m = n - 1, m - 1
        elif 'v' in mat[m, n][0]:
            a, b = "-", Y[m-1]
            m = m - 1
        else:
            break
        retval = (a + retval[0], b + retval[1])
    return retval

In [5]:
def local_traceback(mat, X, Y):
    """
    Traceback returns one of potentially many optimal alignments.  Each point where a cell has more than
    one maximal value, we have a bifurication leading to a doubling in the number of optimal alignments.
    """
    n, m = mat.shape[1]-1, mat.shape[0]-1
    retval = ("","")
    while (m > 0 or n > 0) and mat[m, n][1] > 0:
        a, b = ("","")
        if 'h' in mat[m, n][0]:
            a, b = X[n-1], "-"
            n = n - 1
        elif 'd' in mat[m, n][0]:
            a, b = X[n-1], Y[m-1]
            n, m = n - 1, m - 1
        elif 'v' in mat[m, n][0]:
            a, b = "-", Y[m-1]
            m = m - 1
        else:
            break
        retval = (a + retval[0], b + retval[1])
    return retval

Print a formatted result wraps sequences at <width> characters and identifies offsets

In [6]:
def print_formatted(traceback_results, width=100):
    """
    Print a formatted result wraps sequences at <width> characters and identifies offsets
    """
    
    def chunk(s, w):
        """
        Break s into w width chunks and return a list. Used to print results
        """
        retval = []
        start = 0
        while len(s):
            end = start + w
            retval.append(s[start:end])
            s = s[end:]
        return retval
    
    for (alignedX, alignedY) in traceback_results:
        s1 = 0
        s2 = 0
        print()
        for (A, B) in zip(chunk(alignedX, width), chunk(alignedY, width)):
            print(f'{s1:>5} {A}{" "*(width-len(A))} {(s1 + len(A) - 1):<5}')
            print(f"      {''.join(a if a==b else ' ' for a, b in zip(A, B))}{' '*(width-len(A))}")
            print(f'{s2:>5} {B}{" "*(width-len(B))} {(s2 + len(B) - 1):<5}')
            print()
            s1 += len(A)
            s2 += len(B)

Try data from the class notes. Dont use BLOSUM62 and set indel=-1, match=1, mismatch=0

In [7]:
def trial_1():
    X, Y = "ACGC", "GACTAC"
    similarity = global_sequence_similarity()
    mat,maxval,index = similarity(X, Y, s=score_function("BASIC", C=1))
    print(maxval,index)
    print_formatted([global_traceback(mat[0:index[0]+1, 0:index[1]+1], X, Y)])
    
trial_1()

1 (6, 4)

    0 -AC-GC                                                                                               5    
       AC  C                                                                                              
    0 GACTAC                                                                                               5    



Try data from Rosalind Question.

In [8]:
def trial_2():
    X, Y = "MEANLY", "PLEASANTLY"
    similarity = global_sequence_similarity()
    mat,maxval,index = similarity(X, Y, s=score_function("BLOSUM62", C=5))
    print(maxval,index)
    print_formatted([global_traceback(mat[0:index[0]+1, 0:index[1]+1], X, Y)])

trial_2()

8 (10, 6)

    0 -ME--AN-LY                                                                                           9    
        E  AN LY                                                                                          
    0 PLEASANTLY                                                                                           9    



Try another data set from Rosalind Question.

In [9]:
def trial_3():
    X, Y = """\
ILYPRQSMICMSFCFWDMWKKDVPVVLMMFLERRQMQSVFSWLVTVKTDCGKGIYNHRKYLGLPTMTAGDWHWIKKQNDPHEW\
FQGRLETAWLHSTFLYWKYFECDAVKVCMDTFGLFGHCDWDQQIHTCTHENEPAIAFLDLYCRHSPMCDKLYPVWDMACQTCH\
FHHSWFCRNQEMWMKGDVDDWQWGYHYHTINSAQCNQWFKEICKDMGWDSVFPPRHNCQRHKKCMPALYAGIWMATDHACTFM\
VRLIYTENIAEWHQVYCYRSMNMFTCGNVCLRCKSWIFVKNYMMAPVVNDPMIEAFYKRCCILGKAWYDMWGICPVERKSHWE\
IYAKDLLSFESCCSQKKQNCYTDNWGLEYRLFFQSIQMNTDPHYCQTHVCWISAMFPIYSPFYTSGPKEFYMWLQARIDQNMH\
GHANHYVTSGNWDSVYTPEKRAGVFPVVVPVWYPPQMCNDYIKLTYECERFHVEGTFGCNRWDLGCRRYIIFQCPYCDTMKIC\
YVDQWRSIKEGQFRMSGYPNHGYWFVHDDHTNEWCNQPVLAKFVRSKIVAICKKSQTVFHYAYTPGYNATWPQTNVCERMYGP\
HDNLLNNQQNVTFWWKMVPNCGMQILISCHNKMKWPTSHYVFMRLKCMHVLMQMEYLDHFTGPGEGDFCRNMQPYMHQDLHWE\
GSMRAILEYQAEHHRRAFRAELCAQYDQEIILWSGGWGVQDCGFHANYDGSLQVVSGEPCSMWCTTVMQYYADCWEKCMFA""", """\
ILIPRQQMGCFPFPWHFDFCFWSAHHSLVVPLNPQMQTVFQNRGLDRVTVKTDCHDHRWKWIYNLGLPTMTAGDWHFIKKHVV\
RANNPHQWFQGRLTTAWLHSTFLYKKTEYCLVRHSNCCHCDWDQIIHTCAFIAFLDLYQRHWPMCDKLYCHFHHSWFCRNQEM\
SMDWNQWFPWDSVPRANCLEEGALIALYAGIWANSMKRDMKTDHACTVRLIYVCELHAWLKYCYTSINMLCGNVCLRCKSWIF\
VKLFYMYAPVVNTIEANSPHYYKRCCILGQGICPVERKSHCEIYAKDLLSFESCCSQKQNCYTDNWGLEYRLFFQHIQMECTD\
PHANRGWTSCQTAKYWHFNLDDRPPKEFYMWLQATPTDLCMYQHCLMFKIVKQNFRKQHGHANPAASTSGNWDSVYTPEKMAY\
KDWYVSHPPVDMRRNGSKMVPVWYPPGIWHWKQSYKLTYECFFTVPGRFHVEGTFGCNRWDHQPGTRRDRQANHQFQCPYSDT\
MAIWEHAYTYVDQWRSIKEGQMPMSGYPNHGQWNVHDDHTNEQERSPICNQPVLAKFVRSKNVSNHEICKKSQTVFHWACEAQ\
TNVCERMLNNQHVAVKRNVTFWWQMVPNCLWSCHNKMTWPTRPEQHRLFFVKMRLKCMHEYLDVAPSDFCRNMQAYMHSMRAI\
LEYQADFDLKRRLRAIAPMDLCAQYDQEIILWSGGYIYDQSLQVVSCEGCSYYADCYVKCINVKEKCMFA"""
    similarity = global_sequence_similarity()
    mat,maxval,index = similarity(X, Y, s=score_function("BLOSUM62", C=5))
    print(maxval,index)
    print_formatted([global_traceback(mat[0:index[0]+1, 0:index[1]+1], X, Y)])

trial_3()

1555 (734, 745)

    0 ILYPRQSMICMSFCF-WD--MWKKDVPVVLMMFLERRQMQSVF-S-WL--VTVKTDCGKGIYNHR-K--Y-LGLPTMTAGDWHWIKK---Q-NDPHEWFQ 99   
      IL PRQ M C  F    D   W      V    L   QMQ VF    L  VTVKTDC      HR K  Y LGLPTMTAGDWH IKK     N PH WFQ
    0 ILIPRQQMGCFPFPWHFDFCFWSAHHSLVVP--LNP-QMQTVFQNRGLDRVTVKTDC----HDHRWKWIYNLGLPTMTAGDWHFIKKHVVRANNPHQWFQ 99   

  100 GRLETAWLHSTFLYWKYFE-CDAVKVCMDTFGLFGHCDWDQQIHTCTHENEPAIAFLDLYCRHSPMCDKLYPVWDMACQTCHFHHSWFCRNQEMWMKGDV 199  
      GRL TAWLHSTFLY K  E C  V           HCDWDQ IHTC       IAFLDLY RH PMCDKLY      C   HFHHSWFCRNQEM M  D 
  100 GRLTTAWLHSTFLY-KKTEYC-LVR---HS-NCC-HCDWDQIIHTCAF-----IAFLDLYQRHWPMCDKLY------C---HFHHSWFCRNQEMSM--D- 199  

  200 DDWQWGYHYHTINSAQCNQWFKEICKDMGWDSVFPPRHNCQRHKKCMPALYAGIW-------MATDHACTFMVRLIYTENIAEWHQVYCYRSMNMFTCGN 299  
        W         N  Q   WF        WDSV P R NC        ALYAGIW       M TDHACT  VRLIY      W   YCY S NM  CGN
  200 --W---------N--Q---WFP-------WDSV-P-RANCLE-EGALIALYAGIWANSMKRDMKTDHACT--VRLIYVCELHAWLK-YC

#### Process a BIG dataset - warning can take some time to execute

In [10]:
def trial_4():
    [X, Y, *_] = open("rosalind_ba5e.txt").read().split("\n")
    similarity = global_sequence_similarity()
    mat,maxval,index = similarity(X, Y, s=score_function("BLOSUM62", C=5))
    print(maxval)
    print_formatted([global_traceback(mat[0:index[0]+1, 0:index[1]+1], X, Y)])
    
trial_4()

11229

    0 RYYAPL---SR--FAYHSHIQGHDCIWLKFLRHFCPEDRVTL-A------MHMAAVARVMADR-NIK-FRKF-D--DLNGCHTVLKNPDLVSMMFYFIHR 99   
      RYYAPL       FAYHSH QGHD IWLKFLRHFCPEDRVTL A      MHMAA           K  RKF     LNGCHTVLKNPD      YF HR
    0 RYYAPLAPCNQNAFAYHSHAQGHDNIWLKFLRHFCPEDRVTLCAGMPWMKMHMAAHMYPQPEAWTMKQIRKFWQHKNLNGCHTVLKNPD------YF-HR 99   

  100 EGTFLMMVRGEKWIKSPFWFQHEGKICE------Y--NCFLHTNGAFYTIAACELKFLTQNNVPQKWSLDDGSKD-------MFQQDKNIHCPQFM-PWL 199  
      E TFL   RG K    PFWFQHEGKIC       Y  NCFLHTN AFYTI   ELKF   N  PQKWSLDDGSKD       MFQQDKNIHCPQFM    
  100 EQTFLVPCRG-KY-DAPFWFQHEGKICQPRAVMSYHGNCFLHTNTAFYTI---ELKF---N--PQKWSLDDGSKDFFEPPANMFQQDKNIHCPQFMINHC 199  

  200 --VVMTV-FGGKDAFGHTM----Y-NP-EKIELIHFLDWQFWWWVEQAWKEVHKCDSWWIHPNMSDNLAQFPMGKKWSACHQKGRERRSTCYK---FKDW 299  
           T  FGGKDAFGHTM      NP  KIE IHFLDWQF WW E           WWIHPNMSDNLAQFPMGKK   CHQ GRERRSTCYK    KDW
  200 QQEENTFNFGGKDAFGHTMLVVKQTNPLIKIERIHFLDWQFDWWEEFDECDIQSFTYWWIHPNMSDNLAQFPMGKK---CHQCGRERRSTCYKIIDMKD

In [11]:
def printmat(mat, X, Y):
    ylen, xlen = mat.shape
    print("         "+"     ".join(Y))
    for x in range(xlen):
        print("   | "+" | ".join(f"{mat[y, x][1]:3}" for y in range(ylen))+" |")
        if x+1 < xlen: print(f"{X[x]:1}  -","-"*(ylen*6),sep="")

In [12]:
def trial_5():
    X, Y = "ACGC", "GATTGA"
    similarity = local_sequence_similarity()
    mat,maxval,index = similarity(X, Y, s=score_function("BASIC",A=4, B=1, C=2))
    printmat(mat, X, Y)
    print("Value is",maxval,"at",index)
    printmat(mat[0:index[0]+1,0:index[1]+1], X, Y)
    print()
    print_formatted([local_traceback(mat[0:index[0]+1, 0:index[1]+1], X, Y)])

trial_5()

         G     A     T     T     G     A
   |   0 |   0 |   0 |   0 |   0 |   0 |   0 |
A  -------------------------------------------
   |   0 |   0 |   4 |   2 |   0 |   0 |   4 |
C  -------------------------------------------
   |   0 |   0 |   2 |   3 |   1 |   0 |   2 |
G  -------------------------------------------
   |   0 |   4 |   2 |   1 |   2 |   5 |   3 |
C  -------------------------------------------
   |   0 |   2 |   3 |   1 |   0 |   3 |   4 |
Value is 5 at (5, 3)
         G     A     T     T     G     A
   |   0 |   0 |   0 |   0 |   0 |   0 |
A  -------------------------------------
   |   0 |   0 |   4 |   2 |   0 |   0 |
C  -------------------------------------
   |   0 |   0 |   2 |   3 |   1 |   0 |
G  -------------------------------------
   |   0 |   4 |   2 |   1 |   2 |   5 |


    0 A-CG                                                                                                 3    
      A  G                                                              

In [13]:
def trial_6():
    X, Y = "MEANLY", "PENALTY"
    similarity = local_sequence_similarity()
    mat,maxval,index = similarity(X, Y, s=score_function("PAM250", C=5))
    printmat(mat, X, Y)
    print("Value is",maxval,"at",index)
    printmat(mat[0:index[0]+1,0:index[1]+1], X, Y)
    print()
    print_formatted([local_traceback(mat[0:index[0]+1, 0:index[1]+1], X, Y)])

trial_6()

         P     E     N     A     L     T     Y
   |   0 |   0 |   0 |   0 |   0 |   0 |   0 |   0 |
M  -------------------------------------------------
   |   0 |   0 |   0 |   0 |   0 |   4 |   0 |   0 |
E  -------------------------------------------------
   |   0 |   0 |   4 |   1 |   0 |   0 |   4 |   0 |
A  -------------------------------------------------
   |   0 |   1 |   0 |   4 |   3 |   0 |   1 |   1 |
N  -------------------------------------------------
   |   0 |   0 |   2 |   2 |   4 |   0 |   0 |   0 |
L  -------------------------------------------------
   |   0 |   0 |   0 |   0 |   0 |  10 |   5 |   0 |
Y  -------------------------------------------------
   |   0 |   0 |   0 |   0 |   0 |   5 |   7 |  15 |
Value is 15 at (7, 6)
         P     E     N     A     L     T     Y
   |   0 |   0 |   0 |   0 |   0 |   0 |   0 |   0 |
M  -------------------------------------------------
   |   0 |   0 |   0 |   0 |   0 |   4 |   0 |   0 |
E  ---------------------------------

In [14]:
def trial_7():
    [X, Y, *_] = open("local_alignment.txt").read().split("\n")
    similarity = local_sequence_similarity()
    mat,maxval,index = similarity(X, Y, s=score_function("PAM250", C=5))
    print("Value is",maxval,"at",index)
    print_formatted([local_traceback(mat[0:index[0]+1, 0:index[1]+1], X, Y)])

trial_7()

Value is 1062 at (862, 881)

    0 YQAGIIRQPPRGD-RGVSDRNYSQCGKQ-NQ-AQLDNNPTWTKYEIEWRVQI-LPPGAGVFEGDNGQNQCLCPNW--A-W-EQPCQW----GALHS-NEQ 99   
      Y     R           D     C       A L    T       W      P G GV           C  W  A   E P  W      L   N Q
    0 Y-P-MSRKTAKSQFIEWCDW-F--CFNHWTNWAPLSIVRTSVAFAV-W-GHCWYPCG-GVCKTNRCKDD-FCGRWRKALFAEGPRDWKCCKNDLQNWNPQ 99   

  100 YPNRIHLWAPMSKLHIKIEKSSYN-RNAQ-FPNRCMYECE-FPSY-REQVDSCHYENVQIAF-TIFSGAEQKRKFCSCHFWSNFIDQAVFSTGLI-PWCY 199  
      Y                             F       C     Y           N   AF  I      K              Q   ST    P   
  100 YSQGTR--NTK-RMVATTNQTMIEWKQSHIFETW-LF-CHVIIEYNWSAF-W-MWMNRNEAFNSIIKSGYPKLLL-T-QY-P-L-SQG--STPIVKPL-I 199  

  200 RRDDHSAFFMPNWNKQ--YKHPQLQFRVAGEGTQCRPFYTREMFTKVSAWRIAGRFAGPYERHHDAHLELWY-QHHKVRT-GQQLGIIWNNRDKTRNPCP 299  
      RRD    F    W        P      A     C                   R  GP E       E WY      RT GQQLGIIWNNR KTRNPCP
  200 RRD-QGKFW-A-WAQMWWFREPT-NIPTA-D-Y-CHSW--WQ--SR-ADLQ-NDRDMGP-EADASFYVEFWYWVRCA

```
1062
YQAGIIRQPPRGD-RGVSDRNYSQCGKQ-NQ-AQLDNNPTWTKYEIEWRVQI-LPPGAGVFEGDNGQNQCLCPNW--A-W-EQPCQW----GALHS-NEQYPNRIHLWAPMSKLHIKIEKSSYN-RNAQ-FPNRCMYECE-FPSY-REQVDSCHYENVQIAF-TIFSGAEQKRKFCSCHFWSNFIDQAVFSTGLI-PWCYRRDDHSAFFMPNWNKQ--YKHPQLQFRVAGEGTQCRPFYTREMFTKVSAWRIAGRFAGPYERHHDAHLELWY-QHHKVRT-GQQLGIIWNNRDKTRNPCPFSAY-Y-NK--LP-WWK-I-NQ-N-AFYNCLQNIAHSTHDETHEFNPVKCIDWLQGTMV-P------TECKKGFVHEKCECYRNPGPPLHDMYHQMEDIFGVRFDCLTGWKHLS------D---YNPC-QERRNINDFYIFAYEIAPAVKNLVLSPQPLADATKKCAFNYTPLDQSPVVIACK---WYIHQPI-CMLL----IVLIC-AMDKYNAHMIVIRTTEGQQPMHACRMTEGPGMCMKEPLVTFTLPAQWQWPNHEFKYVYMYVLNYHLSQYTYTDEGHAGGQHYSFNVAVDVGMAWGHNRCYCQPACYSQQETQTRTIDYEKWQYMKHQAFKWGLWFCEQER-HA--WFKGQNRCEMFTAKMTRMGADSNLDQYKLMLAQNYEEQWEQPIMECGMSEIIEIDPPYRSELIFTFWPFCTYSPWQNLIKCRCNNVIEEMDQCVP-LTF-IGFGVKQAGGIQA-WAFYKE--EWTSTYYLMCQCMKSDKAQYPYEIILFWMQ--P-MDTGE--QEPPQQNMWIFLPHSWFFDWCCNAPWSEICSSRHD--H---GQ-CQDAFYPCELFTVF
Y-P-MSRKTAKSQFIEWCDW-F--CFNHWTNWAPLSIVRTSVAFAV-W-GHCWYPCG-GVCKTNRCKDD-FCGRWRKALFAEGPRDWKCCKNDLQNWNPQYSQGTR--NTK-RMVATTNQTMIEWKQSHIFETW-LF-CHVIIEYNWSAF-W-MWMNRNEAFNSIIKSGYPKLLL-T-QY-P-L-SQG--STPIVKPL-IRRD-QGKFW-A-WAQMWWFREPT-NIPTA-D-Y-CHSW--WQ--SR-ADLQ-NDRDMGP-EADASFYVEFWYWVRCAARTYGQQLGIIWNNRLKTRNPCPYSADGIQNKENYVFWWKNMCTKSHIAFYYCLQNVAHYTHDVTAEFNPVKCIDWLQGHMVLSSWFKYNTECKKLFVHEKCECYRM----FCGV---VEDIFGVRFH--TGWKHLSTAKPVPHVCVYNPSVQERRNINDFYIF-YEIAPAVKNLVLSAQPLHDYTKKCAFNYTPITITRIISTRNQIIW-AHVVIACQFYSPHQMLLIELAMDKYCADMNVRRSTEGHQPMHACRSTFGPGMAAKEPLVTFTLVAFWQWPNHEFQYVYMYTED-KIIQIG-PHLSN-GCEMVEYCVDC-YAK-RPCYRAYSAEAQYWRMITEAEDYSYKTRNAIAATATVRGQ-YCHPFRWLGIVWM-AHHDC-FFANECGTICI-PQMAEMRPPETTPYEI--DIIFMMF-WKE--HMSTTIL-DVVGMYRP-ATFSHWHDAHH-QCEPYLTPL-MCQSKLVFDAAFT--QVG-VKGVW-YHTEKLELMAGFNHM-K-FKKEEAQ---QSCFYWFQDCPDYDPPDAVRKTDEKHIRAHGEIWWLMRYYCMYHILHI-ASRHEWMHLRWDQACTNPGY--ELFE-F
```

In [44]:
# Expected Number of High-Scoring Alignments
from math import exp, log

K = 0.050
lamda = 0.25
S = 35
m = 250
n = 1000000000
E = K*m*n*exp(-lamda*S)
print(f"E value associated with S>={S} =",E)

# The Number of high-scoring alignments is poisson distributed with expected value E.
# probability of finding 0 alignments with score >= S is e^-E

p_no_alignments = exp(-E)
print(f"probability of no alignments with score >= {S} is",p_no_alignments)
print("probability of finding at least one alignment (p-value) is",1-p_no_alignments)

E value associated with S>=35 = 1980766.5639468906
probability of no alignments with score >= 35 is 0.0
probability of finding at least one alignment (p-value) is 1.0


In [45]:
# Normalized Scores

s_prime = (lamda*S - log(K)) / log(2)
print("Normalized score S' =",s_prime)

Normalized score S' = 16.94550970266579


In [46]:
N = n*m
E = N/2**s_prime
print("E-value",E)

E-value 1980766.5639468913


# Question 3
Assume the background frequencies for all four nucleotides are equal, and consider the DNA substitution matrix which give all matches a score +1 and all mismatches a score -1.

What is the expected score for this matri?Is this a valid matrix for local alignment? Why?

In [67]:
from genomics import *

X=add_codon(10)
Y=add_codon(1000)
X, Y = "MEANLY", "PENALTY"

In [69]:
similarity = local_sequence_similarity()
mat,maxval,index = similarity(X, Y, s=score_function("PAM250", C=5))
# printmat(mat, X, Y)
print("Value is",maxval,"at",index)
# printmat(mat[0:index[0]+1,0:index[1]+1], X, Y)
print(X)
print(Y)
print_formatted([local_traceback(mat[0:index[0]+1, 0:index[1]+1], X, Y)])


Value is 15 at (7, 6)
MEANLY
PENALTY

    0 EANL-Y                                                                                               5    
      E  L Y                                                                                              
    0 ENALTY                                                                                               5    



In [56]:
print(Y)

TGTTGTGTCTTAGCATGATTGACGCAATGTGTACAAATACCAACGTGTCGAGCGGTTGTGACTCTCCGCCGTAACACTCAGAGTCACTGTATTCATTAATGCGGGCCACTTTAACAACCGATTCGATCCGTCCCTTTCCAGTTTCTCGGTCCCGTACGTTGGACCACACCTTGTACTTAAGGCCATACGTTATCAGGTACAGGAATGCGGTTAATGCTGCGGACAAAAAGACTCTATAAAGATAGGAAGAGATATCCCCCAGGCCCACACGGCAGCTGAGGTATTTTCTGTTGAACGTTCCAGTCGACATACATGCGCGCTCTGCGAAACGCAGACAATCTCCAACAGCCCTGTAGTAGCGAGAGACAAGCAACTCCCGACGGACACCATTGATAAAGCACGCATCAACACCCCGGCTCCAGAGAGGTGGATTGTTGGACCTAAATACAGTCTATGACCGTTCTTGGTGTGCGGCCACTTAATAACCTATGCGACTTATCGCGGCAACCTGCTTATTATCAGATTACGGCCCGCCCAGGACGTATGCGGACGTGCCGCGCGGACGTGCTATCCCCACATCACGATGCACACCTGGAACAATCTACCTGTTACAGAACGACAATACGTATGTCCACGGAACTTACCCGAGCGGATGATGTCAACCACGCGGATAATTAGAGTCGTGCAAGACTTGGGAATGGTCACTCGGAGTACCTGGTCTTCTCGTTTACTTGCGTCTAAAATTCCGTGGTGAAAGAACCACATGAGGTGAACCCGCGATTACTAGTGCTGCAGTACAAACGCTGTATATTGTACTTATGTCTCACCGAATTGAATGATAGTAAACACCGATGCAGACGGTATATCGTATAGTAGGCTGTTCTAACAGCGAATTCGTCGGACTATGAACACTTCGATGCGTGTTTCACACACAGCCCCGTGTGGCTCTCGAGTCTTGACAAACAGCGGGGCGTCATCGGAACCACCCCTGCGCAGGGCT

In [61]:
print(mat)

[[('h', 0) ('', 0) ('', 0) ... ('', 0) ('', 0) ('', 0)]
 [('', 0) ('d', 0) ('d', 0) ... ('d', 0) ('d', 4) ('d', 0)]
 [('', 0) ('d', 6) ('h', 1) ... ('d', 6) ('h', 1) ('d', 2)]
 ...
 [('', 0) ('', 0) ('d', 5) ... ('d', 13) ('d', 10) ('d', 12)]
 [('', 0) ('d', 6) ('h', 1) ... ('d', 16) ('d', 13) ('hd', 8)]
 [('', 0) ('v', 1) ('d', 5) ... ('v', 11) ('d', 16) ('d', 12)]]
