### Ex. 2 Calculating scoring matrices

In [14]:
S1='TSVKTYAKFVTH'
S2='TSVKTYAKFSTH'
S3='TSVKTYAKFVTH'
S4='LSVKKYPKYVVQ'
S5='SSVKKYPKYSVL'
alignment=[S1,S2,S3,S4,S5] 
from collections import Counter
from itertools import combinations
from math import log
f_a=Counter()
for s in alignment:
    f_a.update(s)
total = sum(f_a.values())
p_a = {k:v/total for k,v in f_a.items()}

f_ab = Counter()
for seq_a,seq_b in combinations(alignment,2):
    f_ab.update("".join(sorted(ab)) for ab in zip(seq_a,seq_b))

total_pairs = sum(f_ab.values())
p_ab = {k:v/total_pairs for k,v in f_ab.items()}

e_ab = {a+b:(2-(a==b))*p_a[a]*p_a[b] for a,b in f_ab}

s_ab = {ab:2*log(p_ab[ab]/e_ab[ab])/log(2) for ab in p_ab}


In [15]:
import pandas as pd
aminoacid=pd.DataFrame(index=['$F_a$','$P_a$'],columns=p_a.keys())
for each in p_a:
    aminoacid.loc['$F_a$',each]=f_a[each]
    aminoacid.loc['$P_a$',each]=p_a[each]

aminoacid

Unnamed: 0,T,S,V,K,Y,A,F,H,L,P,Q
$F_a$,9.0,8.0,10.0,12.0,7.0,3.0,3.0,3.0,2.0,2.0,1.0
$P_a$,0.15,0.133333,0.166667,0.2,0.116667,0.05,0.05,0.05,0.0333333,0.0333333,0.0166667


In [16]:
aminoacid_pair=pd.DataFrame(index=['$F_{ab}$','$P_{ab}$','$E_{ab}$','$S_{ab}$'],columns=e_ab)
    
for each in e_ab:
    aminoacid_pair.loc['$F_{ab}$',each]=f_ab[each]
    aminoacid_pair.loc['$P_{ab}$',each]=p_ab[each]
    aminoacid_pair.loc['$E_{ab}$',each]=e_ab[each]
    aminoacid_pair.loc['$S_{ab}$',each]=round(s_ab[each])

aminoacid_pair

Unnamed: 0,TT,SS,VV,KK,YY,AA,FF,SV,HH,LT,KT,AP,FY,TV,HQ,ST,HL,LS,PP,LQ
$F_{ab}$,9.0,11.0,14.0,21.0,11.0,3.0,3.0,6.0,3.0,3.0,6.0,6.0,6.0,6.0,3.0,3.0,3.0,1.0,1.0,1.0
$P_{ab}$,0.075,0.0916667,0.116667,0.175,0.0916667,0.025,0.025,0.05,0.025,0.025,0.05,0.05,0.05,0.05,0.025,0.025,0.025,0.00833333,0.00833333,0.00833333
$E_{ab}$,0.0225,0.0177778,0.0277778,0.04,0.0136111,0.0025,0.0025,0.0444444,0.0025,0.01,0.06,0.00333333,0.0116667,0.05,0.00166667,0.04,0.00333333,0.00888889,0.00111111,0.00111111
$S_{ab}$,3.0,5.0,4.0,4.0,6.0,7.0,7.0,0.0,7.0,3.0,-1.0,8.0,4.0,0.0,8.0,-1.0,6.0,0.0,6.0,6.0


In [17]:
import pickle
with open("answer_ho3ex1.pickle","wb") as fh:
    pickle.dump((aminoacid,aminoacid_pair),fh)


In [18]:
from collections import Counter,defaultdict
from itertools import combinations,product
from math import log

aa = "ARNDCQEGHILKMFPSTWYV"

def get_scores(alignment,with_pseudocount=False):
    # Frequencies of individual amin acids
    f_a=Counter()
    for s in alignment:
        f_a.update(s)
    total = sum(f_a.values())
    p_a = {k:v/total for k,v in f_a.items()}

    # Frequencies of aa-pairs
    f_ab = Counter()
    if with_pseudocount:
        for ab in product(aa,aa):
            f_ab["".join(sorted(ab))] = +1
    for seq_a,seq_b in combinations(alignment,2):
        f_ab.update("".join(sorted(ab)) for ab in zip(seq_a,seq_b))
    
    total_pairs = sum(f_ab.values())
    p_ab = {k:v/total_pairs for k,v in f_ab.items()}

    e_ab = {a+b:(2-(a==b))*p_a[a]*p_a[b] for a,b in f_ab}

    s_ab = {ab:round(2*log(p_ab[ab]/e_ab[ab])/log(2)) for ab in p_ab}
    return s_ab
    
def scores2matrix(scores):
    matrix=defaultdict(dict)
    for a,b in product(aa,aa):
        matrix[a][b] = scores.get(a+b,scores.get(b+a,-99))
    return matrix

def print_matrix(matrix,fh):
    fs="{:>5}"
    print(fs.format("")+"".join(map(lambda x: fs.format(x),aa)),file=fh)
    for a in aa:
        print(fs.format(a),end="",file=fh)
        for b in aa:
            print(fs.format(matrix[a][b]),end="",file=fh)
        print(file=fh)
    


In [19]:
import sys
alignment=open("alignment.dat").read().split()
scores = get_scores(alignment)
matrix = scores2matrix(scores)
print_matrix(matrix,sys.stdout)
print_matrix(matrix,open("score_matrix.txt","w"))

         A    R    N    D    C    Q    E    G    H    I    L    K    M    F    P    S    T    W    Y    V
    A    5   -3   -2   -2   -2   -1   -1   -3   -1   -2   -2   -1   -2   -4    0    1    0   -7   -6    0
    R   -3    6   -1   -4    0   -1   -3    0    1   -3   -2    2   -2   -3   -5   -3   -1   -4   -2   -3
    N   -2   -1    7    1   -3   -1    0   -2   -1   -1   -7   -2   -4   -4   -3    0    0   -6   -3   -1
    D   -2   -4    1    6   -3   -6    1   -2   -3   -2   -3   -3   -5   -3   -6   -4   -5   -8   -4   -2
    C   -2    0   -3   -3    8    1   -2   -1  -99   -2   -1   -4    1    0   -1   -2    0   -3    0   -1
    Q   -1   -1   -1   -6    1    7   -1   -4    3    0   -2    1   -4   -3   -4   -3   -5   -5   -6   -4
    E   -1   -3    0    1   -2   -1    5   -3   -6   -3   -3   -1    0   -7   -1   -3   -3   -3   -3   -3
    G   -3    0   -2   -2   -1   -4   -3    7   -4   -4   -2   -3   -3   -7   -4    1   -2    1   -5   -2
    H   -1    1   -1   -3  -99    3   -6   -4 

In [7]:
import sys
alignment=open("alignment.dat").read().split()
scores = get_scores(alignment,with_pseudocount=True)
matrix = scores2matrix(scores)
print_matrix(matrix,sys.stdout)
print_matrix(matrix,open("score_matrix_laplace.txt","w"))

         A    R    N    D    C    Q    E    G    H    I    L    K    M    F    P    S    T    W    Y    V
    A    5   -2   -2   -2   -2   -1   -1   -3   -1   -2   -2   -1   -1   -4    0    1    0   -6   -6    0
    R   -2    6   -1   -4    0   -1   -3    0    1   -3   -2    1   -2   -3   -5   -3   -1   -4   -1   -3
    N   -2   -1    7    1   -3   -1    0   -2   -1   -1   -7   -2   -3   -4   -3    0    0   -5   -3   -1
    D   -2   -4    1    6   -3   -5    1   -2   -3   -2   -3   -3   -5   -3   -5   -4   -5   -7   -4   -2
    C   -2    0   -3   -3    8    1   -2   -1   -7   -1   -1   -3    1    1   -1   -1    0   -3    0   -1
    Q   -1   -1   -1   -5    1    7   -1   -4    3    0   -2    1   -3   -3   -4   -3   -4   -5   -5   -4
    E   -1   -3    0    1   -2   -1    5   -3   -5   -3   -3   -1    0   -6   -1   -3   -3   -3   -3   -3
    G   -3    0   -2   -2   -1   -4   -3    7   -3   -4   -2   -3   -3   -6   -4    1   -2    1   -4   -2
    H   -1    1   -1   -3   -7    3   -5   -3 

### Ex. 3 Global alignment

In [6]:
%run needle.py -5 blosum62.txt ray.fasta throat.fasta

Score= -16.0

--R-AYMQNDLVKVRYYACHT
  | |  |  | ::       
THRQATWQPPLERMANGRQVE



### Ex.3 RNAS1 --- Three-way comparison

In [7]:
%run needle.py -8 blosum50.txt RNAS1_horse.fasta RNAS1_minke-whale.fasta

Score= 683.0

KESPAMKFERQHMDSGSTSSSNPTYCNQMMKRRNMTQGWCKPVNTFVHEPLADVQAICLQKNITCKNGQSNCYQSSSSMH
:|||||||:|||||||::  :|| |||||| || |||| |||||||||| | ||:|:| |||: ||||::|||:|:|:||
RESPAMKFQRQHMDSGNSPGNNPNYCNQMMMRRKMTQGRCKPVNTFVHESLEDVKAVCSQKNVLCKNGRTNCYESNSTMH

ITDCRLTSGSKYPNCAYQTSQKERHIIVACEGNPYVPVHFDASVEVST
||||| |  ||||||||:|||||:||||||||||||||||| |  |  
ITDCRQTGSSKYPNCAYKTSQKEKHIIVACEGNPYVPVHFDNS--V--



In [8]:
%run needle.py -8 blosum50.txt RNAS1_horse.fasta RNAS1_red-kangaroo.fasta

Score= 521.0

KESPAMKFERQHMDSGSTSSSNPTYCNQMMKRRNMTQGWCKPVNTFVHEPLADVQAICLQKNITCKNGQSNCYQSSSSMH
 |:|| ||:|||||:  :::|:  ||| ||| |:|| | |||:|||:||| : | |:| |:|:|||||::|||:|:| : 
-ETPAEKFQRQHMDTEHSTASSSNYCNLMMKARDMTSGRCKPLNTFIHEPKSVVDAVCHQENVTCKNGRTNCYKSNSRLS

ITDCRLTSGSKYPNCAYQTSQKERHIIVACEGNPYVPVHFDASVEVST
||:|| |  |||||| |:||   ::|||||||  ||||||||   |  
ITNCRQTGASKYPNCQYETSNLNKQIIVACEGQ-YVPVHFDA-Y-V--



In [9]:
%run needle.py -8 blosum50.txt RNAS1_minke-whale.fasta RNAS1_red-kangaroo.fasta

Score= 572.0

RESPAMKFQRQHMDSGNSPGNNPNYCNQMMMRRKMTQGRCKPVNTFVHESLEDVKAVCSQKNVLCKNGRTNCYESNSTMH
 |:|| ||||||||: :|  :: |||| ||  | || |||||:|||:||    | ||| |:|| |||||||||:||| : 
-ETPAEKFQRQHMDTEHSTASSSNYCNLMMKARDMTSGRCKPLNTFIHEPKSVVDAVCHQENVTCKNGRTNCYKSNSRLS

ITDCRQTGSSKYPNCAYKTSQKEKHIIVACEGNPYVPVHFDNSV
||:|||||:|||||| |:||   |:|||||||  |||||||  |
ITNCRQTGASKYPNCQYETSNLNKQIIVACEGQ-YVPVHFDAYV



### Ex.3 Global vs. local alignment

In [10]:
%run needle.py -8 blosum62.txt halodurans.fasta lentus.fasta

Score= 187.0

MRQSLKVMVLSTVALLFMANPAAASEEKKEYLIVVEPEEVSAQSVEESYDVDVIHEFEEIPVIHAELTKKELKKLKKDPN
  |       | |       |         :             :  |                            :   
-AQ-------S-V-------P---------W------------GI--S----------------------------R---

VKAIEKNAEVTISQTVPWGISFINTQQAHNRGIFGNGARVAVLDTGIASHPDLRIAGGASFISSEPSYHDNNGHGTHVAG
|   :  |        |   :      |||||: |:| :||||||||::|||| | |||||:  |||  | |||||||||
V---Q--A--------P---A------AHNRGLTGSGVKVAVLDTGISTHPDLNIRGGASFVPGEPSTQDGNGHGTHVAG

TIAALNNSIGVLGVAPSADLYAVKVLDRNGSGSLASVAQGIEWAINNNMHIINMSLGSTSGSSTLELAVNRANNAGILLV
||||||||||||||||||:|||||||  :||||::|:|||:||| || ||: |:|||| | |:||| ||| | : |:|:|
TIAALNNSIGVLGVAPSAELYAVKVLGASGSGSVSSIAQGLEWAGNNGMHVANLSLGSPSPSATLEQAVNSATSRGVLVV

GAAGNTGRQGVNYPARYSGVMAVAAVDQNGQRASFSTYGPEIEISAPGVNVNSTYTGNRYVSLSGTSMATPHVAGVAALV
 |:||:|   ::|||||:  ||| | |||  ||||| ||  ::| |||||| ||| |: | ||:||||||||||| ||||
AASGNSGAGSISYPARYANAMAVGATDQNNNRASFSQYGAGLDIVAPGVNVQSTYPGSTYASLNGTSMATPHVAGAAALV

KSRYPSYTNN

In [11]:
%run waterman.py -8 blosum62.txt halodurans.fasta lentus.fasta

Score= 916.0

SQTVPWGISFINTQQAHNRGIFGNGARVAVLDTGIASHPDLRIAGGASFISSEPSYHDNNGHGTHVAGTIAALNNSIGVL
:|:|||||| :    |||||: |:| :||||||||::|||| | |||||:  |||  | |||||||||||||||||||||
AQSVPWGISRVQAPAAHNRGLTGSGVKVAVLDTGISTHPDLNIRGGASFVPGEPSTQDGNGHGTHVAGTIAALNNSIGVL

GVAPSADLYAVKVLDRNGSGSLASVAQGIEWAINNNMHIINMSLGSTSGSSTLELAVNRANNAGILLVGAAGNTGRQGVN
||||||:|||||||  :||||::|:|||:||| || ||: |:|||| | |:||| ||| | : |:|:| |:||:|   ::
GVAPSAELYAVKVLGASGSGSVSSIAQGLEWAGNNGMHVANLSLGSPSPSATLEQAVNSATSRGVLVVAASGNSGAGSIS

YPARYSGVMAVAAVDQNGQRASFSTYGPEIEISAPGVNVNSTYTGNRYVSLSGTSMATPHVAGVAALVKSRYPSYTNNQI
|||||:  ||| | |||  ||||| ||  ::| |||||| ||| |: | ||:||||||||||| ||||| : ||::| ||
YPARYANAMAVGATDQNNNRASFSQYGAGLDIVAPGVNVQSTYPGSTYASLNGTSMATPHVAGAAALVKQKNPSWSNVQI

RQRINQTATYLGSPSLYGNGLVHAGRATQ
|  :  ||| ||| :|||:|||:|  ||:
RNHLKNTATSLGSTNLYGSGLVNAEAATR



### Ex.4 Affine gaps

In [1]:
%run affine.py -8 -2 blosum62.txt GLB7A_CHITH.fasta GLBE_CHITH.fasta

Score= 358.0

MKFFAVLALCIVGAIASPLSADQAALVKSTWAQVRNSEVEILAAVFTAYPDIQARFPQFAGKDVASIKDTGAFATHAGRI
|||  :||||:  | || || ||  ||:||: :|:   | || ||| | | ||| |||| |||: :||    |:||||||
MKFI-ILALCV--AAASALSGDQIGLVQSTYGKVKGDSVGILYAVFKADPTIQAAFPQFVGKDLDAIKGGAEFSTHAGRI

VGFVSEIIALIGNESNAPAVQTLVGQLAASHKARGISQAQFNEFRAGLVSYVSSNVAWNAAAESAWTAGLDNIFGLLFAA
|||:  :|       : | :   |  | |:|| ||:: |||| |||  ::|:  :| : || |:|| |  |  || :|| 
VGFLGGVI------DDLPNIGKHVDALVATHKPRGVTHAQFNNFRAAFIAYLKGHVDYTAAVEAAWGATFDAFFGAVFAK

L
:
M

