In [None]:
#Information, Inference Networks: Tutorial 7.  S.C., R.M., F.Z.
#Perceptron algorithm for PDZ binding to peptides
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
from scipy.sparse import coo_matrix
import numpy.matlib
import numpy.linalg as LA
from numpy.linalg import inv

In [None]:
def letter2number(a): 
    #to convert the amino acids letters into integer numbers from 0 to 20
    switcher = {
        '-': 0,
        'A': 1,
        'C': 2,
        'D':3,
        'E':4,
        'F':5,
        'G':6,
        'H':7,
        'I':8,
        'K':9,
        'L':10,
        'M':11,
        'N':12,
        'P':13,
        'Q':14,
        'R':15,
        'S':16,
        'T':17,
        'V':18,
        'W':19,
        'Y':20,     
    }
    #return switcher.get(a, "nothing")
    return switcher.get(a,0)

def seq2number(a):
    ris = []
    for i in range(len(a)):
        ris.append(letter2number(a[i]))
    return ris

In [None]:
int_matrix = pd.read_excel('fp_interaction_matrix.xlsx', index_col=0)
PDZ = np.array(int_matrix.index)
NPDZ=len(PDZ)
print(NPDZ)

In [None]:
# ATTENTION: the peptides in peptides.free are not in the same order as in the interaction matrix
pep = []
with open('peptides.free') as f:
    for line in f:
        x = line.split()
        pep.append(x)
Npep=len(pep)
print(Npep)
# check that all the peptides are correctly listed in pep
for i in range(Npep):
    if len(np.extract(int_matrix.columns==pep[i][0],int_matrix.columns))==0 :
        print(pep[i])

In [None]:
int_matrix

In [None]:
# Check that the binding is read correctly
i=11
j=6
print('PDZ: ',PDZ[i],'\nPeptide: ',pep[j][0],'with sequence',pep[j][1],
      '\nInteraction:',int_matrix.get_value(PDZ[i],pep[j][0]))

In [None]:
# Construct the matrix of peptide sequences
tmp = []
for i in range(Npep):
    tmp.append(seq2number(pep[i][1][:]))
pep_seq=np.asarray(tmp)
print(np.shape(pep_seq))
Nbase=np.shape(pep_seq)[1]

In [None]:
#Expand the matrix in a bynary (Npep,19x10) array X by a one-hot encoding
#gauge: last a.a. remove the last symbol
#add a last line of all one to have a constant term in dot(X,J)
q=20
#X=-np.ones((Npep,Nbase*(q-1)+1))    ### USE {-1,1} CONVENTION FOR INPUT
X=np.zeros((Npep,Nbase*(q-1)+1))   ### USE {0,1} CONVENTION FOR INPUT
for m in range(Npep):
    X[m,Nbase*(q-1)]=1
    for i in range(Nbase):
        if (pep_seq[m,i]!=q):
            X[m,i*(q-1)+pep_seq[m,i]-1]=1
print(np.shape(X))

In [None]:
#Get for a given PDZ the label vector Y
def getY(iPDZ):
    Y = -np.ones(Npep)
    for j in range(Npep):
        Kd=int_matrix.get_value(PDZ[iPDZ],pep[j][0])
        if (Kd>0 and Kd<100000):
            Y[j]=1
    return Y

In [None]:
# Check how many peptides are binding to each PDZ
binding=[]
for j in range(NPDZ):
    Y=getY(j)
    binding.append(sum(Y>0))
print(binding)