In [2]:
import numpy as np
import tensorflow as tf

from tensorflow import keras
from scipy.io import loadmat, savemat

from tensorflow.keras import layers

from sklearn.model_selection import KFold
from sklearn.metrics import label_ranking_average_precision_score as avgprec, coverage_error, label_ranking_loss

In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
#     logical_gpus = tf.config.experimental.list_logical_devices('GPU')
#     print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)

In [3]:
Y_4802 = loadmat('Y_4802.mat')['Y_4802']

In [4]:
X_4802_feature = loadmat('feature_4802.mat')

In [5]:
X_4802_feature.keys()

dict_keys(['__header__', '__version__', '__globals__', 'ppab', 'ppdwt', 'pppse', 'pssmab', 'pssmdwt', 'pssmpse', 'label'])

In [6]:
model_list = []
input_list = []
for fname in ['ppab', 'ppdwt', 'pppse', 'pssmab', 'pssmdwt', 'pssmpse']:
    fdata = X_4802_feature[fname]
    input_list.append(fdata)
    input_shape = fdata.shape
    ix = keras.Input(shape=input_shape[1:], dtype = "float32")
    
    x = layers.Conv2D(256, (4,3), activation='relu')(ix)
    x = layers.Conv2D(128, 3, activation='relu',)(x)
    x = layers.MaxPooling2D()(x)
    x = layers.Conv2D(64, 3, activation='relu',)(x)
    #x = layers.MaxPooling2D()(x)
    x = layers.Conv2D(32, 3, activation='relu',)(x)
    outputs = layers.Flatten()(x)
    #outputs = layers.Dense(37, activation='sigmoid')(x)
    model = keras.Model(ix, outputs)
    model_list.append(model)
    
x = layers.concatenate([m.output for m in model_list])
x = layers.Dense(37, activation='sigmoid')(x)
model = keras.Model(inputs=[m.input for m in model_list], outputs = x)
model.summary()

Model: "functional_13"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 38, 30, 2)]  0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 57, 52, 2)]  0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 31, 57, 2)]  0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 20, 20, 2)]  0                                            
______________________________________________________________________________________

In [7]:
model.compile("adam", "binary_crossentropy", metrics=["binary_accuracy"])
model.fit(input_list, Y_4802, batch_size=8, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fae2834a1c0>

In [5]:
pssmpse = X_4802_feature['pssmpse']

In [6]:
input_shape = pssmpse.shape
pssm = keras.Input(shape=input_shape[1:], dtype = "float32")

x = layers.Conv2D(256, (4,3), activation='relu')(pssm)
x = layers.Conv2D(128, 3, activation='relu',)(x)
x = layers.MaxPooling2D()(x)
x = layers.Conv2D(64, 3, activation='relu',)(x)
#x = layers.MaxPooling2D()(x)
x = layers.Conv2D(32, 3, activation='relu',)(x)
x = layers.Flatten()(x)
outputs = layers.Dense(37, activation='sigmoid')(x)
model = keras.Model(pssm, outputs)
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 31, 20, 2)]       0         
_________________________________________________________________
conv2d (Conv2D)              (None, 28, 18, 256)       6400      
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 26, 16, 128)       295040    
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 13, 8, 128)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 11, 6, 64)         73792     
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 9, 4, 32)          18464     
_________________________________________________________________
flatten (Flatten)            (None, 1152)             

In [7]:
kf = KFold(5)
ap_list = []
rl_list = []
ce_list = []

#with tf.device("cpu:0"):
for train_index, test_index in kf.split(pssmpse):
    train_x = pssmpse[train_index]
    train_y = Y_4802[train_index]

    model.compile("adam", "binary_crossentropy", metrics=["binary_accuracy"])
    model.fit(train_x, train_y, batch_size=8, epochs=50)
    
    test_x = pssmpse[test_index]
    test_y = Y_4802[test_index]
    
    pred_y = model.predict(test_x)
    ap_score = avgprec(test_y, pred_y)
    ap_list.append(ap_score)
    rl_list.append(label_ranking_loss(test_y, pred_y))
    ce_list.append(coverage_error(test_y, pred_y) - 1)
    
print('the ap score is:', ap_list)
print('average is:', sum(ap_list)/len(ap_list))

print('the rl score is:', rl_list)
print('average is:', sum(rl_list)/len(rl_list))

print('the ce score is:', ce_list)
print('average is:', sum(ce_list)/len(ce_list))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/5

In [42]:
tmp = np.array([1,2,3,4,5,6])

In [43]:
tmp = tmp.reshape((2,3))

In [45]:
str(tmp)

'[[1 2 3]\n [4 5 6]]'

In [35]:
'abc {}'.format(np.array([1,2,3]))

'abc [1 2 3]'

In [41]:
tmp[[0,1]][:, [0,1]]

array([[1, 2],
       [4, 5]])

In [38]:
tmp 

array([[1, 2, 3],
       [4, 5, 6]])

In [26]:
tmp.flatten()

array([1, 4])

In [28]:
tmp

array([1, 2, 3, 4, 5, 6])

In [32]:
np.expand_dims(tmp, axis = 1).shape

(6, 1)

In [46]:
from scipy.io import loadmat, savemat

In [47]:
sequence = loadmat('dataset_4802.mat')['Sequence']

In [62]:
count = 0
for i in sequence:
    if len(i[0][0]) < 100:
        count += 1

In [63]:
count

164

In [64]:
sequence[0][0][0]

'MDEQEALNSIMNDLVALQMNRRHRMPGYETMKNKDTGHSNRQKKHNSSSSALLNSPTVTTSSCAGASEKKKFLSDVRIKFEHNGERRIIAFSRPVKYEDVEHKVTTVFGQPLDLHYMNNELSILLKNQDDLDKAIDILDRSSSMKSLRILLLSQDRNHNSSSPHSGVSRQVRIKASQSAGDINTIYQPPEPRSRHLSVSSQNPGRSSPPPGYVPERQQHIARQGSYTSINSEGEFIPETSEQCMLDPLSSAENSLSGSCQSLDRSADSPSFRKSRMSRAQSFPDNRQEYSDRETQLYDKGVKGGTYPRRYHVSVHHKDYSDGRRTFPRIRRHQGNLFTLVPSSRSLSTNGENMGLAVQYLDPRGRLRSADSENALSVQERNVPTKSPSAPINWRRGKLLGQGAFGRVYLCYDVDTGRELASKQVQFDPDSPETSKEVSALECEIQLLKNLQHERIVQYYGCLRDRAEKTLTIFMEYMPGGSVKDQLKAYGALTESVTRKYTRQILEGMSYLHSNMIVHRDIKGANILRDSAGNVKLGDFGASKRLQTICMSGTGMRSVTGTPYWMSPEVISGEGYGRKADVWSLGCTVVEMLTEKPPWAEYEAMAAIFKIATQPTNPQLPSHISEHGRDFLRRIFVEARQRPSAEELLTHHFAQLMY'

In [65]:
Y_4802 = loadmat('Y_4802.mat')['Y_4802']

In [72]:
Y_4802

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

In [67]:
sequence = loadmat('dataset_4802.mat')['Sequence']

In [152]:
uniform_one_encode = []
uniform_label = []
for sq, lb in zip(sequence, Y_4802):
    seq = sq[0][0]
    if len(seq) > 100:
        result = p_split(seq, 100)
    else:
        result = [seq]
    for r in result:
        uniform_one_encode.append(preprocess(r, 100))
        uniform_label.append(lb)
#     print(preprocess(sq[0][0][0:95], 100), lb)
#     t = preprocess(sq[0][0][0:95], 100)

In [153]:
np.array(uniform_one_encode).shape

(55950, 100, 22)

In [154]:
np.array(uniform_label).shape

(55950, 37)

In [None]:
uniform_one_encode

In [133]:
def p_split(p_seq, l):
    result = []
    s = 0
    e = s + l
    while (s + l//2) < len(p_seq):
        t = p_seq[s:e]
        if len(t) < l:
            t = p_seq[-l:]
        s = s + l//2
        e = s + l
        result.append(t)
    return result

In [75]:
np.arange(2, 10)

array([2, 3, 4, 5, 6, 7, 8, 9])

In [None]:
uniform_one_encode = []
for s in sequence:
    p_seq = list(s[0][0])
    seq_num = np.array([amino_code[x] for x in p_seq])
    tmp = np.zeros((seq_num.size, 22))
    s = (100 - len(seq_num.size))//2
    e = s + seq_num.size
    tmp[np.arange(s, e), seq_num] = 1
    extend_one_encode.append(tmp)

In [89]:
def preprocess(p_seq, l):
    seq_num = np.array([amino_code[x] for x in p_seq])
    tmp = np.zeros((l, 22))
    s = (100 - seq_num.size)//2
    e = s + seq_num.size
    tmp[np.arange(s, e), seq_num] = 1
    return tmp

In [78]:
amino_code = {'A':0, 'C':1, 'D':2, 'E':3, 'F':4, 'G':5, 'H':6,
             'I':7, 'K':8, 'L':9, 'M':10, 'N':11, 'P':12, 
             'Q':13, 'R':14, 'S':15, 'T':16, 'U':17, 'V':18,
             'W':19, 'X':20, 'Y':21 }

In [176]:
import numpy as np
from sklearn.model_selection import KFold
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([1, 2, 3, 4])
kf = KFold(n_splits=2, shuffle = False)
kf.get_n_splits(X)

print(kf)

for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

KFold(n_splits=2, random_state=None, shuffle=False)
TRAIN: [2 3] TEST: [0 1]
TRAIN: [0 1] TEST: [2 3]


In [25]:
oridata = loadmat('dataset_4802.mat')

In [26]:
oridata.keys()

dict_keys(['__header__', '__version__', '__globals__', 'Header', 'Sequence', 'Y_4802', 'Y_4802_label'])

In [29]:
oridata['Sequence']

array([[array(['MDEQEALNSIMNDLVALQMNRRHRMPGYETMKNKDTGHSNRQKKHNSSSSALLNSPTVTTSSCAGASEKKKFLSDVRIKFEHNGERRIIAFSRPVKYEDVEHKVTTVFGQPLDLHYMNNELSILLKNQDDLDKAIDILDRSSSMKSLRILLLSQDRNHNSSSPHSGVSRQVRIKASQSAGDINTIYQPPEPRSRHLSVSSQNPGRSSPPPGYVPERQQHIARQGSYTSINSEGEFIPETSEQCMLDPLSSAENSLSGSCQSLDRSADSPSFRKSRMSRAQSFPDNRQEYSDRETQLYDKGVKGGTYPRRYHVSVHHKDYSDGRRTFPRIRRHQGNLFTLVPSSRSLSTNGENMGLAVQYLDPRGRLRSADSENALSVQERNVPTKSPSAPINWRRGKLLGQGAFGRVYLCYDVDTGRELASKQVQFDPDSPETSKEVSALECEIQLLKNLQHERIVQYYGCLRDRAEKTLTIFMEYMPGGSVKDQLKAYGALTESVTRKYTRQILEGMSYLHSNMIVHRDIKGANILRDSAGNVKLGDFGASKRLQTICMSGTGMRSVTGTPYWMSPEVISGEGYGRKADVWSLGCTVVEMLTEKPPWAEYEAMAAIFKIATQPTNPQLPSHISEHGRDFLRRIFVEARQRPSAEELLTHHFAQLMY'],
      dtype='<U657')],
       [array(['MDTEGFGELLQQAEQLAAETEGISELPHVERNLQEIQQAGERLRSRTLTRTSQETADVKASVLLGSRGLDISHISQRLESLSAATTFEPLEPVKDTDIQGFLKNEKDNALLSAIEESRKRTFGMAEEYHRESMLVEWEQVKQRILHTLLASGEDALDFTQESEPSYISDVGPPGRSSLDNIEMAYARQIYIYNEKIVNGHLQPNLVDLCASVAELDDKSISDMWTMVKQMTDVLLTPATDALKNRSSVEVRMEFVRQALAYLEQSYKNYTLVTVFGNLHQAQLG

In [6]:
X_4802_feature = loadmat('feature_4802.mat')

In [12]:
X_4802_feature.keys()

dict_keys(['__header__', '__version__', '__globals__', 'ppab', 'ppdwt', 'pppse', 'pssmab', 'pssmdwt', 'pssmpse', 'label'])

In [18]:
np.amax(X_4802_feature['pppse'])

1.0

In [19]:
np.amax(X_4802_feature['ppdwt'])

12510.368390131456

In [20]:
tmp = X_4802_feature['ppdwt']
tmp = (tmp - np.amin(tmp))/(np.amax(tmp) - np.amin(tmp))

In [24]:
np.amax(tmp * 250)

250.0

In [86]:
d4 = loadmat('dataset_3106.mat')

In [87]:
d4.keys()

dict_keys(['__header__', '__version__', '__globals__', 'Y_3106', 'label_name', 'protein_list', 'sequence_3106'])

In [89]:
d4['sequence_3106']

array([[array(['MFRRKLTALDYHNPAGFNCKDETEFRNFIVWLEDQKIRHYKIEDRGNLRNIHSSDWPKFFEKYLRDVNCPFKIQDRQEAIDWLLGLAVRLEYGDNAEKYKDLVPDNSKTADNATKNAEPLINLDVNNPDFKAGVMALANLLQIQRHDDYLVMLKAIRILVQERLTQDAVAKANQTKEGLPVALDKHILGFDTGDAVLNEAAQILRLLHIEELRELQTKINEAIVAVQAIIADPKTDHRLGKVGR'],
      dtype='<U244')],
       [array(['MIDSVKLRRDSAADFFSHYEYLCALQNSVPLPAVRACLREGVLDFNADRLRGVDWAPLLSTLKINKDLPLVSIKSFFQPWLGDTGSDMNKFCRSRVPAIRYKDVTFQLCKALKGCLSISSVLKNLELNGLILRERDLTILAKGLNKSASLVHLSLANCPIGDGGLEIICQGIKSSITLKTVNFTGCNLTWQGADHMAKILKYQTMRRHEETWAESLRYRRPDLDCMAGLRRITLNCNTLIGDLGACAFADSLSEDLWLRALDLQQCGLTNEGAKALLEALETNTTLVVLDIRKNPLIDHSMMKAVIKKVLQNGRSAKSEYQWITSPSVKEPSKTAKQKRRTIILGSGHKGKATIRIGLATKKPVSSGRKHSLGKEYYAPAPLPPGVSGFLPWRTAERAKRHRGFPLIKTRDICNQLQQPGFPVTVTVESPSSSEVEEVDDSSESVHEVPEKTSIEQEALQEKLEECLKQLKEERVIRLKVDKRVSELEHENAQLRNINFSLSEALHAQSLTNMILDDEGVLGSIENSFQKFHAFLDLLKDAGLGQLATMAGIDQSDFQLLGHPQMTSTVSNPPKEEKKALEDEKPEPKQNALGQMQNIQFQKITGDARIPLPLDSFPVPVSTPEGLGTSSNNLGVPATEQRQESFEGFIARMCSPSPDATSGTGSQRKEEELSRNSRSSSEKKTKTESH'],
    

In [34]:
d4['Y_4802_label'].shape

(37, 1)

In [75]:
label = d4['Y_4802_label']

In [76]:
for l in label:
    print(l)

[array(['Apical Plasma Membrane'], dtype='<U22')]
[array(['Basolateral Plasma Membrane'], dtype='<U27')]
[array(['Cellular Component Unknown'], dtype='<U26')]
[array(['Centrosome'], dtype='<U10')]
[array(['Cytoplasm'], dtype='<U9')]
[array(['Cytoplasmic Vesicles'], dtype='<U20')]
[array(['Cytoskeleton'], dtype='<U12')]
[array(['ERGIC'], dtype='<U5')]
[array(['Early Endosomes'], dtype='<U15')]
[array(['Endoplasmic Reticulum'], dtype='<U21')]
[array(['Endosomes'], dtype='<U9')]
[array(['Extracellular'], dtype='<U13')]
[array(['Golgi Apparatus'], dtype='<U15')]
[array(['Golgi Cis Cisterna'], dtype='<U18')]
[array(['Golgi Trans Cisterna'], dtype='<U20')]
[array(['Golgi Trans Face'], dtype='<U16')]
[array(['Inner Mitochondrial Membrane'], dtype='<U28')]
[array(['Late Endosomes'], dtype='<U14')]
[array(['Lipid Particles'], dtype='<U15')]
[array(['Lysosomes'], dtype='<U9')]
[array(['Medial-Golgi'], dtype='<U12')]
[array(['Melanosome'], dtype='<U10')]
[array(['Microtubule'], dtype='<U11')]
[ar

In [38]:
l4 = d4['Y_4802']

In [51]:
l4[[1,2]] 

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]], dtype=uint8)

0

In [62]:
res = np.zeros((37, 37))
for i in range(37):
    for j in range(i + 1, 37):
        # print(i, j)
        res[i, j] = sum(l4[l4[:, i] == 1][:, j] == 1)

In [74]:
for r in res:
    print(' '.join([str(int(x)) for x in r]))

0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 15 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 28 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4 0 0 0 0 0 0
0 0 0 0 23 0 6 0 0 1 1 0 2 0 0 0 0 0 0 0 0 0 0 0 1 0 0 12 0 0 0 0 0 0 0 0 0
0 0 0 0 0 3 16 0 1 1 17 8 18 0 0 0 0 0 0 1 0 0 2 0 24 0 0 401 0 7 41 0 0 0 2 0 0
0 0 0 0 0 0 0 0 1 0 0 1 11 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 6 0 3 0 0 1 1
0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 1 1 0 0 0 9 0 0 11 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 1 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 41 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 9 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 2 3 20 2 0 0 0 0 1 1 0 0 0 0 0 2 0 2 0 0 10 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 2 28 0 0 0 0 8 0 10 0 1 11 0 6 0 0 13 0 1 56 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 12 0 0 0 0 0 0 7 0 0 0 0 1 0 0 3 0 0 31 0 4 0 1 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 5 3 11 0 0 0 2 7 1 0 0 0 0 0 5 0 0 29 0 1 0 1 0 0
0 0 0 0 0 0 0

In [77]:
d3 = loadmat('dataset_3106.mat')

In [78]:
d3.keys()

dict_keys(['__header__', '__version__', '__globals__', 'Y_3106', 'label_name', 'protein_list', 'sequence_3106'])

In [79]:
d3['label_name']

array([[array(['Centrosome'], dtype='<U10')],
       [array(['Cytoplasm'], dtype='<U9')],
       [array(['Cytoskeleton'], dtype='<U12')],
       [array(['Endosome'], dtype='<U8')],
       [array(['Endoplasmic-Reticulum'], dtype='<U21')],
       [array(['Extracellular'], dtype='<U13')],
       [array(['Golgi-Apparatus'], dtype='<U15')],
       [array(['Lysosome'], dtype='<U8')],
       [array(['Microsome'], dtype='<U9')],
       [array(['Mitochondrion'], dtype='<U13')],
       [array(['Nucleus'], dtype='<U7')],
       [array(['Peroxisome'], dtype='<U10')],
       [array(['Plasma-Membrane'], dtype='<U15')],
       [array(['Synapse'], dtype='<U7')]], dtype=object)

In [80]:
l3 = d3['Y_3106']

In [81]:
res = np.zeros((14, 14))
for i in range(14):
    for j in range(i + 1, 14):
        # print(i, j)
        res[i, j] = sum(l3[l3[:, i] == 1][:, j] == 1)

In [83]:
for r in res:
    print(' '.join([str(int(x)) for x in r]))

0 15 7 1 1 0 2 0 0 1 12 0 0 0
0 0 12 2 13 8 12 2 1 24 263 8 24 2
0 0 0 0 0 0 3 0 0 0 7 0 9 0
0 0 0 0 2 0 4 3 0 0 1 0 1 0
0 0 0 0 0 3 28 2 11 7 14 1 10 0
0 0 0 0 0 0 7 5 0 1 3 0 16 1
0 0 0 0 0 0 0 1 0 0 4 0 20 1
0 0 0 0 0 0 0 0 0 0 2 0 4 0
0 0 0 0 0 0 0 0 0 2 1 0 1 0
0 0 0 0 0 0 0 0 0 0 15 4 1 0
0 0 0 0 0 0 0 0 0 0 0 0 11 1
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 10
0 0 0 0 0 0 0 0 0 0 0 0 0 0


In [104]:
from Bio import SeqIO
fafile = SeqIO.parse('pssm/dataset_3106/ff/1.fasta', 'fasta')

In [105]:
t = list(fafile)

In [106]:
str(t[0].seq)

'MIDSVKLRRDSAADFFSHYEYLCALQNSVPLPAVRACLREGVLDFNADRLRGVDWAPLLSTLKINKDLPLVSIKSFFQPWLGDTGSDMNKFCRSRVPAIRYKDVTFQLCKALKGCLSISSVLKNLELNGLILRERDLTILAKGLNKSASLVHLSLANCPIGDGGLEIICQGIKSSITLKTVNFTGCNLTWQGADHMAKILKYQTMRRHEETWAESLRYRRPDLDCMAGLRRITLNCNTLIGDLGACAFADSLSEDLWLRALDLQQCGLTNEGAKALLEALETNTTLVVLDIRKNPLIDHSMMKAVIKKVLQNGRSAKSEYQWITSPSVKEPSKTAKQKRRTIILGSGHKGKATIRIGLATKKPVSSGRKHSLGKEYYAPAPLPPGVSGFLPWRTAERAKRHRGFPLIKTRDICNQLQQPGFPVTVTVESPSSSEVEEVDDSSESVHEVPEKTSIEQEALQEKLEECLKQLKEERVIRLKVDKRVSELEHENAQLRNINFSLSEALHAQSLTNMILDDEGVLGSIENSFQKFHAFLDLLKDAGLGQLATMAGIDQSDFQLLGHPQMTSTVSNPPKEEKKALEDEKPEPKQNALGQMQNIQFQKITGDARIPLPLDSFPVPVSTPEGLGTSSNNLGVPATEQRQESFEGFIARMCSPSPDATSGTGSQRKEEELSRNSRSSSEKKTKTESH'

In [103]:
res = [str(t[0].seq)]

In [107]:
res.append(str(t[0].seq))

In [108]:
res

['MFRRKLTALDYHNPAGFNCKDETEFRNFIVWLEDQKIRHYKIEDRGNLRNIHSSDWPKFFEKYLRDVNCPFKIQDRQEAIDWLLGLAVRLEYGDNAEKYKDLVPDNSKTADNATKNAEPLINLDVNNPDFKAGVMALANLLQIQRHDDYLVMLKAIRILVQERLTQDAVAKANQTKEGLPVALDKHILGFDTGDAVLNEAAQILRLLHIEELRELQTKINEAIVAVQAIIADPKTDHRLGKVGR',
 'MIDSVKLRRDSAADFFSHYEYLCALQNSVPLPAVRACLREGVLDFNADRLRGVDWAPLLSTLKINKDLPLVSIKSFFQPWLGDTGSDMNKFCRSRVPAIRYKDVTFQLCKALKGCLSISSVLKNLELNGLILRERDLTILAKGLNKSASLVHLSLANCPIGDGGLEIICQGIKSSITLKTVNFTGCNLTWQGADHMAKILKYQTMRRHEETWAESLRYRRPDLDCMAGLRRITLNCNTLIGDLGACAFADSLSEDLWLRALDLQQCGLTNEGAKALLEALETNTTLVVLDIRKNPLIDHSMMKAVIKKVLQNGRSAKSEYQWITSPSVKEPSKTAKQKRRTIILGSGHKGKATIRIGLATKKPVSSGRKHSLGKEYYAPAPLPPGVSGFLPWRTAERAKRHRGFPLIKTRDICNQLQQPGFPVTVTVESPSSSEVEEVDDSSESVHEVPEKTSIEQEALQEKLEECLKQLKEERVIRLKVDKRVSELEHENAQLRNINFSLSEALHAQSLTNMILDDEGVLGSIENSFQKFHAFLDLLKDAGLGQLATMAGIDQSDFQLLGHPQMTSTVSNPPKEEKKALEDEKPEPKQNALGQMQNIQFQKITGDARIPLPLDSFPVPVSTPEGLGTSSNNLGVPATEQRQESFEGFIARMCSPSPDATSGTGSQRKEEELSRNSRSSSEKKTKTESH']

In [110]:
np.array(res)[0]

'MFRRKLTALDYHNPAGFNCKDETEFRNFIVWLEDQKIRHYKIEDRGNLRNIHSSDWPKFFEKYLRDVNCPFKIQDRQEAIDWLLGLAVRLEYGDNAEKYKDLVPDNSKTADNATKNAEPLINLDVNNPDFKAGVMALANLLQIQRHDDYLVMLKAIRILVQERLTQDAVAKANQTKEGLPVALDKHILGFDTGDAVLNEAAQILRLLHIEELRELQTKINEAIVAVQAIIADPKTDHRLGKVGR'

In [148]:
res = []
for i in range(4802):
    fafile = SeqIO.parse('pssm/dataset_4802/ff/' + str(i) + '.fasta', 'fasta')
    res.append(str(list(fafile)[0].seq))

In [149]:
res = np.array(res)

In [150]:
pssm_res = []
for i in range(4802):
    tmp = []
    with open('pssm/dataset_4802/pssm/' + str(i) + '.pssm', 'r') as pfile:
        for line in pfile:
            tmp.append(line.strip().split()[2:])
    pssm_res.append(np.array(tmp[3: (3+len(res[i]))]))

In [151]:
pssm_res = np.array(pssm_res)

In [152]:
pssm_res.shape

(4802,)

In [153]:
pssm_res[0].shape

(657, 42)

In [154]:
pssm_res[3105].shape[0]

468

In [155]:
from scipy.io import savemat

In [156]:
savemat('dataset_4802_pssm.mat', {'sequence':res, 'pssm':pssm_res})