# Data aggregation

### Load data

In [24]:
import pandas as pd
import numpy as np
import pickle
from tqdm import trange
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

print('Loading train_data and contact matrix')

def load_data(data_name):
    pkl_file = open(data_name + '.pkl', 'rb')
    data = pickle.load(pkl_file)
    pkl_file.close()
    return data

#  Load data and target
data_train = load_data('train_data')
two_matrix = load_data('two_matrix_200')
num_classes = 2
print('Loaded successfully')

data_features = pd.read_csv('pdb_and_features.csv')
data_features = data_features.reset_index(drop=True)
print(len(data_train))
print(len(two_matrix))
print(len(data_features))

protein_len = []
pdb_200 = []

low_border = 15
high_border = 30

s = 0
for i in range(len(two_matrix)):
    if (np.shape(two_matrix[i][2])[0] <= high_border) and (np.shape(two_matrix[i][2])[0] >= low_border):
        s +=1

print('Amount of proteins, which we have after bounding: ', s)

Loading train_data and contact matrix
Loaded successfully
8765
6905
6813
Amount of proteins, which we have after bounding:  403


In [15]:
#  Apply zero-padding to make same size for all train and target samples

target = []
mm = high_border

for i in range(len(two_matrix)):
    
    if (high_border >= np.shape(two_matrix[i][2])[0]) and (np.shape(two_matrix[i][2])[0] >= low_border):
        
        f = np.zeros((mm-np.shape(two_matrix[i][2])[0], np.shape(two_matrix[i][2])[0]))
        f1 = np.zeros((mm, mm-np.shape(two_matrix[i][2])[0]))
        bot = np.concatenate((two_matrix[i][2], f), axis=0)
        target.append([two_matrix[i][0], np.concatenate((bot, f1), axis=1), len(two_matrix[i][2])])
        

train = []
mm = high_border

for i in range(len(data_train)):
    
    if (high_border >= np.shape(data_train[i][1])[0]) and (len(data_features.FASTA[i]) <= high_border) and \
            (len(data_features.FASTA[i]) >= low_border):
        
        f = np.zeros((mm-np.shape(data_train[i][1])[0], np.shape(data_train[i][1])[1]))
        train.append([data_features.pdb_name[i], 
                      np.concatenate((data_train[i][1], f), axis=0), 
                      len(data_features.FASTA[i]),
                      data_features.FASTA[i]])
        
        
        
    elif (len(data_features.FASTA[i]) <= high_border) and (len(data_features.FASTA[i]) >= low_border):
        
        train.append([data_features.pdb_name[i], 
                      data_train[i][1][:mm], 
                      len(data_features.FASTA[i]), 
                      data_features.FASTA[i]])

KeyError: 6813

### train - target

In [3]:
real_train = []
real_target = []

for i in range(len(target)):
    
    for j in range(len(train)):
        
        if train[j][0] == target[i][0]:
            real_train.append(train[j])
            real_target.append(target[i])
            break

In [4]:
np.shape(real_train), np.shape(real_target)

((474, 4), (474, 3))

In [5]:
real_train[2][2], real_target[2][2]

(32, 32)

In [25]:
real_train = load_data('righttrain')
real_target = load_data('righttarget')

### train_test_split

In [26]:
train, train_val_test, train_label, target_val_test = train_test_split(real_train, real_target, test_size=0.3, random_state=13)
test, valid, test_label, valid_label = train_test_split(train_val_test, target_val_test, test_size=0.5, random_state=13)

In [27]:
np.shape(train), np.shape(test)

((273, 3), (58, 3))

## Statistical features

Features: affinity score matrix, length of each protein, distance between every amino acid

### Affinity score matrix

Affinity(a, b) = Contact(A, B) / (Contact(A, B) + NonContact(A, B))

separation distance $\geq$ 6

In [8]:
def affinity_matrix(seq, two_matrix):
    
    # 20 is the number of amino acids
    aa = {'A' : 0, 'C' : 1, 'D' : 2, 'E' : 3, 'F' : 4, 'G' : 5, 'H' : 6, 'I' : 7, 'K' : 8, 'L' : 9, 
          'M' : 10, 'N' : 11, 'P' : 12, 'Q' : 13, 'R' : 14, 'S' : 15, 'T' : 16, 'V' : 17, 'W' : 18, 'Y' : 19}
    
    aff_matrix = [[0] * 20 for i in range(20)]
    matrix = [[0] * 20 for i in range(20)]
        
    if len(seq) != np.shape(two_matrix)[0]:
        return np.array(aff_matrix), np.array(matrix), 0
    
    for i in range(len(seq)):
        for j in range(i + 6, len(seq)):
            
            matrix[aa[seq[i]]][aa[seq[j]]] += 1
            matrix[aa[seq[j]]][aa[seq[i]]] += 1
                
            if two_matrix[i][j] == 1:
                
                aff_matrix[aa[seq[i]]][aa[seq[j]]] += 1
                aff_matrix[aa[seq[j]]][aa[seq[i]]] += 1
                
    return np.array(aff_matrix), np.array(matrix), 1


def get_probability(first, second, aff_matrix):
    
    aa = {'A' : 0, 'C' : 1, 'D' : 2, 'E' : 3, 'F' : 4, 'G' : 5, 'H' : 6, 'I' : 7, 'K' : 8, 'L' : 9, 
      'M' : 10, 'N' : 11, 'P' : 12, 'Q' : 13, 'R' : 14, 'S' : 15, 'T' : 16, 'V' : 17, 'W' : 18, 'Y' : 19}
    
    return aff_matrix[aa[first]][aa[second]]

In [9]:
aff_matrix = [[0] * 20 for i in range(20)]
matrix = [[0] * 20 for i in range(20)]
count = 0

for i in range(len(train)):
    for j in range(len(two_matrix)):
        
        if train[i][0] == two_matrix[j][0]:
            
            aff, m, cnt = affinity_matrix(train[i][3], two_matrix[j][2])
            aff_matrix = np.add(aff_matrix, aff)
            matrix = np.add(matrix, m)
            count += cnt

In [10]:
aff_matrix = np.divide(aff_matrix, matrix)

### Length of each protein

In [11]:
len_matrix = []

for i in range(len(train)):
    
    len_matrix.append([train[i][0], len(train[i][3])])

In [12]:
len_matrix[:5]

[['2KGU', 35], ['1ZWU', 30], ['1HP3', 32], ['1IB9', 34], ['2JPK', 35]]

In [13]:
np.shape(train), np.shape(train_label)

((331, 4), (331, 3))

### Train test transformations

In [28]:
# Some transformations to make train and target data
# For neural networks

def redo(x):
    
    out = []
    for i in range(len(x)):
        out.append(x[i][1])
        
    return np.array(out)


def train_transform(x):
    
    x = redo(np.array(x))
    x = np.expand_dims(x, axis=4)
    
    return x


def reshape_of_labels(x):
    
    lst1 = [[[0 for col in range(x.shape[2])] for col in range(x.shape[0])] for row in range(x.shape[1])]
    for i in range(len(x)):
        for j in range(len(x[i])):
            for k in range(len(x[i][j])):
                lst1[j][i][k] = x[i, j, k]
                
    return lst1


def triangle_target_transform(x):
            
    res = []
    for i in range(x.shape[0]):
        res.append(x[i][1][np.triu_indices(len(x[i][1]), k = 1)])
    
    return np.array(res)

    
def target_transform(x):
    
    new_x = []
    x = triangle_target_transform(x)
    enc = OneHotEncoder(sparse = False)
    
    for i in range(len(x)):

        new_x.append(enc.fit_transform(x[0].reshape(-1, 1)))
        
    x = reshape_of_labels(np.array(new_x))
    
    return np.array(x)


def save_file(data, data_name):
    
    output = open(data_name + '.pkl', 'wb')
    pickle.dump(data, output)
    output.close()
    
    
def machine_train_transform(x, aff_matrix):
    
    out_matrix = []
    out = []
    dist = []
    
    for i in range(len(x)):
        out_matrix = []
        
        for j in range(len(x[i][1])):
            for k in range(j + 1, len(x[i][1])):
                
                if j < x[i][2] and k < x[i][2]:
                    pair = [k - j, get_probability(x[i][3][j], x[i][3][k], aff_matrix), len(x[i][3])]

                    out_matrix.append(np.concatenate([
                        x[i][1][j], 
                        x[i][1][k],
                        pair]))
                else:
                    pair = [0, 0, 0]

                    out_matrix.append(np.concatenate([
                        np.zeros(len(x[i][1][j])), 
                        np.zeros(len(x[i][1][k])),
                        pair]))
                     
        out.append(np.array(out_matrix))
            
    out = np.array(out)
    out = np.expand_dims(out, axis=4)
    
    return np.array(out)


def machine_target_transform(x):
    
    x = triangle_target_transform(x)
    
    return x

### Train data

In [254]:
def batch(iterable, n = 1):
    
    l = len(iterable)
    
    for ndx in range(0, l, n):
        
        yield iterable[ndx : min(ndx + n, l)]

machine_train = []
machine_test = []
machine_valid = []
        
for x in batch(train, 10):
    result = machine_train_transform(x, aff_matrix)
    
    for i in range(len(result)):
        machine_train.append(result[i])

for x in batch(test, 10):
    result = machine_train_transform(x, aff_matrix)
    
    for i in range(len(result)):
        machine_test.append(result[i])
    
for x in batch(valid, 10):
    result = machine_train_transform(x, aff_matrix)
    
    for i in range(len(result)):
        machine_valid.append(result[i])

  out = np.expand_dims(out, axis=4)


In [256]:
machine_train = np.array(machine_train)

### Target data

Test data for neural networks and machine learning models

In [260]:
machine_train_label = target_transform(np.array(train_label))
machine_test_label = target_transform(np.array(test_label))
machine_valid_label = target_transform(np.array(valid_label))

In [261]:
machine_train.shape, machine_train_label.shape

((331, 780, 115, 1), (780, 331, 2))

In [27]:
np.shape(neural_train_label), np.shape(neural_train)

((780, 340, 2), (340, 40, 56, 1))

### Save files

In [262]:
# save_file(neural_train, 'neural_train')
# save_file(neural_train_label, 'neural_train_label')
# save_file(neural_test, 'neural_test')
# save_file(neural_test_label, 'neural_test_label')
# save_file(neural_valid, 'neural_valid')
# save_file(neural_valid_label, 'neural_valid_label')

save_file(machine_train, 'machine_train')
save_file(machine_train_label, 'machine_train_label')
save_file(machine_test, 'machine_test')
save_file(machine_test_label, 'machine_test_label')
save_file(machine_valid, 'machine_valid')
save_file(machine_valid_label, 'machine_valid_label')

In [None]:
a = [0, 1, 1]