# Data aggregation

### Load data

In [11]:
import pandas as pd
import numpy as np
import pickle
from tqdm import trange
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

print('Loading train_data and contact matrix')

def load_data(data_name):
    pkl_file = open(data_name + '.pkl', 'rb')
    data = pickle.load(pkl_file)
    pkl_file.close()
    return data

#  Load data and target
data_train = load_data('train_data')
two_matrix = load_data('two_matrix_200')
num_classes = 2
print('Loaded successfully')

data_features = pd.read_csv('pdb_and_features.csv')
data_features = data_features.reset_index(drop=True)
print(len(data_train))
print(len(two_matrix))

protein_len = []
pdb_200 = []

low_border = 25
high_border = 40

s = 0
for i in range(len(two_matrix)):
    if (np.shape(two_matrix[i][2])[0] <= high_border) and (np.shape(two_matrix[i][2])[0] >= low_border):
        s +=1

print('Amount of proteins, which we have after bounding: ', s)

Loading train_data and contact matrix
Loaded successfully
6812
6905
Amount of proteins, which we have after bounding:  504


### Zero-padding

In [12]:
#  Apply zero-padding to make same size for all train and target samples

target = []
mm = high_border

for i in range(len(two_matrix)):
    
    if (high_border >= np.shape(two_matrix[i][2])[0]) and (np.shape(two_matrix[i][2])[0] >= low_border):
        
        f = np.zeros((mm-np.shape(two_matrix[i][2])[0], np.shape(two_matrix[i][2])[0]))
        f1 = np.zeros((mm, mm-np.shape(two_matrix[i][2])[0]))
        bot = np.concatenate((two_matrix[i][2], f), axis=0)
        target.append([two_matrix[i][0], np.concatenate((bot, f1), axis=1), len(two_matrix[i][2])])
        

train = []
mm = high_border

for i in range(len(data_train)):
    
    if (high_border >= np.shape(data_train[i][1])[0]) and (len(data_features.FASTA[i]) <= high_border) and \
            (len(data_features.FASTA[i]) >= low_border):
        
        f = np.zeros((mm-np.shape(data_train[i][1])[0], np.shape(data_train[i][1])[1]))
        train.append([data_features.pdb_name[i], 
                      np.concatenate((data_train[i][1], f), axis=0), 
                      len(data_features.FASTA[i]),
                      data_features.FASTA[i]])
        
    elif (len(data_features.FASTA[i]) <= high_border) and (len(data_features.FASTA[i]) >= low_border):
        
        train.append([data_features.pdb_name[i], 
                      data_train[i][1][:mm], 
                      len(data_features.FASTA[i]), 
                      data_features.FASTA[i]])

### train - target

In [13]:
real_train = []
real_target = []

for i in range(len(train)):
    
    for j in range(len(target)):
        
        if train[i][0] == target[j][0]:
            if train[i][2] == target[j][2]:
                real_train.append(train[i])
                real_target.append(target[j])
                break

In [14]:
np.shape(real_train), np.shape(real_target)

((474, 4), (474, 3))

In [15]:
real_train[134][2], real_target[134][2]

(39, 39)

### train_test_split

In [18]:
train, train_val_test, train_label, target_val_test = train_test_split(real_train, real_target, test_size=0.3, random_state=13)
test, valid, test_label, valid_label = train_test_split(train_val_test, target_val_test, test_size=0.5, random_state=13)

In [20]:
np.shape(train), np.shape(test)

((331, 4), (71, 4))

In [133]:
train[0][1].shape

(40, 56)

### Train test transformations

In [24]:
# Some transformations to make train and target data
# For neural networks

def redo(x):
    
    out = []
    for i in range(len(x)):
        out.append(x[i][1])
        
    return np.array(out)


def train_transform(x):
    
    x = redo(np.array(x))
    x = np.expand_dims(x, axis=4)
    
    return x


def reshape_of_labels(x):
    
    lst1 = [[[0 for col in range(x.shape[2])] for col in range(x.shape[0])] for row in range(x.shape[1])]
    for i in range(len(x)):
        for j in range(len(x[i])):
            for k in range(len(x[i][j])):
                lst1[j][i][k] = x[i, j, k]
                
    return lst1


def triangle_target_transform(x):
    
    x = redo(np.array(x))
    
    length = x.shape[1]
    width = x.shape[2]
        
    x = np.array(list(map(lambda x: x.reshape(length, width)[np.triu_indices(width, k = 1)], x)))
    
    return x

    
def target_transform(x):
    
    x = triangle_target_transform(x)
    print(x)
    
#     x_l = x.shape[0]
#     x_w = x.shape[1]
#     x = enc.fit_transform(x.reshape(-1, 1))
#     x = x.reshape(x_l, x_w, num_classes)
    
#     print(x)
#     x = reshape_of_labels(x)
    
    return x


# def test_target_transform(x):
    
#     x = redo(np.array(x))
#     x_w = x.shape[0]
#     x_h = x.shape[1]
#     x_l = x.shape[2]
#     x = x.reshape(x_w, x_h*x_l)
#     enc = OneHotEncoder(sparse=False)
#     x = enc.fit_transform(x.reshape(-1, 1))
#     x = x.reshape(x_w, x_h*x_l, num_classes)
#     x = reshape_of_labels(x)
#     return x


def save_file(data, data_name):
    
    output = open(data_name + '.pkl', 'wb')
    pickle.dump(data, output)
    output.close()

In [132]:
# machine learning preprocessing

def machine_train_transform(x):
    
    x = redo(np.array(x))
    out_matrix = []
    out = []
    dist = []
    
    for i in range(x.shape[0]):
        for j in range(x.shape[1]):
            for k in range(j + 1, x.shape[1]):
                out.append(np.concatenate([x[i][j], x[i][k]]))
                
    return np.array(out)

def machine_target_transform(x):
    
    x = triangle_target_transform(x)
    
    return x.flatten()

Train data for neural networks and machine learning models

In [22]:
neural_train = train_transform(np.array(train))
neural_test = train_transform(np.array(test))
neural_valid = train_transform(np.array(valid))

  x = np.expand_dims(x, axis=4)


Test data for neural networks and machine learning models

In [27]:
neural_train_label = target_transform(np.array(train_label))
neural_test_label = target_transform(np.array(test_label))
neural_valid_label = target_transform(np.array(valid_label))

[[1. 1. 0. ... 1. 1. 1.]
 [1. 1. 0. ... 1. 0. 0.]
 [1. 1. 0. ... 0. 0. 0.]
 ...
 [1. 1. 1. ... 0. 0. 0.]
 [1. 1. 1. ... 0. 0. 0.]
 [1. 1. 1. ... 0. 0. 0.]]
[[1. 1. 1. ... 1. 1. 1.]
 [1. 1. 0. ... 0. 0. 0.]
 [1. 1. 0. ... 0. 0. 0.]
 ...
 [1. 1. 0. ... 0. 0. 0.]
 [1. 1. 0. ... 0. 0. 0.]
 [1. 1. 0. ... 0. 0. 0.]]
[[1. 1. 1. ... 0. 0. 0.]
 [1. 1. 1. ... 0. 0. 0.]
 [1. 1. 0. ... 0. 0. 0.]
 ...
 [1. 1. 1. ... 0. 0. 0.]
 [1. 1. 0. ... 1. 1. 1.]
 [1. 1. 1. ... 0. 0. 0.]]


In [26]:
np.shape(neural_train_label)

(331, 780)

In [30]:
np.shape(neural_train_label), np.shape(neural_test)

((331, 780), (71, 40, 56, 1))

### Save files

In [28]:
save_file(neural_train, 'neural_train')
save_file(neural_train_label, 'neural_train_label')
save_file(neural_test, 'neural_test')
save_file(neural_test_label, 'neural_test_label')
save_file(neural_valid, 'neural_valid')
save_file(neural_valid_label, 'neural_valid_label')

# save_file(machine_train, 'machine_train')
# save_file(machine_train_label, 'machine_train_label')
# save_file(machine_test, 'machine_test')
# save_file(machine_test_label, 'machine_test_label')
# save_file(machine_valid, 'machine_valid')
# save_file(machine_valid_label, 'machine_valid_label')

In [None]:
a = [0, 1, 1]