# Import Modules

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import multiprocessing as mp
%matplotlib inline

sys.path.append('../')
from helper.scripts import read_fasta,read_pssm_profile

# Read datafiles and get pssm profiles

In [2]:
datadir = '../featEngg/offline/pssmMethods/data/pssmProfiles/trainfiles/'
datafiles = [f.name for f in os.scandir(datadir) if f.name.endswith('.pssm')]
pool_read = mp.Pool(mp.cpu_count())
all_pssm_profiles = list(pool_read.map(read_pssm_profile,[datadir+f for f in datafiles]))

# Make block matrices inspired by AB-PSSM

In [14]:
def parse_pssm_matrix(matrix):
    #keep only the last 20 columns
    #change the object type to float 
    parsed_matrix = matrix[:,2:22].astype(float)
    return parsed_matrix
    
    

def make_block_matrix(matrix):
    
    matrix = parse_pssm_matrix(matrix)
    
    number_of_blocks = 20
    matrix_length = matrix.shape[0]
    elements_in_blocks = round(matrix_length/number_of_blocks)
    i = 0
    N = 1
    block_matrix = []
    while N<number_of_blocks:
        curr_block_min = i
        curr_block_max = min(matrix_length,i + elements_in_blocks)
        curr_block = np.sum(matrix[curr_block_min:curr_block_max,:],axis=0)
        i = curr_block_max
        N+=1
        block_matrix.append(curr_block)
    

    curr_block = np.sum(matrix[(N-1)*elements_in_blocks:matrix_length,:],axis=0)
    block_matrix.append(curr_block)
    return np.array(block_matrix)

In [15]:
pool_makeblock = mp.Pool(mp.cpu_count()) 
block_pssm_profiles = list(pool_makeblock.map(make_block_matrix,all_pssm_profiles))

In [20]:
## check if all block matrices are of shape 20*20
assert [b.shape for b in block_pssm_profiles] == [(20,20) for i in range(len(block_pssm_profiles))]

# Make dataset

In [33]:
#
X_raw = np.array(block_pssm_profiles)

In [34]:
enz_names = [en.replace('.pssm','') for en in datafiles]

In [38]:
label_file = '../data/enz_labels.csv'
label_dict = {}
with open(label_file,'r') as f:
    for lines in f:
        vals = lines.strip().split(',')
        label_dict[vals[0]] = vals[1]

In [42]:
X = []
y = []
enzyme_names = []
for enz_name,x in zip(enz_names,X_raw):
    if enz_name in label_dict:
        label = label_dict[enz_name]
        X.append(x)
        y.append(float(label))
        enzyme_names.append(enz_name) 

In [45]:
assert len(X)==len(y)==len(enzyme_names)

# Import pytorch 