# Import Modules

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import multiprocessing as mp
%matplotlib inline

sys.path.append('../')
from helper.scripts import read_fasta,read_pssm_profile

# Read datafiles and get pssm profiles

In [2]:
datadir = '../featEngg/offline/pssmMethods/data/pssmProfiles/trainfiles/'
datafiles = [f.name for f in os.scandir(datadir) if f.name.endswith('.pssm')]
pool_read = mp.Pool(mp.cpu_count())
all_pssm_profiles = list(pool_read.map(read_pssm_profile,[datadir+f for f in datafiles]))

# Make block matrices inspired by AB-PSSM

In [3]:
def parse_pssm_matrix(matrix):
    #keep only the last 20 columns
    #change the object type to float 
    parsed_matrix = matrix[:,2:22].astype(float)
    return parsed_matrix
    
    

def make_block_matrix(matrix):
    
    matrix = parse_pssm_matrix(matrix)
    
    number_of_blocks = 20
    matrix_length = matrix.shape[0]
    elements_in_blocks = int(matrix_length/number_of_blocks)
    i = 0
    N = 1
    block_matrix = []
    while N<number_of_blocks:
        curr_block_min = i
        curr_block_max = min(matrix_length,i + elements_in_blocks)
        curr_block = np.sum(matrix[curr_block_min:curr_block_max,:],axis=0)
        i = curr_block_max
        N+=1
        block_matrix.append(curr_block)
    

    curr_block = np.sum(matrix[(N-1)*elements_in_blocks:matrix_length,:],axis=0)
    block_matrix.append(curr_block)
    
    block_matrix = np.array(block_matrix)    
    return block_matrix

In [4]:
pool_makeblock = mp.Pool(mp.cpu_count()) 
block_pssm_profiles = list(pool_makeblock.map(make_block_matrix,all_pssm_profiles))

In [5]:
## check if all block matrices are of shape 20*20
assert [b.shape for b in block_pssm_profiles] == [(20,20) for i in range(len(block_pssm_profiles))]

# Make dataset

In [6]:
#
X_raw = np.array(block_pssm_profiles)

In [7]:
enz_names = [en.replace('.pssm','') for en in datafiles]

In [8]:
label_file = '../data/enz_labels.csv'
label_dict = {}
with open(label_file,'r') as f:
    for lines in f:
        vals = lines.strip().split(',')
        label_dict[vals[0]] = vals[1]

In [9]:
X = []
y = []
enzyme_names = []
for enz_name,x in zip(enz_names,X_raw):
    if enz_name in label_dict:
        label = label_dict[enz_name]
        X.append(x)
        y.append(float(label))
        enzyme_names.append(enz_name) 
        
X = np.array(X)
y = np.array(y)
enzyme_names = np.array(enzyme_names)

In [10]:
assert len(X)==len(y)==len(enzyme_names)

# Conduct your own train test split

In [11]:
y0_idx = np.argwhere(y==0).flatten()
y1_idx = np.argwhere(y==1).flatten()

In [12]:
train0_idx = np.random.choice(y0_idx,size=int(0.75*len(y0_idx)),replace=False)
train1_idx = np.random.choice(y1_idx,size=int(0.45*len(y1_idx)),replace=False)

valid0_idx = np.array([i for i in y0_idx if i not in train0_idx])
valid1_idx = np.array([i for i in y1_idx if i not in train1_idx])

In [13]:
train_idx = np.append(train0_idx,train1_idx)
valid_idx = np.append(valid0_idx,valid1_idx)

np.random.shuffle(train_idx)
np.random.shuffle(valid_idx)

In [14]:
X_train,X_valid = X[train_idx],X[valid_idx]
y_train,y_valid = y[train_idx],y[valid_idx]
enz_train,enz_valid = enzyme_names[train_idx],enzyme_names[valid_idx]

In [15]:
assert len(enz_train) + len(enz_valid) == len(enzyme_names)
assert len(X_train) + len(X_valid) == len(X)
assert len(y_train) + len(y_valid) == len(y)

In [16]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train.reshape(-1, X_train.shape[-1])).reshape(X_train.shape)
X_valid = scaler.transform(X_valid.reshape(-1, X_valid.shape[-1])).reshape(X_valid.shape)

In [17]:
X_train = X_train.reshape(-1,1,20,20).astype('float32')
X_valid = X_valid.reshape(-1,1,20,20).astype('float32')
y_train = y_train.reshape(-1,1).astype('float32')
y_valid = y_valid.reshape(-1,1).astype('float32')

# Import pytorch 

In [18]:
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# DataLoader

In [19]:
trainset = []
for i in range(len(X_train)):
    trainset.append([X_train[i],y_train[i]])

validset = []
for i in range(len(X_valid)):
    validset.append([X_valid[i],y_valid[i]])


In [20]:
trainloader = torch.utils.data.DataLoader(trainset, batch_size=1500,
                                          shuffle=True, num_workers=24)
validloader = torch.utils.data.DataLoader(validset, batch_size=500,
                                          shuffle=True, num_workers=24)

# Define model architecture

In [32]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 3, 3)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(3, 1, 5)
        self.fc1 = nn.Linear(4, 1)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(x.size(0),-1)
        x = F.relu(self.fc1(x))
        return torch.sigmoid(x)


net = Net()

# Define Loss function

In [86]:
criterion = nn.BCELoss()
optimizer = optim.Adamax(net.parameters(), lr=0.001, weight_decay=1.15)

# Train Network

In [95]:
for epoch in range(2):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        print(outputs)

        # print statistics
        running_loss += loss.item()
        if i % 20 == 0:    # print every 20 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 20))
            running_loss = 0.0

print('Finished Training')

tensor([[0.5157],
        [0.5157],
        [0.5158],
        ...,
        [0.5157],
        [0.5157],
        [0.5157]], grad_fn=<SigmoidBackward>)
[1,     1] loss: 0.035
tensor([[0.5155],
        [0.5156],
        [0.5156],
        ...,
        [0.5156],
        [0.5155],
        [0.5156]], grad_fn=<SigmoidBackward>)
tensor([[0.5154],
        [0.5154],
        [0.5154],
        [0.5154],
        [0.5154],
        [0.5154],
        [0.5154],
        [0.5154],
        [0.5154],
        [0.5154],
        [0.5154],
        [0.5154],
        [0.5154],
        [0.5154],
        [0.5154],
        [0.5154],
        [0.5154],
        [0.5154],
        [0.5154],
        [0.5154],
        [0.5154],
        [0.5154],
        [0.5155],
        [0.5154],
        [0.5154],
        [0.5154],
        [0.5154],
        [0.5154],
        [0.5154],
        [0.5154],
        [0.5154],
        [0.5154],
        [0.5154],
        [0.5154],
        [0.5154],
        [0.5154],
        [0.5154],
        [0.51

In [88]:
num_correct = 0
all_labels = []
all_preds = []

for i,data in enumerate(validloader):
    inputs, labels = data
    
    all_labels.extend(labels.flatten().detach().numpy())
    
    
    outputs = net(inputs)    
    num_correct += (outputs.round() == labels).sum()
    
    all_preds.extend(outputs.round().flatten().detach().numpy())
    

In [90]:
num_correct.item()/(len(validset))

0.7865253595760787

In [91]:
np.unique(all_labels,return_counts=True)

(array([0., 1.], dtype=float32), array([ 564, 2078]))

In [92]:
np.unique(all_preds,return_counts=True)

(array([1.], dtype=float32), array([2642]))

In [93]:
from sklearn.metrics import f1_score,precision_score,recall_score

In [79]:
f1_score(all_labels,all_preds)

0.8805084745762711

In [71]:
precision_score(all_labels,all_preds,pos_label=0)

  _warn_prf(average, modifier, msg_start, len(result))


0.0

In [61]:
recall_score(all_labels,all_preds,pos_label=0)

0.10106382978723404