In [1]:
import numpy as np
from itertools import starmap

# Get Sequence Files

In [2]:
seqfile = '../data/enz_sequence.csv'

enzyme_names = []
X_raw = []

with open(seqfile,'rt') as f:
    for lines in f:
        val = lines.strip().split(',')
        enzyme_names.append(val[0])
        X_raw.append(val[1])

# Get integer encoded sequence

In [3]:
AAs = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
         'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

AAdict = {aa:i+1 for i,aa in enumerate(AAs)}

In [4]:
def get_integer_features(seq):
    seq_int = []
    for res in seq:
        seq_int.append(AAdict[res])
    return seq_int

In [5]:
X_int_enc = list(map(get_integer_features,X_raw))

# Trim/Pad sequences

In [6]:
def pad_sequence(seq,length):
    '''pads a variable length sequence into a sequence of length {length} with zeros'''
    seq_length = len(seq)
    delta = length - seq_length
    pre_pad_length = delta//2
    post_pad_length = delta - pre_pad_length
    
    pre_pad = np.zeros(pre_pad_length)
    post_pad = np.zeros(post_pad_length)
    
    seq = np.array(seq)
    
    seq = np.insert(seq,0,pre_pad)
    
    seq = np.append(seq,post_pad)
    return seq


def trim_sequence(seq,length):
    '''trims a variable length sequence into a sequence of equal length'''
    seq_length = len(seq)
    delta = seq_length - length
    pre_trim_length = delta//2
    post_trim_length = delta - pre_trim_length
    
    seq = np.array(seq)
    
    return seq[pre_trim_length:seq_length-post_trim_length]


def process_sequence(seq,length):
    '''modifies a sequence of variable length into an user-defined length'''
    seq_length = len(seq)
    if seq_length == length:
        return np.array(seq)
    
    elif seq_length<length:
        return pad_sequence(seq,length)
    
    elif seq_length>length:
        return trim_sequence(seq,length)
    
    return

In [7]:
# get median length of all seq
all_seq_length = list(map(len,X_int_enc))
median_len = np.median(all_seq_length)

In [8]:
argument_iter = zip(X_int_enc,[int(median_len) for i in range(len(X_int_enc))])

In [9]:
X_int_samelen = list(starmap(process_sequence,argument_iter))

In [10]:
X_int_enc = np.array(X_int_samelen)

In [11]:
X_int_enc.shape

(6033, 408)

# One Hot Encode

In [12]:
from sklearn.preprocessing import OneHotEncoder

In [13]:
ohe = OneHotEncoder()

In [14]:
ohe.fit(X_int_enc)

X = ohe.transform(X_int_enc).toarray()

# Get Labels

In [15]:
labelfile = '../data/enz_labels.csv'

enzyme_namesy = []
y_raw = []

with open(labelfile,'rt') as f:
    for lines in f:
        val = lines.strip().split(',')
        enzyme_namesy.append(val[0])
        y_raw.append(int(val[1]))        

In [16]:
assert enzyme_names == enzyme_namesy

In [17]:
y = np.array(y_raw)

In [18]:
enzyme_names = np.array(enzyme_names)

# Make train test data

In [19]:
y0_idx = np.argwhere(y==0).flatten()
y1_idx = np.argwhere(y==1).flatten()

train0_idx = np.random.choice(y0_idx,size=int(0.75*len(y0_idx)),replace=False)
train1_idx = np.random.choice(y1_idx,size=int(0.45*len(y1_idx)),replace=False)

valid0_idx = np.array([i for i in y0_idx if i not in train0_idx])
valid1_idx = np.array([i for i in y1_idx if i not in train1_idx])

train_idx = np.append(train0_idx,train1_idx)
valid_idx = np.append(valid0_idx,valid1_idx)

np.random.shuffle(train_idx)
np.random.shuffle(valid_idx)

X_train,X_valid = X[train_idx],X[valid_idx]
y_train,y_valid = y[train_idx],y[valid_idx]
enz_train,enz_valid = enzyme_names[train_idx],enzyme_names[valid_idx]

assert len(enz_train) + len(enz_valid) == len(enzyme_names)
assert len(X_train) + len(X_valid) == len(X)
assert len(y_train) + len(y_valid) == len(y)

X_train = X_train.reshape(-1,1,X.shape[1]).astype('float32')
X_valid = X_valid.reshape(-1,1,X.shape[1]).astype('float32')
y_train = y_train.reshape(-1,1).astype('float32')
y_valid = y_valid.reshape(-1,1).astype('float32')

# Pytorch

In [20]:
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

## DataLoader

In [21]:
trainset = []
for i in range(len(X_train)):
    trainset.append([X_train[i],y_train[i]])

validset = []
for i in range(len(X_valid)):
    validset.append([X_valid[i],y_valid[i]])
    
trainloader = torch.utils.data.DataLoader(trainset, batch_size=100,
                                          shuffle=True, num_workers=4)
validloader = torch.utils.data.DataLoader(validset, batch_size=500,
                                          shuffle=True, num_workers=4)

## Define model architecture

In [22]:
seq_encoded_len = X_train.shape[2]

In [98]:
class RNN(nn.Module):
    def __init__(self, input_size, output_size, hidden_dim, n_layers):
        super(RNN, self).__init__()

        # Defining some parameters
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers

        #Defining the layers
        # RNN Layer
        self.rnn = nn.LSTM(input_size, hidden_dim, n_layers, batch_first=True,dropout=0.0)   
        # Fully connected layer
        self.fc = nn.Linear(hidden_dim, output_size)
        
    def forward(self, x):
        
        batch_size = x.size(0)

        # Initializing hidden state for first input using method defined below
        hidden = self.init_hidden(batch_size)
        hidden_state = self.init_hidden(batch_size)

        # Passing in the input and hidden state into the model and obtaining outputs
        out, hidden = self.rnn(x, (hidden,hidden_state))
        
        # Reshaping the outputs such that it can be fit into the fully connected layer
        out = out.contiguous().view(-1, self.hidden_dim)
        out = self.fc(out)
        out = torch.sigmoid(out)
        return out, hidden

    def init_hidden(self, batch_size):
        # This method generates the first hidden state of zeros which we'll use in the forward pass
        # We'll send the tensor holding the hidden state to the device we specified earlier as well
        hidden = torch.randn(self.n_layers, batch_size, self.hidden_dim)
        return hidden

n_hidden = 50
n_categories = 1
rnn = RNN(seq_encoded_len, 1, n_hidden, 2)

In [99]:
# Define hyperparameters
n_epochs = 5
lr=0.001

# Define Loss, Optimizer
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(rnn.parameters(), lr=lr)



In [100]:
for epoch in range(0,n_epochs+1):
    
    
    for i,data in enumerate(trainloader,0):
        optimizer.zero_grad()
        
        input_ = data[0]
        label = data[1]


        hidden = rnn.init_hidden(input_.shape[0])
        

        ypred,next_hidden = rnn(input_)
        
        
        loss = criterion(ypred, label)
        loss.backward()
        optimizer.step()
        
    if epoch%10 == 0:
        print('Epoch: {}/{}.............'.format(epoch, n_epochs), end=' ')
        print("Loss: {:.6f}".format(loss.item()))


Epoch: 0/5............. Loss: 0.699357


In [101]:
num_correct = 0
all_labels = []
all_preds = []

for i,data in enumerate(validloader):
    inputs, labels = data
    
    all_labels.extend(labels.flatten().detach().numpy())
    
    
    outputs,_ = rnn(inputs)    
    num_correct += (outputs.round() == labels).sum()
    
    all_preds.extend(outputs.round().flatten().detach().numpy())


In [102]:
num_correct.item()/(len(validset))

0.5639666919000756

In [103]:
np.unique(all_labels,return_counts=True)

(array([0., 1.], dtype=float32), array([ 564, 2078]))

In [104]:
np.unique(all_preds,return_counts=True)

(array([0., 1.], dtype=float32), array([1096, 1546]))

In [105]:
from sklearn.metrics import f1_score,precision_score,recall_score

In [106]:
f1_score(all_labels,all_preds)

0.6821192052980133

In [107]:
precision_score(all_labels,all_preds,pos_label=0)

0.23175182481751824

In [108]:
recall_score(all_labels,all_preds,pos_label=0)

0.450354609929078