In [1]:
import numpy as np
from itertools import starmap

# Get Sequence Files

In [2]:
seqfile = '../data/enz_sequence.csv'

enzyme_names = []
X_raw = []

with open(seqfile,'rt') as f:
    for lines in f:
        val = lines.strip().split(',')
        enzyme_names.append(val[0])
        X_raw.append(val[1])

# Get integer encoded sequence

In [3]:
AAs = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
         'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

AAdict = {aa:i+1 for i,aa in enumerate(AAs)}

In [4]:
def get_integer_features(seq):
    seq_int = []
    for res in seq:
        seq_int.append(AAdict[res])
    return seq_int

In [5]:
X_int_enc = list(map(get_integer_features,X_raw))

# Trim/Pad sequences

In [6]:
def pad_sequence(seq,length):
    '''pads a variable length sequence into a sequence of length {length} with zeros'''
    seq_length = len(seq)
    delta = length - seq_length
    pre_pad_length = delta//2
    post_pad_length = delta - pre_pad_length
    
    pre_pad = np.zeros(pre_pad_length)
    post_pad = np.zeros(post_pad_length)
    
    seq = np.array(seq)
    
    seq = np.insert(seq,0,pre_pad)
    
    seq = np.append(seq,post_pad)
    return seq


def trim_sequence(seq,length):
    '''trims a variable length sequence into a sequence of equal length'''
    seq_length = len(seq)
    delta = seq_length - length
    pre_trim_length = delta//2
    post_trim_length = delta - pre_trim_length
    
    seq = np.array(seq)
    
    return seq[pre_trim_length:seq_length-post_trim_length]


def process_sequence(seq,length):
    '''modifies a sequence of variable length into an user-defined length'''
    seq_length = len(seq)
    if seq_length == length:
        return np.array(seq)
    
    elif seq_length<length:
        return pad_sequence(seq,length)
    
    elif seq_length>length:
        return trim_sequence(seq,length)
    
    return

In [7]:
# get median length of all seq
all_seq_length = list(map(len,X_int_enc))
median_len = np.median(all_seq_length)

In [8]:
argument_iter = zip(X_int_enc,[int(median_len) for i in range(len(X_int_enc))])

In [9]:
X_int_samelen = list(starmap(process_sequence,argument_iter))

In [10]:
X_int_enc = np.array(X_int_samelen)

In [11]:
X_int_enc.shape

(6033, 408)

# Transform to Matrix- OHE

Rows: AminoAcid Type
Cols: 1 if the amino acid corresponding to that row is present in that particular position in the sequence, 0 otherwise

In [12]:
def transform_helper(idx):
    arr = np.zeros((len(AAdict)+1,1))
    arr[int(idx)] = 1
    return arr

In [13]:
def transform_seq(sequence):
    return np.concatenate(tuple(map(transform_helper, sequence)), axis=1)

def transform(sequences):
    return list(map(transform_seq, sequences))

In [14]:
X = np.array(transform(X_int_enc))

# Get Labels

In [15]:
labelfile = '../data/enz_labels.csv'

enzyme_namesy = []
y_raw = []

with open(labelfile,'rt') as f:
    for lines in f:
        val = lines.strip().split(',')
        enzyme_namesy.append(val[0])
        y_raw.append(int(val[1]))        

In [16]:
assert enzyme_names == enzyme_namesy

In [17]:
y = np.array(y_raw)

In [18]:
enzyme_names = np.array(enzyme_names)

# Make train test data

In [19]:
y0_idx = np.argwhere(y==0).flatten()
y1_idx = np.argwhere(y==1).flatten()

train0_idx = np.random.choice(y0_idx,size=int(0.75*len(y0_idx)),replace=False)
train1_idx = np.random.choice(y1_idx,size=int(0.45*len(y1_idx)),replace=False)

valid0_idx = np.array([i for i in y0_idx if i not in train0_idx])
valid1_idx = np.array([i for i in y1_idx if i not in train1_idx])

train_idx = np.append(train0_idx,train1_idx)
valid_idx = np.append(valid0_idx,valid1_idx)

np.random.shuffle(train_idx)
np.random.shuffle(valid_idx)

X_train,X_valid = X[train_idx],X[valid_idx]
y_train,y_valid = y[train_idx],y[valid_idx]
enz_train,enz_valid = enzyme_names[train_idx],enzyme_names[valid_idx]

assert len(enz_train) + len(enz_valid) == len(enzyme_names)
assert len(X_train) + len(X_valid) == len(X)
assert len(y_train) + len(y_valid) == len(y)

len_tr = X_train.shape[0]
len_va = X_valid.shape[0]

X_train = X_train.reshape((len_tr,1,21,408)).astype('float32')
X_valid = X_valid.reshape((len_va,1,21,408)).astype('float32')
y_train = y_train.reshape(-1,1).astype('float32')
y_valid = y_valid.reshape(-1,1).astype('float32')

In [20]:
X_train.shape

(3391, 1, 21, 408)

# Pytorch

In [21]:
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

## DataLoader

In [22]:
trainset = []
for i in range(len(X_train)):
    trainset.append([X_train[i],y_train[i]])

validset = []
for i in range(len(X_valid)):
    validset.append([X_valid[i],y_valid[i]])
    
trainloader = torch.utils.data.DataLoader(trainset, batch_size=100,
                                          shuffle=True, num_workers=1)
validloader = torch.utils.data.DataLoader(validset, batch_size=500,
                                          shuffle=True, num_workers=1)

## Define model architecture

In [79]:
class D_CNN(nn.Module):
    def __init__(self):
        super(D_CNN, self).__init__()

        self.convlayers = nn.Sequential(
          nn.Conv2d(in_channels = 1, out_channels = 21, kernel_size = (1, 25), stride = 1, padding = 0, dilation=1),
          nn.ReLU(),
          nn.Conv2d(in_channels = 3, out_channels = 6, kernel_size = (3, 15), stride = 1, padding = 0, dilation=2),
          nn.ReLU(),
          nn.Conv2d(in_channels = 6, out_channels = 9, kernel_size = (3, 15), stride = 1, padding = 0, dilation=2),
          nn.ReLU(),
          nn.Conv2d(in_channels = 9, out_channels = 9, kernel_size = (3, 15), stride = 1, padding = 0, dilation=2),
          nn.ReLU(),
          nn.Conv2d(in_channels = 9, out_channels = 9, kernel_size = (3, 15), stride = 1, padding = 0, dilation=2),
          nn.ReLU(),
          nn.Conv2d(in_channels = 9, out_channels = 9, kernel_size = (3, 15), stride = 1, padding = 0, dilation=2),
          nn.ReLU(),
          nn.Conv2d(in_channels = 9, out_channels = 9, kernel_size = (3, 15), stride = 1, padding = 0, dilation=2),
          nn.ReLU(),
          nn.Conv2d(in_channels = 9, out_channels = 9, kernel_size = (3, 15), stride = 1, padding = 0, dilation=2),
          nn.ReLU(),
          nn.Conv2d(in_channels = 9, out_channels = 16, kernel_size = (3, 15), stride = 1, padding = 0, dilation=2),
          nn.ReLU())
        
        self.fclayers = nn.Sequential(
          nn.Linear(24960,1200),
          nn.ReLU(),
          nn.Linear(1200,184),
          nn.ReLU(),
          nn.Linear(184,10),nn.ReLU(),nn.Linear(10,1))
        
        
    def forward(self, x):
        
        x = self.convlayers(x)
        x = x.view(-1,24960)
        x = self.fclayers(x)
        x = torch.sigmoid(x)
        return x


d_cnn = D_CNN()

In [80]:
for p in d_cnn.parameters():
    print(p.shape)

torch.Size([3, 1, 3, 7])
torch.Size([3])
torch.Size([6, 3, 3, 15])
torch.Size([6])
torch.Size([9, 6, 3, 15])
torch.Size([9])
torch.Size([9, 9, 3, 15])
torch.Size([9])
torch.Size([9, 9, 3, 15])
torch.Size([9])
torch.Size([9, 9, 3, 15])
torch.Size([9])
torch.Size([9, 9, 3, 15])
torch.Size([9])
torch.Size([9, 9, 3, 15])
torch.Size([9])
torch.Size([16, 9, 3, 15])
torch.Size([16])
torch.Size([1200, 24960])
torch.Size([1200])
torch.Size([184, 1200])
torch.Size([184])
torch.Size([10, 184])
torch.Size([10])
torch.Size([1, 10])
torch.Size([1])


In [81]:
# Define hyperparameters
n_epochs = 20
lr=0.001

# Define Loss, Optimizer
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(d_cnn.parameters(), lr=lr)



In [1]:
def get_lo(li,di,ks=25):
    return li - di*(ks-1) -1 +1

li = 8406
dils = [0,2,4,6,8,16,32,64,128]

for i in range(1,9):
    dil = dils[i]
    lo = get_lo(li,dil)
    li = lo

In [5]:
li*5

10830

In [82]:
for epoch in range(0,n_epochs+1):
    
    
    for i,data in enumerate(trainloader,0):
        optimizer.zero_grad()
        
        input_ = data[0]
        label = data[1]


        ypred = d_cnn(input_)
        
        loss = criterion(ypred, label)
        loss.backward()
        optimizer.step()
        
    if epoch%10 == 0:
        print('Epoch: {}/{}.............'.format(epoch, n_epochs), end=' ')
        print("Loss: {:.6f}".format(loss.item()))


RuntimeError: Calculated padded input size per channel: (1 x 284). Kernel size: (5 x 29). Kernel size can't be greater than actual input size

In [67]:
num_correct = 0
all_labels = []
all_preds = []

for i,data in enumerate(validloader):
    inputs, labels = data
    
    all_labels.extend(labels.flatten().detach().numpy())
    
    
    outputs = d_cnn(inputs)    
    num_correct += (outputs.round() == labels).sum()
    
    all_preds.extend(outputs.round().flatten().detach().numpy())


In [68]:
num_correct.item()/(len(validset))

0.7865253595760787

In [69]:
np.unique(all_labels,return_counts=True)

(array([0., 1.], dtype=float32), array([ 564, 2078]))

In [70]:
np.unique(all_preds,return_counts=True)

(array([1.], dtype=float32), array([2642]))

In [71]:
from sklearn.metrics import f1_score,precision_score,recall_score

In [72]:
f1_score(all_labels,all_preds)

0.8805084745762711

In [73]:
precision_score(all_labels,all_preds,pos_label=0)

  _warn_prf(average, modifier, msg_start, len(result))


0.0

In [74]:
recall_score(all_labels,all_preds,pos_label=0)

0.0