# This notebook train the Neural Network model using all training data by trunk, and evalute the Average Precision (AP) and other matrics by trunk

In [1]:
import numpy as np
import os, random
from pathlib import Path

In [39]:
def get_train_files(input_list):
    out_list=[]
    if type(input_list)==type(None):
        return out_list
    for item in input_list:
        
        if os.path.isdir(item):
            out_list.extend(list(Path(item).rglob("*.npz")))

        elif item[-4:]=='.npz':
            out_list.append(item)        
    
    random.seed(0)
    random.shuffle(out_list)
    return out_list

def read_from_file(file_name):
    data=np.load(file_name)
    return data['morgan'], data['protein'], data['labels']

def generate_batches(files, batch_size=1024):
    counter = 0
    
    print_freq=max(1, len(files)//10)
    
    while counter<len(files):
        file_name = files[counter]

        counter +=1
        
        data=read_from_file(file_name)

        morgan, protein, labels=data
        batch_size=max(batch_size,1)
        for local_index in range(0, labels.shape[0], batch_size):
            batch_morgan=morgan[local_index:(local_index + batch_size)]
            batch_protein=protein[local_index:(local_index + batch_size)]
            batch_labels=labels[local_index:(local_index + batch_size)]          

            yield batch_morgan, batch_protein, batch_labels
        
        if counter%print_freq==0:
            print('.', end='',flush=True)

In [48]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch import optim, Tensor

class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.fc1=nn.Linear(1027,2048)
        #self.dropout1 = nn.Dropout(0.2)
        self.fc2=nn.Linear(2048,1024)
        self.fc3=nn.Linear(1024,512)
        self.fc4=nn.Linear(512,128)
        #self.dropout3 = nn.Dropout(0.2)
        self.out=nn.Linear(128,1)
        
    def forward(self, x):
        x=F.relu(self.fc1(x))
        x=F.relu(self.fc2(x))
        x=F.relu(self.fc3(x))
        x=F.relu(self.fc4(x))
        out=self.out(x)
        return out

In [49]:
dev='cuda'

In [50]:
net=Net()

In [51]:
net.to(dev);
optimizer = optim.Adam(net.parameters(), lr=0.001)

In [52]:
model_details=str(net.to(torch.device(dev)))
print(model_details)
num_params=sum(p.numel() for p in net.parameters())
print('# Parameters=', num_params)

Net(
  (fc1): Linear(in_features=1027, out_features=2048, bias=True)
  (fc2): Linear(in_features=2048, out_features=1024, bias=True)
  (fc3): Linear(in_features=1024, out_features=512, bias=True)
  (fc4): Linear(in_features=512, out_features=128, bias=True)
  (out): Linear(in_features=128, out_features=1, bias=True)
)
# Parameters= 4794113


In [53]:
train_files=['morgan/data10.npz','morgan/data11.npz','morgan/data12.npz','morgan/data13.npz','morgan/data14.npz','morgan/data15.npz',
             'morgan/data16.npz','morgan/data17.npz','morgan/data18.npz','morgan/data19.npz']
test_files=['morgan/data20.npz']

In [57]:
train_files=get_files(['morgan/'])
test_files=get_files(['morgan_validation/'])

In [58]:
len(train_files)

473

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, average_precision_score

# Training loop
for i in range(epoch_num):
    net.train()
    train_loss = 0
    train_generator = generate_batches(train_files, batch_size=1024)
    for batch in train_generator:
        optimizer.zero_grad()
        batch_morgan, batch_protein, batch_labels = batch
        batch_morgan = np.concatenate((batch_morgan, batch_protein), 1)
        score = net(torch.Tensor(batch_morgan).to(dev))
        loss = torch.nn.functional.binary_cross_entropy_with_logits(score, torch.Tensor(batch_labels)[:, None].to(dev), pos_weight=pos_weight)
        train_loss += loss.cpu().item() * len(score)
        loss.backward()
        optimizer.step()

    with torch.no_grad():
        net.eval()
        preds = []
        labels = []
        test_loss = 0
        test_generator = generate_batches(test_files, batch_size=1024)
        for batch in test_generator:
            batch_morgan, batch_protein, batch_labels = batch
            batch_morgan = np.concatenate((batch_morgan, batch_protein), 1)
            score = net(torch.Tensor(batch_morgan).to(dev))
            loss = torch.nn.functional.binary_cross_entropy_with_logits(score, torch.Tensor(batch_labels)[:, None].to(dev), pos_weight=pos_weight)
            preds.append(torch.sigmoid(score).cpu().numpy())
            labels.append(batch_labels)
            test_loss += loss.cpu().item() * len(score)

        preds = np.vstack(preds)[:, 0]
        labels = np.hstack(labels)

        # Calculate additional metrics
        ap = average_precision_score(labels, preds)
        precision = precision_score(labels, preds > 0.5)
        recall = recall_score(labels, preds > 0.5)
        f1 = f1_score(labels, preds > 0.5)
        roc_auc = roc_auc_score(labels, preds)

        print(f'\nEpoch={i+1} Train_Loss={train_loss/len(labels):.6f} Test_Loss={test_loss/len(labels):.6f} '
              f'Test AP={ap:.6f} Precision={precision:.6f} Recall={recall:.6f} F1={f1:.6f} ROC_AUC={roc_auc:.6f}')


....................
Epoch=1 Train_Loss=0.167275 Test_Loss=0.079275 Test AP=0.281868 Precision=0.241906 Recall=0.424797 F1=0.308266 ROC_AUC=0.914911
....................
Epoch=2 Train_Loss=0.159408 Test_Loss=0.073675 Test AP=0.299529 Precision=0.299274 Recall=0.397117 F1=0.341322 ROC_AUC=0.922195
........

In [47]:
#this is the original trainning part, keep it for reference

%%time
pos_weight=torch.Tensor(np.array(5.0))
epoch_num=10

for i in range(epoch_num):
    net.train()
    train_loss=0
    train_generator=generate_batches(train_files, batch_size=1024)
    for batch in train_generator:
        optimizer.zero_grad()
        batch_morgan, batch_protein, batch_labels=batch
        batch_morgan=np.concatenate((batch_morgan, batch_protein),1) #combine morgan fp with protein onehot encoding
        score=net(torch.Tensor(batch_morgan).to(dev))
        loss =  torch.nn.functional.binary_cross_entropy_with_logits(score, torch.Tensor(batch_labels)[:,None].to(dev), pos_weight=pos_weight)
        train_loss+=loss.cpu().item()*len(score)
        loss.backward()
        optimizer.step()
    
    
    with torch.no_grad():
        net.eval()
        preds=[]
        labels=[]
        test_loss=0
        test_generator=generate_batches(test_files, batch_size=1024)
        for batch in test_generator:
            batch_morgan, batch_protein, batch_labels=batch
            batch_morgan=np.concatenate((batch_morgan, batch_protein),1)
            score=net(torch.Tensor(batch_morgan).to(dev))
            loss =  torch.nn.functional.binary_cross_entropy_with_logits(score, torch.Tensor(batch_labels)[:,None].to(dev), pos_weight=pos_weight)
            preds.append(torch.nn.functional.sigmoid(score))
            labels.append(batch_labels)
            test_loss+=loss.cpu().item()*len(score)
            
        preds=torch.vstack(preds).cpu().numpy()[:,0]
        labels=np.hstack(labels)
        ap=average_precision_score(labels, preds)
        print('\n Epoch={} Train_Loss={} Test_Loss={} Test AP={}'.format(i+1, train_loss/len(labels), test_loss/len(labels), ap))

...............
 Epoch=1 Train_Loss=1.003138149199302 Test_Loss=0.04882204769590944 Test AP=0.13260512288866608
...............
 Epoch=2 Train_Loss=0.8343921044622662 Test_Loss=0.052541076241587104 Test AP=0.15806395912532023
...............
 Epoch=3 Train_Loss=0.7964697278223664 Test_Loss=0.06023136017497704 Test AP=0.16564010288688769
...............
 Epoch=4 Train_Loss=0.7719787498841771 Test_Loss=0.06954713337385077 Test AP=0.16598595814173905
...............
 Epoch=5 Train_Loss=0.7594206354114563 Test_Loss=0.11244394741869336 Test AP=0.17529272770581467
...............
 Epoch=6 Train_Loss=0.7488237145274264 Test_Loss=0.156620718493725 Test AP=0.1808955108795634
...............
 Epoch=7 Train_Loss=0.7353306274759498 Test_Loss=0.32474393346613667 Test AP=0.17521899748532252
...............
 Epoch=8 Train_Loss=0.7311725073708566 Test_Loss=0.4824021732787721 Test AP=0.18973572540806538
...............
 Epoch=9 Train_Loss=0.7230835853912253 Test_Loss=0.5686870524970539 Test AP=0.196638