In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import roc_auc_score

from dnn import DNN

**Load data**

In [2]:
loaded_tox21_features_train_val_test = np.load('../../preprocessing/preprocessed_data/tox21_features_train_val_test.npy', allow_pickle=True).item()

tox21_X_train = loaded_tox21_features_train_val_test['train']
tox21_X_val = loaded_tox21_features_train_val_test['validation']
tox21_X_test = loaded_tox21_features_train_val_test['test']

loaded_tox21_labels_train_val_test = np.load('../../preprocessing/preprocessed_data/tox21_labels_train_val_test.npy', allow_pickle=True).item()

tox21_y_train = loaded_tox21_labels_train_val_test['train']
tox21_y_val = loaded_tox21_labels_train_val_test['validation']
tox21_y_test = loaded_tox21_labels_train_val_test['test']

**Check shapes**

In [3]:
print('Features Train/Val/Test Shapes:')
[i.shape for i in [tox21_X_train, tox21_X_val, tox21_X_test]]

Features Train/Val/Test Shapes:


[(4698, 2248), (1566, 2248), (1567, 2248)]

In [4]:
print('Labels Train/Val/Test Shapes:')
[i.shape for i in [tox21_y_train, tox21_y_val, tox21_y_test]]

Labels Train/Val/Test Shapes:


[(4698, 12), (1566, 12), (1567, 12)]

In [5]:
tox21_y_train = np.where(tox21_y_train == -1, 0, tox21_y_train)
tox21_y_val = np.where(tox21_y_val == -1, 0, tox21_y_val)
tox21_y_test = np.where(tox21_y_test == -1, 0, tox21_y_test)

**Create dataloaders**

In [6]:
def create_dataloaders(X_train, y_train, X_val, y_val, X_test, y_test, batch_size=512):
    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
    X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
    y_val_tensor = torch.tensor(y_val, dtype=torch.float32)
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
    test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    return train_loader, val_loader, test_loader

**Create training loop**

In [7]:
def train_validation_loop(model, dataloader_train, dataloader_val, loss_func, optimizer, num_epochs=10, device='cpu'):
    for epoch in range(num_epochs):
        model.train()
        total_train_loss = 0
        
        for batch_X, batch_y in dataloader_train:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = loss_func(outputs, batch_y)
            loss.backward()
            optimizer.step()
            
            total_train_loss += loss.item() * batch_X.shape[0]
        
        train_loss_per_epoch = total_train_loss / len(dataloader_train.dataset)
        
        if epoch % 1 == 0:
            model.eval()
            total_val_loss = 0

            val_outputs = []
            val_labels = []
            
            with torch.no_grad():
                for batch_X, batch_y in dataloader_val:
                    batch_X, batch_y = batch_X.to(device), batch_y.to(device)

                    outputs = model(batch_X)
                    loss = loss_func(outputs, batch_y)

                    total_val_loss += loss.item() * batch_X.shape[0]

                    val_outputs.append(outputs.cpu())
                    val_labels.append(batch_y.cpu())
                
            val_loss_per_epoch = total_val_loss / len(dataloader_val.dataset)
            
            val_outputs = torch.cat(val_outputs)
            val_labels = torch.cat(val_labels)
            
            roc_auc_scores = []

            for task in range(val_labels.shape[-1]):
                roc_auc = roc_auc_score(val_labels[:, task], val_outputs[:, task])
                roc_auc_scores.append(roc_auc)
            
            roc_auc_scores_mean = np.mean(roc_auc_scores)
            roc_auc_scores_std = np.std(roc_auc_scores)
            
            print(f'Epoch {epoch+1}/{num_epochs}, '
                  f'Train Loss: {train_loss_per_epoch:.4f}, '
                  f'Validation Loss: {val_loss_per_epoch:.4f}, '
                  f'Validation AUC mean: {roc_auc_scores_mean:.4f}, '
                  f'Validation AUC std: {roc_auc_scores_std:.4f}')

**Set hyperparameters**

In [8]:
params = {'input_size': 2248,
          'hidden_layers': [1024, 512],
          'output_size': 12,
          'learning_rate': 1e-3,
          'activation_function': nn.ReLU,
          'dropout_p': 0.25,
          'batch_size': 512}

input_size, hidden_layers, output_size, learning_rate, activation_function, dropout_p, batch_size = params.values()
num_epochs = 100
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

**Model training**

In [9]:
model = DNN(input_size, hidden_layers, output_size, activation_function, p=dropout_p)

In [10]:
model.to(device)

DNN(
  (input_layer): Linear(in_features=2248, out_features=1024, bias=True)
  (act1): ReLU()
  (dropout1): Dropout(p=0.25, inplace=False)
  (hidden_layers): Sequential(
    (0): Sequential(
      (0): Linear(in_features=1024, out_features=512, bias=True)
      (1): ReLU()
      (2): Dropout(p=0.25, inplace=False)
    )
  )
  (output_layer): Linear(in_features=512, out_features=12, bias=True)
)

In [11]:
loss_func = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [12]:
train_loader, val_loader, test_loader = create_dataloaders(tox21_X_train, 
                                                           tox21_y_train, 
                                                           tox21_X_val, 
                                                           tox21_y_val, 
                                                           tox21_X_test, 
                                                           tox21_y_test)

In [13]:
train_validation_loop(model, train_loader, val_loader, loss_func, optimizer, num_epochs=10, device=device)

Epoch 1/10, Train Loss: 0.3481, Validation Loss: 0.2298, Validation AUC mean: 0.7495, Validation AUC std: 0.0496
Epoch 2/10, Train Loss: 0.2032, Validation Loss: 0.2028, Validation AUC mean: 0.7653, Validation AUC std: 0.0523
Epoch 3/10, Train Loss: 0.1611, Validation Loss: 0.1973, Validation AUC mean: 0.7770, Validation AUC std: 0.0549
Epoch 4/10, Train Loss: 0.1333, Validation Loss: 0.1902, Validation AUC mean: 0.7854, Validation AUC std: 0.0551
Epoch 5/10, Train Loss: 0.1084, Validation Loss: 0.1958, Validation AUC mean: 0.7847, Validation AUC std: 0.0532
Epoch 6/10, Train Loss: 0.0857, Validation Loss: 0.2119, Validation AUC mean: 0.7834, Validation AUC std: 0.0547
Epoch 7/10, Train Loss: 0.0682, Validation Loss: 0.2260, Validation AUC mean: 0.7803, Validation AUC std: 0.0566
Epoch 8/10, Train Loss: 0.0548, Validation Loss: 0.2481, Validation AUC mean: 0.7747, Validation AUC std: 0.0564
Epoch 9/10, Train Loss: 0.0447, Validation Loss: 0.2721, Validation AUC mean: 0.7651, Validation

**Test the model**

In [14]:
def test_model(model, test_loader, loss_func, device='cpu'):
    model.eval()
    total_test_loss = 0

    test_outputs = []
    test_labels = []

    with torch.no_grad():
        for batch_X, batch_y in test_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)

            outputs = model(batch_X)
            loss = loss_func(outputs, batch_y)

            total_test_loss += loss.item() * batch_X.shape[0]

            test_outputs.append(outputs.cpu())
            test_labels.append(batch_y.cpu())

    test_loss = total_test_loss / len(test_loader.dataset)

    test_outputs = torch.cat(test_outputs)
    test_labels = torch.cat(test_labels)

    roc_auc_scores = []

    for task in range(test_labels.shape[-1]):
        roc_auc = roc_auc_score(test_labels[:, task], test_outputs[:, task])
        roc_auc_scores.append(roc_auc)

    roc_auc_scores_mean = np.mean(roc_auc_scores)
    roc_auc_scores_std = np.std(roc_auc_scores)
    return roc_auc_scores_mean, roc_auc_scores_std

In [15]:
roc_auc_scores_mean, roc_auc_scores_std = test_model(model, test_loader, loss_func)

In [18]:
print(f'AUC mean: {roc_auc_scores_mean:.4f}, AUC std: {roc_auc_scores_std:.4f}')

AUC mean: 0.7788, AUC std: 0.0500
