In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import roc_auc_score
import numpy as np
import scipy.sparse as sp
import numpy as np
import pandas as pd

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

**Load the toxcast dataset**

In [3]:
loaded_toxcast_features_train_val_test = np.load('../preprocessing/preprocessed_data/toxcast_features_train_val_test.npy', allow_pickle=True).item()

loaded_toxcast_ecfp_descr_quantiles_scaled_X_train = loaded_toxcast_features_train_val_test['train']
loaded_toxcast_ecfp_descr_quantiles_scaled_X_val = loaded_toxcast_features_train_val_test['validation']
loaded_toxcast_ecfp_descr_quantiles_scaled_X_test = loaded_toxcast_features_train_val_test['test']

In [4]:
toxcast_features = np.concatenate((loaded_toxcast_ecfp_descr_quantiles_scaled_X_train,
                                  loaded_toxcast_ecfp_descr_quantiles_scaled_X_val,
                                  loaded_toxcast_ecfp_descr_quantiles_scaled_X_test), axis=0)

In [5]:
toxcast_features.shape

(8595, 2248)

In [6]:
loaded_toxcast_labels_train_val_test = np.load('../preprocessing/preprocessed_data/toxcast_labels_train_val_test.npy', allow_pickle=True).item()

loaded_toxcast_y_train = loaded_toxcast_labels_train_val_test['train']
loaded_toxcast_y_val = loaded_toxcast_labels_train_val_test['validation']
loaded_toxcast_y_test = loaded_toxcast_labels_train_val_test['test']

In [7]:
toxcast_labels = np.concatenate((loaded_toxcast_y_train,
                                  loaded_toxcast_y_val,
                                  loaded_toxcast_y_test), axis=0)

In [8]:
toxcast_labels.shape

(8595, 617)

**Split the dataset into validation and test sets and create splits for each task**

In [9]:
toxcast_labels_val = toxcast_labels[:,:308]
toxcast_labels_test = toxcast_labels[:,308:617]

In [10]:
toxcast_labels_val.shape, toxcast_labels_test.shape

((8595, 308), (8595, 309))

In [11]:
def split_indices_for_task(task_labels):
    active_indices = np.where(task_labels == 1)[0]
    inactive_indices = np.where(task_labels == 0)[0]

    np.random.seed(42)
    selected_active = np.random.choice(active_indices, size=5, replace=False)
    selected_inactive = np.random.choice(inactive_indices, size=5, replace=False)

    rest_indices = np.setdiff1d(np.arange(task_labels.shape[0]), np.concatenate([selected_active, selected_inactive]))

    return selected_active, selected_inactive, rest_indices

def process_indices_for_all_tasks(toxcast_labels_val):
    all_tasks_indices = []
    
    for task in range(toxcast_labels_val.shape[1]):
        task_labels = toxcast_labels_val[:, task]

        selected_active, selected_inactive, rest_indices = split_indices_for_task(task_labels)

        task_indices = {
            "task": task,
            "indices_active": selected_active,
            "indices_inactive": selected_inactive,
            "indices_rest": rest_indices
        }
        
        all_tasks_indices.append(task_indices)
    
    return all_tasks_indices

In [12]:
val_tasks_indices = process_indices_for_all_tasks(toxcast_labels_val)
test_tasks_indices = process_indices_for_all_tasks(toxcast_labels_test)

In [13]:
val_tasks_indices[0]

{'task': 0,
 'indices_active': array([6522, 2862, 1497, 7059, 1526], dtype=int64),
 'indices_inactive': array([6568, 1766, 2715, 6455, 3518], dtype=int64),
 'indices_rest': array([   0,    1,    2, ..., 8592, 8593, 8594])}

In [14]:
test_tasks_indices[0]

{'task': 0,
 'indices_active': array([6210, 4636, 6840, 1168, 3190], dtype=int64),
 'indices_inactive': array([7281, 1869, 6455, 3755, 1701], dtype=int64),
 'indices_rest': array([   0,    1,    2, ..., 8592, 8593, 8594])}

**Create a new logistic regression model**

In [15]:
def create_layer(in_size, out_size, activation_function=nn.ReLU, p=0.25):
    return nn.Sequential(
        nn.AlphaDropout(p=p),
        nn.Linear(in_size, out_size),
        activation_function()
    )


class DNN(nn.Module):
    def __init__(self, input_layer, hidden_layers, output_layer, activation_function=nn.ReLU, p=0.25):
        super().__init__()
        
        self.input_layer = nn.Linear(input_layer, hidden_layers[0])
        self.act1 = activation_function()
        self.dropout1 = nn.AlphaDropout(p=p)

        layers = [create_layer(hl_in, hl_out, activation_function, p) for hl_in, hl_out in zip(hidden_layers, hidden_layers[1:])]
        
        self.hidden_layers = nn.Sequential(*layers)
        
        self.output_layer = nn.Linear(hidden_layers[-1], output_layer)
        
    def forward(self, x):
        x = self.dropout1(x)
        x = self.act1(self.input_layer(x))
        x = self.hidden_layers(x)
        x = self.output_layer(x)
        return x

params = {'input_size': 2248,
          'hidden_layers': [1024, 128],
          'output_size': 12,
          'activation_function': nn.SELU,
          'dropout_p': 0.45
         }

input_size, hidden_layers, output_size, activation_function, dropout_p = params.values()


model = DNN(input_size, hidden_layers, output_size, activation_function, p=dropout_p)
model.load_state_dict(torch.load('../pretraining/training/dnn_best_model.pth', map_location=device))

<All keys matched successfully>

In [16]:
model.to(device)

DNN(
  (input_layer): Linear(in_features=2248, out_features=1024, bias=True)
  (act1): SELU()
  (dropout1): AlphaDropout(p=0.45, inplace=False)
  (hidden_layers): Sequential(
    (0): Sequential(
      (0): AlphaDropout(p=0.45, inplace=False)
      (1): Linear(in_features=1024, out_features=128, bias=True)
      (2): SELU()
    )
  )
  (output_layer): Linear(in_features=128, out_features=12, bias=True)
)

**Train & validation loop**

In [17]:
def extract_features(model, features):
    model.eval()
    with torch.no_grad():
        features = torch.tensor(features, dtype=torch.float32).to(device)
        last_hl_output = model.input_layer(features)
        last_hl_output = model.act1(last_hl_output)
        last_hl_output = model.hidden_layers(last_hl_output)
        last_hl_output = last_hl_output.cpu().numpy()
    return last_hl_output

def cosine_similarity_matrix(features):
    norm_features = features / np.linalg.norm(features, axis=1, keepdims=True)
    return np.dot(norm_features, norm_features.T)

def build_graph(features, k=10):
    cosine_sim = cosine_similarity_matrix(features)
    adjacency_matrix = np.zeros_like(cosine_sim)
    
    for i in range(cosine_sim.shape[0]):
        top_k_indices = np.argsort(-cosine_sim[i, :])[:k+1]
        top_k_indices = top_k_indices[top_k_indices != i]
        adjacency_matrix[i, top_k_indices] = cosine_sim[i, top_k_indices]

    degree_matrix = np.diag(adjacency_matrix.sum(axis=1))
    degree_inv_sqrt = np.diag(1 / np.sqrt(np.diag(degree_matrix) + 1e-7))
    normalized_adjacency = np.dot(np.dot(degree_inv_sqrt, adjacency_matrix), degree_inv_sqrt)
    
    return normalized_adjacency

def feature_propagation(features, laplacian_matrix, alpha=0.5, k=3):
    identity_matrix = np.eye(features.shape[0])
    diffusion_matrix = np.linalg.matrix_power((alpha * identity_matrix + laplacian_matrix), k)
    return np.dot(diffusion_matrix, features)

class LRNN(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.output_layer = nn.Linear(input_size, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        x = self.output_layer(x)
        x = self.sigmoid(x)
        return x

def train_validate_task(
    dnn_model, 
    lr_model, 
    task_indices, 
    toxcast_features, 
    toxcast_labels_val, 
    optimizer, 
    criterion, 
    k=10, 
    alpha=0.5, 
    diffusion_k=3):
    
    active_indices = task_indices['indices_active']
    inactive_indices = task_indices['indices_inactive']
    rest_indices = task_indices['indices_rest']
    
    train_indices = np.concatenate([active_indices, inactive_indices])
    train_features = toxcast_features[train_indices]
    train_labels = toxcast_labels_val[train_indices, task_indices['task']]
    
    val_features = toxcast_features[rest_indices]
    val_labels = toxcast_labels_val[rest_indices, task_indices['task']]
    valid_mask = val_labels != -1
    val_features = val_features[valid_mask]
    val_labels = val_labels[valid_mask]

    train_features = extract_features(dnn_model, train_features)
    val_features = extract_features(dnn_model, val_features)
    
    all_features = np.concatenate([train_features, val_features], axis=0)

    laplacian_matrix = build_graph(all_features, k=k)
    all_features_propagated = feature_propagation(all_features, laplacian_matrix, alpha=alpha, k=diffusion_k)

    train_features_propagated = all_features_propagated[:len(train_features)]
    val_features_propagated = all_features_propagated[len(train_features):]

    train_features_propagated = torch.tensor(train_features_propagated, dtype=torch.float32).to(device)
    train_labels = torch.tensor(train_labels, dtype=torch.float32).unsqueeze(1).to(device)
    
    val_features_propagated = torch.tensor(val_features_propagated, dtype=torch.float32).to(device)
    val_labels = torch.tensor(val_labels, dtype=torch.float32).unsqueeze(1).to(device)

    lr_model.train()
    optimizer.zero_grad()
    outputs = lr_model(train_features_propagated)
    loss = criterion(outputs, train_labels)
    loss.backward()
    optimizer.step()

    lr_model.eval()
    with torch.no_grad():
        val_outputs = lr_model(val_features_propagated)
        val_loss = criterion(val_outputs, val_labels).item()
        val_outputs = val_outputs.cpu().numpy()
        val_labels = val_labels.cpu().numpy()

    if len(val_labels) > 0 and len(np.unique(val_labels)) > 1:
        roc_auc = roc_auc_score(val_labels, val_outputs)
    else:
        roc_auc = float('nan')
    
    return loss.item(), val_loss, roc_auc

def train_all_tasks(dnn_model, toxcast_features, toxcast_labels_val, val_tasks_indices, num_epochs=10, learning_rate=0.001, patience=3, k=10, alpha=0.5, diffusion_k=3):
    all_train_losses_per_epoch = []
    all_val_losses_per_epoch = []
    mean_roc_aucs_per_epoch = []
    
    best_mean_roc_auc = float('-inf')
    best_epoch = -1
    no_improvement_counter = 0
    
    best_models = {}

    models = {}
    optimizers = {}
    criterions = {}

    for task_indices in val_tasks_indices:
        input_size = hidden_layers[-1]
        task_model = LRNN(input_size)
        task_model.to(device)
        optimizer = optim.Adam(task_model.parameters(), lr=learning_rate)
        
        models[task_indices['task']] = task_model
        optimizers[task_indices['task']] = optimizer
        criterions[task_indices['task']] = nn.BCELoss()

    for epoch in range(num_epochs):
        epoch_train_losses = []
        epoch_val_losses = []
        epoch_roc_aucs = []
        total_train_loss = 0
        total_val_loss = 0
        
        for task_indices in val_tasks_indices:
            task_model = models[task_indices['task']]
            optimizer = optimizers[task_indices['task']]
            criterion = criterions[task_indices['task']]
            
            train_loss, val_loss, roc_auc = train_validate_task(
                dnn_model, task_model, task_indices, toxcast_features, toxcast_labels_val, optimizer, criterion, k=k, alpha=alpha, diffusion_k=diffusion_k
            )
            
            epoch_train_losses.append(train_loss)
            epoch_val_losses.append(val_loss)
            epoch_roc_aucs.append(roc_auc)
            total_train_loss += train_loss
            total_val_loss += val_loss
        
        mean_roc_auc = np.nanmean(epoch_roc_aucs)
        
        all_train_losses_per_epoch.append(total_train_loss)
        all_val_losses_per_epoch.append(total_val_loss)
        mean_roc_aucs_per_epoch.append(mean_roc_auc)
        
        if mean_roc_auc > best_mean_roc_auc:
            best_mean_roc_auc = mean_roc_auc
            best_epoch = epoch
            no_improvement_counter = 0

            for task_indices in val_tasks_indices:
                best_models[task_indices["task"]] = models[task_indices['task']].state_dict()
        else:
            no_improvement_counter += 1
        
        if no_improvement_counter >= patience:
            print(f"Early stopping after {epoch + 1} epochs. Best AUC: {best_mean_roc_auc:.4f} at epoch {best_epoch + 1}")
            break
                
        print(f'Epoch {epoch + 1}/{num_epochs} - Mean ROC AUC: {mean_roc_auc:.4f}, '
              f'Total Train Loss: {total_train_loss:.4f}, ' 
              f'Total Validation Loss: {total_val_loss:.4f}, '
              f'Best AUC: {best_mean_roc_auc}')
    
    torch.save(best_models, 'best_models.pth')
    
    return all_train_losses_per_epoch, all_val_losses_per_epoch, mean_roc_aucs_per_epoch, best_epoch

In [18]:
train_losses_per_epoch, val_losses_per_epoch, mean_roc_aucs_per_epoch, best_epoch = train_all_tasks(
    model, 
    toxcast_features, 
    toxcast_labels_val, 
    val_tasks_indices, 
    num_epochs=10, 
    learning_rate=1e-3, 
    patience=5, 
    k=5, 
    alpha=0.5, 
    diffusion_k=1
)

Epoch 1/10 - Mean ROC AUC: 0.5017, Total Train Loss: 374.7683, Total Validation Loss: 367.7821, Best AUC: 0.5016561024413451
Epoch 2/10 - Mean ROC AUC: 0.5100, Total Train Loss: 345.6468, Total Validation Loss: 345.7915, Best AUC: 0.5099528154041377
Epoch 3/10 - Mean ROC AUC: 0.5182, Total Train Loss: 319.0021, Total Validation Loss: 326.4700, Best AUC: 0.5181822341239953
Epoch 4/10 - Mean ROC AUC: 0.5264, Total Train Loss: 294.8560, Total Validation Loss: 309.7935, Best AUC: 0.5264124974426249
Epoch 5/10 - Mean ROC AUC: 0.5344, Total Train Loss: 273.1644, Total Validation Loss: 295.6478, Best AUC: 0.5344176505220728
Epoch 6/10 - Mean ROC AUC: 0.5424, Total Train Loss: 253.8270, Total Validation Loss: 283.8645, Best AUC: 0.5424352702662786
Epoch 7/10 - Mean ROC AUC: 0.5504, Total Train Loss: 236.6990, Total Validation Loss: 274.2456, Best AUC: 0.5503604334733173
Epoch 8/10 - Mean ROC AUC: 0.5579, Total Train Loss: 221.6016, Total Validation Loss: 266.5712, Best AUC: 0.5579036262909991
