## Training a Neural Network Model

- using `pytorch`

https://pytorch.org/tutorials/beginner/basics/buildmodel_tutorial.html

In [1]:
import sys
import os

# Add the path to the 'code' directory
sys.path.append(os.path.abspath('../code'))

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets, transforms, models
from torchvision.utils import make_grid
import pandas as pd
import numpy as np

# include feature engineering pipeline
from feature_eng_pipeline import pipeline_nn

In [3]:
# switch to using CUDA - GPU
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cuda device


In [4]:
torch.cuda.is_available()

True

In [5]:
# loading data
data_path = '../../data/mean_with_labels.csv'

class RNANanoporeDataset(Dataset):
    """Dataset used to train and test RNA Nanopore data"""

    def __init__(self, csv_file):
        """Initializes instance of class RNANanoporeDataset.

        Args:
            csv_file (str): Path to the csv file with the nanopore data
        """
        # Load the CSV file
        self.df = pd.read_csv(csv_file)

        # Call pipeline to preprocess the data
        v, s, X_df, y_df = pipeline_nn(self.df)

        # Save target and predictors
        self.X = X_df.drop(["transcript_name", "gene_id", "nucleotide_seq"], axis=1).reset_index(drop=True)  # Reset index of the data after processing
        self.y = y_df.reset_index(drop=True).squeeze()  # Ensure y is 1D and indices are reset

    def __len__(self):
        """Returns the size of the dataset"""
        return len(self.X)

    def __getitem__(self, idx):
    # Handle if idx is a tensor (converting to list if needed)
        if isinstance(idx, torch.Tensor):
            idx = idx.tolist()

        # Extract the feature row (as numpy) and label
        signal_features = self.X.iloc[idx].values  # Assuming signal features are part of X
        label = self.y.iloc[idx]  # Use `.iloc` to make sure it's scalar, not Series

        # Convert to tensors
        signal_features = torch.tensor(signal_features, dtype=torch.float32)
        label = torch.tensor(label, dtype=torch.float32)

        return signal_features, label



In [6]:
# preparing data for training using DataLoaders
from torch.utils.data.dataset import random_split

dataset = RNANanoporeDataset(data_path)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
trainset, testset = random_split(dataset, [train_size, test_size])

# Dataloaders
trainloader = DataLoader(trainset, batch_size=64, shuffle=True)
testloader = DataLoader(testset, batch_size=64, shuffle=False)

In [13]:
class ModNet(nn.Module):
    def __init__(self, signal_input_dim):
        super(ModNet, self).__init__()

        # Read-level Encoder: MLP with two hidden layers
        self.encoder = nn.Sequential(
            nn.Linear(signal_input_dim, 150),  # Change hidden dimensions as needed
            nn.ReLU(),
            nn.Linear(150, 32),
            nn.ReLU(),
            nn.Linear(32, 1)  # Single output for binary classification
        )

    def forward(self, signal_features):
        # Pass through the encoder
        read_level_probs = self.encoder(signal_features)
        return torch.sigmoid(read_level_probs)  # Apply sigmoid for probabilities


    def noisy_or_pooling(self, read_level_probs):
        """
        :param read_level_probs: Tensor of shape (batch_size, 1)
        :return: Site-level modification probability for each site (batch_size, 1)
        """
        site_level_probs = 1 - torch.prod(1 - read_level_probs, dim=1)
        return site_level_probs


In [14]:
# Set CUDA launch blocking for better error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [15]:
# train the model
import torch.optim as optim
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc

# Assume that ModNet is already defined
model = ModNet(signal_input_dim=50) 
               #trigram_vocab_size=64,
               #embedding_dim=20)

# Loss function and optimizer
criterion = nn.BCEWithLogitsLoss()  # Binary Cross-Entropy Loss for binary classification
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# Training loop
def train_model(model, trainloader, criterion, optimizer, num_epochs=10):
    model.train()  # Set model to training mode
    for epoch in range(num_epochs):
        running_loss = 0.0
        
        # Training loop
        for i, data in enumerate(trainloader, 0):
            signal_features, labels = data
            signal_features = signal_features.to(device)
            labels = labels.to(device).float()  # Ensure labels are float for BCELoss

            print("Signal features shape:", signal_features.shape)  # Expected: [batch_size, 9]
            print("Labels shape:", labels.shape)                    # Expected: [batch_size]

            # Forward pass
            try:
                read_level_probs = model(signal_features)  # Output shape: [batch_size, 1]
                print("Read level probabilities shape:", read_level_probs.shape)  # Expected: [batch_size, 1]
                # Compute loss
                # For BCELoss, ensure labels are float
                loss = criterion(read_level_probs, labels.view(-1, 1).float())  # Reshape labels to [batch_size, 1]
                print("Loss:", loss.item())

                # Zero gradients, backward pass, and optimization step
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            except RuntimeError as e:
                print("RuntimeError during training:", e)
                break

            # Print statistics
            running_loss += loss.item()
            if i % 10 == 9:  # Print every 10 batches
                print(f'Epoch [{epoch+1}/{num_epochs}], Batch [{i+1}/{len(trainloader)}], Loss: {running_loss / 10:.4f}')
                running_loss = 0.0

    print('Finished Training')

# Train and evaluate the model
for param in model.parameters():
    print(param.shape)
    
model.to(device)
train_model(model, trainloader, criterion, optimizer, num_epochs=10)

torch.Size([150, 50])
torch.Size([150])
torch.Size([32, 150])
torch.Size([32])
torch.Size([1, 32])
torch.Size([1])
Signal features shape: torch.Size([64, 50])
Labels shape: torch.Size([64])
Read level probabilities shape: torch.Size([64, 1])
RuntimeError during training: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.



RuntimeError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [12]:
# Function to evaluate on the test set
def evaluate_model(model, testloader, criterion):
    model.eval()  # Set model to evaluation mode
    total_loss = 0.0
    all_labels = []
    all_predictions = []

    with torch.no_grad():
        for data in testloader:
            signal_features, labels = data
            
            # Move data to device
            signal_features = signal_features.to(device)
            labels = labels.to(device)
            
            # Forward pass
            read_level_probs = model(signal_features)
            site_level_probs = model.noisy_or_pooling(read_level_probs).squeeze()  # Shape: (batch_size,)

            # Compute loss
            loss = criterion(site_level_probs, labels.float())
            total_loss += loss.item()

            # Collect predictions and labels for ROC and PR AUC
            all_labels.append(labels.cpu())
            all_predictions.append(site_level_probs.cpu())

    # Convert lists to tensors
    all_labels = torch.cat(all_labels)
    all_predictions = torch.cat(all_predictions)

    # Compute ROC-AUC
    roc_auc = roc_auc_score(all_labels, all_predictions)

    # Compute PR-AUC
    precision, recall, _ = precision_recall_curve(all_labels, all_predictions)
    pr_auc = auc(recall, precision)

    # Average loss
    avg_loss = total_loss / len(testloader)
    
    print(f'Test Loss: {avg_loss:.4f}, ROC-AUC: {roc_auc:.4f}, PR-AUC: {pr_auc:.4f}')
    
    return avg_loss, roc_auc, pr_auc

evaluate_model(model, testloader, criterion)

ValueError: Input contains NaN.