In [1]:
import os
import sys
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
from sklearn.metrics import roc_curve, auc, precision_recall_curve, confusion_matrix, classification_report

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

Device: cuda


In [3]:
from src.config.loader import Config
from src.utils.data_loader import get_data_loaders
from src.models.mlp import MLP

config_path = os.path.join(project_root, 'src/config/config.yaml')
config = Config(config_path)

print(f"Model: {config.model['type']}")
print(f"Epochs: {config.training['epochs']}")
print(f"Learning Rate: {config.training['learning_rate']}")
print(f"Batch Size: {config.training['batch_size']}")

Model: mlp
Epochs: 50
Learning Rate: 0.001
Batch Size: 32


In [5]:
os.chdir(project_root)

from src.utils.data_generator import generate_synthetic_mutation_data

data_path = 'data/processed/feature_matrix.csv'
generate_synthetic_mutation_data(
    n_samples=2000,
    output_path=data_path,
    include_labels=True,
    random_seed=42
)

✓ Synthetic data saved to: data/processed/feature_matrix.csv


Unnamed: 0,AF,CADD_PHRED,conservation_score,variant_type,impact_category,pathogenicity_label
0,0.022089,10.407791,0.678450,SNV,MODERATE,0
1,0.002761,25.850879,0.441873,SNV,LOW,0
2,0.091643,1.967608,0.189863,SNV,MODIFIER,0
3,0.005631,14.762573,0.318113,SNV,MODIFIER,0
4,0.062413,3.986900,0.752561,indel,LOW,1
...,...,...,...,...,...,...
1995,0.003397,3.608055,0.479496,SNV,LOW,1
1996,0.135533,4.010681,0.900242,SNV,MODERATE,0
1997,0.067081,1.247257,0.340254,SNV,MODERATE,0
1998,0.775286,14.544056,0.126218,indel,LOW,1


In [6]:
train_loader, val_loader, test_loader, input_dim = get_data_loaders(config)

print(f"Input Features: {input_dim}")
print(f"Train Samples: {len(train_loader.dataset)}")
print(f"Validation Samples: {len(val_loader.dataset)}")
print(f"Test Samples: {len(test_loader.dataset)}")

Input Features: 9
Train Samples: 1400
Validation Samples: 200
Test Samples: 400


In [None]:
model = MLP(
    input_dim=input_dim,
    hidden_layers=config.model['mlp']['hidden_layers'],
    dropout=config.model['mlp']['dropout']
).to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=config.training['learning_rate'])

print(f"Model Parameters: {sum(p.numel() for p in model.parameters()):,}")
print(model)

TypeError: __init__() got an unexpected keyword argument 'hidden_dims'

In [None]:
train_losses = []
val_losses = []
train_accs = []
val_accs = []

epochs = config.training['epochs']

for epoch in range(epochs):
    model.train()
    train_loss = 0.0
    train_correct = 0
    train_total = 0
    
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} [Train]")
    
    for X_batch, y_batch in progress_bar:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device).float().unsqueeze(1)
        
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        preds = (torch.sigmoid(outputs) > 0.5).float()
        train_correct += (preds == y_batch).sum().item()
        train_total += y_batch.size(0)
        
        progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})
    
    avg_train_loss = train_loss / len(train_loader)
    train_acc = train_correct / train_total
    
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device).float().unsqueeze(1)
            
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            
            val_loss += loss.item()
            preds = (torch.sigmoid(outputs) > 0.5).float()
            val_correct += (preds == y_batch).sum().item()
            val_total += y_batch.size(0)
    
    avg_val_loss = val_loss / len(val_loader)
    val_acc = val_correct / val_total
    
    train_losses.append(avg_train_loss)
    val_losses.append(avg_val_loss)
    train_accs.append(train_acc)
    val_accs.append(val_acc)
    
    print(f"Epoch {epoch+1}/{epochs} - Train Loss: {avg_train_loss:.4f}, Train Acc: {train_acc:.4f}, Val Loss: {avg_val_loss:.4f}, Val Acc: {val_acc:.4f}")

print("\nTraining Complete!")

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(range(1, epochs+1), train_losses, label='Train Loss', marker='o')
axes[0].plot(range(1, epochs+1), val_losses, label='Val Loss', marker='s')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('Training and Validation Loss')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

axes[1].plot(range(1, epochs+1), train_accs, label='Train Accuracy', marker='o')
axes[1].plot(range(1, epochs+1), val_accs, label='Val Accuracy', marker='s')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy')
axes[1].set_title('Training and Validation Accuracy')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
model.eval()
all_preds = []
all_probs = []
all_labels = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to(device)
        outputs = model(X_batch)
        probs = torch.sigmoid(outputs).cpu().numpy()
        preds = (probs > 0.5).astype(int)
        
        all_probs.extend(probs.flatten())
        all_preds.extend(preds.flatten())
        all_labels.extend(y_batch.numpy())

all_preds = np.array(all_preds)
all_probs = np.array(all_probs)
all_labels = np.array(all_labels)

test_acc = (all_preds == all_labels).mean()
print(f"Test Accuracy: {test_acc:.4f}")

In [None]:
fpr, tpr, _ = roc_curve(all_labels, all_probs)
roc_auc = auc(fpr, tpr)

precision, recall, _ = precision_recall_curve(all_labels, all_probs)
pr_auc = auc(recall, precision)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.3f})')
axes[0].plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random')
axes[0].set_xlabel('False Positive Rate')
axes[0].set_ylabel('True Positive Rate')
axes[0].set_title('ROC Curve')
axes[0].legend(loc='lower right')
axes[0].grid(True, alpha=0.3)

axes[1].plot(recall, precision, color='green', lw=2, label=f'PR curve (AUC = {pr_auc:.3f})')
axes[1].set_xlabel('Recall')
axes[1].set_ylabel('Precision')
axes[1].set_title('Precision-Recall Curve')
axes[1].legend(loc='lower left')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"ROC-AUC: {roc_auc:.4f}")
print(f"PR-AUC: {pr_auc:.4f}")

In [None]:
cm = confusion_matrix(all_labels, all_preds)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Benign', 'Pathogenic'], 
            yticklabels=['Benign', 'Pathogenic'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

tn, fp, fn, tp = cm.ravel()
print(f"True Negatives: {tn}")
print(f"False Positives: {fp}")
print(f"False Negatives: {fn}")
print(f"True Positives: {tp}")
print(f"Sensitivity (Recall): {tp/(tp+fn):.4f}")
print(f"Specificity: {tn/(tn+fp):.4f}")

In [None]:
print("Classification Report:")
print(classification_report(all_labels, all_preds, 
                          target_names=['Benign', 'Pathogenic'],
                          digits=4))

In [None]:
results_df = pd.DataFrame({
    'Metric': ['Test Accuracy', 'ROC-AUC', 'PR-AUC', 'Sensitivity', 'Specificity'],
    'Value': [
        test_acc,
        roc_auc,
        pr_auc,
        tp/(tp+fn),
        tn/(tn+fp)
    ]
})

plt.figure(figsize=(10, 6))
bars = plt.barh(results_df['Metric'], results_df['Value'], color='steelblue')
plt.xlabel('Score')
plt.title('Model Performance Summary')
plt.xlim(0, 1)
plt.grid(True, alpha=0.3, axis='x')

for i, (metric, value) in enumerate(zip(results_df['Metric'], results_df['Value'])):
    plt.text(value + 0.02, i, f'{value:.4f}', va='center')

plt.tight_layout()
plt.show()

print("\nFinal Results:")
print(results_df.to_string(index=False))

In [None]:
os.makedirs('../reports/results/checkpoints', exist_ok=True)
model_save_path = '../reports/results/checkpoints/notebook_trained_model.pth'

torch.save({
    'epoch': epochs,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'train_loss': train_losses[-1],
    'val_loss': val_losses[-1],
    'test_acc': test_acc,
    'roc_auc': roc_auc
}, model_save_path)

print(f"Model saved to: {model_save_path}")