In [1]:
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path
import logging
import torch
from datetime import datetime

# Setup paths
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.getcwd()))
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

# Create timestamped results directory
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
BASE_DIR = os.getcwd()
MODEL_DIR = os.path.join(BASE_DIR, 'models', timestamp)
RESULTS_DIR = os.path.join(BASE_DIR, 'results', timestamp)
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)

# Imports
from preprocessing.data_container import DataContainer
from models.deep_surv_model import DeepSurvModel
from utils.evaluation import cindex_score

# Setup logging
log_file = os.path.join(RESULTS_DIR, 'training.log')
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(log_file),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# Check for CUDA
device = 'cuda' if torch.cuda.is_available() else 'cpu'
logger.info(f"Using device: {device}")


2024-11-10 20:51:44,282 - INFO - Using device: cpu


In [4]:
# Data configuration
DATA_CONFIG = {
    'use_pca': False,
    'gene_type': 'intersection',
    'use_imputed': True,
    'use_cohorts': True
}

# Model configuration with or without CV
USE_CV = True  # Set to False for direct training

if USE_CV:
    # Configuration for CV training
    MODEL_CONFIG = {
        'params_cv': {
            'hidden_layers': [[32, 16], [64, 32], [32, 32, 16]],
            'learning_rate': [0.01, 0.001],
            'batch_size': [32, 16],  # Kleinere Batch-Sizes
            'num_epochs': [10]
        },
        'use_cohort_cv': True,
        'n_splits_inner': 3
    }
else:
    # Configuration for direct training
    MODEL_CONFIG = {
        'hidden_layers': [32, 16],
        'learning_rate': 0.01,
        'batch_size': 64,
        'num_epochs': 10,
        'device': device,
        'random_state': 42
    }

# Save configurations
config_file = os.path.join(RESULTS_DIR, 'config.json')
import json
with open(config_file, 'w') as f:
    json.dump({
        'data_config': DATA_CONFIG,
        'model_config': MODEL_CONFIG,
        'use_cv': USE_CV
    }, f, indent=4)

try:
    # Create DataContainer and load data
    logger.info("Loading data...")
    data_container = DataContainer(DATA_CONFIG, project_root=PROJECT_ROOT)
    X, y = data_container.load_data()
    
    logger.info(f"Loaded data with shape: X={X.shape}")
    
    # Save feature names
    feature_names = pd.DataFrame({'feature': X.columns})
    feature_names.to_csv(os.path.join(RESULTS_DIR, 'feature_names.csv'), index=False)
    
    # Initialize DeepSurv
    logger.info("Initializing DeepSurv model...")
    if not USE_CV:
        deep_surv = DeepSurvModel(**MODEL_CONFIG)
    else:
        deep_surv = DeepSurvModel(device=device, random_state=42)
    
except Exception as e:
    logger.error(f"Error during initialization: {str(e)}")
    raise


2024-11-10 20:54:47,382 - INFO - Loading data...
2024-11-10 20:54:47,383 - INFO - Loading data...
2024-11-10 20:55:55,465 - INFO - Loaded data: 1091 samples, 13214 features
2024-11-10 20:56:00,938 - INFO - Loaded data with shape: X=(1091, 13214)
2024-11-10 20:56:00,954 - INFO - Initializing DeepSurv model...


In [5]:
try:
    logger.info("Starting model training...")
    
    if USE_CV:
        logger.info(f"Cross-validation config: {MODEL_CONFIG}")
        # Fit with CV
        deep_surv.fit(
            X=X,
            y=y,
            data_container=data_container,
            **MODEL_CONFIG
        )
        
        # Log CV results
        cv_results = pd.DataFrame(deep_surv.cv_results_['cv_results'])
        logger.info("\nCross-validation results:")
        logger.info(f"Mean c-index: {deep_surv.cv_results_['mean_score']:.3f} "
                   f"± {deep_surv.cv_results_['std_score']:.3f}")
        
        # Save detailed CV results
        cv_results.to_csv(os.path.join(RESULTS_DIR, 'cv_results.csv'))
        
    else:
        # Direct training without CV
        X_train, y_train, X_val, y_val = data_container.get_train_val_split(X, y)
        deep_surv.fit(
            X=X_train,
            y=y_train,
            validation_data=(X_val, y_val)
        )
        
        # Evaluate on validation set
        val_pred = deep_surv.predict(X_val)
        val_score = cindex_score(y_val, val_pred)
        logger.info(f"\nValidation c-index: {val_score:.3f}")
        
    logger.info("Model training completed successfully!")
    
except Exception as e:
    logger.error(f"Error during training: {str(e)}")
    raise

2024-11-10 20:56:01,037 - INFO - Starting model training...
2024-11-10 20:56:01,038 - INFO - Cross-validation config: {'params_cv': {'hidden_layers': [[32, 16], [64, 32], [32, 32, 16]], 'learning_rate': [0.01, 0.001], 'batch_size': [32, 16], 'num_epochs': [10]}, 'use_cohort_cv': True, 'n_splits_inner': 3}
2024-11-10 20:56:01,040 - INFO - Starting DeepSurv training...
2024-11-10 20:56:01,041 - INFO - Starting nested cross-validation for DeepSurv...
2024-11-10 20:56:01,045 - INFO - Outer fold 1
2024-11-10 21:01:05,796 - INFO - Fold 1 complete - Test c-index: 0.574
2024-11-10 21:01:05,800 - INFO - Outer fold 2
2024-11-10 21:04:20,758 - INFO - Fold 2 complete - Test c-index: 0.528
2024-11-10 21:04:20,761 - INFO - Outer fold 3
2024-11-10 21:08:13,361 - INFO - Fold 3 complete - Test c-index: 0.536
2024-11-10 21:08:13,364 - INFO - Outer fold 4
2024-11-10 21:12:18,879 - INFO - Fold 4 complete - Test c-index: 0.540
2024-11-10 21:12:18,882 - INFO - Outer fold 5
2024-11-10 21:16:14,041 - INFO - F

In [9]:
# Save model
logger.info("Saving model...")
model_name = f"deep_surv_model_{timestamp}"
deep_surv.save(MODEL_DIR, model_name)


    





    


2024-11-11 06:27:49,790 - INFO - Saving model...
2024-11-11 06:27:49,830 - INFO - Model saved to /Users/jonasschernich/Library/Mobile Documents/com~apple~CloudDocs/Uni/Master/9. Semester/Consulting/Organization/PCaPrognostics/models/deep_surv/models/20241110_205142/model
2024-11-11 06:27:49,840 - INFO - Saved CV results to /Users/jonasschernich/Library/Mobile Documents/com~apple~CloudDocs/Uni/Master/9. Semester/Consulting/Organization/PCaPrognostics/models/deep_surv/models/20241110_205142/results/deep_surv_model_20241110_205142_cv_results.csv
