## Random Survival Forest Training
 Dieses Notebook demonstriert das Training eines Random Survival Forest Models mit verschiedenen Optionen:
- Verschiedene Input-Typen (Kohorten, merged data)
- Mit/ohne PCA
- Grid/Random Search
- Verschiedene Cross-Validation Strategien


In [11]:
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sksurv.ensemble import RandomSurvivalForest
import logging

In [12]:
# Get absolute path to project root
def find_project_root(current_path, marker_file='requirements.txt'):
    current = Path(current_path).resolve()
    while current != current.parent:
        if (current / marker_file).exists():
            return str(current)
        current = current.parent
    raise FileNotFoundError(f"Could not find project root with marker file {marker_file}")

try:
    PROJECT_ROOT = find_project_root(os.getcwd())
except FileNotFoundError:
    PROJECT_ROOT = os.path.dirname(os.path.dirname(os.getcwd()))
    print("Warning: Could not find project root automatically.")

if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

# Setup directories
MODEL_DIR = os.path.join(os.getcwd(), 'model')
RESULTS_DIR = os.path.join(os.getcwd(), 'results')
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)





In [13]:
from preprocessing.data_loader import DataLoader
from preprocessing.dimension_reduction import PCADimensionReduction
from models.rsf_model import RSFModel
from utils.evaluation import cindex_score
from utils.visualization import plot_survival_curves, plot_cv_results

## Setup und Konfiguration

In [15]:


# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Configuration
CONFIG = {
    # Data options
    'USE_COHORTS': True,         # True für kohortenweise CV
    'USE_PCA': False,            # True für PCA Dimensionsreduktion
    'GENE_TYPE': 'intersection', # 'intersection', 'common_genes', oder 'all_genes'
    'USE_IMPUTED': True,        # True für imputierte Daten
    
    # CV options
    'USE_COHORT_CV_INNER': True, # True für kohortenweise innere CV
    'N_SPLITS_INNER': 5,        # Anzahl Inner CV Splits wenn nicht kohortenbasiert
    'USE_PARALLEL': True,       # Parallel Processing
    
    # RSF parameters
    'N_ESTIMATORS': [10, 20],
    'MAX_DEPTH': [3, 5],
    'MIN_SAMPLES_SPLIT': [5, 10],
    'MIN_SAMPLES_LEAF': [3, 5]
}

# Save configuration
pd.Series(CONFIG).to_csv(os.path.join(RESULTS_DIR, 'config.csv'))
logger.info(f"Saved configuration to {os.path.join(RESULTS_DIR, 'config.csv')}")

# Initialize data loader
logger.info("Initializing data loader...")
loader = DataLoader(PROJECT_ROOT)

# Load and prepare data
try:
    logger.info("Loading data...")
    # Load merged data
    X, pdata = loader.get_merged_data(
        gene_type=CONFIG['GENE_TYPE'],
        use_imputed=CONFIG['USE_IMPUTED']
    )
    
    logger.info("Preparing survival data...")
    # Prepare survival data
    y = loader.prepare_survival_data(pdata)
    
    # Setup groups if using cohorts
    if CONFIG['USE_COHORTS']:
        groups = np.array([idx.split('.')[0] for idx in X.index])
    else:
        groups = None
    
    # Print data info
    logger.info("\nData shapes:")
    logger.info(f"X: {X.shape}")
    logger.info(f"y: {y.shape}")
    logger.info(f"y dtype: {y.dtype}")
    
    # Validate survival data
    logger.info("\nSurvival data validation:")
    logger.info(f"Field names: {y.dtype.names}")
    logger.info(f"Event field: {'status' if 'status' in y.dtype.names else 'event'}")
    logger.info(f"Number of events: {y['status' if 'status' in y.dtype.names else 'event'].sum()}")
    logger.info(f"Time range: [{y['time'].min():.1f}, {y['time'].max():.1f}]")
    
    if CONFIG['USE_COHORTS']:
        logger.info("\nCohort distribution:")
        logger.info(pd.Series(groups).value_counts())

except Exception as e:
    logger.error(f"Error loading/preparing data: {str(e)}")
    raise

# Optional PCA
if CONFIG['USE_PCA']:
    logger.info("\nPerforming PCA...")
    pca = PCADimensionReduction(variance_threshold=0.95)
    X = pca.fit_transform(X)
    pca.save(os.path.join(MODEL_DIR, 'pca_transform.pkl'))
    logger.info(f"Reduced dimensions from {X.shape[1]} to {pca.n_components} components")
    logger.info(f"Explained variance ratio: {pca.explained_variance_ratio_.sum():.3f}")

# Setup pipeline and parameter grid
logger.info("\nSetting up model pipeline...")
base_rsf = RandomSurvivalForest(
    n_estimators=10,
    random_state=42
)

pipeline_steps = [
    ('scaler', StandardScaler()),
    ('rsf', base_rsf)
]

# Correct format for sklearn GridSearchCV
param_grid = {
    'rsf__n_estimators': CONFIG['N_ESTIMATORS'],
    'rsf__max_depth': CONFIG['MAX_DEPTH'],
    'rsf__min_samples_split': CONFIG['MIN_SAMPLES_SPLIT'],
    'rsf__min_samples_leaf': CONFIG['MIN_SAMPLES_LEAF']
}

# Validate param_grid format
if not isinstance(param_grid, dict):
    raise ValueError("param_grid must be a dictionary")
for param_name, param_values in param_grid.items():
    if not isinstance(param_values, (list, tuple, np.ndarray)):
        raise ValueError(f"Values for parameter {param_name} must be a list")

logger.info("\nParameter grid:")
for param, values in param_grid.items():
    logger.info(f"{param}: {values}")
logger.info(f"Total combinations: {np.prod([len(v) for v in param_grid.values()])}")

# Initialize and train model
logger.info("\nTraining model...")
rsf = RSFModel()


2024-11-10 07:43:57,624 - INFO - Saved configuration to /Users/jonasschernich/Library/Mobile Documents/.Trash/models 07.39.00/rsf/results/config.csv
2024-11-10 07:43:57,625 - INFO - Initializing data loader...


KeyboardInterrupt: 

In [16]:
try:
    rsf.fit_model(
        X=X,
        y=y,
        groups=groups,
        fname='rsf_results',
        path=RESULTS_DIR,
        pipeline_steps=pipeline_steps,
        params_cv=param_grid,
        use_cohort_cv=CONFIG['USE_COHORT_CV_INNER'],
        n_splits_inner=CONFIG['N_SPLITS_INNER'],
        parallel=CONFIG['USE_PARALLEL'],
        refit=True
    )
    logger.info("Model training completed successfully.")

except Exception as e:
    logger.error(f"\nError during model training: {str(e)}")
    raise

# Print training results
if hasattr(rsf, 'cv_results_'):
    logger.info("\nTraining Results:")
    logger.info(f"Mean CV Score: {rsf.cv_results_['mean_score']:.3f} ± {rsf.cv_results_['std_score']:.3f}")
    if 'best_params' in rsf.cv_results_:
        logger.info("\nBest Parameters:")
        for param, value in rsf.cv_results_['best_params'].items():
            logger.info(f"{param}: {value}")

logger.info("\nTraining completed!")


2024-11-10 07:44:05,663 - INFO - Starting nested cross-validation...
2024-11-10 07:44:05,666 - INFO - Data shape: X=(1091, 13214), groups=9 unique
2024-11-10 07:44:05,668 - INFO - 
Outer fold 1
2024-11-10 07:44:06,156 - INFO - Test cohort: Atlanta_2014_Long
2024-11-10 07:44:06,167 - INFO - Starting inner grid search with 16 parameter combinations
2024-11-10 07:45:24,514 - INFO - New best score: 0.504 with params: {'max_depth': 3, 'min_samples_leaf': 3, 'min_samples_split': 5, 'n_estimators': 10}
2024-11-10 07:47:58,669 - INFO - New best score: 0.508 with params: {'max_depth': 3, 'min_samples_leaf': 3, 'min_samples_split': 5, 'n_estimators': 20}


KeyboardInterrupt: 

### Train Model


In [1]:
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path
import logging

# Setup paths
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.getcwd()))
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

# Setup directories
MODEL_DIR = os.path.join(os.getcwd(), 'model')
RESULTS_DIR = os.path.join(os.getcwd(), 'results')
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)

# Imports
from preprocessing.data_container import DataContainer
from models.rsf_model import RSFModel
from utils.evaluation import cindex_score

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Data configuration
DATA_CONFIG = {
    'use_pca': False,
    'gene_type': 'intersection',
    'use_imputed': True,
    'use_cohorts': True
}

# Model/CV configuration
MODEL_CONFIG = {
    'params_cv': {
        'rsf__n_estimators': [100, 200],
        'rsf__min_samples_split': [5, 10],
        'rsf__min_samples_leaf': [3, 5]
    },
    'use_cohort_cv': True,
    'n_splits_inner': 5
}



In [2]:
try:
    # Create DataContainer and load data
    data_container = DataContainer(DATA_CONFIG, project_root=PROJECT_ROOT)
    X, y = data_container.load_data()

    # Initialize and train model
    rsf = RSFModel()
    rsf.fit(
        X=X,
        y=y,
        data_container=data_container,
        **MODEL_CONFIG
    )
    
    # Get and save feature importance
    importance_df = rsf.get_feature_importance(feature_names=X.columns)
    importance_df.to_csv(os.path.join(RESULTS_DIR, 'feature_importance.csv'))
    
    # Save model
    rsf.save(MODEL_DIR, "rsf_model")
    
    logger.info("Training completed successfully!")
    
except Exception as e:
    logger.error(f"Error during training: {str(e)}")
    raise

2024-11-10 11:14:48,054 - INFO - Loading data...
2024-11-10 11:15:46,824 - INFO - Loaded data: 1091 samples, 13214 features
2024-11-10 11:15:49,402 - INFO - Starting model training...
2024-11-10 11:15:49,402 - INFO - Input data shape: X=(1091, 13214)
2024-11-10 11:15:49,403 - INFO - Starting nested cross-validation...
2024-11-10 11:15:49,404 - INFO - Data shape: X=(1091, 13214), groups=9 unique
2024-11-10 11:15:49,407 - INFO - 
Outer fold 1
2024-11-10 11:15:49,444 - INFO - Test cohort: Atlanta_2014_Long
2024-11-10 11:15:49,444 - INFO - Starting inner grid search with 8 parameter combinations


KeyboardInterrupt: 