In [1]:
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path
import logging
import pickle

In [18]:
# Setup paths
def find_project_root(current_path, marker_file='requirements.txt'):
    current = Path(current_path).resolve()
    while current != current.parent:
        if (current / marker_file).exists():
            return str(current)
        current = current.parent
    raise FileNotFoundError(f"Could not find project root with marker file {marker_file}")

try:
    PROJECT_ROOT = find_project_root(os.getcwd())
except FileNotFoundError:
    PROJECT_ROOT = os.path.dirname(os.path.dirname(os.getcwd()))
    print("Warning: Could not find project root automatically.")

if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)
# Setup directories
MODEL_DIR = os.path.join(os.path.dirname(os.getcwd()), 'rsf', 'model')
RESULTS_DIR = os.path.join(os.path.dirname(os.getcwd()), 'rsf','results')



In [19]:
from preprocessing.data_loader import DataLoader
from models.rsf_model import RSFModel
from utils.evaluation import cindex_score
from utils.visualization import plot_survival_curves, plot_cv_results

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

In [20]:
# Load trained model
try:
    model_path = os.path.join(MODEL_DIR, 'rsf_model.pkl')
    logger.info(f"Loading model from {model_path}")
    
    with open(model_path, 'rb') as f:
        rsf = pickle.load(f)
        
    logger.info("Model loaded successfully")
    
except Exception as e:
    logger.error(f"Error loading model: {str(e)}")
    raise

2024-11-10 07:03:46,797 - INFO - Loading model from /Users/jonasschernich/Library/Mobile Documents/com~apple~CloudDocs/Uni/Master/9. Semester/Consulting/Organization/PCaPrognostics/models/rsf/model/rsf_model.pkl
2024-11-10 07:03:46,816 - INFO - Model loaded successfully


"""
### Test mit Beispiel-Testdaten
Diese Testdaten sind nur für Demo-Zwecke. Für echte Evaluierung müssen die tatsächlichen 
Testdaten aus dem Data Loader verwendet werden.
### Problem: Die features beim Testen müssen übereinstimmen -> Überlegen wie wir das lösen
"""

In [22]:
# Load same data that was used for training for testing
try:
    loader = DataLoader(PROJECT_ROOT)
    
    X, pdata = loader.get_merged_data(
        gene_type='intersection',
        use_imputed=True
    )
    y = loader.prepare_survival_data(pdata)
    groups = np.array([idx.split('.')[0] for idx in X.index])
    
    logger.info(f"Data loaded: {X.shape[0]} samples, {X.shape[1]} features")
    
except Exception as e:
    logger.error(f"Error loading data: {str(e)}")
    raise

## Create Test Subset and Evaluate

try:
    logger.info("Creating test subset from data...")
    
    # Use 20% of the data for testing
    n_test = int(len(X) * 0.2)
    test_indices = np.random.choice(len(X), n_test, replace=False)
    
    X_test = X.iloc[test_indices]
    y_test = y[test_indices]
    test_cohorts = groups[test_indices] if groups is not None else None
    
    logger.info(f"Test subset created: {X_test.shape[0]} samples, {X_test.shape[1]} features")
    
    # Make predictions
    logger.info("Making predictions...")
    predictions = rsf.predict(X_test)
    
    # Calculate c-index
    score = cindex_score(y_test, predictions)
    logger.info(f"Test Set C-index: {score:.3f}")
    
    # Plot survival curves for a few examples
    logger.info("Plotting survival curves...")
    rsf_model = rsf.named_steps['rsf']
    #plot_survival_curves(rsf_model, X_test.iloc[:5], y_test[:5])
    
    # Show performance by cohort
    if test_cohorts is not None:
        logger.info("\nPerformance by cohort:")
        cohort_scores = {}
        for cohort in np.unique(test_cohorts):
            mask = test_cohorts == cohort
            if sum(mask) > 0:  # Nur wenn Kohorte Samples hat
                cohort_score = cindex_score(y_test[mask], predictions[mask])
                cohort_scores[cohort] = cohort_score
                logger.info(f"{cohort}: {cohort_score:.3f} (n={sum(mask)})")
        
        # Save cohort scores
        pd.Series(cohort_scores).to_csv(
            os.path.join(RESULTS_DIR, 'cohort_test_scores.csv')
        )

except Exception as e:
    logger.error(f"Error evaluating model: {str(e)}")
    raise



2024-11-10 07:07:11,522 - INFO - Data loaded: 1091 samples, 13214 features
2024-11-10 07:07:11,535 - INFO - Creating test subset from data...
2024-11-10 07:07:11,610 - INFO - Test subset created: 218 samples, 13214 features
2024-11-10 07:07:11,611 - INFO - Making predictions...
2024-11-10 07:07:11,698 - INFO - Test Set C-index: 0.494
2024-11-10 07:07:11,699 - INFO - Plotting survival curves...
2024-11-10 07:07:11,699 - INFO - 
Performance by cohort:
2024-11-10 07:07:11,701 - INFO - Atlanta_2014_Long: 0.383 (n=21)
2024-11-10 07:07:11,702 - INFO - Belfast_2018_Jain: 0.573 (n=47)
2024-11-10 07:07:11,702 - INFO - CPC_GENE_2017_Fraser: 0.711 (n=16)
2024-11-10 07:07:11,703 - INFO - CPGEA_2020_Li: 0.447 (n=24)
2024-11-10 07:07:11,704 - INFO - CamCap_2016_Ross_Adams: 0.567 (n=27)
2024-11-10 07:07:11,705 - INFO - CancerMap_2017_Luca: 0.554 (n=28)
2024-11-10 07:07:11,706 - INFO - DKFZ_2018_Gerhauser: 0.418 (n=14)
2024-11-10 07:07:11,707 - INFO - MSKCC_2010_Taylor: 0.497 (n=25)
2024-11-10 07:07:1