In [2]:
import pandas as pd
import os
import sys
import numpy as np
from models.survival_svm import prepare_survival_data, train_survival_svm, create_pipeline_and_param_grid

# Setup paths
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.getcwd()))
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

exprs_path = os.path.join(PROJECT_ROOT, 'data', 'merged_data', 'exprs', 'common_genes', 'common_genes_knn_imputed.csv')
pdata_path = os.path.join(PROJECT_ROOT, 'data', 'merged_data', 'pData', 'imputed', 'merged_imputed_pData.csv')

print("Loading data from:")
print(exprs_path)
print(pdata_path)

# Load data
exprs = pd.read_csv(exprs_path, index_col=0).iloc[:, :50]
pdata = pd.read_csv(pdata_path, index_col=0)

# Prepare survival data
exprs_clean, y, valid_mask = prepare_survival_data(pdata, exprs)
groups = np.array([idx.split('.')[0] for idx in exprs_clean.index])

# Setup model parameters
param_grid = {
    'model__alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100],
    'model__max_iter': [100, 500, 1000, 2000],
    'model__rank_ratio': [0, 0.2, 0.4, 0.6, 0.8, 1.0]
}

# Train model
results = train_survival_svm(exprs_clean, y, param_grid, groups)

# Print results
print(f"Mean CV Score: {results['mean_score']:.3f} ± {results['std_score']:.3f}")
print("Best Parameters:", results['best_params'])
print("\nDetailed CV Results:")
for i, fold_result in enumerate(results['cv_results']['fold_results']):
    print(f"\nFold {i+1}:")
    print(f"Test Cohort: {fold_result['test_cohort']}")
    print(f"Test Score: {fold_result['test_score']:.3f}")
    print(f"Best Parameters: {fold_result['best_params']}")

Loading data from:
/Users/jonasschernich/Library/Mobile Documents/com~apple~CloudDocs/Uni/Master/9. Semester/Consulting/Organization/PCaPrognostics/data/merged_data/exprs/common_genes/common_genes_knn_imputed.csv
/Users/jonasschernich/Library/Mobile Documents/com~apple~CloudDocs/Uni/Master/9. Semester/Consulting/Organization/PCaPrognostics/data/merged_data/pData/imputed/merged_imputed_pData.csv
Fitting 8 folds for each of 168 candidates, totalling 1344 fits




Fitting 8 folds for each of 168 candidates, totalling 1344 fits
[CV] END model__alpha=1, model__max_iter=100, model__rank_ratio=1.0; total time=   0.4s
[CV] END model__alpha=1, model__max_iter=100, model__rank_ratio=1.0; total time=   0.7s
[CV] END model__alpha=1, model__max_iter=100, model__rank_ratio=1.0; total time=   0.4s
[CV] END model__alpha=1, model__max_iter=100, model__rank_ratio=1.0; total time=   0.5s
[CV] END model__alpha=1, model__max_iter=100, model__rank_ratio=1.0; total time=   0.5s
[CV] END model__alpha=1, model__max_iter=100, model__rank_ratio=1.0; total time=   0.5s
[CV] END model__alpha=1, model__max_iter=100, model__rank_ratio=1.0; total time=   0.4s
[CV] END model__alpha=1, model__max_iter=100, model__rank_ratio=1.0; total time=   0.4s
[CV] END model__alpha=1, model__max_iter=100, model__rank_ratio=1.0; total time=   0.4s
[CV] END model__alpha=0.0001, model__max_iter=100, model__rank_ratio=0; total time=   0.1s
[CV] END model__alpha=0.0001, model__max_iter=100, mo



Fitting 8 folds for each of 168 candidates, totalling 1344 fits
[CV] END model__alpha=0.0001, model__max_iter=100, model__rank_ratio=0.4; total time=   0.4s
[CV] END model__alpha=0.0001, model__max_iter=100, model__rank_ratio=0.8; total time=   0.3s
[CV] END model__alpha=0.0001, model__max_iter=100, model__rank_ratio=0.8; total time=   0.6s
[CV] END model__alpha=0.0001, model__max_iter=500, model__rank_ratio=0; total time=   0.1s
[CV] END model__alpha=0.0001, model__max_iter=500, model__rank_ratio=0; total time=   0.1s
[CV] END model__alpha=0.0001, model__max_iter=500, model__rank_ratio=0.2; total time=   0.2s
[CV] END model__alpha=0.0001, model__max_iter=500, model__rank_ratio=0.2; total time=   0.3s
[CV] END model__alpha=0.0001, model__max_iter=500, model__rank_ratio=0.6; total time=   0.4s
[CV] END model__alpha=0.0001, model__max_iter=500, model__rank_ratio=0.6; total time=   0.4s
[CV] END model__alpha=0.0001, model__max_iter=1000, model__rank_ratio=0; total time=   0.1s
[CV] END mo



Fitting 8 folds for each of 168 candidates, totalling 1344 fits
[CV] END model__alpha=0.0001, model__max_iter=1000, model__rank_ratio=0.2; total time=   0.3s
[CV] END model__alpha=0.0001, model__max_iter=1000, model__rank_ratio=0.2; total time=   0.3s
[CV] END model__alpha=0.0001, model__max_iter=1000, model__rank_ratio=0.6; total time=   0.4s
[CV] END model__alpha=0.0001, model__max_iter=1000, model__rank_ratio=0.6; total time=   0.4s
[CV] END model__alpha=0.0001, model__max_iter=1000, model__rank_ratio=1.0; total time=   0.3s
[CV] END model__alpha=0.0001, model__max_iter=1000, model__rank_ratio=1.0; total time=   0.4s
[CV] END model__alpha=0.0001, model__max_iter=2000, model__rank_ratio=0.4; total time=   0.4s
[CV] END model__alpha=0.0001, model__max_iter=2000, model__rank_ratio=0.4; total time=   0.6s
[CV] END model__alpha=0.0001, model__max_iter=2000, model__rank_ratio=0.8; total time=   0.5s
[CV] END model__alpha=0.0001, model__max_iter=2000, model__rank_ratio=0.8; total time=   0

  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 8 folds for each of 168 candidates, totalling 1344 fits
[CV] END model__alpha=0.0001, model__max_iter=2000, model__rank_ratio=0.8; total time=   0.4s
[CV] END model__alpha=0.0001, model__max_iter=2000, model__rank_ratio=1.0; total time=   0.5s
[CV] END model__alpha=0.001, model__max_iter=100, model__rank_ratio=0; total time=   0.3s
[CV] END model__alpha=0.001, model__max_iter=100, model__rank_ratio=0.2; total time=   0.4s
[CV] END model__alpha=0.001, model__max_iter=100, model__rank_ratio=0.4; total time=   0.5s
[CV] END model__alpha=0.001, model__max_iter=100, model__rank_ratio=0.6; total time=   0.4s
[CV] END model__alpha=0.001, model__max_iter=100, model__rank_ratio=0.8; total time=   0.5s
[CV] END model__alpha=0.001, model__max_iter=100, model__rank_ratio=1.0; total time=   0.4s
[CV] END model__alpha=0.001, model__max_iter=500, model__rank_ratio=0; total time=   0.3s
[CV] END model__alpha=0.001, model__max_iter=500, model__rank_ratio=0.2; total time=   0.5s
[CV] END model__



Fitting 8 folds for each of 168 candidates, totalling 1344 fits
[CV] END model__alpha=0.001, model__max_iter=500, model__rank_ratio=0.6; total time=   0.4s
[CV] END model__alpha=0.001, model__max_iter=500, model__rank_ratio=0.8; total time=   0.4s
[CV] END model__alpha=0.001, model__max_iter=500, model__rank_ratio=0.8; total time=   0.7s
[CV] END model__alpha=0.001, model__max_iter=1000, model__rank_ratio=0; total time=   0.3s
[CV] END model__alpha=0.001, model__max_iter=1000, model__rank_ratio=0; total time=   0.2s
[CV] END model__alpha=0.001, model__max_iter=1000, model__rank_ratio=0.2; total time=   0.5s
[CV] END model__alpha=0.001, model__max_iter=1000, model__rank_ratio=0.4; total time=   0.4s
[CV] END model__alpha=0.001, model__max_iter=1000, model__rank_ratio=0.6; total time=   0.3s
[CV] END model__alpha=0.001, model__max_iter=1000, model__rank_ratio=0.8; total time=   0.4s
[CV] END model__alpha=0.001, model__max_iter=1000, model__rank_ratio=1.0; total time=   0.3s
[CV] END mode



Fitting 8 folds for each of 168 candidates, totalling 1344 fits
[CV] END model__alpha=0.001, model__max_iter=2000, model__rank_ratio=0; total time=   0.2s
[CV] END model__alpha=0.001, model__max_iter=2000, model__rank_ratio=0.2; total time=   0.3s
[CV] END model__alpha=0.001, model__max_iter=2000, model__rank_ratio=0.2; total time=   0.4s
[CV] END model__alpha=0.001, model__max_iter=2000, model__rank_ratio=0.6; total time=   0.4s
[CV] END model__alpha=0.001, model__max_iter=2000, model__rank_ratio=0.6; total time=   0.4s
[CV] END model__alpha=0.001, model__max_iter=2000, model__rank_ratio=1.0; total time=   0.6s
[CV] END model__alpha=0.001, model__max_iter=2000, model__rank_ratio=1.0; total time=   0.3s
[CV] END model__alpha=0.01, model__max_iter=100, model__rank_ratio=0.2; total time=   0.5s
[CV] END model__alpha=0.01, model__max_iter=100, model__rank_ratio=0.2; total time=   0.3s
[CV] END model__alpha=0.01, model__max_iter=100, model__rank_ratio=0.6; total time=   0.3s
[CV] END model



Fitting 8 folds for each of 168 candidates, totalling 1344 fits
[CV] END model__alpha=0.01, model__max_iter=500, model__rank_ratio=1.0; total time=   0.3s
[CV] END model__alpha=0.01, model__max_iter=1000, model__rank_ratio=0.2; total time=   0.3s
[CV] END model__alpha=0.01, model__max_iter=1000, model__rank_ratio=0.2; total time=   0.6s
[CV] END model__alpha=0.01, model__max_iter=1000, model__rank_ratio=0.6; total time=   0.3s
[CV] END model__alpha=0.01, model__max_iter=1000, model__rank_ratio=0.6; total time=   0.3s
[CV] END model__alpha=0.01, model__max_iter=1000, model__rank_ratio=1.0; total time=   0.4s
[CV] END model__alpha=0.01, model__max_iter=1000, model__rank_ratio=1.0; total time=   0.3s
[CV] END model__alpha=0.01, model__max_iter=2000, model__rank_ratio=0.2; total time=   0.3s
[CV] END model__alpha=0.01, model__max_iter=2000, model__rank_ratio=0.2; total time=   0.3s
[CV] END model__alpha=0.01, model__max_iter=2000, model__rank_ratio=0.6; total time=   0.3s
[CV] END model__a



Fitting 8 folds for each of 168 candidates, totalling 1344 fits
[CV] END model__alpha=0.01, model__max_iter=2000, model__rank_ratio=0.8; total time=   0.3s
[CV] END model__alpha=0.01, model__max_iter=2000, model__rank_ratio=0.8; total time=   0.3s
[CV] END model__alpha=0.1, model__max_iter=100, model__rank_ratio=0; total time=   0.3s
[CV] END model__alpha=0.1, model__max_iter=100, model__rank_ratio=0; total time=   0.3s
[CV] END model__alpha=0.1, model__max_iter=100, model__rank_ratio=0.4; total time=   0.3s
[CV] END model__alpha=0.1, model__max_iter=100, model__rank_ratio=0.4; total time=   0.3s
[CV] END model__alpha=0.1, model__max_iter=100, model__rank_ratio=0.8; total time=   0.3s
[CV] END model__alpha=0.1, model__max_iter=100, model__rank_ratio=0.8; total time=   0.3s
[CV] END model__alpha=0.1, model__max_iter=500, model__rank_ratio=0; total time=   0.3s
[CV] END model__alpha=0.1, model__max_iter=500, model__rank_ratio=0; total time=   0.3s
[CV] END model__alpha=0.1, model__max_it



Mean CV Score: 0.628 ± 0.086
Best Parameters: {'model__alpha': 0.0001, 'model__max_iter': 100, 'model__rank_ratio': 0.2}

Detailed CV Results:

Fold 1:
Test Cohort: Atlanta_2014_Long
Test Score: 0.588
Best Parameters: {'model__alpha': 0.0001, 'model__max_iter': 100, 'model__rank_ratio': 0.2}

Fold 2:
Test Cohort: Belfast_2018_Jain
Test Score: 0.539
Best Parameters: {'model__alpha': 0.0001, 'model__max_iter': 100, 'model__rank_ratio': 0.2}

Fold 3:
Test Cohort: CPC_GENE_2017_Fraser
Test Score: 0.453
Best Parameters: {'model__alpha': 0.001, 'model__max_iter': 100, 'model__rank_ratio': 0}

Fold 4:
Test Cohort: CPGEA_2020_Li
Test Score: 0.595
Best Parameters: {'model__alpha': 0.001, 'model__max_iter': 100, 'model__rank_ratio': 0}

Fold 5:
Test Cohort: CamCap_2016_Ross_Adams
Test Score: 0.694
Best Parameters: {'model__alpha': 0.0001, 'model__max_iter': 100, 'model__rank_ratio': 0.2}

Fold 6:
Test Cohort: CancerMap_2017_Luca
Test Score: 0.708
Best Parameters: {'model__alpha': 0.001, 'model__

In [5]:
"""!!! Funktioniert aktuell nicht, weil wir nicht die Überlebenszeiten mit Zeit=0 bereinigen"""




import os
import sys
import numpy as np
import logging
from models.survival_svm import SurvivalSVMWrapper
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler



from preprocessing.data_container import DataContainer
from utils.evaluation import cindex_score
from models.modelling_process import ModellingProcess

# Data configuration
DATA_CONFIG = {
    'use_pca': True,
    'pca_threshold': 0.5,
    'gene_type': 'intersection',
    'use_imputed': True,
    'use_cohorts': False
}

# Model configuration
MODEL_CONFIG = {
    'params_cv': {
        'model__alpha': [0.001, 0.01, 0.1],
        'model__max_iter': [100, 500],
        'model__rank_ratio': [0.25, 0.5, 0.75]
    },
    'refit': True,
    'do_nested_resampling': True
}

# Pipeline setup
svm_pipeline_steps = [
    ('scaler', StandardScaler()),
    ('model', SurvivalSVMWrapper())
]

mp = ModellingProcess()
mp.prepare_data(DATA_CONFIG, PROJECT_ROOT)

# Training
nstd_res_result, cmplt_model, cmplt_pipeline = mp.do_modelling(svm_pipeline_steps, MODEL_CONFIG)

# Print results
print("\nPerformance:")
print(f"Mean C-Index: {nstd_res_result['mean_score']:.3f} ± {nstd_res_result['std_score']:.3f}")

2024-11-21 17:54:27,966 - INFO - Loading data...
2024-11-21 17:55:46,461 - INFO - Applying PCA...
2024-11-21 17:55:49,582 - INFO - Selected 39 components explaining 50.0% of variance
2024-11-21 17:55:50,651 - INFO - Loaded data: 1091 samples, 39 features
2024-11-21 17:55:55,517 - INFO - Start model training...
2024-11-21 17:55:55,518 - INFO - Input data shape: X=(1091, 39)
2024-11-21 17:55:55,518 - INFO - Nested resampling...
2024-11-21 17:55:55,518 - INFO - Starting nested resampling...
2024-11-21 17:55:55,519 - INFO - Data shape: X=(1091, 39), groups=9 unique
2024-11-21 17:55:55,521 - INFO - 
Outer fold 1 - Phase 1
2024-11-21 17:55:55,524 - INFO - Test cohort: Atlanta_2014_Long


Fitting 8 folds for each of 18 candidates, totalling 144 fits


126 fits failed out of a total of 144.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
126 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/jonasschernich/Library/Mobile Documents/com~apple~CloudDocs/Finanzen/Lev Strategy/StatisticalConsulting/.venv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/jonasschernich/Library/Mobile Documents/com~apple~CloudDocs/Finanzen/Lev Strategy/StatisticalConsulting/.venv/lib/python3.11/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/j

ValueError: observed time contains values smaller or equal to zero