In [2]:
import pandas as pd
import os
import numpy as np
from lifelines.utils import concordance_index
# Setup paths




### Merged Test Data

In [3]:
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.getcwd()))
PROJECT_ROOT

# Define paths
SCORES_DIR = os.path.join(PROJECT_ROOT, 'PCaPrognostics','data', 'scores')
PDATA_DIR = os.path.join(PROJECT_ROOT, 'PCaPrognostics', 'data', 'merged_data', 'pData', 'imputed')

# Load data
scores = pd.read_csv(SCORES_DIR + '/test_scores.csv', index_col=0)
pdata = pd.read_csv(PDATA_DIR + '/test_pData_imputed.csv', index_col=0)

# Match indices
pdata.index = scores.index

# Extract survival data
times = pdata['MONTH_TO_BCR'].values
events = pdata['BCR_STATUS'].astype(bool).values
risk_scores = scores.values.flatten()

# Calculate C-index
cindex = concordance_index(times, -risk_scores, events)

print(f"C-Index: {cindex:.3f}")

C-Index der Scores: 0.777


### Test Cohort 1

In [4]:
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.getcwd()))
PROJECT_ROOT

# Define paths
SCORES_DIR = os.path.join(PROJECT_ROOT, 'PCaPrognostics','data', 'scores')
PDATA_DIR = os.path.join(PROJECT_ROOT, 'PCaPrognostics', 'data', 'cohort_data', 'pData', 'imputed')

# Load data
scores = pd.read_csv(SCORES_DIR + '/test_scores_cohort1.csv', index_col=0)
pdata = pd.read_csv(PDATA_DIR + '/test_pData_cohort1_imputed.csv', index_col=0)

# Match indices
pdata.index = scores.index

# Extract survival data
times = pdata['MONTH_TO_BCR'].values
events = pdata['BCR_STATUS'].astype(bool).values
risk_scores = scores.values.flatten()

# Calculate C-index
cindex = concordance_index(times, -risk_scores, events)

print(f"C-Index: {cindex:.3f}")

C-Index der Scores: 0.754


### Test Cohort 2

In [5]:
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.getcwd()))
PROJECT_ROOT

# Define paths
SCORES_DIR = os.path.join(PROJECT_ROOT, 'PCaPrognostics','data', 'scores')
PDATA_DIR = os.path.join(PROJECT_ROOT, 'PCaPrognostics', 'data', 'cohort_data', 'pData', 'imputed')

# Load data
scores = pd.read_csv(SCORES_DIR + '/test_scores_cohort2.csv', index_col=0)
pdata = pd.read_csv(PDATA_DIR + '/test_pData_cohort2_imputed.csv', index_col=0)

# Match indices
pdata.index = scores.index

# Extract survival data
times = pdata['MONTH_TO_BCR'].values
events = pdata['BCR_STATUS'].astype(bool).values
risk_scores = scores.values.flatten()

# Calculate C-index
cindex = concordance_index(times, -risk_scores, events)

print(f"C-Index: {cindex:.3f}")

C-Index der Scores: 0.870


### All Train Cohorts

In [16]:
# Import required libraries
import pandas as pd
import numpy as np
from lifelines.utils import concordance_index
import os

# Initialize results storage
results = []

# Define the directories
scores_dir = os.path.join(PROJECT_ROOT, 'PCaPrognostics', 'data', 'scores', 'cohort_specific')
pdata_dir = os.path.join(PROJECT_ROOT, 'PCaPrognostics', 'data', 'cohort_data', 'pData', 'imputed')

# Loop through each pdata file
for pdata_filename in os.listdir(pdata_dir):
    if pdata_filename.endswith('.csv'):
        # Get cohort name and construct score filename
        cohort_name = pdata_filename.replace('.csv', '')
        score_filename = cohort_name + '_scores.csv'
        score_path = os.path.join(scores_dir, score_filename)


        # Read both files
        pdata = pd.read_csv(os.path.join(pdata_dir, pdata_filename))
        scores_df = pd.read_csv(score_path)


        # Extract survival data
        times = pdata['MONTH_TO_BCR'].values
        events = pdata['BCR_STATUS'].astype(bool).values
        risk_scores = scores_df['risk_score'].values

        # Calculate c-index
        c_index = concordance_index(times, -risk_scores, events)

        # Store results
        results.append({
            'cohort': cohort_name,
            'c_index': c_index,
            'n_patients': len(pdata)
        })



# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Display results
print(results_df)

# Save results
save_dir = os.path.join(PROJECT_ROOT, 'PCaPrognostics', 'results_modelling_splits')
file_path = os.path.join(save_dir, 'splits_score_cindices.csv')
#results_df.to_csv(results_df.to_csv(file_path, index=False), index=False)


                        cohort   c_index  n_patients
0            Belfast_2018_Jain  0.687273         248
1         CPC_GENE_2017_Fraser  0.698545          73
2          DKFZ_2018_Gerhauser  0.782862          82
3          CancerMap_2017_Luca  0.688883         133
4            MSKCC_2010_Taylor  0.695264         131
5   test_pData_cohort2_imputed  0.869605         164
6            Atlanta_2014_Long  0.650289         100
7       CamCap_2016_Ross_Adams  0.749662         112
8    Stockholm_2016_Ross_Adams  0.683429          92
9   test_pData_cohort1_imputed  0.754160         332
10               CPGEA_2020_Li  0.675168         120
