In [1]:
import pandas as pd
import os
import numpy as np
from lifelines.utils import concordance_index
# Setup paths




### Merged Test Data

In [2]:
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.getcwd()))
PROJECT_ROOT

# Define paths
SCORES_DIR = os.path.join(PROJECT_ROOT, 'PCaPrognostics','data', 'scores')
PDATA_DIR = os.path.join(PROJECT_ROOT, 'PCaPrognostics', 'data', 'merged_data', 'pData', 'imputed')

# Load data
scores = pd.read_csv(SCORES_DIR + '/test_scores.csv', index_col=0)
pdata = pd.read_csv(PDATA_DIR + '/test_pData_imputed.csv', index_col=0)

# Match indices
pdata.index = scores.index

# Extract survival data
times = pdata['MONTH_TO_BCR'].values
events = pdata['BCR_STATUS'].astype(bool).values
risk_scores = scores.values.flatten()

# Calculate C-index
cindex = concordance_index(times, -risk_scores, events)

print(f"C-Index der Scores: {cindex:.3f}")

C-Index der Scores: 0.777


### Test Cohort 1

In [3]:
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.getcwd()))
PROJECT_ROOT

# Define paths
SCORES_DIR = os.path.join(PROJECT_ROOT, 'PCaPrognostics','data', 'scores')
PDATA_DIR = os.path.join(PROJECT_ROOT, 'PCaPrognostics', 'data', 'cohort_data', 'pData', 'imputed')

# Load data
scores = pd.read_csv(SCORES_DIR + '/test_scores_cohort1.csv', index_col=0)
pdata = pd.read_csv(PDATA_DIR + '/test_pData_cohort1_imputed.csv', index_col=0)

# Match indices
pdata.index = scores.index

# Extract survival data
times = pdata['MONTH_TO_BCR'].values
events = pdata['BCR_STATUS'].astype(bool).values
risk_scores = scores.values.flatten()

# Calculate C-index
cindex = concordance_index(times, -risk_scores, events)

print(f"C-Index der Scores: {cindex:.3f}")

C-Index der Scores: 0.754


### Test Cohort 2

In [4]:
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.getcwd()))
PROJECT_ROOT

# Define paths
SCORES_DIR = os.path.join(PROJECT_ROOT, 'PCaPrognostics','data', 'scores')
PDATA_DIR = os.path.join(PROJECT_ROOT, 'PCaPrognostics', 'data', 'cohort_data', 'pData', 'imputed')

# Load data
scores = pd.read_csv(SCORES_DIR + '/test_scores_cohort2.csv', index_col=0)
pdata = pd.read_csv(PDATA_DIR + '/test_pData_cohort2_imputed.csv', index_col=0)

# Match indices
pdata.index = scores.index

# Extract survival data
times = pdata['MONTH_TO_BCR'].values
events = pdata['BCR_STATUS'].astype(bool).values
risk_scores = scores.values.flatten()

# Calculate C-index
cindex = concordance_index(times, -risk_scores, events)

print(f"C-Index der Scores: {cindex:.3f}")

C-Index der Scores: 0.870


### All Train Cohorts

In [5]:
# Import required libraries
import pandas as pd
import numpy as np
from lifelines.utils import concordance_index
import os

# Initialize results storage
results = []

# Define the directories
scores_dir = os.path.join(PROJECT_ROOT, 'PCaPrognostics', 'data', 'scores', 'cohort_specific')
pdata_dir = os.path.join(PROJECT_ROOT, 'PCaPrognostics', 'data', 'cohort_data', 'pData', 'imputed')

# Loop through each pdata file
for pdata_filename in os.listdir(pdata_dir):
    if pdata_filename.endswith('.csv'):
        # Get cohort name and construct score filename
        cohort_name = pdata_filename.replace('.csv', '')
        score_filename = cohort_name + '_scores.csv'
        score_path = os.path.join(scores_dir, score_filename)

        # Check if corresponding score file exists
        if os.path.exists(score_path):
            print(f"\nProcessing cohort: {cohort_name}")

            # Read both files
            pdata = pd.read_csv(os.path.join(pdata_dir, pdata_filename))
            scores_df = pd.read_csv(score_path)

            print(f"Number of samples in pdata file: {len(pdata)}")
            print(f"Number of samples in scores file: {len(scores_df)}")

            try:
                # Extract survival data
                times = pdata['MONTH_TO_BCR'].values
                events = pdata['BCR_STATUS'].astype(bool).values
                risk_scores = scores_df['risk_score'].values

                # Calculate c-index
                c_index = concordance_index(times, -risk_scores, events)

                # Store results
                results.append({
                    'cohort': cohort_name,
                    'c_index': c_index,
                    'n_patients': len(pdata)
                })
                print(f"Successfully calculated c-index: {c_index:.3f}")

            except Exception as e:
                print(f"Error calculating c-index: {str(e)}")
        else:
            print(f"\nNo score file found for cohort: {cohort_name}")

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Display results
print("\nC-indices by cohort:")
print(results_df)

# Save results
save_dir = os.path.join(PROJECT_ROOT, 'PCaPrognostics', 'results_modelling_splits')
file_path = os.path.join(save_dir, 'splits_score_cindices.csv')
results_df.to_csv(results_df.to_csv(file_path, index=False), index=False)



No score file found for cohort: high_risk_pData_test_cohort1

No score file found for cohort: high_risk_pData_test_cohort2

Processing cohort: Belfast_2018_Jain
Number of samples in pdata file: 248
Number of samples in scores file: 248
Successfully calculated c-index: 0.687

Processing cohort: CPC_GENE_2017_Fraser
Number of samples in pdata file: 73
Number of samples in scores file: 73
Successfully calculated c-index: 0.699

Processing cohort: DKFZ_2018_Gerhauser
Number of samples in pdata file: 82
Number of samples in scores file: 82
Successfully calculated c-index: 0.783

Processing cohort: CancerMap_2017_Luca
Number of samples in pdata file: 133
Number of samples in scores file: 133
Successfully calculated c-index: 0.689

No score file found for cohort: test_pData_cohort2_imputed_1_example

Processing cohort: MSKCC_2010_Taylor
Number of samples in pdata file: 131
Number of samples in scores file: 131
Successfully calculated c-index: 0.695

No score file found for cohort: low_risk_p

'cohort,c_index,n_patients\nBelfast_2018_Jain,0.6872734595247684,248\nCPC_GENE_2017_Fraser,0.6985446985446986,73\nDKFZ_2018_Gerhauser,0.7828618968386023,82\nCancerMap_2017_Luca,0.6888829645427886,133\nMSKCC_2010_Taylor,0.6952639751552795,131\nAtlanta_2014_Long,0.6502890173410405,100\nCamCap_2016_Ross_Adams,0.7496617050067659,112\nStockholm_2016_Ross_Adams,0.6834289571380358,92\nCPGEA_2020_Li,0.6751684311838306,120\n'

In [6]:
# Berechne den gewichteten Durchschnitt des c-Index, gewichtet nach n_patients
weighted_average = (results_df['c_index'] * results_df['n_patients']).sum() / results_df['n_patients'].sum()

print(f"Gewichteter Durchschnitt des c-Index: {weighted_average:.3f}")


Gewichteter Durchschnitt des c-Index: 0.698
