In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.metrics import concordance_index_censored
from sksurv.nonparametric import kaplan_meier_estimator
from sksurv.util import Surv
import warnings


In [2]:
class CoxPipeline:
    def __init__(self, alpha=0.5, n_splits=5, random_state=42):
        self.alpha = alpha
        self.n_splits = n_splits
        self.random_state = random_state
        self.model = None
        warnings.filterwarnings("ignore")

    def split_data(self, data, features, target, duration):
        # Train-test split
        train_val_data, test_data = train_test_split(
            data, test_size=0.2, stratify=data[target], random_state=self.random_state
        )

        # Further split train_val_data into train and validation
        train_data, val_data = train_test_split(
            train_val_data, test_size=0.2, stratify=train_val_data[target], random_state=self.random_state
        )

        # Prepare data for Cox model
        X_train = train_data[features]
        y_train = Surv.from_dataframe(event=target, time=duration, data=train_data)

        X_val = val_data[features]
        y_val = Surv.from_dataframe(event=target, time=duration, data=val_data)

        X_test = test_data[features]
        y_test = Surv.from_dataframe(event=target, time=duration, data=test_data)

        return X_train, y_train, X_val, y_val, X_test, y_test

    def cross_validate(self, X_train, y_train):
        # Ensure scaling of features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        skf = StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=self.random_state)
        concordance_indices = []

        for fold, (train_idx, val_idx) in enumerate(skf.split(X_train_scaled, y_train[target])):
            X_train_fold, X_val_fold = X_train_scaled[train_idx], X_train_scaled[val_idx]
            y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]

            estimator = CoxPHSurvivalAnalysis(alpha=self.alpha)
            estimator.fit(X_train_fold, y_train_fold)

            prediction = estimator.predict(X_val_fold)
            result = concordance_index_censored(
                y_val_fold[target], y_val_fold[duration], prediction
            )
            concordance_indices.append(result[0])

            print(f"Fold {fold + 1} Concordance Index: {result[0]}")

        average_concordance_index = np.mean(concordance_indices)
        print("Average Concordance Index:", average_concordance_index)
        return average_concordance_index

    def train_and_evaluate(self, X_train, y_train, X_test, y_test):
        # Ensure scaling of features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        self.model = CoxPHSurvivalAnalysis(alpha=self.alpha)
        self.model.fit(X_train_scaled, y_train)

        test_predictions = self.model.predict(X_test_scaled)
        test_result = concordance_index_censored(
            y_test[target], y_test[duration], test_predictions
        )
        print("Test Data Concordance Index:", test_result[0])

        coefficients = pd.Series(self.model.coef_, index=X_train.columns)
        print(coefficients)

        return test_result[0], coefficients

    def plot_kaplan_meier(self, data, target, duration):
        time = np.array(list(data[duration].astype(int)))
        events = np.array(list(data[target].astype(bool)))

        survival_probabilities, time_points, conf_int = kaplan_meier_estimator(
            events, time, conf_type="log-log"
        )

        plt.figure(figsize=(10, 6))
        plt.step(time_points, survival_probabilities, where='post', label='Kaplan-Meier Estimate')

        if conf_int.shape[1] == 2 and len(conf_int) == len(time_points):
            plt.fill_between(
                time_points, conf_int[:, 0], conf_int[:, 1], color='grey', alpha=0.2, label='95% Confidence Interval'
            )

        plt.xlabel('Time')
        plt.ylabel('Survival Probability')
        plt.title('Kaplan-Meier Survival Estimate')
        plt.legend()
        plt.grid(True)
        plt.show()

    def run_pipeline(self, data, features, target, duration):
        X_train, y_train, X_val, y_val, X_test, y_test = self.split_data(data, features, target, duration)
        self.cross_validate(X_train, y_train)
        test_concordance, coefficients = self.train_and_evaluate(X_train, y_train, X_test, y_test)
        # Uncomment to plot Kaplan-Meier estimate
        # self.plot_kaplan_meier(data, target, duration)
        return test_concordance, coefficients


In [3]:
def analyze_file(data):
    print("Dataset:")
    display(data)
    
    # Output the shape of the dataset
    print("\nShape of the dataset (rows, columns):")
    print(data.shape)
    
    # Output the data types of each column
    print("\nData types of each column:")
    print(data.dtypes)
    
    # Output the count of missing values in each column
    print("\nCount of missing values in each column:")
    print(data.isnull().sum())
def get_high_correlation_features(df, target_columns, threshold=0.9):
    """
    Get features with correlation greater than the specified threshold with the target variables.

    Parameters:
    - df (pd.DataFrame): The input DataFrame with features and target variables.
    - target_columns (list of str): List of target variable column names.
    - threshold (float): Correlation threshold to filter features.

    Returns:
    - dict: A dictionary where keys are column names and values are their correlation values.
    """
    high_corr_features = {}

    # Compute correlation matrix
    corr_matrix = df.corr()

    # Iterate over columns in the DataFrame
    for column in df.columns:
        if column not in target_columns:
            # Check correlation with each target variable
            for target in target_columns:
                correlation_value = corr_matrix.loc[column, target]
                if abs(correlation_value) > threshold:
                    high_corr_features[column] = correlation_value
                    break  # No need to check other targets for this feature

    return high_corr_features

def show_correlation_heatmap(df, top_n_columns):
    """
    Displays a correlation matrix heatmap for the specified top_n_columns along with 'OS' and 'OS.time'.
    
    Parameters:
    df (pd.DataFrame): The DataFrame containing the data.
    top_n_columns (list): A list of column names for which the correlation matrix will be displayed.
    """
    # Ensure 'OS' and 'OS.time' are included in the correlation matrix
    columns_to_include = top_n_columns + ['OS', 'OS.time']
    
    # Select only the specified columns, including 'OS' and 'OS.time'
    selected_data = df[columns_to_include]
    
    # Compute the correlation matrix
    correlation_matrix = selected_data.corr()
    
    # Plot heatmap
    plt.figure(figsize=(24, 22))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
    plt.title(f'Correlation Matrix Heatmap for Top {len(top_n_columns)} Features + OS and OS.time')
    plt.show()



In [4]:
data = pd.read_csv('/Users/simrantanwar/Desktop/College/DDP/survival_analysis/data/genomics_brain_16_09_24.csv')
analyze_file(data)

Dataset:


Unnamed: 0.1,Unnamed: 0,Patient ID,LASP1,HOXA11,CREBBP,ETV1,GAS7,CD79B,PAX7,BTK,...,CUX1,PIERCE2,SRGAP2,NCOA4,SSX4,TAF15,MLLT6,DUX4L1,OS,OS.time
0,0,TCGA-02-0047,8.923858,3.526361,8.244071,8.900549,9.184510,3.555348,2.302585,6.255750,...,8.282483,4.127134,8.053251,9.012621,0.0,8.319474,8.334952,0.0,1.0,448.0
1,1,TCGA-02-0055,8.460199,3.970292,7.816014,7.650645,7.964156,2.890372,1.609438,6.614726,...,8.368693,4.276666,7.794823,8.727130,0.0,8.235361,7.855157,0.0,1.0,76.0
2,2,TCGA-02-2483,9.164296,4.682131,8.572628,10.186973,8.340217,2.944439,1.609438,6.146329,...,8.606302,3.931826,8.222554,9.270965,0.0,8.883779,8.491260,0.0,0.0,466.0
3,3,TCGA-02-2485,9.133675,3.970292,8.474077,9.155250,6.938284,2.564949,1.791759,5.780744,...,8.629629,4.369448,8.235361,8.687779,0.0,9.019301,8.461046,0.0,0.0,470.0
4,4,TCGA-02-2486,8.626765,3.526361,7.090910,6.838405,8.690642,2.995732,0.693147,7.329750,...,8.512382,5.793014,7.963460,8.760139,0.0,8.525161,8.006034,0.0,1.0,618.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
660,8698,TCGA-WY-A85A,9.093245,1.791759,8.631414,10.231928,7.642524,0.693147,2.397895,5.602119,...,8.810460,3.891820,7.765993,8.910586,0.0,8.927977,9.006509,0.0,0.0,1320.0
661,8699,TCGA-WY-A85B,8.977778,2.397895,8.905037,10.492523,7.574558,2.484907,1.386294,6.572283,...,9.015906,4.234107,7.955074,9.328568,0.0,9.316770,9.229849,0.0,0.0,1393.0
662,8700,TCGA-WY-A85C,8.820552,6.282267,9.278466,9.983315,7.007601,1.945910,1.945910,5.730100,...,9.177714,3.610918,8.107720,9.145909,0.0,9.151757,9.219696,0.0,0.0,1426.0
663,8701,TCGA-WY-A85D,8.793612,0.000000,8.634443,8.751316,7.677400,2.197225,0.000000,5.983936,...,8.861208,3.637586,7.914983,8.731498,0.0,9.364005,8.856234,0.0,0.0,1147.0



Shape of the dataset (rows, columns):
(665, 752)

Data types of each column:
Unnamed: 0      int64
Patient ID     object
LASP1         float64
HOXA11        float64
CREBBP        float64
               ...   
TAF15         float64
MLLT6         float64
DUX4L1        float64
OS            float64
OS.time       float64
Length: 752, dtype: object

Count of missing values in each column:
Unnamed: 0    0
Patient ID    0
LASP1         0
HOXA11        0
CREBBP        0
             ..
TAF15         0
MLLT6         0
DUX4L1        0
OS            0
OS.time       0
Length: 752, dtype: int64


In [5]:
data = data.drop(['Unnamed: 0','Patient ID'],axis=1)
analyze_file(data)

Dataset:


Unnamed: 0,LASP1,HOXA11,CREBBP,ETV1,GAS7,CD79B,PAX7,BTK,BRCA1,WAS,...,CUX1,PIERCE2,SRGAP2,NCOA4,SSX4,TAF15,MLLT6,DUX4L1,OS,OS.time
0,8.923858,3.526361,8.244071,8.900549,9.184510,3.555348,2.302585,6.255750,6.257668,7.026427,...,8.282483,4.127134,8.053251,9.012621,0.0,8.319474,8.334952,0.0,1.0,448.0
1,8.460199,3.970292,7.816014,7.650645,7.964156,2.890372,1.609438,6.614726,5.605802,7.315884,...,8.368693,4.276666,7.794823,8.727130,0.0,8.235361,7.855157,0.0,1.0,76.0
2,9.164296,4.682131,8.572628,10.186973,8.340217,2.944439,1.609438,6.146329,6.932448,6.867974,...,8.606302,3.931826,8.222554,9.270965,0.0,8.883779,8.491260,0.0,0.0,466.0
3,9.133675,3.970292,8.474077,9.155250,6.938284,2.564949,1.791759,5.780744,6.970730,6.242223,...,8.629629,4.369448,8.235361,8.687779,0.0,9.019301,8.461046,0.0,0.0,470.0
4,8.626765,3.526361,7.090910,6.838405,8.690642,2.995732,0.693147,7.329750,5.683580,7.600902,...,8.512382,5.793014,7.963460,8.760139,0.0,8.525161,8.006034,0.0,1.0,618.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
660,9.093245,1.791759,8.631414,10.231928,7.642524,0.693147,2.397895,5.602119,5.849325,6.272877,...,8.810460,3.891820,7.765993,8.910586,0.0,8.927977,9.006509,0.0,0.0,1320.0
661,8.977778,2.397895,8.905037,10.492523,7.574558,2.484907,1.386294,6.572283,6.376727,7.059618,...,9.015906,4.234107,7.955074,9.328568,0.0,9.316770,9.229849,0.0,0.0,1393.0
662,8.820552,6.282267,9.278466,9.983315,7.007601,1.945910,1.945910,5.730100,6.347389,6.428105,...,9.177714,3.610918,8.107720,9.145909,0.0,9.151757,9.219696,0.0,0.0,1426.0
663,8.793612,0.000000,8.634443,8.751316,7.677400,2.197225,0.000000,5.983936,5.796058,6.858565,...,8.861208,3.637586,7.914983,8.731498,0.0,9.364005,8.856234,0.0,0.0,1147.0



Shape of the dataset (rows, columns):
(665, 750)

Data types of each column:
LASP1      float64
HOXA11     float64
CREBBP     float64
ETV1       float64
GAS7       float64
            ...   
TAF15      float64
MLLT6      float64
DUX4L1     float64
OS         float64
OS.time    float64
Length: 750, dtype: object

Count of missing values in each column:
LASP1      0
HOXA11     0
CREBBP     0
ETV1       0
GAS7       0
          ..
TAF15      0
MLLT6      0
DUX4L1     0
OS         0
OS.time    0
Length: 750, dtype: int64


In [6]:
column_names = data.columns.tolist()
features = column_names[:-2]
target = column_names[-2]
print(target)
duration = column_names[-1]
print(duration)

# Initialize CoxPipeline with default parameters
pipeline = CoxPipeline(alpha=0.5, n_splits=5, random_state=42)

# Run the pipeline
test_concordance, coefficients = pipeline.run_pipeline(data, features, target, duration)

# Output results
#print("Test Concordance Index:", test_concordance)
#print("\nModel Coefficients:")
#print(coefficients)


OS
OS.time
Fold 1 Concordance Index: 0.7933130699088146
Fold 2 Concordance Index: 0.7743031951053705
Fold 3 Concordance Index: 0.812539184952978
Fold 4 Concordance Index: 0.8076639646278556
Fold 5 Concordance Index: 0.826735885788449
Average Concordance Index: 0.8029110600766934
Test Data Concordance Index: 0.846307385229541
LASP1     0.307537
HOXA11   -0.810686
CREBBP    0.434702
ETV1     -0.563096
GAS7      0.523358
            ...   
NCOA4    -0.894213
SSX4     -0.044134
TAF15    -0.401231
MLLT6    -0.278987
DUX4L1    0.000000
Length: 748, dtype: float64


In [7]:
gene_c_index = pd.read_csv('/Users/simrantanwar/Desktop/College/DDP/survival_analysis/data/gene_expression_c_index copy.csv')
#df_sorted = gene_c_index.sort_values(by = 'c_index',ascending=False)
gene_c_index.head()

Unnamed: 0,gene_expression,c_index
0,HOXD11,0.795951
1,MSN,0.79488
2,FKBP9,0.793804
3,HOXC13,0.792626
4,CSMD3,0.791731


# C-index

# top 10

In [8]:
gene_expressions_10 = gene_c_index['gene_expression'][:10].tolist()
print(gene_expressions_10)

# Assuming 'target' and 'duration' are already identified from the column names
target_columns = ['OS','OS.time']

# Combine the top 20 genes and the additional columns
columns_to_select = gene_expressions_10 + target_columns

# Create a new DataFrame with these columns
gene_expressions_10_df = data[columns_to_select]

['HOXD11', 'MSN', 'FKBP9', 'HOXC13', 'CSMD3', 'PAX3', 'TMSB4X', 'HMGA2', 'BCR', 'CPEB3']


In [9]:
column_names = gene_expressions_10_df.columns.tolist()
features = column_names[:-2]
target = column_names[-2]
print(target)
duration = column_names[-1]
print(duration)

# Initialize CoxPipeline with default parameters
pipeline = CoxPipeline(alpha=0.5, n_splits=5, random_state=42)

# Run the pipeline
test_concordance, coefficients = pipeline.run_pipeline(data, features, target, duration)

# Output results
#print("Test Concordance Index:", test_concordance)
#print("\nModel Coefficients:")
#print(coefficients)


OS
OS.time
Fold 1 Concordance Index: 0.8328267477203647
Fold 2 Concordance Index: 0.7709041468388851
Fold 3 Concordance Index: 0.8294670846394985
Fold 4 Concordance Index: 0.823876197494473
Fold 5 Concordance Index: 0.8682673588578845
Average Concordance Index: 0.8250683071102213
Test Data Concordance Index: 0.8585685771314514
HOXD11    0.215545
MSN       0.352890
FKBP9     0.369941
HOXC13    0.248283
CSMD3     0.045269
PAX3      0.102294
TMSB4X    0.085048
HMGA2     0.071469
BCR      -0.343771
CPEB3    -0.014962
dtype: float64


# top 20

In [10]:
gene_expressions_20 = gene_c_index['gene_expression'][:20].tolist()
print(gene_expressions_20)

# Assuming 'target' and 'duration' are already identified from the column names
target_columns = ['OS','OS.time']

# Combine the top 20 genes and the additional columns
columns_to_select = gene_expressions_20 + target_columns

# Create a new DataFrame with these columns
gene_expressions_20_df = data[columns_to_select]

['HOXD11', 'MSN', 'FKBP9', 'HOXC13', 'CSMD3', 'PAX3', 'TMSB4X', 'HMGA2', 'BCR', 'CPEB3', 'IGF2BP2', 'ALDH2', 'ABI1', 'SOCS1', 'KAT6B', 'MN1', 'RUNX1T1', 'MYD88', 'MACC1', 'FOXO4']


In [11]:
column_names = gene_expressions_20_df.columns.tolist()
features = column_names[:-2]
target = column_names[-2]
print(target)
duration = column_names[-1]
print(duration)

# Initialize CoxPipeline with default parameters
pipeline = CoxPipeline(alpha=0.5, n_splits=5, random_state=42)

# Run the pipeline
test_concordance, coefficients = pipeline.run_pipeline(data, features, target, duration)



OS
OS.time
Fold 1 Concordance Index: 0.8115501519756839
Fold 2 Concordance Index: 0.7865397688647179
Fold 3 Concordance Index: 0.8112852664576803
Fold 4 Concordance Index: 0.8364038319823139
Fold 5 Concordance Index: 0.8617780661907852
Average Concordance Index: 0.8215114170942363
Test Data Concordance Index: 0.8614200171086399
HOXD11     0.163664
MSN        0.332832
FKBP9      0.225252
HOXC13     0.219895
CSMD3      0.118254
PAX3       0.084634
TMSB4X     0.158366
HMGA2      0.041705
BCR       -0.392972
CPEB3      0.185457
IGF2BP2    0.039050
ALDH2      0.010756
ABI1      -0.271523
SOCS1      0.126338
KAT6B      0.199787
MN1        0.150965
RUNX1T1   -0.187463
MYD88      0.187766
MACC1      0.020489
FOXO4     -0.153478
dtype: float64


# top 30

In [12]:
gene_expressions_30 = gene_c_index['gene_expression'][:30].tolist()
print(gene_expressions_10)

# Assuming 'target' and 'duration' are already identified from the column names
target_columns = ['OS','OS.time']

# Combine the top 20 genes and the additional columns
columns_to_select = gene_expressions_30 + target_columns

# Create a new DataFrame with these columns
gene_expressions_30_df = data[columns_to_select]

column_names = gene_expressions_30_df.columns.tolist()
features = column_names[:-2]
target = column_names[-2]
print(target)
duration = column_names[-1]
print(duration)

# Initialize CoxPipeline with default parameters
pipeline = CoxPipeline(alpha=0.5, n_splits=5, random_state=42)

# Run the pipeline
test_concordance, coefficients = pipeline.run_pipeline(data, features, target, duration)



['HOXD11', 'MSN', 'FKBP9', 'HOXC13', 'CSMD3', 'PAX3', 'TMSB4X', 'HMGA2', 'BCR', 'CPEB3']
OS
OS.time
Fold 1 Concordance Index: 0.7857142857142857
Fold 2 Concordance Index: 0.8055744391570361
Fold 3 Concordance Index: 0.8163009404388715
Fold 4 Concordance Index: 0.8422991893883567
Fold 5 Concordance Index: 0.8656716417910447
Average Concordance Index: 0.823112099297919
Test Data Concordance Index: 0.870544625035643
HOXD11     0.204569
MSN        0.263503
FKBP9      0.176913
HOXC13     0.333878
CSMD3      0.204691
PAX3       0.066949
TMSB4X     0.355162
HMGA2     -0.066057
BCR       -0.370482
CPEB3      0.065264
IGF2BP2    0.167460
ALDH2     -0.004151
ABI1      -0.322862
SOCS1      0.127097
KAT6B      0.101294
MN1        0.214413
RUNX1T1   -0.093548
MYD88      0.232993
MACC1     -0.100252
FOXO4     -0.227568
PATZ1     -0.698401
PTPRT      0.110296
NUMA1      0.962997
BCL7A      0.073727
HLF        0.015483
BCL3       0.049788
TET1       0.054345
LRP1B      0.020867
MLLT6     -0.341527
OLI

# top 50

In [13]:
gene_expressions_50 = gene_c_index['gene_expression'][:50].tolist()
print(gene_expressions_10)

# Assuming 'target' and 'duration' are already identified from the column names
target_columns = ['OS','OS.time']

# Combine the top 20 genes and the additional columns
columns_to_select = gene_expressions_50 + target_columns

# Create a new DataFrame with these columns
gene_expressions_50_df = data[columns_to_select]

column_names = gene_expressions_50_df.columns.tolist()
features = column_names[:-2]
target = column_names[-2]
print(target)
duration = column_names[-1]
print(duration)

# Initialize CoxPipeline with default parameters
pipeline = CoxPipeline(alpha=0.5, n_splits=5, random_state=42)

# Run the pipeline
test_concordance, coefficients = pipeline.run_pipeline(data, features, target, duration)



['HOXD11', 'MSN', 'FKBP9', 'HOXC13', 'CSMD3', 'PAX3', 'TMSB4X', 'HMGA2', 'BCR', 'CPEB3']
OS
OS.time
Fold 1 Concordance Index: 0.8168693009118541
Fold 2 Concordance Index: 0.778382053025153
Fold 3 Concordance Index: 0.8288401253918495
Fold 4 Concordance Index: 0.8282977155490051
Fold 5 Concordance Index: 0.8604802076573653
Average Concordance Index: 0.8225738805070453
Test Data Concordance Index: 0.8637011690903906
HOXD11      0.254980
MSN         0.089796
FKBP9       0.045796
HOXC13      0.287501
CSMD3       0.212408
PAX3        0.126535
TMSB4X      0.677608
HMGA2      -0.035175
BCR        -0.478221
CPEB3      -0.440076
IGF2BP2     0.191155
ALDH2       0.244390
ABI1       -0.182505
SOCS1       0.238210
KAT6B       0.206629
MN1         0.038503
RUNX1T1     0.082642
MYD88       0.345199
MACC1      -0.041862
FOXO4       0.030114
PATZ1      -0.787526
PTPRT       0.189065
NUMA1       1.249818
BCL7A      -0.106863
HLF        -0.093394
BCL3        0.104274
TET1        0.223593
LRP1B       0.0

# Correlated

In [14]:
target_columns = ['OS','OS.time']
corr_value = get_high_correlation_features(data, target_columns, threshold=0.3)
#print(corr_value)
print(len(corr_value))
sorted_corr_value = dict(sorted(corr_value.items(), key=lambda item: item[1], reverse=True))


131


# top 10

In [15]:
top_10_genes = list(sorted_corr_value.keys())[:10]
print(top_10_genes)

# Assuming 'target' and 'duration' are already identified from the column names
target_columns = ['OS','OS.time']

# Combine the top 20 genes and the additional columns
columns_to_select = top_10_genes + target_columns

# Create a new DataFrame with these columns
top_10_genes_df = data[columns_to_select]

column_names = top_10_genes_df.columns.tolist()
features = column_names[:-2]
target = column_names[-2]
print(target)
duration = column_names[-1]
print(duration)

# Initialize CoxPipeline with default parameters
pipeline = CoxPipeline(alpha=0.5, n_splits=5, random_state=42)

# Run the pipeline
test_concordance, coefficients = pipeline.run_pipeline(data, features, target, duration)

['HOXD11', 'MSN', 'IGF2BP2', 'TMSB4X', 'HOXC11', 'FKBP9', 'HOXC13', 'HOXD13', 'HMGA2', 'CDKN2C']
OS
OS.time
Fold 1 Concordance Index: 0.8168693009118541
Fold 2 Concordance Index: 0.77906186267845
Fold 3 Concordance Index: 0.8106583072100313
Fold 4 Concordance Index: 0.8312453942520265
Fold 5 Concordance Index: 0.8715120051914341
Average Concordance Index: 0.8218693740487593
Test Data Concordance Index: 0.8448816652409467
HOXD11     0.189379
MSN        0.204836
IGF2BP2    0.077783
TMSB4X     0.295567
HOXC11     0.564582
FKBP9      0.348741
HOXC13    -0.179171
HOXD13     0.010995
HMGA2      0.150032
CDKN2C     0.007428
dtype: float64


# top 20

In [16]:
top_20_genes = list(sorted_corr_value.keys())[:20]
print(top_20_genes)

# Assuming 'target' and 'duration' are already identified from the column names
target_columns = ['OS','OS.time']

# Combine the top 20 genes and the additional columns
columns_to_select = top_20_genes + target_columns

# Create a new DataFrame with these columns
top_20_genes_df = data[columns_to_select]

column_names = top_20_genes_df.columns.tolist()
features = column_names[:-2]
target = column_names[-2]
print(target)
duration = column_names[-1]
print(duration)

# Initialize CoxPipeline with default parameters
pipeline = CoxPipeline(alpha=0.5, n_splits=5, random_state=42)

# Run the pipeline
test_concordance, coefficients = pipeline.run_pipeline(data, features, target, duration)

['HOXD11', 'MSN', 'IGF2BP2', 'TMSB4X', 'HOXC11', 'FKBP9', 'HOXC13', 'HOXD13', 'HMGA2', 'CDKN2C', 'STIL', 'MACC1', 'PAX3', 'CHEK2', 'SOCS1', 'BCL2L12', 'COL3A1', 'LATS2', 'TPM4', 'ASPM']
OS
OS.time
Fold 1 Concordance Index: 0.8358662613981763
Fold 2 Concordance Index: 0.7831407205982325
Fold 3 Concordance Index: 0.8169278996865204
Fold 4 Concordance Index: 0.8459837877671333
Fold 5 Concordance Index: 0.8559377027903958
Average Concordance Index: 0.8275712744480916
Test Data Concordance Index: 0.8622754491017964
HOXD11     0.185054
MSN        0.163556
IGF2BP2   -0.004158
TMSB4X     0.461054
HOXC11     0.469886
FKBP9      0.253069
HOXC13    -0.132602
HOXD13    -0.033937
HMGA2      0.208178
CDKN2C    -0.105883
STIL      -0.244773
MACC1      0.089830
PAX3       0.072445
CHEK2      0.031573
SOCS1      0.273820
BCL2L12   -0.337399
COL3A1    -0.008903
LATS2      0.081562
TPM4      -0.065709
ASPM       0.577151
dtype: float64


# top 30

In [17]:
top_30_genes = list(sorted_corr_value.keys())[:30]
print(top_30_genes)

# Assuming 'target' and 'duration' are already identified from the column names
target_columns = ['OS','OS.time']

# Combine the top 20 genes and the additional columns
columns_to_select = top_30_genes + target_columns

# Create a new DataFrame with these columns
top_30_genes_df = data[columns_to_select]

column_names = top_30_genes_df.columns.tolist()
features = column_names[:-2]
target = column_names[-2]
print(target)
duration = column_names[-1]
print(duration)

# Initialize CoxPipeline with default parameters
pipeline = CoxPipeline(alpha=0.5, n_splits=5, random_state=42)

# Run the pipeline
test_concordance, coefficients = pipeline.run_pipeline(data, features, target, duration)

['HOXD11', 'MSN', 'IGF2BP2', 'TMSB4X', 'HOXC11', 'FKBP9', 'HOXC13', 'HOXD13', 'HMGA2', 'CDKN2C', 'STIL', 'MACC1', 'PAX3', 'CHEK2', 'SOCS1', 'BCL2L12', 'COL3A1', 'LATS2', 'TPM4', 'ASPM', 'HOXA9', 'BCL3', 'MYD88', 'CASP8', 'COL1A1', 'FCGR2B', 'PDCD1LG2', 'BRIP1', 'RMI2', 'BUB1B']
OS
OS.time
Fold 1 Concordance Index: 0.8328267477203647
Fold 2 Concordance Index: 0.7845003399048266
Fold 3 Concordance Index: 0.8100313479623824
Fold 4 Concordance Index: 0.8135593220338984
Fold 5 Concordance Index: 0.8500973393900065
Average Concordance Index: 0.8182030194022957
Test Data Concordance Index: 0.8637011690903906
HOXD11      0.245136
MSN         0.107795
IGF2BP2    -0.059545
TMSB4X      0.508607
HOXC11      0.555800
FKBP9       0.292064
HOXC13     -0.165310
HOXD13     -0.039051
HMGA2       0.291819
CDKN2C     -0.097266
STIL       -0.338153
MACC1       0.099815
PAX3        0.075325
CHEK2       0.025352
SOCS1       0.232120
BCL2L12    -0.352918
COL3A1     -0.383580
LATS2       0.096851
TPM4       -0

# top 50

In [18]:
top_50_genes = list(sorted_corr_value.keys())[:50]
print(top_50_genes)

# Assuming 'target' and 'duration' are already identified from the column names
target_columns = ['OS','OS.time']

# Combine the top 20 genes and the additional columns
columns_to_select = top_50_genes + target_columns

# Create a new DataFrame with these columns
top_50_genes_df = data[columns_to_select]

column_names = top_50_genes_df.columns.tolist()
features = column_names[:-2]
target = column_names[-2]
print(target)
duration = column_names[-1]
print(duration)

# Initialize CoxPipeline with default parameters
pipeline = CoxPipeline(alpha=0.5, n_splits=5, random_state=42)

# Run the pipeline
test_concordance, coefficients = pipeline.run_pipeline(data, features, target, duration)

['HOXD11', 'MSN', 'IGF2BP2', 'TMSB4X', 'HOXC11', 'FKBP9', 'HOXC13', 'HOXD13', 'HMGA2', 'CDKN2C', 'STIL', 'MACC1', 'PAX3', 'CHEK2', 'SOCS1', 'BCL2L12', 'COL3A1', 'LATS2', 'TPM4', 'ASPM', 'HOXA9', 'BCL3', 'MYD88', 'CASP8', 'COL1A1', 'FCGR2B', 'PDCD1LG2', 'BRIP1', 'RMI2', 'BUB1B', 'POLQ', 'RUNX1', 'APOBEC3B', 'TNFRSF14', 'FANCD2', 'KNL1', 'ELF4', 'CASP3', 'BRCA2', 'FADD', 'HOXA11', 'HK3', 'TFRC', 'SLC34A2', 'FANCA', 'SFRP4', 'CD79B', 'CHIC2', 'WT1', 'CD74']
OS
OS.time
Fold 1 Concordance Index: 0.8123100303951368
Fold 2 Concordance Index: 0.8008157715839564
Fold 3 Concordance Index: 0.7749216300940439
Fold 4 Concordance Index: 0.8305084745762712
Fold 5 Concordance Index: 0.8416612589227774
Average Concordance Index: 0.8120434331144372
Test Data Concordance Index: 0.8756772169945822
HOXD11      0.249465
MSN         0.212950
IGF2BP2    -0.135972
TMSB4X      0.596003
HOXC11      0.415499
FKBP9       0.231488
HOXC13     -0.019015
HOXD13     -0.127883
HMGA2       0.328153
CDKN2C     -0.050140
S