In [None]:
!pip install lifelines

In [None]:
!pip install scikit-survival


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from lifelines import CoxPHFitter
from lifelines.utils import concordance_index

In [None]:
# Load data
train = pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/train.csv')
test = pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/test.csv')
data_dict = pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/data_dictionary.csv')


In [None]:
# Quick data inspection
print("Train Data Shape:", train.shape)
print("Test Data Shape:", test.shape)
print("Columns in Train:", train.columns)


In [None]:
# Extract target variables
event_col = 'efs'
time_col = 'efs_time'

In [None]:
# Preprocess the training data
# Encode categorical variables
categorical_cols = train.select_dtypes(include=['object', 'category']).columns
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    train[col] = train[col].fillna('Missing')
    train[col] = le.fit_transform(train[col])
    label_encoders[col] = le


In [None]:
# Scale numerical variables excluding race_group
numerical_cols = train.select_dtypes(include=['float64', 'int64']).columns.drop([time_col, event_col, 'race_group'])
scaler = StandardScaler()
train[numerical_cols] = train[numerical_cols].fillna(0)  # Fill NaNs with 0 before scaling
train[numerical_cols] = scaler.fit_transform(train[numerical_cols])

# Ensure race_group remains an integer
if train['race_group'].dtype != 'int':
    race_group_encoder = LabelEncoder()
    train['race_group'] = race_group_encoder.fit_transform(train['race_group'])
train['race_group'] = train['race_group'].astype(int)
y = train['race_group']  # Ensure y is categorical



In [None]:
# Scale numerical variables
numerical_cols = train.select_dtypes(include=['float64', 'int64']).columns.drop([time_col, event_col])
scaler = StandardScaler()
train[numerical_cols] = train[numerical_cols].fillna(0)  # Fill NaNs with 0 before scaling
train[numerical_cols] = scaler.fit_transform(train[numerical_cols])


In [None]:
def stratified_c_index(df, risk_scores, time_col, event_col, stratify_col):
    groups = df[stratify_col].unique()
    c_indices = []

    for group in groups:
        group_data = df[df[stratify_col] == group]
        
        # Align indices between group_data and risk_scores
        aligned_risk_scores = risk_scores.loc[group_data.index.intersection(risk_scores.index)]
        
        # Ensure that aligned_risk_scores matches group_data
        group_data = group_data.loc[aligned_risk_scores.index]
        
        c_index = concordance_index(group_data[time_col], aligned_risk_scores, group_data[event_col])
        c_indices.append(c_index)

    return np.mean(c_indices) - np.std(c_indices)



In [None]:
# Scale numerical variables excluding race_group
numerical_cols = train.select_dtypes(include=['float64', 'int64']).columns.drop([time_col, event_col, 'race_group'])
scaler = StandardScaler()
train[numerical_cols] = train[numerical_cols].fillna(0)  # Fill NaNs with 0 before scaling
train[numerical_cols] = scaler.fit_transform(train[numerical_cols])

# Ensure race_group remains as a categorical variable
if train['race_group'].dtype != 'int':
    race_group_encoder = LabelEncoder()
    train['race_group'] = race_group_encoder.fit_transform(train['race_group'])
train['race_group'] = train['race_group'].astype(int)  # Explicitly cast to integer

# Prepare data for Cox Proportional Hazards model
X = train.drop(columns=[time_col, event_col, 'race_group'])
y = train['race_group']

# Validation to check if y is suitable for StratifiedKFold
print("Unique values in y before splitting:", np.unique(y))
assert y.dtype == int, "y should be integer type for StratifiedKFold"
assert len(np.unique(y)) > 1, "y must have more than one unique value for stratification"


In [None]:
X_val.loc[:, 'risk_score'] = cox_model.predict_partial_hazard(X_val)


In [None]:
# Proceed with StratifiedKFold
skf = StratifiedKFold(n_splits=5)
for train_index, val_index in skf.split(X, y):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = train.iloc[train_index], train.iloc[val_index]

    # Train Cox Proportional Hazards model
    cox_model = CoxPHFitter()
    cox_model.fit(pd.concat([X_train, y_train[[time_col, event_col]]], axis=1), duration_col=time_col, event_col=event_col)

    # Predict risk scores and maintain index
    X_val.loc[:, 'risk_score'] = cox_model.predict_partial_hazard(X_val)

    # Use risk scores directly (indices aligned with X_val)
    risk_scores = X_val['risk_score']

    # Calculate Stratified Concordance Index
    c_index_score = stratified_c_index(train, risk_scores, time_col, event_col, stratify_col='race_group')

    print("Stratified Concordance Index on Validation:", c_index_score)



In [None]:
# Prepare data for Cox Proportional Hazards model
X = train.drop(columns=[time_col, event_col, 'race_group'])
y = train['race_group']  # Ensure y is the correct column for stratification

# Validation to check if y is suitable for StratifiedKFold
print("Unique values in y before splitting:", np.unique(y))
assert y.dtype == int, "y should be integer type for StratifiedKFold"
assert len(np.unique(y)) > 1, "y must have more than one unique value for stratification"

# Proceed with StratifiedKFold
skf = StratifiedKFold(n_splits=5)
for train_index, val_index in skf.split(X, y):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = train.iloc[train_index], train.iloc[val_index]

    # Train Cox Proportional Hazards model
    cox_model = CoxPHFitter()
    cox_model.fit(pd.concat([X_train, y_train[[time_col, event_col]]], axis=1), duration_col=time_col, event_col=event_col)

    # Calculate risk scores and Stratified Concordance Index
    X_val['risk_score'] = cox_model.predict_partial_hazard(X_val)
    c_index_score = stratified_c_index(train, X_val['risk_score'], time_col, event_col, stratify_col='race_group')

    print("Stratified Concordance Index on Validation:", c_index_score)



In [None]:
# Split data for cross-validation
skf = StratifiedKFold(n_splits=5)
for train_index, val_index in skf.split(X, y):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = train.iloc[train_index], train.iloc[val_index]

    # Train Cox Proportional Hazards model
    cox_model = CoxPHFitter()
    cox_model.fit(pd.concat([X_train, y_train[[time_col, event_col]]], axis=1), duration_col=time_col, event_col=event_col)

    # Calculate risk scores and Stratified Concordance Index
    X_val['risk_score'] = cox_model.predict_partial_hazard(X_val)
    c_index_score = stratified_c_index(train, X_val['risk_score'], time_col, event_col, stratify_col='race_group')

    print("Stratified Concordance Index on Validation:", c_index_score)

In [None]:
# Check for NaN values before fitting
if X.isnull().any().any():
    print("NaN values found in features. Filling with 0.")
    X = X.fillna(0)

In [None]:
# Ensure race_group is categorical
print("Unique values in y before splitting:", np.unique(y))
assert y.dtype == int, "y should be integer type for StratifiedKFold"

# Ensure no NaNs or unexpected values in y
assert not y.isnull().any(), "y contains NaNs!"
assert len(np.unique(y)) > 1, "y must have more than one unique value for stratification"

# Proceed with StratifiedKFold
skf = StratifiedKFold(n_splits=5)
for train_index, val_index in skf.split(X, y):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = train.iloc[train_index], train.iloc[val_index]

    # Train Cox Proportional Hazards model
    cox_model = CoxPHFitter()
    cox_model.fit(pd.concat([X_train, y_train[[time_col, event_col]]], axis=1), duration_col=time_col, event_col=event_col)

    # Calculate risk scores and Stratified Concordance Index
    X_val['risk_score'] = cox_model.predict_partial_hazard(X_val)
    c_index_score = stratified_c_index(train, X_val['risk_score'], time_col, event_col, stratify_col='race_group')

    print("Stratified Concordance Index on Validation:", c_index_score)


In [None]:
# Preprocess test data
for col in categorical_cols:
    test[col] = test[col].fillna('Missing')
    if col in label_encoders:
        test[col] = test[col].map(lambda x: label_encoders[col].classes_[0] if x not in label_encoders[col].classes_ else x)
        test[col] = label_encoders[col].transform(test[col])

if not set(numerical_cols).issubset(test.columns):
    missing_cols = set(numerical_cols) - set(test.columns)
    for col in missing_cols:
        test[col] = 0  # Add missing numerical columns with default value 0

test[numerical_cols] = test[numerical_cols].fillna(0)  # Fill NaNs with 0 before scaling
test[numerical_cols] = scaler.transform(test[numerical_cols])

In [None]:
# Predict on test set
test['risk_score'] = cox_model.predict_partial_hazard(test)
submission = test[['ID', 'risk_score']]
submission.columns = ['ID', 'prediction']

In [None]:
# Save submission file
submission.to_csv('/kaggle/working/submission.csv', index=False)
print("Submission file saved as '/kaggle/working/submission.csv'")