In [2]:
!pip install lifelines

Collecting lifelines
  Downloading lifelines-0.30.0-py3-none-any.whl.metadata (3.2 kB)
Collecting autograd>=1.5 (from lifelines)
  Downloading autograd-1.7.0-py3-none-any.whl.metadata (7.5 kB)
Collecting autograd-gamma>=0.3 (from lifelines)
  Downloading autograd-gamma-0.5.0.tar.gz (4.0 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting formulaic>=0.2.2 (from lifelines)
  Downloading formulaic-1.0.2-py3-none-any.whl.metadata (6.8 kB)
Collecting interface-meta>=1.2.0 (from formulaic>=0.2.2->lifelines)
  Downloading interface_meta-1.3.0-py3-none-any.whl.metadata (6.7 kB)
Downloading lifelines-0.30.0-py3-none-any.whl (349 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m349.3/349.3 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading autograd-1.7.0-py3-none-any.whl (52 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.5/52.5 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading formulaic-1.0.2-py3-

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from lifelines import CoxPHFitter
from lifelines.utils import concordance_index

# Load data
train = pd.read_csv('/mnt/data/train.csv')
test = pd.read_csv('/mnt/data/test.csv')
data_dict = pd.read_csv('/mnt/data/data_dictionary.csv')

# Quick data inspection
print("Train Data Shape:", train.shape)
print("Test Data Shape:", test.shape)
print("Columns in Train:", train.columns)

# Extract target variables
event_col = 'efs'
time_col = 'efs_time'

# Preprocess the training data
# Encode categorical variables
categorical_cols = train.select_dtypes(include=['object', 'category']).columns
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    train[col] = train[col].fillna('Missing')
    train[col] = le.fit_transform(train[col])
    label_encoders[col] = le

# Scale numerical variables
numerical_cols = train.select_dtypes(include=['float64', 'int64']).columns.drop([time_col])
scaler = StandardScaler()
train[numerical_cols] = scaler.fit_transform(train[numerical_cols])

# Stratified Concordance Index calculation
def stratified_c_index(df, risk_scores, time_col, event_col, stratify_col):
    groups = df[stratify_col].unique()
    c_indices = []

    for group in groups:
        group_data = df[df[stratify_col] == group]
        c_index = concordance_index(group_data[time_col], risk_scores[group_data.index], group_data[event_col])
        c_indices.append(c_index)

    return np.mean(c_indices) - np.std(c_indices)

# Prepare data for CoxPH model
X = train.drop(columns=[time_col, event_col, 'race_group'])
y = train[[time_col, event_col]]

cox_model = CoxPHFitter()
cox_model.fit(pd.concat([X, y], axis=1), duration_col=time_col, event_col=event_col)

# Calculate risk scores and Stratified Concordance Index
train['risk_score'] = cox_model.predict_partial_hazard(X)
c_index_score = stratified_c_index(train, train['risk_score'], time_col, event_col, stratify_col='race_group')

print("Stratified Concordance Index on Train:", c_index_score)

# Preprocess test data
for col in categorical_cols:
    test[col] = test[col].fillna('Missing')
    test[col] = label_encoders[col].transform(test[col])

test[numerical_cols] = scaler.transform(test[numerical_cols])

# Predict on test set
test['risk_score'] = cox_model.predict_partial_hazard(test)
submission = test[['ID', 'risk_score']]
submission.columns = ['ID', 'prediction']

# Save submission file
submission.to_csv('/mnt/data/submission.csv', index=False)
print("Submission file saved as 'submission.csv'")
