In [1]:
# Install required libraries
!pip install optuna xgboost pandas numpy scikit-learn lifelines

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from lifelines.utils import concordance_index
from xgboost import XGBRegressor
from sklearn.feature_selection import SelectKBest, f_regression
import optuna

# Step 1: Load and Preprocess Dataset
def load_and_preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Separate numeric and categorical columns
    numeric_cols = train_data.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = train_data.select_dtypes(include=["object"]).columns.tolist()

    # Handle missing values
    train_data[numeric_cols] = train_data[numeric_cols].fillna(train_data[numeric_cols].median())
    test_data = test_data.reindex(columns=train_data.columns, fill_value=0)
    test_data[numeric_cols] = test_data[numeric_cols].fillna(train_data[numeric_cols].median())

    train_data[categorical_cols] = train_data[categorical_cols].fillna(train_data[categorical_cols].mode().iloc[0])
    test_data[categorical_cols] = test_data[categorical_cols].fillna(train_data[categorical_cols].mode().iloc[0])

    # Key columns
    key_columns = ["ID", "efs", "efs_time"]
    if 'survival_time' in train_data.columns and 'event_occurred' in train_data.columns:
        key_columns.extend(["survival_time", "event_occurred"])

    key_train_data = train_data[key_columns]
    key_test_data = test_data[["ID"]]

    # One-hot encode categorical variables
    train_data = pd.get_dummies(train_data.drop(columns=key_columns), drop_first=True)
    test_data = pd.get_dummies(test_data.drop(columns=["ID"]), drop_first=True)

    train_data, test_data = train_data.align(test_data, join="left", axis=1)
    test_data.fillna(0, inplace=True)

    # Reattach key columns
    train_data = pd.concat([key_train_data, train_data], axis=1)
    test_data = pd.concat([key_test_data, test_data], axis=1)

    train_data = shuffle(train_data, random_state=42)
    return train_data, test_data

# Step 2: Prepare Features and Targets
def prepare_features_and_targets(train_data, test_data):
    X = train_data.drop(columns=["ID", "efs", "efs_time"])
    target_column = "efs_time" if "efs_time" in train_data.columns else "survival_time"
    y = train_data[target_column]

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    selector = SelectKBest(f_regression, k=min(10, X.shape[1]))
    X_selected = selector.fit_transform(X_scaled, y)

    X_test = test_data.drop(columns=["ID"])
    X_test_scaled = scaler.transform(X_test)
    X_test_selected = selector.transform(X_test_scaled)

    return X_selected, y, X_test_selected, test_data["ID"]

# Step 3: Define Objective Function for Optuna
def objective(trial, X, y):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 1500),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-4, 1e-1),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-4, 1e-1)
    }

    model = XGBRegressor(**params, random_state=42)
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    c_indices = []
    for train_idx, val_idx in kf.split(X):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model.fit(X_train, y_train)
        val_predictions = model.predict(X_val)
        c_index = concordance_index(y_val, -val_predictions)
        c_indices.append(c_index)

    return np.mean(c_indices)

# Step 4: Hyperparameter Optimization
def optimize_hyperparameters(X, y, n_trials=50):
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, X, y), n_trials=n_trials)
    return study.best_params

# Step 5: Train Final Model and Make Predictions
def train_and_predict(X, y, X_test, best_params):
    best_model = XGBRegressor(**best_params, random_state=42)
    best_model.fit(X, y)
    predictions = best_model.predict(X_test)
    return predictions

# Main Execution
train_path = "/kaggle/input/hcat-data/train.csv"
test_path = "/kaggle/input/hcat-data/test.csv"
train_data, test_data = load_and_preprocess_data(train_path, test_path)
X, y, X_test, test_ids = prepare_features_and_targets(train_data, test_data)

best_params = optimize_hyperparameters(X, y, n_trials=50)
print("Best parameters:", best_params)

predictions = train_and_predict(X, y, X_test, best_params)

# Create Submission File
submission = pd.DataFrame({
    "ID": test_ids.astype(int),
    "prediction": predictions
})
submission.to_csv("submission.csv", index=False)

# Additional Evaluation Metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error

def evaluate_model(X, y, best_params):
    best_model = XGBRegressor(**best_params, random_state=42)
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    mae_scores, rmse_scores, c_indices = [], [], []
    for train_idx, val_idx in kf.split(X):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        best_model.fit(X_train, y_train)
        val_predictions = best_model.predict(X_val)

        mae_scores.append(mean_absolute_error(y_val, val_predictions))
        rmse_scores.append(np.sqrt(mean_squared_error(y_val, val_predictions)))
        c_indices.append(concordance_index(y_val, -val_predictions))

    print("Mean Absolute Error (MAE):", np.mean(mae_scores))
    print("Root Mean Squared Error (RMSE):", np.mean(rmse_scores))
    print("Concordance Index (C-index):", np.mean(c_indices))

evaluate_model(X, y, best_params)


Collecting lifelines
  Downloading lifelines-0.30.0-py3-none-any.whl.metadata (3.2 kB)
Collecting autograd-gamma>=0.3 (from lifelines)
  Downloading autograd-gamma-0.5.0.tar.gz (4.0 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting formulaic>=0.2.2 (from lifelines)
  Downloading formulaic-1.1.1-py3-none-any.whl.metadata (6.9 kB)
Collecting interface-meta>=1.2.0 (from formulaic>=0.2.2->lifelines)
  Downloading interface_meta-1.3.0-py3-none-any.whl.metadata (6.7 kB)
Downloading lifelines-0.30.0-py3-none-any.whl (349 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m349.3/349.3 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading formulaic-1.1.1-py3-none-any.whl (115 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.7/115.7 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading interface_meta-1.3.0-py3-none-any.whl (14 kB)
Building wheels for collected packages: autograd-gamma
  Building wheel for aut

[I 2025-01-26 10:29:47,228] A new study created in memory with name: no-name-5c2177fb-d26b-4678-a41b-9279115973db
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-4, 1e-1),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-4, 1e-1)
[I 2025-01-26 10:30:01,548] Trial 0 finished with value: 0.407588682892182 and parameters: {'n_estimators': 869, 'learning_rate': 0.02147880014276471, 'max_depth': 9, 'subsample': 0.9709043136355503, 'colsample_bytree': 0.9779429622217032, 'reg_alpha': 0.07758691682618124, 'reg_lambda': 0.003536448404801696}. Best is trial 0 with value: 0.407588682892182.
[I 2025-01-26 10:30:08,270] Trial 1 finished with value: 0.3939604632774049 and parameters: {'n_estimators': 1196, 'learning_rate': 0.08265985920696794, 'max_depth': 3, 'subsample': 0.65

Best parameters: {'n_estimators': 1381, 'learning_rate': 0.09074663435834068, 'max_depth': 11, 'subsample': 0.7860117323972773, 'colsample_bytree': 0.9699622653443711, 'reg_alpha': 0.06501236757815101, 'reg_lambda': 0.056270653111641214}
Mean Absolute Error (MAE): 19.9054285697362
Root Mean Squared Error (RMSE): 28.04351630160472
Concordance Index (C-index): 0.4412656714841128
