In [1]:
# Install required libraries
!pip install autograd==1.7.0 autograd-gamma==0.5.0 interface_meta==1.3.0 formulaic==1.0.2 lifelines==0.30.0
!pip install optuna xgboost lightgbm catboost pandas numpy scikit-learn

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from lifelines.utils import concordance_index
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.feature_selection import SelectKBest, f_regression
import optuna
import os

# Step 1: Load and Preprocess Dataset
def load_and_preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Separate numeric and categorical columns
    numeric_cols = train_data.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = train_data.select_dtypes(include=["object"]).columns.tolist()

    # Handle missing values
    train_data[numeric_cols] = train_data[numeric_cols].fillna(train_data[numeric_cols].median())
    test_data = test_data.reindex(columns=train_data.columns, fill_value=0)
    test_data[numeric_cols] = test_data[numeric_cols].fillna(train_data[numeric_cols].median())

    train_data[categorical_cols] = train_data[categorical_cols].fillna(train_data[categorical_cols].mode().iloc[0])
    test_data[categorical_cols] = test_data[categorical_cols].fillna(train_data[categorical_cols].mode().iloc[0])

    # Key columns
    key_columns = ["ID", "efs", "efs_time"]
    key_train_data = train_data[key_columns]
    key_test_data = test_data[["ID"]]

    # One-hot encode categorical variables
    train_data = pd.get_dummies(train_data.drop(columns=key_columns), drop_first=True)
    test_data = pd.get_dummies(test_data.drop(columns=["ID"]), drop_first=True)

    train_data, test_data = train_data.align(test_data, join="left", axis=1)
    test_data.fillna(0, inplace=True)

    # Reattach key columns
    train_data = pd.concat([key_train_data, train_data], axis=1)
    test_data = pd.concat([key_test_data, test_data], axis=1)

    train_data = shuffle(train_data, random_state=42)
    return train_data, test_data

# Step 2: Prepare Features and Targets
def prepare_features_and_targets(train_data, test_data):
    X = train_data.drop(columns=["ID", "efs", "efs_time"])
    y = train_data["efs_time"]

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    selector = SelectKBest(f_regression, k=min(10, X.shape[1]))
    X_selected = selector.fit_transform(X_scaled, y)

    X_test = test_data.drop(columns=["ID"])
    X_test_scaled = scaler.transform(X_test)
    X_test_selected = selector.transform(X_test_scaled)

    return X_selected, y, X_test_selected, test_data["ID"]

# Step 3: Define Objective Function for Optuna
def objective(trial, X, y):
    model_type = trial.suggest_categorical('model_type', ['xgboost', 'lightgbm', 'catboost'])
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 1500),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'subsample': trial.suggest_uniform('subsample', 0.6, 1.0)
    }
    if model_type == 'xgboost':
        params['colsample_bytree'] = trial.suggest_uniform('colsample_bytree', 0.6, 1.0)
        params['reg_alpha'] = trial.suggest_loguniform('reg_alpha', 1e-4, 1e-1)
        params['reg_lambda'] = trial.suggest_loguniform('reg_lambda', 1e-4, 1e-1)
        model = XGBRegressor(**params, random_state=42)
    elif model_type == 'lightgbm':
        params['colsample_bytree'] = trial.suggest_uniform('colsample_bytree', 0.6, 1.0)
        params['reg_alpha'] = trial.suggest_loguniform('reg_alpha', 1e-4, 1e-1)
        params['reg_lambda'] = trial.suggest_loguniform('reg_lambda', 1e-4, 1e-1)
        model = LGBMRegressor(**params, random_state=42)
    elif model_type == 'catboost':
        model = CatBoostRegressor(**params, random_state=42, verbose=0)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    c_indices = []
    for train_idx, val_idx in kf.split(X):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        model.fit(X_train, y_train)
        val_predictions = model.predict(X_val)
        c_index = concordance_index(y_val, -val_predictions)
        c_indices.append(c_index)

    return np.mean(c_indices)

# Step 4: Hyperparameter Optimization
def optimize_hyperparameters(X, y, n_trials=100):
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, X, y), n_trials=n_trials)
    return study.best_params

# Step 5: Train Final Model and Make Predictions
def train_and_predict(X, y, X_test, best_params):
    if best_params['model_type'] == 'xgboost':
        model = XGBRegressor(**best_params, random_state=42)
    elif best_params['model_type'] == 'lightgbm':
        model = LGBMRegressor(**best_params, random_state=42)
    elif best_params['model_type'] == 'catboost':
        model = CatBoostRegressor(**best_params, random_state=42, verbose=0)
    
    model.fit(X, y)
    predictions = model.predict(X_test)
    return predictions

# Main Execution
train_path = "/kaggle/input/equity-post-HCT-survival-predictions/train.csv"
test_path = "/kaggle/input/equity-post-HCT-survival-predictions/test.csv"
train_data, test_data = load_and_preprocess_data(train_path, test_path)
X, y, X_test, test_ids = prepare_features_and_targets(train_data, test_data)

# Hyperparameter Optimization
best_params = optimize_hyperparameters(X, y, n_trials=100)
print("Best parameters:", best_params)

# Train Final Model and Make Predictions
predictions = train_and_predict(X, y, X_test, best_params)

# Step 1: Load the Submission File
submission = pd.read_csv("/kaggle/input/equity-post-HCT-survival-predictions/sample_submission.csv")

# Step 3: Save the Submission File
submission = pd.DataFrame({
    "ID": test_ids.astype(int),
    "prediction": predictions
})
submission.to_csv("/kaggle/working/Submission.csv", index=False)

# Step 4: Print the Shape of Submission File
print("Sub shape:", submission.shape)

# Step 5: Display the First Few Rows
print(submission.head())

# Additional Evaluation Metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error

def evaluate_model(X, y, best_params):
    if best_params['model_type'] == 'xgboost':
        model = XGBRegressor(**best_params, random_state=42)
    elif best_params['model_type'] == 'lightgbm':
        model = LGBMRegressor(**best_params, random_state=42)
    elif best_params['model_type'] == 'catboost':
        model = CatBoostRegressor(**best_params, random_state=42, verbose=0)
    
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    mae_scores


Collecting autograd-gamma==0.5.0
  Downloading autograd-gamma-0.5.0.tar.gz (4.0 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting interface_meta==1.3.0
  Downloading interface_meta-1.3.0-py3-none-any.whl.metadata (6.7 kB)
Collecting formulaic==1.0.2
  Downloading formulaic-1.0.2-py3-none-any.whl.metadata (6.8 kB)
Collecting lifelines==0.30.0
  Downloading lifelines-0.30.0-py3-none-any.whl.metadata (3.2 kB)
Downloading interface_meta-1.3.0-py3-none-any.whl (14 kB)
Downloading formulaic-1.0.2-py3-none-any.whl (94 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.5/94.5 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lifelines-0.30.0-py3-none-any.whl (349 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m349.3/349.3 kB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: autograd-gamma
  Building wheel for autograd-gamma (setup.py) ... [?25l[?25hdone
  Created wheel 

[I 2025-01-28 03:55:31,662] A new study created in memory with name: no-name-213c2b96-68df-424d-ab83-76bfc0b0ace8
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0)
[I 2025-01-28 03:56:46,596] Trial 0 finished with value: 0.40870480215892435 and parameters: {'model_type': 'catboost', 'n_estimators': 1185, 'learning_rate': 0.0728038718543334, 'max_depth': 11, 'subsample': 0.9850268167513618}. Best is trial 0 with value: 0.40870480215892435.
  params['colsample_bytree'] = trial.suggest_uniform('colsample_bytree', 0.6, 1.0)
  params['reg_alpha'] = trial.suggest_loguniform('reg_alpha', 1e-4, 1e-1)
  params['reg_lambda'] = trial.suggest_loguniform('reg_lambda', 1e-4, 1e-1)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002869 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 307
[LightGBM] [Info] Number of data points in the train set: 23040, number of used features: 10
[LightGBM] [Info] Start training from score 23.370257
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001728 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 307
[LightGBM] [Info] Number of data points in the train set: 23040, number of used features: 10
[LightGBM] [Info] Start training from score 23.176595
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001799 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 307
[LightGBM] [Info] Number of data points in the train set:

[I 2025-01-28 03:56:56,514] Trial 1 finished with value: 0.3997043693607437 and parameters: {'model_type': 'lightgbm', 'n_estimators': 1337, 'learning_rate': 0.045569861414372505, 'max_depth': 9, 'subsample': 0.7567890549424783, 'colsample_bytree': 0.937845591696771, 'reg_alpha': 0.007684961882634736, 'reg_lambda': 0.08042738635547002}. Best is trial 0 with value: 0.40870480215892435.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0)
  params['colsample_bytree'] = trial.suggest_uniform('colsample_bytree', 0.6, 1.0)
  params['reg_alpha'] = trial.suggest_loguniform('reg_alpha', 1e-4, 1e-1)
  params['reg_lambda'] = trial.suggest_loguniform('reg_lambda', 1e-4, 1e-1)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000950 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 307
[LightGBM] [Info] Number of data points in the train set: 23040, number of used features: 10
[LightGBM] [Info] Start training from score 23.370257
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001057 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 307
[LightGBM] [Info] Number of data points in the train set: 23040, number of used features: 10
[LightGBM] [Info] Start training from score 23.176595
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001908 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bi

[I 2025-01-28 03:57:05,674] Trial 2 finished with value: 0.39532581648001985 and parameters: {'model_type': 'lightgbm', 'n_estimators': 1348, 'learning_rate': 0.048567044316046006, 'max_depth': 4, 'subsample': 0.6454828702585297, 'colsample_bytree': 0.9624417923490945, 'reg_alpha': 0.01436477753368571, 'reg_lambda': 0.025073277404419444}. Best is trial 0 with value: 0.40870480215892435.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0)
  params['colsample_bytree'] = trial.suggest_uniform('colsample_bytree', 0.6, 1.0)
  params['reg_alpha'] = trial.suggest_loguniform('reg_alpha', 1e-4, 1e-1)
  params['reg_lambda'] = trial.suggest_loguniform('reg_lambda', 1e-4, 1e-1)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002435 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 307
[LightGBM] [Info] Number of data points in the train set: 23040, number of used features: 10
[LightGBM] [Info] Start training from score 23.370257
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001678 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 307
[LightGBM] [Info] Number of data points in the train set: 23040, number of used features: 10
[LightGBM] [Info] Start training from score 23.176595
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001772 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 307
[LightGBM] [Info] Number of data points in the train set: 23040, number of used features: 10
[LightGBM] [Info] Start trai

[I 2025-01-28 03:57:12,005] Trial 3 finished with value: 0.3932792718090513 and parameters: {'model_type': 'lightgbm', 'n_estimators': 775, 'learning_rate': 0.029810458611565067, 'max_depth': 12, 'subsample': 0.9035953167812831, 'colsample_bytree': 0.8700215137401164, 'reg_alpha': 0.05161951323325421, 'reg_lambda': 0.0005235545446171093}. Best is trial 0 with value: 0.40870480215892435.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0)
[I 2025-01-28 03:57:25,927] Trial 4 finished with value: 0.3894956569851942 and parameters: {'model_type': 'catboost', 'n_estimators': 1309, 'learning_rate': 0.01367737265864179, 'max_depth': 5, 'subsample': 0.9252601722923515}. Best is trial 0 with value: 0.40870480215892435.
  params['colsample_bytree'] = trial.suggest_uniform('colsample_bytree', 0.6, 1.0)
  params['reg_alpha'] = trial.suggest_loguniform('reg_alpha', 1e-4, 1e-1)
  params['reg_lambda'] = trial.suggest_log

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001232 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 307
[LightGBM] [Info] Number of data points in the train set: 23040, number of used features: 10
[LightGBM] [Info] Start training from score 23.370257
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000920 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 307
[LightGBM] [Info] Number of data points in the train set: 23040, number of used features: 10
[LightGBM] [Info] Start training from score 23.176595
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000929 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enoug

[I 2025-01-28 03:57:34,278] Trial 5 finished with value: 0.39314152858752827 and parameters: {'model_type': 'lightgbm', 'n_estimators': 1011, 'learning_rate': 0.01753541479178952, 'max_depth': 6, 'subsample': 0.6118807488782804, 'colsample_bytree': 0.9943282329349517, 'reg_alpha': 0.02307359483263558, 'reg_lambda': 0.011381612137354126}. Best is trial 0 with value: 0.40870480215892435.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0)
  params['colsample_bytree'] = trial.suggest_uniform('colsample_bytree', 0.6, 1.0)
  params['reg_alpha'] = trial.suggest_loguniform('reg_alpha', 1e-4, 1e-1)
  params['reg_lambda'] = trial.suggest_loguniform('reg_lambda', 1e-4, 1e-1)
[I 2025-01-28 03:57:39,658] Trial 6 finished with value: 0.3945359745805702 and parameters: {'model_type': 'xgboost', 'n_estimators': 570, 'learning_rate': 0.03439102861415738, 'max_depth': 6, 'subsample': 0.7781954828521452, 'colsample_bytree':

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001839 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 307
[LightGBM] [Info] Number of data points in the train set: 23040, number of used features: 10
[LightGBM] [Info] Start training from score 23.370257
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000983 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 307
[LightGBM] [Info] Number of data points in the train set: 23040, number of used features: 10
[LightGBM] [Info] Start training from score 23.176595
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001883 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 307
[LightGBM] [Info] Number of data points in the train set:

[I 2025-01-28 04:07:10,111] Trial 29 finished with value: 0.39829658052287675 and parameters: {'model_type': 'lightgbm', 'n_estimators': 657, 'learning_rate': 0.07326248765060885, 'max_depth': 10, 'subsample': 0.6723530826958375, 'colsample_bytree': 0.8337458281509799, 'reg_alpha': 0.000632127811567464, 'reg_lambda': 0.09292361299072568}. Best is trial 23 with value: 0.4388516146368698.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0)
[I 2025-01-28 04:08:24,982] Trial 30 finished with value: 0.4084521196948673 and parameters: {'model_type': 'catboost', 'n_estimators': 1231, 'learning_rate': 0.06838705031095552, 'max_depth': 11, 'subsample': 0.6035919660791549}. Best is trial 23 with value: 0.4388516146368698.
  params['colsample_bytree'] = trial.suggest_uniform('colsample_bytree', 0.6, 1.0)
  params['reg_alpha'] = trial.suggest_loguniform('reg_alpha', 1e-4, 1e-1)
  params['reg_lambda'] = trial.suggest_l

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001805 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 307
[LightGBM] [Info] Number of data points in the train set: 23040, number of used features: 10
[LightGBM] [Info] Start training from score 23.370257
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001955 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 307
[LightGBM] [Info] Number of data points in the train set: 23040, number of used features: 10
[LightGBM] [Info] Start training from score 23.176595
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001845 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 307
[LightGBM] [Info] Number of data points in the train set: 23040, number of used features: 10
[LightGBM] [Info] Start trai

[I 2025-01-28 04:09:19,599] Trial 34 finished with value: 0.3986608748725192 and parameters: {'model_type': 'lightgbm', 'n_estimators': 837, 'learning_rate': 0.06367676768824665, 'max_depth': 11, 'subsample': 0.6385486702962015, 'colsample_bytree': 0.9208409129054079, 'reg_alpha': 0.00023155302540787463, 'reg_lambda': 0.014787941258194088}. Best is trial 23 with value: 0.4388516146368698.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0)
  params['colsample_bytree'] = trial.suggest_uniform('colsample_bytree', 0.6, 1.0)
  params['reg_alpha'] = trial.suggest_loguniform('reg_alpha', 1e-4, 1e-1)
  params['reg_lambda'] = trial.suggest_loguniform('reg_lambda', 1e-4, 1e-1)
[I 2025-01-28 04:09:34,031] Trial 35 finished with value: 0.4263219903606917 and parameters: {'model_type': 'xgboost', 'n_estimators': 1041, 'learning_rate': 0.048490108400174085, 'max_depth': 9, 'subsample': 0.7263189259943416, 'colsample_by

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001846 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 307
[LightGBM] [Info] Number of data points in the train set: 23040, number of used features: 10
[LightGBM] [Info] Start training from score 23.370257
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002033 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 307
[LightGBM] [Info] Number of data points in the train set: 23040, number of used features: 10
[LightGBM] [Info] Start training from score 23.176595
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001835 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 307
[LightGBM] [Info] Number of data points in the train set: 23040, number of used features: 10
[LightGBM] [Info] Start trai

[I 2025-01-28 04:09:43,299] Trial 36 finished with value: 0.40497309399194503 and parameters: {'model_type': 'lightgbm', 'n_estimators': 1232, 'learning_rate': 0.08359838121333384, 'max_depth': 10, 'subsample': 0.6771259064708178, 'colsample_bytree': 0.9446917675932092, 'reg_alpha': 0.005572797928442026, 'reg_lambda': 0.06640970794219371}. Best is trial 23 with value: 0.4388516146368698.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0)
  params['colsample_bytree'] = trial.suggest_uniform('colsample_bytree', 0.6, 1.0)
  params['reg_alpha'] = trial.suggest_loguniform('reg_alpha', 1e-4, 1e-1)
  params['reg_lambda'] = trial.suggest_loguniform('reg_lambda', 1e-4, 1e-1)
[I 2025-01-28 04:10:15,198] Trial 37 finished with value: 0.440053303279611 and parameters: {'model_type': 'xgboost', 'n_estimators': 948, 'learning_rate': 0.06053342025450243, 'max_depth': 12, 'subsample': 0.6273389723271785, 'colsample_bytre

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000991 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 307
[LightGBM] [Info] Number of data points in the train set: 23040, number of used features: 10
[LightGBM] [Info] Start training from score 23.370257
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000946 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 307
[LightGBM] [Info] Number of data points in the train set: 23040, number of used features: 10
[LightGBM] [Info] Start training from score 23.176595
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000936 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enoug

[I 2025-01-28 04:10:56,746] Trial 39 finished with value: 0.39789437897597574 and parameters: {'model_type': 'lightgbm', 'n_estimators': 1090, 'learning_rate': 0.043371363072613134, 'max_depth': 12, 'subsample': 0.6229386311972116, 'colsample_bytree': 0.9734844407962896, 'reg_alpha': 0.0010753796881615345, 'reg_lambda': 0.09281273501746974}. Best is trial 37 with value: 0.440053303279611.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0)
[I 2025-01-28 04:11:07,684] Trial 40 finished with value: 0.3910212683016466 and parameters: {'model_type': 'catboost', 'n_estimators': 995, 'learning_rate': 0.06137727048898851, 'max_depth': 5, 'subsample': 0.7719500288195053}. Best is trial 37 with value: 0.440053303279611.
  params['colsample_bytree'] = trial.suggest_uniform('colsample_bytree', 0.6, 1.0)
  params['reg_alpha'] = trial.suggest_loguniform('reg_alpha', 1e-4, 1e-1)
  params['reg_lambda'] = trial.suggest_lo

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001077 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 307
[LightGBM] [Info] Number of data points in the train set: 23040, number of used features: 10
[LightGBM] [Info] Start training from score 23.370257
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000895 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 307
[LightGBM] [Info] Number of data points in the train set: 23040, number of used features: 10
[LightGBM] [Info] Start training from score 23.176595
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001617 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bi

[I 2025-01-28 04:23:22,061] Trial 59 finished with value: 0.40623775748100976 and parameters: {'model_type': 'lightgbm', 'n_estimators': 1450, 'learning_rate': 0.09882431618075166, 'max_depth': 11, 'subsample': 0.8914789615913874, 'colsample_bytree': 0.601488317147369, 'reg_alpha': 0.010352824301919738, 'reg_lambda': 0.013396127368070622}. Best is trial 58 with value: 0.4442414545524092.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0)
  params['colsample_bytree'] = trial.suggest_uniform('colsample_bytree', 0.6, 1.0)
  params['reg_alpha'] = trial.suggest_loguniform('reg_alpha', 1e-4, 1e-1)
  params['reg_lambda'] = trial.suggest_loguniform('reg_lambda', 1e-4, 1e-1)
[I 2025-01-28 04:23:58,340] Trial 60 finished with value: 0.4415107232535389 and parameters: {'model_type': 'xgboost', 'n_estimators': 1343, 'learning_rate': 0.07873603639994041, 'max_depth': 12, 'subsample': 0.601915781785688, 'colsample_bytr

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002369 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 307
[LightGBM] [Info] Number of data points in the train set: 23040, number of used features: 10
[LightGBM] [Info] Start training from score 23.370257
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001714 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 307
[LightGBM] [Info] Number of data points in the train set: 23040, number of used features: 10
[LightGBM] [Info] Start training from score 23.176595
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001734 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 307
[LightGBM] [Info] Number of data points in the train set: 23040, number of used features: 10
[LightGBM] [Info] Start trai

[I 2025-01-28 04:34:28,431] Trial 78 finished with value: 0.406923217130433 and parameters: {'model_type': 'lightgbm', 'n_estimators': 1332, 'learning_rate': 0.09986996356383494, 'max_depth': 8, 'subsample': 0.6772794546426876, 'colsample_bytree': 0.6550345578870703, 'reg_alpha': 0.005399928236547739, 'reg_lambda': 0.0036540029284460925}. Best is trial 58 with value: 0.4442414545524092.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0)
  params['colsample_bytree'] = trial.suggest_uniform('colsample_bytree', 0.6, 1.0)
  params['reg_alpha'] = trial.suggest_loguniform('reg_alpha', 1e-4, 1e-1)
  params['reg_lambda'] = trial.suggest_loguniform('reg_lambda', 1e-4, 1e-1)
[I 2025-01-28 04:34:57,158] Trial 79 finished with value: 0.4388983175380129 and parameters: {'model_type': 'xgboost', 'n_estimators': 1273, 'learning_rate': 0.07642329780178239, 'max_depth': 11, 'subsample': 0.6663083910994488, 'colsample_bytr

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000998 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 307
[LightGBM] [Info] Number of data points in the train set: 23040, number of used features: 10
[LightGBM] [Info] Start training from score 23.370257
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001753 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 307
[LightGBM] [Info] Number of data points in the train set: 23040, number of used features: 10
[LightGBM] [Info] Start training from score 23.176595
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001890 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 307
[LightGBM] [Info] Number of data points in the train set:

[I 2025-01-28 04:46:57,594] Trial 96 finished with value: 0.4059382320533583 and parameters: {'model_type': 'lightgbm', 'n_estimators': 1476, 'learning_rate': 0.08104152792849521, 'max_depth': 12, 'subsample': 0.6271830899348732, 'colsample_bytree': 0.7875914706472921, 'reg_alpha': 0.006363007184025647, 'reg_lambda': 0.004071561362170131}. Best is trial 92 with value: 0.44447640873860506.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0)
  params['colsample_bytree'] = trial.suggest_uniform('colsample_bytree', 0.6, 1.0)
  params['reg_alpha'] = trial.suggest_loguniform('reg_alpha', 1e-4, 1e-1)
  params['reg_lambda'] = trial.suggest_loguniform('reg_lambda', 1e-4, 1e-1)
[I 2025-01-28 04:47:42,170] Trial 97 finished with value: 0.44350033599066674 and parameters: {'model_type': 'xgboost', 'n_estimators': 1412, 'learning_rate': 0.09263101152445041, 'max_depth': 12, 'subsample': 0.6229527576416449, 'colsample_b

Best parameters: {'model_type': 'xgboost', 'n_estimators': 1410, 'learning_rate': 0.09257481429832366, 'max_depth': 12, 'subsample': 0.6135228936223254, 'colsample_bytree': 0.7872598677222539, 'reg_alpha': 0.00643677836424401, 'reg_lambda': 0.002933945497885441}
Sub shape: (3, 2)
      ID  prediction
0  28800   49.039345
1  28801  -24.584087
2  28802   20.959532


In [2]:
import os

# Step 3: Save the Submission File
submission = pd.DataFrame({
    "ID": test_ids.astype(int),
    "prediction": predictions
})

# Save the file and confirm its creation
submission_file_path = "/kaggle/working/submission.csv"
submission.to_csv(submission_file_path, index=False)
print(f"Submission file saved to: {submission_file_path}")

# Check if the file exists
if os.path.exists(submission_file_path):
    print("Submission file exists.")
else:
    print("Error: Submission file does not exist!")

# Display the first few rows of the file
try:
    generated_submission = pd.read_csv(submission_file_path)
    print("Preview of the generated submission file:")
    print(generated_submission.head())
    print(f"Submission file shape: {generated_submission.shape}")
except Exception as e:
    print(f"Error reading the generated submission file: {e}")


Submission file saved to: /kaggle/working/submission.csv
Submission file exists.
Preview of the generated submission file:
      ID  prediction
0  28800   49.039345
1  28801  -24.584087
2  28802   20.959532
Submission file shape: (3, 2)


In [3]:
print(f"Number of rows in test data: {test_data.shape[0]}")
print(f"Length of predictions: {len(predictions)}")
print(f"Length of test_ids: {len(test_ids)}")
# Check test data size
print(f"Number of rows in test data: {test_data.shape[0]}")

# Check predictions and IDs
print(f"Length of predictions: {len(predictions)}")
print(f"Length of test_ids: {len(test_ids)}")

# Debug alignment issues if lengths mismatch
if len(predictions) != len(test_ids):
    print("Error: Mismatch between predictions and test IDs!")


Number of rows in test data: 3
Length of predictions: 3
Length of test_ids: 3
Number of rows in test data: 3
Length of predictions: 3
Length of test_ids: 3


In [4]:
test_data_raw = pd.read_csv(test_path)
print(test_data_raw.head())
print(f"Test dataset shape: {test_data_raw.shape}")


      ID                       dri_score psych_disturb    cyto_score diabetes  \
0  28800  N/A - non-malignant indication            No           NaN       No   
1  28801                    Intermediate            No  Intermediate       No   
2  28802  N/A - non-malignant indication            No           NaN       No   

   hla_match_c_high  hla_high_res_8          tbi_status arrhythmia  \
0               NaN             NaN              No TBI         No   
1               2.0             8.0  TBI +- Other, >cGy         No   
2               2.0             8.0              No TBI         No   

   hla_low_res_6  ... karnofsky_score hepatic_mild          tce_div_match  \
0            6.0  ...            90.0           No                    NaN   
1            6.0  ...            90.0           No  Permissive mismatched   
2            6.0  ...            90.0           No  Permissive mismatched   

  donor_related      melphalan_dose  hla_low_res_8 cardiac  \
0     Unrelated  N/A, M

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
