In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
dd = pd.read_csv('data/data_dictionary.csv')
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [4]:
X = train.drop(['efs', 'efs_time'], axis=1)
y = train[['efs', 'efs_time', 'race_group']]

## Data Cleaning

In [None]:
df = pd.concat([X, test])

'''
# many missing data
df.drop('tce_match', axis=1, inplace=True)

# self encode !!
df['mrd_hct'] = df['mrd_hct'].fillna('Unknown')
df['mrd_hct'] = df['mrd_hct'].map({'Negative': 0, 'Positive': 1, 'Unknown': -1})

# Fill missing values with the mode within `dri_score`
df['cyto_score_detail'] = df.groupby('dri_score')['cyto_score_detail'] \
                            .transform(lambda group: group.fillna(group.mode()[0] if not group.mode().empty else 'Unknown'))

'''

categorical_cols = ['dri_score', 'cyto_score', 'diabetes', 'arrhythmia', 'cmv_status', 'rituximab', 'obesity', 
                    'in_vivo_tcd', 'tce_match', 'graft_type', 'renal_issue', 'pulm_severe', 'prim_disease_hct', 
                    'ethnicity', 'conditioning_intensity', 'mrd_hct', 'hla_match_c_high', 'hla_match_b_low', 
                    'peptic_ulcer', 'prior_tumor', 'hepatic_severe', 'sex_match', 'gvhd_proph', 'rheum_issue', 
                    'hla_match_b_high', 'race_group', 'hepatic_mild', 'tce_div_match', 'donor_related']

# Ensure columns are numeric before applying median
numerical_cols = df.select_dtypes(include=['number']).columns

for col in numerical_cols:
    df[col] = df[col].fillna(df[col].median())

# For categorical columns, fill with 'Unknown'
categorical_cols = df.select_dtypes(exclude=['number']).columns

'''for col in categorical_cols:
    df[col] = df[col].fillna('Unknown')

'''
# If 'year_hct' or 'age_at_hct' is missing, you could fill with median value
df['age_at_hct'].fillna(df['age_at_hct'].median(), inplace=True)
df['year_hct'].fillna(df['year_hct'].mode()[0], inplace=True)  # Use mode for most frequent year

# For categorical columns where a specific value indicates missing data
df['dri_score'].fillna('Missing disease status', inplace=True)
df['cyto_score'].fillna('Not tested', inplace=True)
df['diabetes'].fillna('Not done', inplace=True)


print("Check for missing data in train set:")
print(df.isnull().sum().sort_values(ascending=False))

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt
corr_matrix = pd.DataFrame(df.select_dtypes(include=['number']).corr())

plt.figure(figsize=(15, 13)) 
sns.heatmap(data=corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")

In [7]:
# Feature Engineering: Count missing values per row
df['missing_count'] = df.isnull().sum(axis=1)

# Feature Engineering: Combine HLA match scores into a single metric
hla_cols = [
    'hla_high_res_6', 'hla_high_res_8', 'hla_high_res_10',
    'hla_match_a_high', 'hla_match_b_high', 'hla_match_c_high',
    'hla_match_dqb1_high', 'hla_low_res_6', 'hla_low_res_8'
]
df['hla_match_avg'] = df[hla_cols].mean(axis=1)

# Feature Engineering: Binary feature for high-risk DRI scores
df['dri_high_risk'] = df['dri_score'].apply(
    lambda x: 1 if isinstance(x, str) and ('High' in x or 'Very high' in x) else 0
)

# Feature Engineering: Binary feature for any psychiatric disturbance
df['psych_disturb_binary'] = df['psych_disturb'].apply(lambda x: 1 if x == 'Yes' else 0)

# Feature Engineering: Create a composite comorbidity score
comorbidity_cols = [
    'cardiac', 'renal_issue', 'hepatic_severe', 'pulm_severe', 
    'rheum_issue', 'diabetes', 'obesity', 'vent_hist', 'arrhythmia'
]
df['comorbidity_severity'] = df[comorbidity_cols].apply(
    lambda row: sum([1 for val in row if val == 'Yes']), axis=1
)

# Feature Engineering: Binary feature for severe conditions
df['severe_conditions'] = df[['hepatic_severe', 'pulm_severe', 'renal_issue']].apply(
    lambda row: 1 if 'Yes' in row.values else 0, axis=1
)

# Feature Engineering: Encode donor-recipient sex match as numerical categories
sex_match_mapping = {'M-M': 0, 'F-F': 1, 'M-F': 2, 'F-M': 3, np.nan: -1}
df['sex_match_encoded'] = df['sex_match'].map(sex_match_mapping)

# Feature Engineering: Create interaction features
df['hla_cyto_interaction'] = df['hla_match_c_high'] * df['cyto_score'].apply(
    lambda x: 1 if x == 'Favorable' else 0
)

# Feature Engineering: Extract year from 'year_hct' and categorize into decades
df['year_hct_decade'] = df['year_hct'].apply(lambda x: (x // 10) * 10 if not pd.isnull(x) else np.nan)

# Feature Engineering: Add a risk index based on comorbidity and DRI
df['risk_index'] = df['comorbidity_score'] + df['dri_high_risk']

# Feature Engineering: Binarize CMV status
df['cmv_positive'] = df['cmv_status'].apply(lambda x: 1 if x in ['+/-', '+/+'] else 0)


In [None]:
'''from sklearn.preprocessing import OneHotEncoder

object_cols = df.select_dtypes(include=['object']).columns

# Apply OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, drop='first')  # Updated sparse parameter
encoded_array = encoder.fit_transform(df[object_cols])

# Create a DataFrame for the encoded data
encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out(object_cols), index=df.index)

# Remove original object columns and add the encoded ones
df = pd.concat([df.drop(columns=object_cols), encoded_df], axis=1)'''

In [None]:
from sklearn.preprocessing import LabelEncoder

# Example DataFrame (replace `X` with your DataFrame)
categorical_cols = df.select_dtypes(include=['object']).columns

# Initialize LabelEncoder
label_encoders = {}

# Apply Label Encoding to each categorical column
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Store the encoder for future use (e.g., inverse_transform)

# Check if all columns are now numeric
print(df.dtypes)


In [10]:
X = df[:-3]
X_test = df[-3:]

In [None]:
print("Different columns between X and test:")
print(X.shape, X_test.shape)

In [None]:
from lifelines import KaplanMeierFitter

# Kaplan-Meier estimation
kmf = KaplanMeierFitter()
kmf.fit(y['efs_time'], event_observed=y['efs'])

y['y'] = kmf.survival_function_at_times(y['efs_time']).values
# Plot survival curve
kmf.plot_survival_function()

In [None]:
'''from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

rdc = XGBRegressor(random_state=42,
                   enable_categorical=True)
rdc.fit(X_train, y_train['y'])

rdc.score(X_val, y_val['y'])'''

In [None]:
'''preds = rdc.predict(X_val)
'''

fe: 0.5831621880666985,


In [None]:
'''from scipy.stats import rankdata 
preds = rankdata(preds)
preds'''

In [None]:
y.isna().sum()

In [None]:
'''# kfold cross-validation
from metric import score
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import make_scorer
from functools import partial
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.decomposition import PCA
from sklearn.linear_model import SGDRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import Pool, CatBoostRegressor

models = {
    # 1. Linear Regression: Default settings
    "Linear Regression": LinearRegression(),

    # 2. Decision Tree Regressor: Simple tree with a small depth
    "Decision Tree Regressor": DecisionTreeRegressor(
        max_depth=5, 
        min_samples_split=10, 
        random_state=42
    ),

    # 3. Random Forest Regressor: Ensemble with a few trees
    "Random Forest Regressor": RandomForestRegressor(
        n_estimators=100, 
        max_depth=7, 
        min_samples_split=10, 
        random_state=42
    ),

    # 4. Gradient Boosting Regressor: Small learning rate and estimators
    "Gradient Boosting Regressor": GradientBoostingRegressor(
        n_estimators=100, 
        learning_rate=0.1, 
        max_depth=4, 
        random_state=42
    ),

    # 5. XGBoost Regressor: Commonly used settings
    "XGBoost Regressor": XGBRegressor(
        n_estimators=100, 
        learning_rate=0.1, 
        max_depth=4, 
        subsample=0.8, 
        colsample_bytree=0.8, 
        random_state=42
    ),

    # 6. LightGBM Regressor: Fast and efficient gradient boosting
    "LightGBM Regressor": LGBMRegressor(
        n_estimators=100, 
        learning_rate=0.1, 
        max_depth=-1, 
        num_leaves=31, 
        random_state=42
    ),

    # 7. CatBoost Regressor: Fast and powerful gradient boosting
    "CatBoost Regressor": CatBoostRegressor(
        iterations=100, 
        learning_rate=0.1, 
        depth=6, 
        verbose=0, 
        random_state=42
    ),

    # 8. SVR (Support Vector Regressor): Default kernel and parameters
    "SVR (Support Vector Regressor)": SVR(
        kernel='rbf', 
        C=1.0, 
        epsilon=0.1
    ),

    # 9. Gaussian Process Regressor: Default kernel
    "Gaussian Process Regressor": GaussianProcessRegressor(
        alpha=1e-10, 
        normalize_y=True
    ),

    # 10. SGD Regressor: Stochastic Gradient Descent for regression
    "SGD Regressor": SGDRegressor(
        max_iter=1000, 
        tol=1e-3, 
        learning_rate='invscaling', 
        eta0=0.01, 
        random_state=42
    )
}


kf = KFold(n_splits=5, shuffle=True, random_state=42)
kf.get_n_splits(X)

scores = {
    "Linear Regression": [],
    "Decision Tree Regressor": [],
    "Random Forest Regressor": [],
    "Gradient Boosting Regressor": [],
    "XGBoost Regressor": [],
    "LightGBM Regressor": [],
    "CatBoost Regressor": [],
    "SVR (Support Vector Regressor)": [],
    "Gaussian Process Regressor": [],
    "SGD Regressor": [],
}

for name, model in models.items():
    for i, (train_index, test_index) in enumerate(kf.split(X)):
        model.fit(X.iloc[train_index], y.iloc[train_index]['y'])
        
        preds = pd.DataFrame({
            'prediction': model.predict(X.iloc[test_index]),
            'ID': X.iloc[test_index]['ID']
        })
             
        y_true = y.iloc[test_index][["efs", "efs_time","race_group"]]
        y_true['ID'] = X.iloc[test_index]['ID']

        scores[name].append(score(y_true.copy(), preds.copy(), "ID"))

        
    print(f"{name}: {scores[name]}, Mean: {np.mean(scores[name])}")
    
    '''

In [None]:
'''# kfold cross-validation

'''
Linear Regression                134 ---
Decision Tree Regressor          119 ---
Random Forest Regressor          142 ---
Gradient Boosting Regressor      161
XGBoost Regressor                161
LightGBM Regressor               163
CatBoost Regressor               161
SVR (Support Vector Regressor)   077 ---
Gaussian Process Regressor       000 ---
SGD Regressor                    013 ---
'''


from metric import score
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import make_scorer
from functools import partial
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.decomposition import PCA
from sklearn.linear_model import SGDRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import Pool, CatBoostRegressor

models = {
    # 1. Linear Regression: Default settings
    "Linear Regression": LinearRegression(),

    # 2. Decision Tree Regressor: Simple tree with a small depth
    "Decision Tree Regressor": DecisionTreeRegressor(
        max_depth=5, 
        min_samples_split=10, 
        random_state=42
    ),

    # 3. Random Forest Regressor: Ensemble with a few trees
    "Random Forest Regressor": RandomForestRegressor(
        n_estimators=100, 
        max_depth=7, 
        min_samples_split=10, 
        random_state=42
    ),

    # 4. Gradient Boosting Regressor: Small learning rate and estimators
    "Gradient Boosting Regressor": GradientBoostingRegressor(
        n_estimators=100, 
        learning_rate=0.1, 
        max_depth=4, 
        random_state=42
    ),

    # 5. XGBoost Regressor: Commonly used settings
    "XGBoost Regressor": XGBRegressor(
        n_estimators=100, 
        learning_rate=0.1, 
        max_depth=4, 
        subsample=0.8, 
        colsample_bytree=0.8, 
        random_state=42
    ),

    # 6. LightGBM Regressor: Fast and efficient gradient boosting
    "LightGBM Regressor": LGBMRegressor(
        n_estimators=100, 
        learning_rate=0.1, 
        max_depth=-1, 
        num_leaves=31, 
        random_state=42
    ),

    # 7. CatBoost Regressor: Fast and powerful gradient boosting
    "CatBoost Regressor": CatBoostRegressor(
        iterations=100, 
        learning_rate=0.1, 
        depth=6, 
        verbose=0, 
        random_state=42
    ),

    # 8. SVR (Support Vector Regressor): Default kernel and parameters
    "SVR (Support Vector Regressor)": SVR(
        kernel='rbf', 
        C=1.0, 
        epsilon=0.1
    ),

    # 9. Gaussian Process Regressor: Default kernel
    "Gaussian Process Regressor": GaussianProcessRegressor(
        alpha=1e-10, 
        normalize_y=True
    ),

    # 10. SGD Regressor: Stochastic Gradient Descent for regression
    "SGD Regressor": SGDRegressor(
        max_iter=1000, 
        tol=1e-3, 
        learning_rate='invscaling', 
        eta0=0.01, 
        random_state=42
    )
}


kf = KFold(n_splits=5, shuffle=True, random_state=42)
kf.get_n_splits(X)

scores = {
    "Linear Regression": [],
    "Decision Tree Regressor": [],
    "Random Forest Regressor": [],
    "Gradient Boosting Regressor": [],
    "XGBoost Regressor": [],
    "LightGBM Regressor": [],
    "CatBoost Regressor": [],
    "SVR (Support Vector Regressor)": [],
    "Gaussian Process Regressor": [],
    "SGD Regressor": [],
}

for name, model in models.items():
    for i, (train_index, test_index) in enumerate(kf.split(X)):
        model.fit(X.iloc[train_index], y.iloc[train_index]['y'])
        
        preds = pd.DataFrame({
            'prediction': model.predict(X.iloc[test_index]),
            'ID': X.iloc[test_index]['ID']
        })
             
        y_true = y.iloc[test_index][["efs", "efs_time","race_group"]]
        y_true['ID'] = X.iloc[test_index]['ID']

        scores[name].append(score(y_true.copy(), preds.copy(), "ID"))

        
    print(f"{name}: {scores[name]}, Mean: {np.mean(scores[name])}")
    
    '''

In [18]:
# custom_scorer

from sklearn.metrics import make_scorer

# Custom scorer to use additional columns in y_true
def custom_scorer(y_true, y_pred):
    """
    Custom scoring function that uses additional columns from y_true.
    """
    # Extract additional columns ['efs', 'efs_time', 'race_group', 'ID']
    additional_cols = y_true[['efs', 'efs_time', 'race_group', 'ID']]

    # Prepare predictions DataFrame with 'ID'
    preds = pd.DataFrame({
        'prediction': y_pred,
        'ID': additional_cols['ID']
    })

    # Call the custom metric function
    return score(additional_cols, preds, "ID")

# Wrap the custom scorer using make_scorer
scorer = make_scorer(custom_scorer, greater_is_better=True)

In [44]:
'''# CustomRandomizedSearchCV
import numpy as np
import pandas as pd
from sklearn.base import clone, BaseEstimator
from sklearn.metrics import make_scorer
from sklearn.model_selection import ParameterSampler
from tqdm import tqdm  # For progress tracking


class CustomRandomizedSearchCV:
    def __init__(self, estimator, param_distributions, n_iter=10, cv=None, scoring=None, random_state=None, verbose=2):
        self.estimator = estimator
        self.param_distributions = param_distributions
        self.n_iter = n_iter
        self.cv = cv
        self.scoring = scoring
        self.random_state = random_state
        self.verbose = verbose
        self.results_ = []

    def fit(self, X, y):
        np.random.seed(self.random_state)
        sampled_params = list(ParameterSampler(self.param_distributions, self.n_iter, random_state=self.random_state))
        
        # Progress bar
        if self.verbose:
            print(f"Evaluating {self.n_iter} parameter combinations...")
        
        for params in tqdm(sampled_params, disable=not self.verbose):
            scores = []
            
            # Cross-validation logic
            oof_predictions = np.zeros(len(X))  # Array to store OOF predictions

            for train_idx, test_idx in self.cv.split(X, y):
                # Split data into training and validation sets
                X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
                y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

                # Clone the estimator and set parameters
                model = clone(self.estimator)
                model.set_params(**params)
                model.fit(X_train, y_train['y'])  # Train only on target y['y']
                oof_predictions[test_idx] = model.predict(X_test)  # Save predictions for validation set

            y_true = y[["efs","efs_time","race_group"]].copy()
            y_true[['ID']] = X[["ID"]]
            y_pred = X[["ID"]].copy()
            y_pred["prediction"] = oof_predictions
            m = score(y_true.copy(), y_pred.copy(), "ID")

            self.results_.append({'params': params, 'score': m})
        
        # Select best result
        self.results_ = sorted(self.results_, key=lambda x: x['score'], reverse=True)
        self.best_params_ = self.results_[0]['params']
        self.best_score_ = self.results_[0]['score']
        self.best_estimator_ = clone(self.estimator).set_params(**self.best_params_)
        self.best_estimator_.fit(X, y['y'])

        return self

    def get_results(self):
        """
        Return the search results.

        Returns:
        - results: List of dictionaries with params, scores, and mean_score.
        """
        return self.results_
'''

In [112]:
'''# GradientBoostingRegressor
from scipy.stats import randint, uniform
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold

# Define the parameter grid for RandomizedSearchCV
param_distributions = {
    'n_estimators': randint(50, 3000),         # Number of trees
    'learning_rate': uniform(0.01, 0.2),     # Shrinkage rate
    'max_depth': randint(3, 10),             # Maximum depth of each tree
    'min_samples_split': randint(2, 20),     # Minimum number of samples to split a node
    'min_samples_leaf': randint(1, 20),      # Minimum samples per leaf node
    'subsample': uniform(0.5, 0.5),          # Fraction of samples used for fitting each tree
    'max_features': ['sqrt', 'log2', None],  # Number of features considered for best split
}

# Define cross-validation strategy
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize the GradientBoostingRegressor model
model = GradientBoostingRegressor()

# Initialize the CustomScorerCVWrapper
wrapper = CustomRandomizedSearchCV(
    estimator=model,
    param_distributions=param_distributions,
    scoring=scorer,
    cv=kf,
    n_iter=100,
    random_state=42
)

# Fit the wrapper with X and y_full
wrapper.fit(X, y)

print(wrapper)
# Print the best parameters and score
print("Best Parameters:", wrapper.best_params_)
print("Best Score:", wrapper.best_score_)
print("All Results:", wrapper.get_results())'''

'# GradientBoostingRegressor\nfrom scipy.stats import randint, uniform\nfrom sklearn.ensemble import GradientBoostingRegressor\nfrom sklearn.model_selection import KFold\n\n# Define the parameter grid for RandomizedSearchCV\nparam_distributions = {\n    \'n_estimators\': randint(50, 3000),         # Number of trees\n    \'learning_rate\': uniform(0.01, 0.2),     # Shrinkage rate\n    \'max_depth\': randint(3, 10),             # Maximum depth of each tree\n    \'min_samples_split\': randint(2, 20),     # Minimum number of samples to split a node\n    \'min_samples_leaf\': randint(1, 20),      # Minimum samples per leaf node\n    \'subsample\': uniform(0.5, 0.5),          # Fraction of samples used for fitting each tree\n    \'max_features\': [\'sqrt\', \'log2\', None],  # Number of features considered for best split\n}\n\n# Define cross-validation strategy\nkf = KFold(n_splits=5, shuffle=True, random_state=42)\n\n# Initialize the GradientBoostingRegressor model\nmodel = GradientBoosting

In [None]:
'''# XGBoost Regressor
from xgboost import XGBRegressor
from scipy.stats import randint, uniform

param_distributions = {
    'n_estimators': randint(1000, 3000),
    'learning_rate': uniform(0.01, 0.3),
    'max_depth': randint(3, 10),
    'min_child_weight': randint(1, 40),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'gamma': uniform(0, 1),
    'reg_alpha': uniform(0, 1),
    'reg_lambda': uniform(0, 1),
    'scale_pos_weight': uniform(1, 10),
}

# Initialize XGBoost Regressor
xgb_regressor = XGBRegressor(random_state=42)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
# Set up RandomizedSearchCV with 50 iterations and KFold cross-validation
wrapperxgb = CustomRandomizedSearchCV(
    estimator=xgb_regressor,
    param_distributions=param_distributions,
    scoring=scorer,
    cv=kf,
    n_iter=150,
    random_state=42
)

# Fit the wrapper with X and y_full
wrapperxgb.fit(X, y)

# Print the best parameters and score
print("Best Parameters:", wrapperxgb.best_params_)
print("Best Score:", wrapperxgb.best_score_)
'''

In [109]:
%%time
'''xgbparams = {'colsample_bytree': 0.7, 
             'learning_rate': 0.02, 
             'max_depth': 5, 
             'min_child_weight': 10, 
             'n_estimators': 2500, 
             'reg_alpha': 0.7,  
             'scale_pos_weight': 5, 
             'subsample': 0.7,
             'enable_categorical':True}'''

xgb_regressor.set_params(**xgbparams)

xgb_regressor = XGBRegressor(
        random_state=42,
        device="cuda",
        max_depth=3,  
        colsample_bytree=0.5,  
        subsample=0.7,  
        n_estimators=2500,  
        learning_rate=0.02,  
        enable_categorical=True,
        min_child_weight=80,
        #early_stopping_rounds=25,
    )

FOLDS = 10
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
    
oof_xgb = np.zeros(len(X))
pred_xgb = np.zeros(len(X_test))

for i, (train_index, test_index) in enumerate(kf.split(X, y)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)
    
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]

    
    xgb_regressor.fit(
        X_train, y_train['y'],
        eval_set=[(X_valid, y_valid['y'])],  
        verbose=500 
    )

    # INFER OOF
    oof_xgb[test_index] = xgb_regressor.predict(X_valid)
    # INFER TEST
    pred_xgb += xgb_regressor.predict(X_test)

# COMPUTE AVERAGE TEST PREDS
pred_xgb /= FOLDS

#########################
### Fold 1
#########################
[0]	validation_0-rmse:0.17778
[500]	validation_0-rmse:0.16049
[1000]	validation_0-rmse:0.15827
[1500]	validation_0-rmse:0.15749
[2000]	validation_0-rmse:0.15692
[2499]	validation_0-rmse:0.15673
#########################
### Fold 2
#########################
[0]	validation_0-rmse:0.17357
[500]	validation_0-rmse:0.15628
[1000]	validation_0-rmse:0.15467
[1500]	validation_0-rmse:0.15406
[2000]	validation_0-rmse:0.15374
[2499]	validation_0-rmse:0.15358
#########################
### Fold 3
#########################
[0]	validation_0-rmse:0.17731
[500]	validation_0-rmse:0.15853
[1000]	validation_0-rmse:0.15608
[1500]	validation_0-rmse:0.15521
[2000]	validation_0-rmse:0.15477
[2499]	validation_0-rmse:0.15456
#########################
### Fold 4
#########################
[0]	validation_0-rmse:0.17928
[500]	validation_0-rmse:0.16074
[1000]	validation_0-rmse:0.15850
[1500]	validation_0-rmse:0.15773
[2000]	validation_0-rmse:0.15745
[2499

In [110]:
y_true = y[["efs","efs_time","race_group"]].copy()
y_true[['ID']] = X[["ID"]]
y_pred = X[["ID"]].copy()
y_pred["prediction"] = oof_xgb
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for XGBoost KaplanMeier = 0.67095, ",m)


Overall CV for XGBoost KaplanMeier = 0.67069,  0.6709587632859836


In [None]:
# LightGBM Regressor
from lightgbm import LGBMRegressor

param_distributions = {
    'n_estimators': randint(1000, 3000),
    'learning_rate': uniform(0.01, 0.3),
    'max_depth': randint(3, 10),
    'num_leaves': randint(31, 255),
    'min_child_samples': randint(10, 100),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'reg_alpha': uniform(0, 1),
    'reg_lambda': uniform(0, 1),
    'scale_pos_weight': uniform(1, 10),
    'min_split_gain': uniform(0, 1),
    'device': ["gpu"],
    'verbose':[-1], 
    'objective':["regression"], 
    
}

# Initialize LightGBM Regressor
lgb_regressor = LGBMRegressor(random_state=42)

# Set up RandomizedSearchCV with 50 iterations and KFold cross-validation
wrapperLGBM = CustomRandomizedSearchCV(
    estimator=lgb_regressor,
    param_distributions=param_distributions,
    scoring=scorer,
    cv=kf,
    n_iter=150,
    random_state=42
)

# Fit the wrapper with X and y_full
wrapperLGBM.fit(X, y)

# Print the best parameters and score
print("Best Parameters:", wrapperLGBM.best_params_)
print("Best Score:", wrapperLGBM.best_score_)
print("All Results:", wrapperLGBM.get_results())


In [113]:
lgmbres = pd.DataFrame([{**entry['params'], 'score': entry['score']} for entry in wrapperLGBM.get_results()])
lgmbres

Unnamed: 0,colsample_bytree,device,learning_rate,max_depth,min_child_samples,min_split_gain,n_estimators,num_leaves,objective,reg_alpha,reg_lambda,scale_pos_weight,subsample,verbose,score
0,0.749816,gpu,0.295214,5,81,0.598658,2638,152,regression,0.155995,0.058084,9.661761,0.840446,-1,0.652585
1,0.883229,gpu,0.016175,4,97,0.832443,1805,160,regression,0.181825,0.183405,4.042422,0.809903,-1,0.649241
2,0.772778,gpu,0.097369,5,51,0.046666,1699,238,regression,0.232771,0.090606,7.18386,0.752985,-1,0.666127
3,0.993292,gpu,0.150029,7,60,0.680308,1840,197,regression,0.013265,0.942202,6.632882,0.754167,-1,0.650983
4,0.606387,gpu,0.079268,6,69,0.122038,1508,38,regression,0.034389,0.90932,3.5878,0.865009,-1,0.664947
5,0.724684,gpu,0.16602,4,13,0.184854,1702,176,regression,0.775133,0.939499,9.948274,0.83916,-1,0.661511
6,0.96875,gpu,0.036548,9,71,0.32533,1719,112,regression,0.539692,0.586751,10.652553,0.842814,-1,0.658167
7,0.7104,gpu,0.098882,7,74,0.015636,1520,118,regression,0.394882,0.293488,1.140798,0.679537,-1,0.665802
8,0.884537,gpu,0.247053,5,14,0.926301,2017,71,regression,0.91496,0.850039,5.494507,0.638164,-1,0.650325
9,0.748327,gpu,0.210652,7,53,0.637557,2570,223,regression,0.382927,0.971712,9.489138,0.888692,-1,0.651702


In [None]:
# CatBoost Regressor
from catboost import CatBoostRegressor

param_distributions = {
    'learning_rate': uniform(0.01, 0.3),
    'depth': randint(3, 12),
    'l2_leaf_reg': uniform(1, 10),
    'border_count': randint(32, 255),
    'subsample': uniform(0.6, 0.4),
    'colsample_bylevel': uniform(0.6, 0.4),
    'min_data_in_leaf': randint(1, 20),
    'max_ctr_complexity': randint(1, 10),
    'grow_policy': ["Lossguide"],
    'bootstrap_type':['Bernoulli'], 
    'leaf_estimation_method':['Gradient']
}
# Initialize CatBoost Regressor
catboost_regressor = CatBoostRegressor(random_state=42, silent=True)

# Set up RandomizedSearchCV with 50 iterations and KFold cross-validation
wrappercat = CustomRandomizedSearchCV(
    estimator=catboost_regressor,
    param_distributions=param_distributions,
    scoring=scorer,
    cv=kf,
    n_iter=150,
    random_state=42
)

# Fit the wrapper with X and y_full
wrappercat.fit(X, y)

# Print the best parameters and score
print("Best Parameters:", wrappercat.best_params_)
print("Best Score:", wrappercat.best_score_)
