In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from lightgbm import log_evaluation
from sklearn.model_selection import StratifiedKFold
from lifelines import NelsonAalenFitter
from lifelines.utils import concordance_index
import joblib
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.impute import KNNImputer, SimpleImputer

# Load Training Data
train = pd.read_csv("/kaggle/input/cibmtr-data/train.csv")

# Identify numerical and categorical columns
num_cols = train.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = train.select_dtypes(include=['object']).columns.tolist()

num_cols.remove('efs')
num_cols.remove('efs_time')

# Handle missing values
num_imputer = KNNImputer(n_neighbors=7)
train[num_cols] = num_imputer.fit_transform(train[num_cols])

cat_imputer = SimpleImputer(strategy='most_frequent')
train[cat_cols] = cat_imputer.fit_transform(train[cat_cols])

# Function to detect and cap outliers automatically using IQR
def cap_outliers_auto(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.25)  # First quartile (25th percentile)
        Q3 = df[col].quantile(0.75)  # Third quartile (75th percentile)
        IQR = Q3 - Q1  # Interquartile range
        lower_bound = Q1 - 1.5 * IQR  # Lower bound
        upper_bound = Q3 + 1.5 * IQR  # Upper bound
        
        # Cap outliers
        df[col] = np.where(df[col] < lower_bound, lower_bound, df[col])
        df[col] = np.where(df[col] > upper_bound, upper_bound, df[col])
    
    return df

# Apply outlier handling
train = cap_outliers_auto(train, num_cols)

print("Outliers successfully capped using dynamic IQR calculation!")

# Feature Selection
RMV = ["ID", "efs", "efs_time", "y"]
FEATURES = [c for c in train.columns if c not in RMV]

# Encode Categorical Features
for col in train.select_dtypes(include=['object', 'category']).columns:
    train[col] = train[col].astype('category').cat.codes

# Nelson-Aalen Target Transformation
def create_nelson(data):
    naf = NelsonAalenFitter(nelson_aalen_smoothing=0)
    naf.fit(durations=data['efs_time'], event_observed=data['efs'])
    return naf.cumulative_hazard_at_times(data['efs_time']).values * -1

train["y_nel"] = create_nelson(train)
train.loc[train.efs == 0, "y_nel"] = (-(-train.loc[train.efs == 0, "y_nel"])**0.5)

def logit_transform(y, eps=2e-2, eps_mul=1.1):
    y = (y - y.min() + eps) / (y.max() - y.min() + eps_mul * eps)
    return np.log(y / (1 - y))

train["y_transformed"] = logit_transform(train["y_nel"])

# Stratified KFold for Cross-Validation
FOLDS = 20
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)
train["fold"] = -1
for fold, (_, val_idx) in enumerate(skf.split(train, train["race_group"])):
    train.loc[val_idx, "fold"] = fold

# Model Training and Stacking
xgb_oof, lgb_oof, cat_oof = np.zeros(len(train)), np.zeros(len(train)), np.zeros(len(train))
xgb_models, lgb_models, cat_models = [], [], []

for fold in range(FOLDS):
    x_train, y_train = train.loc[train.fold != fold, FEATURES], train.loc[train.fold != fold, "y_transformed"]
    x_valid, y_valid = train.loc[train.fold == fold, FEATURES], train.loc[train.fold == fold, "y_transformed"]
    
    # XGBoost Model
    model_xgb = xgb.XGBRegressor(n_estimators=1000, learning_rate=0.01, max_depth=4, subsample=0.8)
    model_xgb.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], verbose=True)
    xgb_models.append(model_xgb)
    xgb_oof[train.index[train.fold == fold]] = model_xgb.predict(x_valid)
    
    # LightGBM Model
    model_lgb = lgb.LGBMRegressor(n_estimators=1000, learning_rate=0.01, max_depth=6, num_leaves=31)
    model_lgb.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], callbacks=[log_evaluation(500)])
    lgb_models.append(model_lgb)
    lgb_oof[train.index[train.fold == fold]] = model_lgb.predict(x_valid)
    
    # CatBoost Model
    model_cat = cb.CatBoostRegressor(iterations=1000, learning_rate=0.01, depth=6, verbose=500)
    model_cat.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], verbose=True)
    cat_models.append(model_cat)
    cat_oof[train.index[train.fold == fold]] = model_cat.predict(x_valid)

# Save Models
joblib.dump(xgb_models, "xgboost_models.pkl")
joblib.dump(lgb_models, "lightgbm_models.pkl")
joblib.dump(cat_models, "catboost_models.pkl")
print("Models saved successfully.")

# Prepare Training Data for Meta-Model
stacked_train = np.vstack((xgb_oof, lgb_oof, cat_oof)).T

# Define Neural Network Meta-Model (Improved)
meta_model = keras.Sequential([
    layers.Dense(256, kernel_initializer='he_normal', input_shape=(3,)),
    layers.BatchNormalization(),
    layers.LeakyReLU(),
    layers.Dropout(0.3),

    layers.Dense(128, kernel_initializer='he_normal'),
    layers.BatchNormalization(),
    layers.LeakyReLU(),
    layers.Dropout(0.3),

    layers.Dense(64, kernel_initializer='he_normal'),
    layers.BatchNormalization(),
    layers.LeakyReLU(),
    layers.Dropout(0.2),

    layers.Dense(1, activation='linear')  # Output layer
])

# Compile model
meta_model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001),
                   loss='mse')

# Train meta-model
meta_model.fit(stacked_train, train["y_transformed"], epochs=30, batch_size=16, verbose=1)

# Save Meta-Model
meta_model.save("meta_model.h5")
print("Meta-model saved successfully.")

# Save preprocessors for inference
joblib.dump(num_imputer, "num_imputer.pkl")
joblib.dump(cat_imputer, "cat_imputer.pkl")

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (
Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



ModuleNotFoundError: No module named 'tensorflow'