I used Tabnet instead of LGBM/Catboost based on this [notebook](https://www.kaggle.com/code/snnclsr/tabular-ensemble-lgbm-catboost).

(The parameters have not been optimized.)

Original comment:

In my previous work [here](https://www.kaggle.com/code/snnclsr/lgbm-baseline-with-new-features), I showed the effectiveness of additional tabular features. Since there are not enough positive samples, every bit of contribution will be important in the final stage.

With that motivation, I want to show how ensembles are useful in predictions.

**Edit:** New version adds two image model predictions from here: https://www.kaggle.com/code/motono0223/isic-tabular-model-image-model-features and one from my own model (augmentations used [here](https://www.kaggle.com/code/snnclsr/image-augmentations-from-winning-solutions)).

In [1]:
!pip install --no-index --find-links /kaggle/input/pytorchtabnet/pytorch_tabnet-4.1.0-py3-none-any.whl pytorch-tabnet

Looking in links: /kaggle/input/pytorchtabnet/pytorch_tabnet-4.1.0-py3-none-any.whl
Processing /kaggle/input/pytorchtabnet/pytorch_tabnet-4.1.0-py3-none-any.whl
Installing collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-4.1.0


In [2]:
NO_RUN = False

import os

if NO_RUN and not os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    # To save some time.
    import pandas as pd
    df_sub = pd.read_csv("/kaggle/input/isic-2024-challenge/sample_submission.csv")
    df_sub.to_csv("submission.csv", index=False)
    exit(0)

# Imports

In [3]:
import numpy as np
import pandas as pd
import pandas.api.types
import matplotlib.pyplot as plt
import os

from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.model_selection import GroupKFold, StratifiedGroupKFold
from sklearn.ensemble import VotingClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler

import optuna
import catboost as cb
import lightgbm as lgb
import xgboost as xgb

from pytorch_tabnet.tab_model import TabNetClassifier
import torch

OPTIMIZE_OPTUNA = False
SUBSAMPLE = False
SUBSAMPLE_RATIO = 0.5 # only effective if SUBSAMPLE=True
DISPLAY_FEATURE_IMPORTANCE = False

## Generating the image level predictions

In [4]:
!python /kaggle/input/isic-script-inference-effnetv1b0-f313ae/main.py /kaggle/input/isic-pytorch-training-baseline-image-only/AUROC0.5171_Loss0.3476_epoch35.bin
!mv submission.csv submission_effnetv1b0.csv

BEST_WEIGHT = /kaggle/input/isic-pytorch-training-baseline-image-only/AUROC0.5171_Loss0.3476_epoch35.bin
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  1.10it/s]


In [5]:
!python /kaggle/input/isic-script-inference-eva02/main.py /kaggle/input/isic-pytorch-training-baseline-eva02/AUROC0.5177_Loss0.2829_epoch7.bin
!mv submission.csv submission_eva02.csv

BEST_WEIGHT = /kaggle/input/isic-pytorch-training-baseline-eva02/AUROC0.5177_Loss0.2829_epoch7.bin
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  2.69it/s]


In [6]:
# My model
!python /kaggle/input/isic-2024-pl-submission-script-and-preds/pl_submission.py
!mv submission.csv submission_image3.csv

  df_train_meta = pd.read_csv(BASE_DATA_DIR + "train-metadata.csv")


# Feature Engineering

In [7]:
df_train = pd.read_csv("/kaggle/input/isic-2024-challenge/train-metadata.csv")
df_test = pd.read_csv("/kaggle/input/isic-2024-challenge/test-metadata.csv")

def feature_engineering(df):
    # New features to try...
    df["lesion_size_ratio"] = df["tbp_lv_minorAxisMM"] / df["clin_size_long_diam_mm"]
    df["lesion_shape_index"] = df["tbp_lv_areaMM2"] / (df["tbp_lv_perimeterMM"] ** 2)
    df["hue_contrast"] = (df["tbp_lv_H"] - df["tbp_lv_Hext"]).abs()
    df["luminance_contrast"] = (df["tbp_lv_L"] - df["tbp_lv_Lext"]).abs()
    df["lesion_color_difference"] = np.sqrt(df["tbp_lv_deltaA"] ** 2 + df["tbp_lv_deltaB"] ** 2 + df["tbp_lv_deltaL"] ** 2)
    df["border_complexity"] = df["tbp_lv_norm_border"] + df["tbp_lv_symm_2axis"]
    df["color_uniformity"] = df["tbp_lv_color_std_mean"] / df["tbp_lv_radial_color_std_max"]
    df["3d_position_distance"] = np.sqrt(df["tbp_lv_x"] ** 2 + df["tbp_lv_y"] ** 2 + df["tbp_lv_z"] ** 2) 
    df["perimeter_to_area_ratio"] = df["tbp_lv_perimeterMM"] / df["tbp_lv_areaMM2"]
    df["lesion_visibility_score"] = df["tbp_lv_deltaLBnorm"] + df["tbp_lv_norm_color"]
    df["combined_anatomical_site"] = df["anatom_site_general"] + "_" + df["tbp_lv_location"]
    df["symmetry_border_consistency"] = df["tbp_lv_symm_2axis"] * df["tbp_lv_norm_border"]
    df["color_consistency"] = df["tbp_lv_stdL"] / df["tbp_lv_Lext"]
    
    df["size_age_interaction"] = df["clin_size_long_diam_mm"] * df["age_approx"]
    df["hue_color_std_interaction"] = df["tbp_lv_H"] * df["tbp_lv_color_std_mean"]
    df["lesion_severity_index"] = (df["tbp_lv_norm_border"] + df["tbp_lv_norm_color"] + df["tbp_lv_eccentricity"]) / 3
    df["shape_complexity_index"] = df["border_complexity"] + df["lesion_shape_index"]
    df["color_contrast_index"] = df["tbp_lv_deltaA"] + df["tbp_lv_deltaB"] + df["tbp_lv_deltaL"] + df["tbp_lv_deltaLBnorm"]
    df["log_lesion_area"] = np.log(df["tbp_lv_areaMM2"] + 1)
    df["normalized_lesion_size"] = df["clin_size_long_diam_mm"] / df["age_approx"]
    df["mean_hue_difference"] = (df["tbp_lv_H"] + df["tbp_lv_Hext"]) / 2
    df["std_dev_contrast"] = np.sqrt((df["tbp_lv_deltaA"] ** 2 + df["tbp_lv_deltaB"] ** 2 + df["tbp_lv_deltaL"] ** 2) / 3)
    df["color_shape_composite_index"] = (df["tbp_lv_color_std_mean"] + df["tbp_lv_area_perim_ratio"] + df["tbp_lv_symm_2axis"]) / 3
    df["3d_lesion_orientation"] = np.arctan2(df_train["tbp_lv_y"], df_train["tbp_lv_x"])
    df["overall_color_difference"] = (df["tbp_lv_deltaA"] + df["tbp_lv_deltaB"] + df["tbp_lv_deltaL"]) / 3
    df["symmetry_perimeter_interaction"] = df["tbp_lv_symm_2axis"] * df["tbp_lv_perimeterMM"]
    df["comprehensive_lesion_index"] = (df["tbp_lv_area_perim_ratio"] + df["tbp_lv_eccentricity"] + df["tbp_lv_norm_color"] + df["tbp_lv_symm_2axis"]) / 4

    # Taken from: https://www.kaggle.com/code/dschettler8845/isic-detect-skin-cancer-let-s-learn-together
    df["color_variance_ratio"] = df["tbp_lv_color_std_mean"] / df["tbp_lv_stdLExt"]
    df["border_color_interaction"] = df["tbp_lv_norm_border"] * df["tbp_lv_norm_color"]
    df["size_color_contrast_ratio"] = df["clin_size_long_diam_mm"] / df["tbp_lv_deltaLBnorm"]
    df["age_normalized_nevi_confidence"] = df["tbp_lv_nevi_confidence"] / df["age_approx"]
    df["color_asymmetry_index"] = df["tbp_lv_radial_color_std_max"] * df["tbp_lv_symm_2axis"]
    df["3d_volume_approximation"] = df["tbp_lv_areaMM2"] * np.sqrt(df["tbp_lv_x"]**2 + df["tbp_lv_y"]**2 + df["tbp_lv_z"]**2)
    df["color_range"] = (df["tbp_lv_L"] - df["tbp_lv_Lext"]).abs() + (df["tbp_lv_A"] - df["tbp_lv_Aext"]).abs() + (df["tbp_lv_B"] - df["tbp_lv_Bext"]).abs()
    df["shape_color_consistency"] = df["tbp_lv_eccentricity"] * df["tbp_lv_color_std_mean"]
    df["border_length_ratio"] = df["tbp_lv_perimeterMM"] / (2 * np.pi * np.sqrt(df["tbp_lv_areaMM2"] / np.pi))
    df["age_size_symmetry_index"] = df["age_approx"] * df["clin_size_long_diam_mm"] * df["tbp_lv_symm_2axis"]
    # Until here.
    
    new_num_cols = [
        "lesion_size_ratio", "lesion_shape_index", "hue_contrast",
        "luminance_contrast", "lesion_color_difference", "border_complexity",
        "color_uniformity", "3d_position_distance", "perimeter_to_area_ratio",
        "lesion_visibility_score", "symmetry_border_consistency", "color_consistency",

        "size_age_interaction", "hue_color_std_interaction", "lesion_severity_index", 
        "shape_complexity_index", "color_contrast_index", "log_lesion_area",
        "normalized_lesion_size", "mean_hue_difference", "std_dev_contrast",
        "color_shape_composite_index", "3d_lesion_orientation", "overall_color_difference",
        "symmetry_perimeter_interaction", "comprehensive_lesion_index",
        
        "color_variance_ratio", "border_color_interaction", "size_color_contrast_ratio",
        "age_normalized_nevi_confidence", "color_asymmetry_index", "3d_volume_approximation",
        "color_range", "shape_color_consistency", "border_length_ratio", "age_size_symmetry_index",
    ]
    new_cat_cols = ["combined_anatomical_site"]
    return df, new_num_cols, new_cat_cols

num_cols = [
    'age_approx', 'clin_size_long_diam_mm', 'tbp_lv_A', 'tbp_lv_Aext', 'tbp_lv_B', 'tbp_lv_Bext', 
    'tbp_lv_C', 'tbp_lv_Cext', 'tbp_lv_H', 'tbp_lv_Hext', 'tbp_lv_L', 
    'tbp_lv_Lext', 'tbp_lv_areaMM2', 'tbp_lv_area_perim_ratio', 'tbp_lv_color_std_mean', 
    'tbp_lv_deltaA', 'tbp_lv_deltaB', 'tbp_lv_deltaL', 'tbp_lv_deltaLB',
    'tbp_lv_deltaLBnorm', 'tbp_lv_eccentricity', 'tbp_lv_minorAxisMM',
    'tbp_lv_nevi_confidence', 'tbp_lv_norm_border', 'tbp_lv_norm_color',
    'tbp_lv_perimeterMM', 'tbp_lv_radial_color_std_max', 'tbp_lv_stdL',
    'tbp_lv_stdLExt', 'tbp_lv_symm_2axis', 'tbp_lv_symm_2axis_angle',
    'tbp_lv_x', 'tbp_lv_y', 'tbp_lv_z',
]
df_train[num_cols] = df_train[num_cols].fillna(df_train[num_cols].median())
df_test[num_cols] = df_test[num_cols].fillna(df_train[num_cols].median())
df_train, new_num_cols, new_cat_cols = feature_engineering(df_train.copy())
df_test, _, _ = feature_engineering(df_test.copy())
num_cols += new_num_cols

# anatom_site_general
cat_cols = ["sex", "tbp_tile_type", "tbp_lv_location", "tbp_lv_location_simple"] + new_cat_cols
train_cols = num_cols + cat_cols

df_eff = pd.read_csv("/kaggle/input/isic-inference-effnetv1b0-for-training-data/train_effnetv1b0.csv")
df_eff = df_eff[["target_effnetv1b0"]]

df_eva = pd.read_csv("/kaggle/input/isic-inference-eva02-for-training-data/train_eva02.csv")
df_eva = df_eva[["target_eva02"]]

df_image_3 = pd.read_csv("/kaggle/input/isic-2024-pl-submission-script-and-preds/train_preds.csv")

df_train["target_effnetv1b0"] = df_eff["target_effnetv1b0"]
df_train["target_eva02"] = df_eva["target_eva02"]
df_train["target_3"] = df_image_3["pred"]

train_cols += ["target_effnetv1b0","target_eva02", "target_3"]

category_encoder = OrdinalEncoder(
    categories='auto',
    dtype=int,
    handle_unknown='use_encoded_value',
    unknown_value=-2,
    encoded_missing_value=-1,
)

X_cat = category_encoder.fit_transform(df_train[cat_cols])
for c, cat_col in enumerate(cat_cols):
    df_train[cat_col] = X_cat[:, c]

  df_train = pd.read_csv("/kaggle/input/isic-2024-challenge/train-metadata.csv")
  df_eff = pd.read_csv("/kaggle/input/isic-inference-effnetv1b0-for-training-data/train_effnetv1b0.csv")
  df_eva = pd.read_csv("/kaggle/input/isic-inference-eva02-for-training-data/train_eva02.csv")


# CV Setup

In [8]:
N_SPLITS = 5
gkf = StratifiedGroupKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

if SUBSAMPLE:
    df_pos = df_train[df_train["target"] == 1]
    df_neg = df_train[df_train["target"] == 0]
    df_neg = df_neg.sample(frac=SUBSAMPLE_RATIO, random_state=42)
    df_train = pd.concat([df_pos, df_neg]).sample(frac=1.0, random_state=42).reset_index(drop=True)    

df_train["fold"] = -1
for idx, (train_idx, val_idx) in enumerate(gkf.split(df_train, df_train["target"], groups=df_train["patient_id"])):
    df_train.loc[val_idx, "fold"] = idx

# Competition Metric

In [9]:
def comp_score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str, min_tpr: float=0.80):
    v_gt = abs(np.asarray(solution.values)-1)
    v_pred = np.array([1.0 - x for x in submission.values])
    max_fpr = abs(1-min_tpr)
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    # change scale from [0.5, 1.0] to [0.5 * max_fpr**2, max_fpr]
    # https://math.stackexchange.com/questions/914823/shift-numbers-into-a-different-range
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)
    return partial_auc

def custom_lgbm_metric(y_true, y_hat):
    # TODO: Refactor with the above.
    min_tpr = 0.80
    v_gt = abs(y_true-1)
    v_pred = np.array([1.0 - x for x in y_hat])
    max_fpr = abs(1-min_tpr)
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    # change scale from [0.5, 1.0] to [0.5 * max_fpr**2, max_fpr]
    # https://math.stackexchange.com/questions/914823/shift-numbers-into-a-different-range
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)
    return "pauc80", partial_auc, True

# Tabnet

In [10]:
def preprocess_data(X):
    X = X.replace([np.inf, -np.inf], np.nan)
    
    imputer = SimpleImputer(strategy='median')
    X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
    
    scaler = RobustScaler()
    X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
    
    return X

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

tabnet_params = {
    "n_d": 64,
    "n_a": 64,
    "n_steps": 5,
    "gamma": 1.5,
    "n_independent": 2,
    "n_shared": 2,
    "lambda_sparse": 1e-4,
    "optimizer_params": dict(lr=2e-2),
    "scheduler_params": dict(mode="max", patience=5, min_lr=1e-5, factor=0.9),
    "scheduler_fn": torch.optim.lr_scheduler.ReduceLROnPlateau,
    "verbose": 10,
    "device_name": device  # デバイスを指定
}


tabnet_scores = []
tabnet_models = []
oof_df = pd.DataFrame()

df_train[train_cols] = preprocess_data(df_train[train_cols])

for fold in range(N_SPLITS):
    print(f"Training fold {fold}")
    
    _df_train = df_train[df_train["fold"] != fold].reset_index(drop=True)
    _df_valid = df_train[df_train["fold"] == fold].reset_index(drop=True)
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(_df_train[train_cols])
    X_valid = scaler.transform(_df_valid[train_cols])
    y_train = _df_train["target"].values
    y_valid = _df_valid["target"].values
    
    model = TabNetClassifier(**tabnet_params)
    model.fit(
        X_train=X_train, y_train=y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric=['auc'],  
        max_epochs=100,
        patience=20,
        batch_size=1024,
        virtual_batch_size=128,
        num_workers=4,
        drop_last=False
    )
    
    preds = model.predict_proba(X_valid)[:, 1]
    
    solution_df = pd.DataFrame({'target': y_valid})
    submission_df = pd.DataFrame({'prediction': preds})
    score = comp_score(solution_df, submission_df, row_id_column_name="", min_tpr=0.80)
    print(f"fold: {fold} - Partial AUC Score: {score:.5f}")
    
    tabnet_scores.append(score)
    tabnet_models.append(model)
    
    oof_single = _df_valid[["isic_id", "target"]].copy()
    oof_single["pred"] = preds
    oof_df = pd.concat([oof_df, oof_single])

mean_score = np.mean(tabnet_scores)
print(f"Mean Partial AUC Score: {mean_score:.5f}")

Using device: cuda
Training fold 0




epoch 0  | loss: 0.0101  | val_0_auc: 0.76022 |  0:00:22s
epoch 10 | loss: 0.0055  | val_0_auc: 0.95622 |  0:03:53s
epoch 20 | loss: 0.00501 | val_0_auc: 0.95148 |  0:07:29s
epoch 30 | loss: 0.00482 | val_0_auc: 0.95957 |  0:11:01s

Early stopping occurred at epoch 32 with best_epoch = 12 and best_val_0_auc = 0.96268




fold: 0 - Partial AUC Score: 0.17454
Training fold 1




epoch 0  | loss: 0.01105 | val_0_auc: 0.8649  |  0:00:20s
epoch 10 | loss: 0.00565 | val_0_auc: 0.95012 |  0:03:49s
epoch 20 | loss: 0.00547 | val_0_auc: 0.96701 |  0:07:18s
epoch 30 | loss: 0.00531 | val_0_auc: 0.97914 |  0:10:47s
epoch 40 | loss: 0.00549 | val_0_auc: 0.95793 |  0:14:16s

Early stopping occurred at epoch 47 with best_epoch = 27 and best_val_0_auc = 0.98189




fold: 1 - Partial AUC Score: 0.18761
Training fold 2




epoch 0  | loss: 0.01095 | val_0_auc: 0.81997 |  0:00:21s
epoch 10 | loss: 0.00594 | val_0_auc: 0.96478 |  0:03:51s
epoch 20 | loss: 0.00536 | val_0_auc: 0.97387 |  0:07:20s
epoch 30 | loss: 0.00534 | val_0_auc: 0.97916 |  0:10:48s
epoch 40 | loss: 0.00548 | val_0_auc: 0.98198 |  0:14:19s
epoch 50 | loss: 0.00545 | val_0_auc: 0.98482 |  0:17:49s
epoch 60 | loss: 0.00509 | val_0_auc: 0.98413 |  0:21:18s
epoch 70 | loss: 0.00515 | val_0_auc: 0.9799  |  0:24:48s
epoch 80 | loss: 0.0051  | val_0_auc: 0.98567 |  0:28:22s
epoch 90 | loss: 0.00494 | val_0_auc: 0.98588 |  0:31:52s
Stop training because you reached max_epochs = 100 with best_epoch = 99 and best_val_0_auc = 0.98648




fold: 2 - Partial AUC Score: 0.19077
Training fold 3




epoch 0  | loss: 0.01046 | val_0_auc: 0.65961 |  0:00:20s
epoch 10 | loss: 0.00545 | val_0_auc: 0.93921 |  0:03:48s
epoch 20 | loss: 0.00557 | val_0_auc: 0.96572 |  0:07:16s
epoch 30 | loss: 0.00536 | val_0_auc: 0.94292 |  0:10:46s
epoch 40 | loss: 0.00503 | val_0_auc: 0.95721 |  0:14:16s

Early stopping occurred at epoch 43 with best_epoch = 23 and best_val_0_auc = 0.96643




fold: 3 - Partial AUC Score: 0.17418
Training fold 4




epoch 0  | loss: 0.01002 | val_0_auc: 0.65884 |  0:00:21s
epoch 10 | loss: 0.00547 | val_0_auc: 0.96988 |  0:03:59s
epoch 20 | loss: 0.00568 | val_0_auc: 0.96277 |  0:07:35s
epoch 30 | loss: 0.00559 | val_0_auc: 0.96652 |  0:11:09s
epoch 40 | loss: 0.00537 | val_0_auc: 0.97946 |  0:14:42s
epoch 50 | loss: 0.0052  | val_0_auc: 0.97521 |  0:18:16s
epoch 60 | loss: 0.00499 | val_0_auc: 0.97989 |  0:21:52s
epoch 70 | loss: 0.005   | val_0_auc: 0.97858 |  0:25:29s
epoch 80 | loss: 0.00514 | val_0_auc: 0.9736  |  0:29:00s

Early stopping occurred at epoch 84 with best_epoch = 64 and best_val_0_auc = 0.98204




fold: 4 - Partial AUC Score: 0.18785
Mean Partial AUC Score: 0.18299


In [11]:
'''
import joblib

def save_tabnet_model(model, path):
    save_dict = {}
    
    save_dict['model'] = model
    
    if hasattr(model, 'feature_importances_'):
        save_dict['feature_importances'] = model.feature_importances_
    
    for attr in ['input_dim', 'output_dim', 'n_d', 'n_a', 'n_steps']:
        if hasattr(model, attr):
            save_dict[attr] = getattr(model, attr)
    
    joblib.dump(save_dict, path)

save_dir = "tabnet_model"
os.makedirs(save_dir, exist_ok=True)

for i, model in enumerate(tabnet_models):
    save_path = os.path.join(save_dir, f"tabnet_model_fold_{i}.joblib")
    save_tabnet_model(model, save_path)

print(f"Models saved in {save_dir}")
'''

'\nimport joblib\n\ndef save_tabnet_model(model, path):\n    save_dict = {}\n    \n    save_dict[\'model\'] = model\n    \n    if hasattr(model, \'feature_importances_\'):\n        save_dict[\'feature_importances\'] = model.feature_importances_\n    \n    for attr in [\'input_dim\', \'output_dim\', \'n_d\', \'n_a\', \'n_steps\']:\n        if hasattr(model, attr):\n            save_dict[attr] = getattr(model, attr)\n    \n    joblib.dump(save_dict, path)\n\nsave_dir = "tabnet_model"\nos.makedirs(save_dir, exist_ok=True)\n\nfor i, model in enumerate(tabnet_models):\n    save_path = os.path.join(save_dir, f"tabnet_model_fold_{i}.joblib")\n    save_tabnet_model(model, save_path)\n\nprint(f"Models saved in {save_dir}")\n'

In [12]:
def load_tabnet_model(path):
    loaded_dict = joblib.load(path)
    model = loaded_dict['model']
    return model

#loaded_model = load_tabnet_model("path/to/model.joblib")

# Inference

In [13]:
df_eff = pd.read_csv("submission_effnetv1b0.csv")
df_test["target_effnetv1b0"] = df_eff["target"]

df_eva = pd.read_csv("submission_eva02.csv")
df_test["target_eva02"] = df_eva["target"]

df_3 = pd.read_csv("submission_image3.csv")
df_test["target_3"] = df_3["target"]

In [14]:
X_cat = category_encoder.transform(df_test[cat_cols])
for c, cat_col in enumerate(cat_cols):
    df_test[cat_col] = X_cat[:, c]

In [15]:
df_test[train_cols] = preprocess_data(df_test[train_cols])
X_test = StandardScaler().fit_transform(df_test[train_cols])

tabnet_preds = np.mean([model.predict_proba(X_test)[:, 1] for model in tabnet_models], axis=0)

In [16]:
df_sub = pd.read_csv("/kaggle/input/isic-2024-challenge/sample_submission.csv")
df_sub["target"] = tabnet_preds
df_sub

Unnamed: 0,isic_id,target
0,ISIC_0015657,0.000882
1,ISIC_0015729,1.5e-05
2,ISIC_0015740,5.7e-05


In [17]:
df_sub.to_csv("submission.csv", index=False)