https://www.kaggle.com/code/abdmental01/multimodel-isic

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import *
from sklearn.preprocessing import *

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

from sklearn.metrics import *
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

pd.set_option('display.max_columns', None)

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OrdinalEncoder
import category_encoders as ce

from tqdm import tqdm

SEED = 42
n_splits = 3

# Load and preprocess data

In [2]:
%%time 

test = pd.read_csv('/kaggle/input/isic-2024-challenge/test-metadata.csv')
train = pd.read_csv('/kaggle/input/isic-2024-challenge/train-metadata.csv')

train.drop('isic_id',axis=1,inplace=True)
test.drop('isic_id',axis=1,inplace=True)

test_columns = set(test.columns)
train_columns = set(train.columns)

diff_test_train = test_columns - train_columns
diff_train_test = train_columns - test_columns

if not diff_test_train and not diff_train_test:
    print("Both DataFrames have the same columns.")
else:
    print("Columns present in test but not in train:", diff_test_train)
    print("Columns present in train but not in test:", diff_train_test)

train.drop(columns=['iddx_4', 'mel_mitotic_index', 'iddx_1', 'lesion_id', 'tbp_lv_dnn_lesion_confidence',
                    'iddx_5', 'mel_thick_mm', 'iddx_2', 'iddx_full', 'iddx_3'],inplace=True)

Columns present in test but not in train: set()
Columns present in train but not in test: {'iddx_1', 'tbp_lv_dnn_lesion_confidence', 'iddx_5', 'iddx_3', 'mel_thick_mm', 'target', 'iddx_2', 'lesion_id', 'mel_mitotic_index', 'iddx_4', 'iddx_full'}
CPU times: user 6.49 s, sys: 743 ms, total: 7.23 s
Wall time: 9.44 s


In [3]:
def fe(df):
    
    # a sort of eccentricity
    df["lesion_size_ratio"]=df["tbp_lv_minorAxisMM"]/df["clin_size_long_diam_mm"]
    # another dimensionless measure of eccentricity (think circle / square)
    df["lesion_shape_index"]=df["tbp_lv_areaMM2"]/(df["tbp_lv_perimeterMM"]**2)
    # contrast between hue inside and outside
    df["hue_contrast"]= (df["tbp_lv_H"]-df["tbp_lv_Hext"]).abs()
    # contrast between luminance inside and outside
    df["luminance_contrast"]= (df["tbp_lv_L"]-df["tbp_lv_Lext"]).abs()
    # LAB is another color space similar to RGB. delta's are inside v. outside.
    df["lesion_color_difference"]=np.sqrt(df["tbp_lv_deltaA"]**2+df["tbp_lv_deltaB"]**2+df["tbp_lv_deltaL"]**2)
    # both metrics increase when asymmetry is higher and are on scale 0-10
    df["border_complexity"]=df["tbp_lv_norm_border"]+df["tbp_lv_symm_2axis"]
    # position on 3D TBP
    df["3d_position_distance"]=np.sqrt(df["tbp_lv_x"]**2+df["tbp_lv_y"]**2+df["tbp_lv_z"]**2)
    # another measure of irregularity...?
    df["perimeter_to_area_ratio"]=df["tbp_lv_perimeterMM"]/df["tbp_lv_areaMM2"]
    # contrast between lesion and surrounding, values from 5-25 + color variation 0 - 10
    df["lesion_visibility_score"]=df["tbp_lv_deltaLBnorm"]+df["tbp_lv_norm_color"]
    # both are location indicators
    df["combined_anatomical_site"]=df["anatom_site_general"]+"_"+df["tbp_lv_location"]
    # only when both are large does a lesion score high on this (cf border_complexity)
    df["symmetry_border_consistency"]=df["tbp_lv_symm_2axis"]*df["tbp_lv_norm_border"]
    # whether the variation in color is similar inside and outside lesion
    df["color_consistency"]=df["tbp_lv_stdL"]/df["tbp_lv_stdLExt"]
    # interactions are just products
    df["size_age_interaction"]=df["clin_size_long_diam_mm"]*df["age_approx"]
    # hue inside and color irregularity
    df["hue_color_std_interaction"]=df["tbp_lv_H"]*df["tbp_lv_color_std_mean"]
    # three measures of irregularity combined.
    df["lesion_severity_index"]=(df["tbp_lv_norm_border"]+df["tbp_lv_norm_color"]+df["tbp_lv_eccentricity"])/3
    df["shape_complexity_index"]=df["border_complexity"]+df["lesion_shape_index"]
    # first three terms are average contrast, last term is contrast in immediately surrounding skin
    df["color_contrast_index"]=df["tbp_lv_deltaA"]+df["tbp_lv_deltaB"]+df["tbp_lv_deltaL"]+df["tbp_lv_deltaLBnorm"]
    # the malignant lesions can be way longer and a log scale might better capture this
    df["log_lesion_area"]=np.log(df["tbp_lv_areaMM2"]+1)
    # perhaps lesion gorws in size with age.
    df["normalized_lesion_size"]=df["clin_size_long_diam_mm"]/df["age_approx"]
    # internal and external hue averaged
    df["mean_hue_difference"]=(df["tbp_lv_H"]+df["tbp_lv_Hext"])/2
    # combining inner contrast assuming Gaussisna
    df["std_dev_contrast"]=np.sqrt((df["tbp_lv_deltaA"]**2+df["tbp_lv_deltaB"]**2+df["tbp_lv_deltaL"]**2)/3)
    # combine metrics of color and shape, both could be more irregular for malignant
    df["color_shape_composite_index"]=(df["tbp_lv_color_std_mean"]+df["tbp_lv_area_perim_ratio"]+df["tbp_lv_symm_2axis"])/3
    df["3d_lesion_orientation"]=np.arctan2(df["tbp_lv_y"],df["tbp_lv_x"])
    df["overall_color_difference"]=(df["tbp_lv_deltaA"]+df["tbp_lv_deltaB"]+df["tbp_lv_deltaL"])/3
    df["symmetry_perimeter_interaction"]=df["tbp_lv_symm_2axis"]*df["tbp_lv_perimeterMM"]
    # the larger this value, the larger the "irregularity"
    df["comprehensive_lesion_index"]=(df["tbp_lv_area_perim_ratio"]+df["tbp_lv_eccentricity"]+df["tbp_lv_norm_color"]+df["tbp_lv_symm_2axis"])/4
    
    # categorical columns
    n_cat = ["combined_anatomical_site"]
    
    return df, n_cat

train, n_cat = fe(train)
test, _ = fe(test)

# columns with categories
cat_cols = ["sex", "tbp_tile_type", "tbp_lv_location", "tbp_lv_location_simple",'patient_id',
   'anatom_site_general','copyright_license','attribution','image_type'] + n_cat

# drop additional columns in one set
def align_columns(train, test):
    common_cols = train.columns.intersection(test.columns)
    train = train[common_cols]
    test = test[common_cols]
    return train, test

# protect target from being altered by fit_transform, add back later.
target = train['target']
train_features = train.drop(columns=['target'], errors='ignore')

train_features_aligned, test_features_aligned = align_columns(train_features, test)

encoder = ce.OrdinalEncoder(cols=cat_cols, handle_unknown='ignore')
train = encoder.fit_transform(train_features_aligned)
# a second call to encoder.transform will apply the same statistics of fit_transform.
test = encoder.transform(test_features_aligned)

train['target'] = target

In [4]:
X = train.drop('target',axis=1)
y = train['target']

def pauc_above_tpr(solution: pd.DataFrame, submission: pd.DataFrame, min_tpr: float=0.80):
    v_gt = abs(np.asarray(solution.values)-1)
    v_pred = np.array([1.0 - x for x in submission.values])
    max_fpr = abs(1-min_tpr)
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)
    return partial_auc

def Train_ML(Model, X, y, test_data):
    # k-fold cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    train_scores = []
    val_scores = []
    test_predictions = [] 
    models = []

    for fold, (train_index, test_index) in enumerate(tqdm(skf.split(X, y), total=n_splits), 1):
        # StratifiedKFold yields the indices from which we retrieve pandas metadata
        X_train, X_val = X.iloc[train_index], X.iloc[test_index]
        y_train, y_val = y.iloc[train_index], y.iloc[test_index]
        
        model = Model
        
        model.fit(X_train, y_train)

        # record performance on all sets
        y_train_pred_proba = model.predict_proba(X_train)[:, 1]
        train_pauc = pauc_above_tpr(y_train,pd.DataFrame(y_train_pred_proba, columns=["prediction"]),min_tpr=0.8)
        train_scores.append(train_pauc)

        y_val_pred_proba = model.predict_proba(X_val)[:, 1]
        val_pauc = pauc_above_tpr(y_val,pd.DataFrame(y_val_pred_proba, columns=["prediction"]),min_tpr=0.8)
        val_scores.append(val_pauc)
        
        # make prediction
        y_test_pred_proba = model.predict_proba(test)[:, 1]
        test_predictions.append(y_test_pred_proba)
        
        models.append(model)

        print(f"Fold {fold}: Train pAUC = {train_pauc:.4f}, Validation pAUC = {val_pauc:.4f}")

    # mean pauc on different folds' models
    mean_train_pauc = np.mean(train_scores)
    mean_val_pauc = np.mean(val_scores)

    print(f"\nMean Train pAUC: {mean_train_pauc:.4f}")
    print(f"Mean Validation pAUC: {mean_val_pauc:.4f}")

    # why would you want the "model"?
    return model,test_predictions, models

LightBGM

In [5]:
%%time

params =  {
        'objective': 'binary', 'colsample_bytree': 0.6852015051268027, 'max_depth': 4, 
        'learning_rate': 0.05714390301637632, 'n_estimators': 1010, 'subsample': 0.13326633837138008, 
        'lambda_l1': 1.4445754309498806e-08, 'lambda_l2': 0.11031259304642657, 'boosting_type': 'dart'
            }

Model = LGBMClassifier(**params,verbose=-1,random_state=SEED,
                      extra_tree=True,max_bin=250,reg_alpha=0.1,reg_lambda=0.8
                      )

train_lgb, test_preds , all_models = Train_ML(Model, X, y, test)

 33%|███▎      | 1/3 [04:44<09:29, 284.64s/it]

Fold 1: Train pAUC = 0.1938, Validation pAUC = 0.1705


 67%|██████▋   | 2/3 [13:00<06:49, 409.05s/it]

Fold 2: Train pAUC = 0.1941, Validation pAUC = 0.1513


100%|██████████| 3/3 [21:27<00:00, 429.19s/it]

Fold 3: Train pAUC = 0.1920, Validation pAUC = 0.1635

Mean Train pAUC: 0.1933
Mean Validation pAUC: 0.1617
CPU times: user 25min 25s, sys: 2.36 s, total: 25min 28s
Wall time: 21min 27s





CatBoost

In [6]:
%%time

Cat_Model = CatBoostClassifier(verbose=0,random_state=SEED,
                          iterations = 1000,
                          learning_rate=0.01,
                          objective = 'Logloss',
                          boosting_type = 'Plain',
                          bootstrap_type = 'Bernoulli',
                          colsample_bylevel = 0.08656159895289164,
                          subsample = 0.46623542352578917,
                          depth=9,)

train_cat, cat_test_preds , Cat_all_models = Train_ML(Cat_Model, X, y, test)

 33%|███▎      | 1/3 [01:05<02:10, 65.28s/it]

Fold 1: Train pAUC = 0.1908, Validation pAUC = 0.1711


 67%|██████▋   | 2/3 [02:09<01:04, 64.90s/it]

Fold 2: Train pAUC = 0.1933, Validation pAUC = 0.1535


100%|██████████| 3/3 [03:14<00:00, 64.83s/it]

Fold 3: Train pAUC = 0.1923, Validation pAUC = 0.1619

Mean Train pAUC: 0.1922
Mean Validation pAUC: 0.1621
CPU times: user 9min 13s, sys: 1min 29s, total: 10min 42s
Wall time: 3min 14s





XGB

In [7]:
%%time

xgb_params2 = {
    'objective': 'binary:logistic', 'colsample_bytree': 0.11756728710020253,'max_depth': 4, 
    'learning_rate': 0.009393224320850784,'n_estimators': 1227, 'subsample': 0.9589462514195692,
    'lambda': 0.34216652262461505,'alpha': 1.150597512455824e-07
              }

xgb_Model = XGBClassifier(**xgb_params2,random_state=SEED)

train_xgb, xgb_test_preds , xgb_all_models = Train_ML(xgb_Model, X, y, test)

 33%|███▎      | 1/3 [01:40<03:21, 100.52s/it]

Fold 1: Train pAUC = 0.1937, Validation pAUC = 0.1710


 67%|██████▋   | 2/3 [03:18<01:39, 99.22s/it] 

Fold 2: Train pAUC = 0.1937, Validation pAUC = 0.1536


100%|██████████| 3/3 [04:58<00:00, 99.51s/it]

Fold 3: Train pAUC = 0.1937, Validation pAUC = 0.1590

Mean Train pAUC: 0.1937
Mean Validation pAUC: 0.1612
CPU times: user 4min 57s, sys: 662 ms, total: 4min 58s
Wall time: 4min 58s





Test

In [8]:
%%time

Sample = pd.read_csv('/kaggle/input/isic-2024-challenge/sample_submission.csv')

lgb_test = np.mean(test_preds, axis=0)
cat_test = np.mean(cat_test_preds, axis=0)
xgb_test = np.mean(xgb_test_preds, axis=0)


ensemble_preds = (lgb_test + cat_test + xgb_test) / 3

sub = pd.DataFrame({
    'isic_id': Sample['isic_id'],
    'target': ensemble_preds
})

sub.to_csv('submission.csv', index=False)
sub.head()

CPU times: user 7.01 ms, sys: 0 ns, total: 7.01 ms
Wall time: 14.2 ms


Unnamed: 0,isic_id,target
0,ISIC_0015657,0.000268
1,ISIC_0015729,6.5e-05
2,ISIC_0015740,0.00026
