https://www.kaggle.com/code/abdmental01/multimodel-isic

In [1]:
%load_ext memory_profiler

import io
import warnings
import gc
import sys
import os
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from tqdm import tqdm

import h5py
from PIL import Image
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow

from scipy.special import softmax

from sklearn.model_selection import *
from sklearn.preprocessing import *
from sklearn.metrics import *

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

import torch
from torchvision.transforms import v2 as transforms
from transformers import AutoModelForImageClassification
from torch.utils.data import Dataset, DataLoader

pd.set_option('display.max_columns', None)

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OrdinalEncoder
import category_encoders as ce

import joblib

OWN_INSTANCE = True
SEED = 42
n_splits = 3

os.makedirs('gradboost', exist_ok = True)

# Load and preprocess metadata

In [2]:
%%time 

test_metadata_file = '/kaggle/input/isic-2024-challenge/test-metadata.csv'
train_metadata_file = '/kaggle/input/isic-2024-challenge/train-metadata.csv'

if OWN_INSTANCE:
    test_metadata_file = 'data/test-metadata.csv'
    train_metadata_file = 'data/train-metadata.csv'

test = pd.read_csv(test_metadata_file)
train = pd.read_csv(train_metadata_file)

#train.drop('isic_id',axis=1,inplace=True)
#test.drop('isic_id',axis=1,inplace=True)

test_columns = set(test.columns)
train_columns = set(train.columns)

diff_test_train = test_columns - train_columns
diff_train_test = train_columns - test_columns

if not diff_test_train and not diff_train_test:
    print("Both DataFrames have the same columns.")
else:
    print("Columns present in test but not in train:", diff_test_train)
    print("Columns present in train but not in test:", diff_train_test)

train.drop(columns=['iddx_4', 'mel_mitotic_index', 'iddx_1', 'lesion_id', 'tbp_lv_dnn_lesion_confidence',
                    'iddx_5', 'mel_thick_mm', 'iddx_2', 'iddx_full', 'iddx_3'],inplace=True)

Columns present in test but not in train: set()
Columns present in train but not in test: {'mel_thick_mm', 'lesion_id', 'iddx_2', 'iddx_5', 'iddx_3', 'target', 'iddx_4', 'tbp_lv_dnn_lesion_confidence', 'iddx_full', 'mel_mitotic_index', 'iddx_1'}
CPU times: user 2.52 s, sys: 318 ms, total: 2.84 s
Wall time: 2.83 s


In [3]:
%%time

def fe(df):
    
    # a sort of eccentricity
    df["lesion_size_ratio"]=df["tbp_lv_minorAxisMM"]/df["clin_size_long_diam_mm"]
    # another dimensionless measure of eccentricity (think circle / square)
    df["lesion_shape_index"]=df["tbp_lv_areaMM2"]/(df["tbp_lv_perimeterMM"]**2)
    # contrast between hue inside and outside
    df["hue_contrast"]= (df["tbp_lv_H"]-df["tbp_lv_Hext"]).abs()
    # contrast between luminance inside and outside
    df["luminance_contrast"]= (df["tbp_lv_L"]-df["tbp_lv_Lext"]).abs()
    # LAB is another color space similar to RGB. delta's are inside v. outside.
    df["lesion_color_difference"]=np.sqrt(df["tbp_lv_deltaA"]**2+df["tbp_lv_deltaB"]**2+df["tbp_lv_deltaL"]**2)
    # both metrics increase when asymmetry is higher and are on scale 0-10
    df["border_complexity"]=df["tbp_lv_norm_border"]+df["tbp_lv_symm_2axis"]
    # position on 3D TBP
    df["3d_position_distance"]=np.sqrt(df["tbp_lv_x"]**2+df["tbp_lv_y"]**2+df["tbp_lv_z"]**2)
    # another measure of irregularity...?
    df["perimeter_to_area_ratio"]=df["tbp_lv_perimeterMM"]/df["tbp_lv_areaMM2"]
    # contrast between lesion and surrounding, values from 5-25 + color variation 0 - 10
    df["lesion_visibility_score"]=df["tbp_lv_deltaLBnorm"]+df["tbp_lv_norm_color"]
    # both are location indicators
    df["combined_anatomical_site"]=df["anatom_site_general"]+"_"+df["tbp_lv_location"]
    # only when both are large does a lesion score high on this (cf border_complexity)
    df["symmetry_border_consistency"]=df["tbp_lv_symm_2axis"]*df["tbp_lv_norm_border"]
    # whether the variation in color is similar inside and outside lesion
    df["color_consistency"]=df["tbp_lv_stdL"]/df["tbp_lv_stdLExt"]
    # interactions are just products
    df["size_age_interaction"]=df["clin_size_long_diam_mm"]*df["age_approx"]
    # hue inside and color irregularity
    df["hue_color_std_interaction"]=df["tbp_lv_H"]*df["tbp_lv_color_std_mean"]
    # three measures of irregularity combined.
    df["lesion_severity_index"]=(df["tbp_lv_norm_border"]+df["tbp_lv_norm_color"]+df["tbp_lv_eccentricity"])/3
    df["shape_complexity_index"]=df["border_complexity"]+df["lesion_shape_index"]
    # first three terms are average contrast, last term is contrast in immediately surrounding skin
    df["color_contrast_index"]=df["tbp_lv_deltaA"]+df["tbp_lv_deltaB"]+df["tbp_lv_deltaL"]+df["tbp_lv_deltaLBnorm"]
    # the malignant lesions can be way longer and a log scale might better capture this
    df["log_lesion_area"]=np.log(df["tbp_lv_areaMM2"]+1)
    # perhaps lesion gorws in size with age.
    df["normalized_lesion_size"]=df["clin_size_long_diam_mm"]/df["age_approx"]
    # internal and external hue averaged
    df["mean_hue_difference"]=(df["tbp_lv_H"]+df["tbp_lv_Hext"])/2
    # combining inner contrast assuming Gaussisna
    df["std_dev_contrast"]=np.sqrt((df["tbp_lv_deltaA"]**2+df["tbp_lv_deltaB"]**2+df["tbp_lv_deltaL"]**2)/3)
    # combine metrics of color and shape, both could be more irregular for malignant
    df["color_shape_composite_index"]=(df["tbp_lv_color_std_mean"]+df["tbp_lv_area_perim_ratio"]+df["tbp_lv_symm_2axis"])/3
    df["3d_lesion_orientation"]=np.arctan2(df["tbp_lv_y"],df["tbp_lv_x"])
    df["overall_color_difference"]=(df["tbp_lv_deltaA"]+df["tbp_lv_deltaB"]+df["tbp_lv_deltaL"])/3
    df["symmetry_perimeter_interaction"]=df["tbp_lv_symm_2axis"]*df["tbp_lv_perimeterMM"]
    # the larger this value, the larger the "irregularity"
    df["comprehensive_lesion_index"]=(df["tbp_lv_area_perim_ratio"]+df["tbp_lv_eccentricity"]+df["tbp_lv_norm_color"]+df["tbp_lv_symm_2axis"])/4
    
    # categorical columns
    n_cat = ["combined_anatomical_site"]
    
    return df, n_cat

train, n_cat = fe(train)
test, _ = fe(test)

# columns with categories
cat_cols = ["sex", "tbp_tile_type", "tbp_lv_location", "tbp_lv_location_simple",'patient_id',
   'anatom_site_general','copyright_license','attribution','image_type'] + n_cat

# drop columns only present in one set
def align_columns(train, test):
    common_cols = train.columns.intersection(test.columns)
    train = train[common_cols]
    test = test[common_cols]
    return train, test

# target will be removed by align_columns anyway, remove first and add back later.
target = train['target']
train_features = train.drop(columns=['target'], errors='ignore')

train_features_aligned, test_features_aligned = align_columns(train_features, test)

encoder = ce.OrdinalEncoder(cols=cat_cols, handle_unknown='ignore')
train = encoder.fit_transform(train_features_aligned)
# a second call to encoder.transform will apply the same statistics of fit_transform.
test = encoder.transform(test_features_aligned)

train.drop(columns=['isic_id'], inplace = True)
test.drop(columns=['isic_id'], inplace = True)

train['target'] = target

CPU times: user 1.66 s, sys: 471 ms, total: 2.13 s
Wall time: 2.13 s


# Load ViT and extract feature from last hidden layer

## Preparation for image dataset

In [4]:
%%time

val_transform = transforms.Compose(
    [
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]
)

model_path = '/kaggle/input/vit/transformers/default/1/'
hdf_test_path = '/kaggle/input/isic-2024-challenge/test-image.hdf5'
hdf_train_path = '/kaggle/input/isic-2024-challenge/train-image.hdf5'

if OWN_INSTANCE:
    model_path = 'TobanDjan/vit'
    hdf_test_path = 'data/test-image.hdf5'
    hdf_train_path = 'data/train-image.hdf5'

# Function to load images from encoded data
def load_image_from_encoded_data(encoded_data):
    image = Image.open(io.BytesIO(encoded_data))
    return image.convert('RGB')

# Define a custom Dataset for the HDF5 images
class HDF5TestDataset(Dataset):
    def __init__(self, image_data, ids, transform=None):
        self.image_data = image_data
        self.ids = ids
        self.transform = transform

    def __len__(self):
        return len(self.image_data)

    def __getitem__(self, idx):
        image_data = self.image_data[idx]
        image = load_image_from_encoded_data(image_data)
        #imshow(image)
        #plt.show()
        if self.transform:
            image = self.transform(image)
        
        # print(image.element_size() * image.nelement())
        # 602112 B = 0.574 MB
        return image, self.ids[idx]

def get_dataset(hdf_file_path):
    with h5py.File(hdf_file_path, 'r') as f:
        image_data = [f[image_id][()] for image_id in tqdm(f.keys())]
        ids = list(f.keys())
        dataset = HDF5TestDataset(image_data=image_data, ids=ids, transform=val_transform)
    
    return dataset

%memit train_dataset = get_dataset(hdf_train_path)
test_dataset = get_dataset(hdf_test_path)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 401059/401059 [00:54<00:00, 7413.29it/s]


peak memory: 2900.26 MiB, increment: 1360.78 MiB


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 340.01it/s]


CPU times: user 52 s, sys: 3.94 s, total: 56 s
Wall time: 56.8 s


In [10]:
# Create the test dataset and dataloader
batch_size = 2 ** 9

if OWN_INSTANCE:
    batch_size = 2 ** 8

model = None
gc.collect()
with torch.no_grad():
    torch.cuda.empty_cache()

def predict(model_path, dataset):
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=4)
    
    device = torch.device("cuda")
    model = AutoModelForImageClassification.from_pretrained(model_path)
    model.to(device)

    isic_ids = []
    targets = []

    with torch.no_grad():
        for inputs, batch_ids in tqdm(dataloader, total = len(dataloader)):
            inputs = inputs.to(device)
            # print(inputs.element_size() * inputs.nelement())
            outputs = model(inputs)
            outputs = outputs.logits.cpu()

            outputs = softmax(outputs, axis=1)[:, 1]

            isic_ids.extend(batch_ids)
            targets.extend(outputs)
            
    return isic_ids, targets

model_paths = [ 
    '/kaggle/input/efficientnet-b7-full/transformers/default/3/',
    '/kaggle/input/beit/transformers/default/1',
    '/kaggle/input/vitmae-sup/transformers/default/1',
    '/kaggle/input/vit/transformers/default/1'
]

if OWN_INSTANCE:
    model_paths = [
        'TobanDjan/efficientnet-b7-2024',
        'TobanDjan/beit',
        'TobanDjan/vitmae-sup',
        'TobanDjan/vit'
    ]

model_names = ['effnet', 'beit', 'vitmae', 'vit']

for model_name, model_path in zip(model_names, model_paths):
    test_isic_ids, test_targets = predict(model_path, test_dataset)
    test[model_name + '_target']  = test_targets

    train_isic_ids, train_targets = predict(model_path, train_dataset)
    train[model_name + '_target'] = train_targets

print(test)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.57it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1567/1567 [18:21<00:00,  1.42it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.11s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1567/1567 [23:08<00:00,  1.13it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.21it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1567/1567 [1:10:06<00:00,  2.68s/it]
100%|███████████████████████████████████████████████

   patient_id  age_approx  sex  anatom_site_general  clin_size_long_diam_mm  \
0         NaN        45.0    1                    3                    2.70   
1         NaN        35.0    2                    1                    2.52   
2         NaN        65.0    1                    3                    3.16   

   image_type  tbp_tile_type  tbp_lv_A  tbp_lv_Aext  tbp_lv_B  tbp_lv_Bext  \
0           1              2  22.80433    20.007270  28.38412    27.043640   
1           1              2  16.64867     9.657964  31.31752    27.524318   
2           1              2  24.25384    19.937380  30.46368    28.384240   

    tbp_lv_C  tbp_lv_Cext   tbp_lv_H  tbp_lv_Hext  tbp_lv_L  tbp_lv_Lext  \
0  36.410100    33.640000  51.220960    53.505430  24.97985    31.114600   
1  35.467806    29.169579  62.004494    70.664619  59.90409    68.141071   
2  38.939500    34.686660  51.474730    54.915410  35.81945    41.358640   

   tbp_lv_areaMM2  tbp_lv_area_perim_ratio  tbp_lv_color_std_mean

In [11]:
model = None
gc.collect()
with torch.no_grad():
    torch.cuda.empty_cache()

# Gradient boosting preparations

In [12]:
X = train.drop('target',axis=1)
y = train['target']

def pauc_above_tpr(solution, submission, min_tpr: float=0.80):
    v_gt = abs(np.asarray(solution)-1)
    v_pred = np.array([1.0 - x for x in submission])
    max_fpr = abs(1-min_tpr)
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)
    return partial_auc

def Train_ML(model_factory, X, y, test_data):
    # k-fold cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    train_scores = []
    val_scores = []
    test_predictions = [] 
    models = []

    for fold, (train_index, test_index) in enumerate(tqdm(skf.split(X, y), total=n_splits), 1):
        # StratifiedKFold yields the indices from which we retrieve pandas metadata
        X_train, X_val = X.iloc[train_index], X.iloc[test_index]
        y_train, y_val = y[train_index], y[test_index]
        
        model = model_factory()
        
        model.fit(X_train, y_train)

        # record performance on all sets
        y_train_pred_proba = model.predict_proba(X_train)[:, 1]
        train_pauc = pauc_above_tpr(y_train,y_train_pred_proba, min_tpr=0.8)
        train_scores.append(train_pauc)

        y_val_pred_proba = model.predict_proba(X_val)[:, 1]
        val_pauc = pauc_above_tpr(y_val, y_val_pred_proba, min_tpr=0.8)
        val_scores.append(val_pauc)
        
        # make predictions
        y_test_pred_proba = model.predict_proba(test)[:, 1]
        test_predictions.append(y_test_pred_proba)
        
        models.append(model)

        print(f"Fold {fold}: Train pAUC = {train_pauc:.4f}, Validation pAUC = {val_pauc:.4f}")

    # mean pauc on different folds' models
    mean_train_pauc = np.mean(train_scores)
    mean_val_pauc = np.mean(val_scores)

    print(f"\nMean Train pAUC: {mean_train_pauc:.4f}")
    print(f"Mean Validation pAUC: {mean_val_pauc:.4f}")

    # why would you want the "model"?
    return model,test_predictions, models

# LightGBM

In [13]:
%%time

def lgbm_factory():
    params =  {
            'objective': 'binary', 'colsample_bytree': 0.6852015051268027, 'max_depth': 4, 
            'learning_rate': 0.05714390301637632, 'n_estimators': 1010, 'subsample': 0.13326633837138008, 
            'lambda_l1': 1.4445754309498806e-08, 'lambda_l2': 0.11031259304642657, 'boosting_type': 'dart'
                }
    
    Model = LGBMClassifier(**params,verbose=-1,random_state=SEED,
                          extra_tree=True,max_bin=250,reg_alpha=0.1,reg_lambda=0.8
                          )
    return Model

train_lgb, test_preds, all_models = Train_ML(lgbm_factory, X, y, test)


 33%|████████████████████████████████████████▋                                                                                 | 1/3 [00:26<00:53, 26.53s/it]

Fold 1: Train pAUC = 0.1990, Validation pAUC = 0.1934


 67%|█████████████████████████████████████████████████████████████████████████████████▎                                        | 2/3 [00:51<00:25, 25.75s/it]

Fold 2: Train pAUC = 0.1990, Validation pAUC = 0.1829


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [01:16<00:00, 25.66s/it]

Fold 3: Train pAUC = 0.1990, Validation pAUC = 0.1898

Mean Train pAUC: 0.1990
Mean Validation pAUC: 0.1887
CPU times: user 19min 1s, sys: 1.58 s, total: 19min 3s
Wall time: 1min 16s





# CatBoost

In [14]:
%%time

def cat_factory():
    Cat_Model = CatBoostClassifier(verbose=0,random_state=SEED,
                              iterations = 1000,
                              learning_rate=0.01,
                              objective = 'Logloss',
                              boosting_type = 'Plain',
                              bootstrap_type = 'Bernoulli',
                              colsample_bylevel = 0.08656159895289164,
                              subsample = 0.46623542352578917,
                              depth=9,)
    return Cat_Model

%memit train_cat, cat_test_preds , Cat_all_models = Train_ML(cat_factory, X, y, test)


 33%|████████████████████████████████████████▋                                                                                 | 1/3 [00:42<01:24, 42.24s/it]

Fold 1: Train pAUC = 0.1986, Validation pAUC = 0.1932


 67%|█████████████████████████████████████████████████████████████████████████████████▎                                        | 2/3 [01:24<00:42, 42.52s/it]

Fold 2: Train pAUC = 0.1994, Validation pAUC = 0.1855


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [02:07<00:00, 42.49s/it]

Fold 3: Train pAUC = 0.1991, Validation pAUC = 0.1904

Mean Train pAUC: 0.1990
Mean Validation pAUC: 0.1897
peak memory: 4563.01 MiB, increment: 457.35 MiB
CPU times: user 14min 17s, sys: 2min 3s, total: 16min 21s
Wall time: 2min 8s





# XGB

In [15]:
%%time

def xgb_factory():
    xgb_params2 = {
        'objective': 'binary:logistic', 'colsample_bytree': 0.11756728710020253,'max_depth': 4, 
        'learning_rate': 0.009393224320850784,'n_estimators': 1227, 'subsample': 0.9589462514195692,
        'lambda': 0.34216652262461505,'alpha': 1.150597512455824e-07
                  }
    
    xgb_Model = XGBClassifier(**xgb_params2,random_state=SEED)
    return xgb_Model

%memit train_xgb, xgb_test_preds , xgb_all_models = Train_ML(xgb_factory, X, y, test)


 33%|████████████████████████████████████████▋                                                                                 | 1/3 [00:14<00:28, 14.08s/it]

Fold 1: Train pAUC = 0.1987, Validation pAUC = 0.1939


 67%|█████████████████████████████████████████████████████████████████████████████████▎                                        | 2/3 [00:28<00:14, 14.20s/it]

Fold 2: Train pAUC = 0.1990, Validation pAUC = 0.1871


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:42<00:00, 14.14s/it]

Fold 3: Train pAUC = 0.1987, Validation pAUC = 0.1890

Mean Train pAUC: 0.1988
Mean Validation pAUC: 0.1900
peak memory: 4766.06 MiB, increment: 317.96 MiB
CPU times: user 9min 32s, sys: 2.25 s, total: 9min 34s
Wall time: 42.9 s





# Test

In [16]:
%%time

sample_file = '/kaggle/input/isic-2024-challenge/sample_submission.csv'

if OWN_INSTANCE:
    sample_file = 'data/sample_submission.csv'
    
Sample = pd.read_csv(sample_file)

lgb_test = np.mean(test_preds, axis=0)
cat_test = np.mean(cat_test_preds, axis=0)
xgb_test = np.mean(xgb_test_preds, axis=0)


ensemble_preds = (lgb_test + cat_test + xgb_test) / 3

sub = pd.DataFrame({
    'isic_id': Sample['isic_id'],
    'target': ensemble_preds
})

sub.to_csv('submission.csv', index=False)
sub.head()

CPU times: user 0 ns, sys: 6.34 ms, total: 6.34 ms
Wall time: 4.9 ms


Unnamed: 0,isic_id,target
0,ISIC_0015657,4.2e-05
1,ISIC_0015729,1.9e-05
2,ISIC_0015740,3.3e-05


# Save models

In [17]:
def dump_models(framework, models):
    for idx, model in enumerate(models):
        joblib.dump(model, f'gradboost/{framework}_{idx}.joblib')

dump_models("lgbm", all_models)
dump_models("catboost", Cat_all_models)
dump_models("xgb", xgb_all_models)