https://www.kaggle.com/code/abdmental01/multimodel-isic

In [1]:
%load_ext memory_profiler

import io
import warnings
import gc
import sys
import os
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from tqdm import tqdm

import h5py
from PIL import Image
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow

from scipy.special import softmax

from sklearn.model_selection import *
from sklearn.preprocessing import *
from sklearn.metrics import *

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

import torch
from torchvision.transforms import v2 as transforms
from transformers import AutoModelForImageClassification
from torch.utils.data import Dataset, DataLoader

pd.set_option('display.max_columns', None)

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OrdinalEncoder
import category_encoders as ce

import joblib

OWN_INSTANCE = True
SEED = 42
n_splits = 3

os.makedirs('gradboost', exist_ok = True)

# Load and preprocess metadata

In [2]:
%%time 

test_metadata_file = '/kaggle/input/isic-2024-challenge/test-metadata.csv'
train_metadata_file = '/kaggle/input/isic-2024-challenge/train-metadata.csv'

if OWN_INSTANCE:
    test_metadata_file = 'data/test-metadata.csv'
    train_metadata_file = 'data/train-metadata.csv'

test = pd.read_csv(test_metadata_file)
train = pd.read_csv(train_metadata_file)

#train.drop('isic_id',axis=1,inplace=True)
#test.drop('isic_id',axis=1,inplace=True)

test_columns = set(test.columns)
train_columns = set(train.columns)

diff_test_train = test_columns - train_columns
diff_train_test = train_columns - test_columns

if not diff_test_train and not diff_train_test:
    print("Both DataFrames have the same columns.")
else:
    print("Columns present in test but not in train:", diff_test_train)
    print("Columns present in train but not in test:", diff_train_test)

train.drop(columns=['iddx_4', 'mel_mitotic_index', 'iddx_1', 'lesion_id', 'tbp_lv_dnn_lesion_confidence',
                    'iddx_5', 'mel_thick_mm', 'iddx_2', 'iddx_full', 'iddx_3'],inplace=True)

Columns present in test but not in train: set()
Columns present in train but not in test: {'mel_mitotic_index', 'tbp_lv_dnn_lesion_confidence', 'iddx_1', 'iddx_2', 'iddx_full', 'iddx_5', 'lesion_id', 'target', 'mel_thick_mm', 'iddx_4', 'iddx_3'}
CPU times: user 2.51 s, sys: 313 ms, total: 2.83 s
Wall time: 2.84 s


In [3]:
%%time

def fe(df):
    
    # a sort of eccentricity
    df["lesion_size_ratio"]=df["tbp_lv_minorAxisMM"]/df["clin_size_long_diam_mm"]
    # another dimensionless measure of eccentricity (think circle / square)
    df["lesion_shape_index"]=df["tbp_lv_areaMM2"]/(df["tbp_lv_perimeterMM"]**2)
    # contrast between hue inside and outside
    df["hue_contrast"]= (df["tbp_lv_H"]-df["tbp_lv_Hext"]).abs()
    # contrast between luminance inside and outside
    df["luminance_contrast"]= (df["tbp_lv_L"]-df["tbp_lv_Lext"]).abs()
    # LAB is another color space similar to RGB. delta's are inside v. outside.
    df["lesion_color_difference"]=np.sqrt(df["tbp_lv_deltaA"]**2+df["tbp_lv_deltaB"]**2+df["tbp_lv_deltaL"]**2)
    # both metrics increase when asymmetry is higher and are on scale 0-10
    df["border_complexity"]=df["tbp_lv_norm_border"]+df["tbp_lv_symm_2axis"]
    # position on 3D TBP
    df["3d_position_distance"]=np.sqrt(df["tbp_lv_x"]**2+df["tbp_lv_y"]**2+df["tbp_lv_z"]**2)
    # another measure of irregularity...?
    df["perimeter_to_area_ratio"]=df["tbp_lv_perimeterMM"]/df["tbp_lv_areaMM2"]
    # contrast between lesion and surrounding, values from 5-25 + color variation 0 - 10
    df["lesion_visibility_score"]=df["tbp_lv_deltaLBnorm"]+df["tbp_lv_norm_color"]
    # both are location indicators
    df["combined_anatomical_site"]=df["anatom_site_general"]+"_"+df["tbp_lv_location"]
    # only when both are large does a lesion score high on this (cf border_complexity)
    df["symmetry_border_consistency"]=df["tbp_lv_symm_2axis"]*df["tbp_lv_norm_border"]
    # whether the variation in color is similar inside and outside lesion
    df["color_consistency"]=df["tbp_lv_stdL"]/df["tbp_lv_stdLExt"]
    # interactions are just products
    df["size_age_interaction"]=df["clin_size_long_diam_mm"]*df["age_approx"]
    # hue inside and color irregularity
    df["hue_color_std_interaction"]=df["tbp_lv_H"]*df["tbp_lv_color_std_mean"]
    # three measures of irregularity combined.
    df["lesion_severity_index"]=(df["tbp_lv_norm_border"]+df["tbp_lv_norm_color"]+df["tbp_lv_eccentricity"])/3
    df["shape_complexity_index"]=df["border_complexity"]+df["lesion_shape_index"]
    # first three terms are average contrast, last term is contrast in immediately surrounding skin
    df["color_contrast_index"]=df["tbp_lv_deltaA"]+df["tbp_lv_deltaB"]+df["tbp_lv_deltaL"]+df["tbp_lv_deltaLBnorm"]
    # the malignant lesions can be way longer and a log scale might better capture this
    df["log_lesion_area"]=np.log(df["tbp_lv_areaMM2"]+1)
    # perhaps lesion gorws in size with age.
    df["normalized_lesion_size"]=df["clin_size_long_diam_mm"]/df["age_approx"]
    # internal and external hue averaged
    df["mean_hue_difference"]=(df["tbp_lv_H"]+df["tbp_lv_Hext"])/2
    # combining inner contrast assuming Gaussisna
    df["std_dev_contrast"]=np.sqrt((df["tbp_lv_deltaA"]**2+df["tbp_lv_deltaB"]**2+df["tbp_lv_deltaL"]**2)/3)
    # combine metrics of color and shape, both could be more irregular for malignant
    df["color_shape_composite_index"]=(df["tbp_lv_color_std_mean"]+df["tbp_lv_area_perim_ratio"]+df["tbp_lv_symm_2axis"])/3
    df["3d_lesion_orientation"]=np.arctan2(df["tbp_lv_y"],df["tbp_lv_x"])
    df["overall_color_difference"]=(df["tbp_lv_deltaA"]+df["tbp_lv_deltaB"]+df["tbp_lv_deltaL"])/3
    df["symmetry_perimeter_interaction"]=df["tbp_lv_symm_2axis"]*df["tbp_lv_perimeterMM"]
    # the larger this value, the larger the "irregularity"
    df["comprehensive_lesion_index"]=(df["tbp_lv_area_perim_ratio"]+df["tbp_lv_eccentricity"]+df["tbp_lv_norm_color"]+df["tbp_lv_symm_2axis"])/4
    
    # categorical columns
    n_cat = ["combined_anatomical_site"]
    
    return df, n_cat

train, n_cat = fe(train)
test, _ = fe(test)

# columns with categories
cat_cols = ["sex", "tbp_tile_type", "tbp_lv_location", "tbp_lv_location_simple",'patient_id',
   'anatom_site_general','copyright_license','attribution','image_type'] + n_cat

# drop columns only present in one set
def align_columns(train, test):
    common_cols = train.columns.intersection(test.columns)
    train = train[common_cols]
    test = test[common_cols]
    return train, test

# target will be removed by align_columns anyway, remove first and add back later.
target = train['target']
train_features = train.drop(columns=['target'], errors='ignore')

train_features_aligned, test_features_aligned = align_columns(train_features, test)

encoder = ce.OrdinalEncoder(cols=cat_cols, handle_unknown='ignore')
train = encoder.fit_transform(train_features_aligned)
# a second call to encoder.transform will apply the same statistics of fit_transform.
test = encoder.transform(test_features_aligned)

train.drop(columns=['isic_id'], inplace = True)
test.drop(columns=['isic_id'], inplace = True)

train['target'] = target

CPU times: user 1.67 s, sys: 544 ms, total: 2.22 s
Wall time: 2.21 s


# Load ViT and extract feature from last hidden layer

## Preparation for image dataset

In [4]:
%%time

val_transform = transforms.Compose(
    [
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]
)

model_path = '/kaggle/input/vit/transformers/default/1/'
hdf_test_path = '/kaggle/input/isic-2024-challenge/test-image.hdf5'
hdf_train_path = '/kaggle/input/isic-2024-challenge/train-image.hdf5'

if OWN_INSTANCE:
    model_path = 'TobanDjan/vit'
    hdf_test_path = 'data/test-image.hdf5'
    hdf_train_path = 'data/train-image.hdf5'

# Function to load images from encoded data
def load_image_from_encoded_data(encoded_data):
    image = Image.open(io.BytesIO(encoded_data))
    return image.convert('RGB')

# Define a custom Dataset for the HDF5 images
class HDF5TestDataset(Dataset):
    def __init__(self, image_data, ids, transform=None):
        self.image_data = image_data
        self.ids = ids
        self.transform = transform

    def __len__(self):
        return len(self.image_data)

    def __getitem__(self, idx):
        image_data = self.image_data[idx]
        image = load_image_from_encoded_data(image_data)
        #imshow(image)
        #plt.show()
        if self.transform:
            image = self.transform(image)
        
        # print(image.element_size() * image.nelement())
        # 602112 B = 0.574 MB
        return image, self.ids[idx]

def get_dataset(hdf_file_path):
    with h5py.File(hdf_file_path, 'r') as f:
        image_data = [f[image_id][()] for image_id in tqdm(f.keys())]
        ids = list(f.keys())
        dataset = HDF5TestDataset(image_data=image_data, ids=ids, transform=val_transform)
    
    return dataset

%memit train_dataset = get_dataset(hdf_train_path)
test_dataset = get_dataset(hdf_test_path)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 401059/401059 [00:55<00:00, 7188.85it/s]


peak memory: 2903.96 MiB, increment: 1445.95 MiB


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 408.58it/s]

CPU times: user 53.4 s, sys: 4.13 s, total: 57.6 s
Wall time: 57.5 s





In [5]:
# Create the test dataset and dataloader
batch_size = 2 ** 9

if OWN_INSTANCE:
    batch_size = 2 ** 8

test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

model = None
gc.collect()
with torch.no_grad():
    torch.cuda.empty_cache()

device = torch.device("cuda")
model = AutoModelForImageClassification.from_pretrained(model_path)
model.to(device)

def combine_vit_features(dataloader, metadata):
    NUM_FEATURES = 768
    feat_cols = list(range(NUM_FEATURES))

    table = metadata.values
    table = np.hstack((table, np.zeros((table.shape[0], NUM_FEATURES + 1))))
    columns = metadata.columns.values.tolist()
    columns.append('vit_target')
    columns.extend(feat_cols)

    row_offset = 0
    
    model.eval()
    with torch.no_grad():
        for inputs, batch_ids in tqdm(dataloader, total = len(dataloader)):
            inputs = inputs.to(device)
            # print(inputs.element_size() * inputs.nelement())
            outputs = model(inputs, output_hidden_states = True)

            proba = outputs.logits.cpu()
            proba = softmax(proba, axis=1)[:, 1]

            last_hidden_states = outputs.hidden_states[-1].cpu()
            cls_features = last_hidden_states[:, 0, :].squeeze().numpy().tolist()

            for i, isic_id in enumerate(batch_ids):
                row_idx = row_offset + i
                table[row_idx, -NUM_FEATURES - 1] = proba[i]
                table[row_idx, -NUM_FEATURES:] = cls_features[i]

            row_offset += len(batch_ids)

    # metadata.reset_index(inplace = True)
    return pd.DataFrame(table, columns = columns)

%memit test = combine_vit_features(test_dataloader, test)
print(test)
%memit train = combine_vit_features(train_dataloader, train)


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.50it/s]


peak memory: 3115.38 MiB, increment: 65.44 MiB
   patient_id  age_approx  sex  anatom_site_general  clin_size_long_diam_mm  \
0         NaN        45.0  1.0                  3.0                    2.70   
1         NaN        35.0  2.0                  1.0                    2.52   
2         NaN        65.0  1.0                  3.0                    3.16   

   image_type  tbp_tile_type  tbp_lv_A  tbp_lv_Aext  tbp_lv_B  tbp_lv_Bext  \
0         1.0            2.0  22.80433    20.007270  28.38412    27.043640   
1         1.0            2.0  16.64867     9.657964  31.31752    27.524318   
2         1.0            2.0  24.25384    19.937380  30.46368    28.384240   

    tbp_lv_C  tbp_lv_Cext   tbp_lv_H  tbp_lv_Hext  tbp_lv_L  tbp_lv_Lext  \
0  36.410100    33.640000  51.220960    53.505430  24.97985    31.114600   
1  35.467806    29.169579  62.004494    70.664619  59.90409    68.141071   
2  38.939500    34.686660  51.474730    54.915410  35.81945    41.358640   

   tbp_lv_areaMM2 

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1567/1567 [24:04<00:00,  1.09it/s]


peak memory: 6137.11 MiB, increment: 3023.30 MiB


In [6]:
model = None
gc.collect()
with torch.no_grad():
    torch.cuda.empty_cache()

from guppy import hpy; h=hpy()
print(h.heap())

print(len(train))
print(test)

Partition of a set of 2480314 objects. Total size = 6564178064 bytes.
 Index  Count   %     Size   % Cumulative  % Kind (class / dict of class)
     0      5   0 3734255479  57 3734255479  57 pandas.core.frame.DataFrame
     1 401062  16 2289308988  35 6023564467  92 numpy.bytes_
     2    490   0 279287074   4 6302851541  96 numpy.ndarray
     3 867003  35 94244943   1 6397096484  97 str
     4 387400  16 29354192   0 6426450676  98 tuple
     5 116637   5 20813393   0 6447264069  98 types.CodeType
     6 179690   7 16433999   0 6463698068  98 bytes
     7 102616   4 14776704   0 6478474772  99 function
     8  49915   2 13150952   0 6491625724  99 dict (no owner)
     9  11428   0 11107488   0 6502733212  99 type
<3237 more rows. Type e.g. '_.more' to view.>
401059
   patient_id  age_approx  sex  anatom_site_general  clin_size_long_diam_mm  \
0         NaN        45.0  1.0                  3.0                    2.70   
1         NaN        35.0  2.0                  1.0             

# Gradient boosting preparations

In [7]:
X = train.drop('target',axis=1)
y = train['target']

def pauc_above_tpr(solution, submission, min_tpr: float=0.80):
    v_gt = abs(np.asarray(solution)-1)
    v_pred = np.array([1.0 - x for x in submission])
    max_fpr = abs(1-min_tpr)
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)
    return partial_auc

def Train_ML(model_factory, X, y, test_data):
    # k-fold cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    train_scores = []
    val_scores = []
    test_predictions = [] 
    models = []

    for fold, (train_index, test_index) in enumerate(tqdm(skf.split(X, y), total=n_splits), 1):
        # StratifiedKFold yields the indices from which we retrieve pandas metadata
        X_train, X_val = X.iloc[train_index], X.iloc[test_index]
        y_train, y_val = y[train_index], y[test_index]
        
        model = model_factory()
        
        model.fit(X_train, y_train)

        # record performance on all sets
        y_train_pred_proba = model.predict_proba(X_train)[:, 1]
        train_pauc = pauc_above_tpr(y_train,y_train_pred_proba, min_tpr=0.8)
        train_scores.append(train_pauc)

        y_val_pred_proba = model.predict_proba(X_val)[:, 1]
        val_pauc = pauc_above_tpr(y_val, y_val_pred_proba, min_tpr=0.8)
        val_scores.append(val_pauc)
        
        # make predictions
        y_test_pred_proba = model.predict_proba(test)[:, 1]
        test_predictions.append(y_test_pred_proba)
        
        models.append(model)

        print(f"Fold {fold}: Train pAUC = {train_pauc:.4f}, Validation pAUC = {val_pauc:.4f}")

    # mean pauc on different folds' models
    mean_train_pauc = np.mean(train_scores)
    mean_val_pauc = np.mean(val_scores)

    print(f"\nMean Train pAUC: {mean_train_pauc:.4f}")
    print(f"Mean Validation pAUC: {mean_val_pauc:.4f}")

    # why would you want the "model"?
    return model,test_predictions, models

# LightGBM

In [8]:
%%time

def lgbm_factory():
    params =  {
            'objective': 'binary', 'colsample_bytree': 0.6852015051268027, 'max_depth': 4, 
            'learning_rate': 0.05714390301637632, 'n_estimators': 1010, 'subsample': 0.13326633837138008, 
            'lambda_l1': 1.4445754309498806e-08, 'lambda_l2': 0.11031259304642657, 'boosting_type': 'gbdt'
                }
    
    Model = LGBMClassifier(**params,verbose=-1,random_state=SEED,
                          extra_tree=True,max_bin=250,reg_alpha=0.1,reg_lambda=0.8
                          )
    return Model

train_lgb, test_preds, all_models = Train_ML(lgbm_factory, X, y, test)


 33%|████████████████████████████████████████▋                                                                                 | 1/3 [01:14<02:29, 74.65s/it]

Fold 1: Train pAUC = 0.2000, Validation pAUC = 0.1769


 67%|█████████████████████████████████████████████████████████████████████████████████▎                                        | 2/3 [02:27<01:13, 73.58s/it]

Fold 2: Train pAUC = 0.2000, Validation pAUC = 0.1724


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [03:40<00:00, 73.55s/it]

Fold 3: Train pAUC = 0.2000, Validation pAUC = 0.1729

Mean Train pAUC: 0.2000
Mean Validation pAUC: 0.1741
CPU times: user 51min 37s, sys: 8.49 s, total: 51min 46s
Wall time: 3min 40s





# CatBoost

In [9]:
%%time

def cat_factory():
    Cat_Model = CatBoostClassifier(verbose=0,random_state=SEED,
                              iterations = 1000,
                              learning_rate=0.01,
                              objective = 'Logloss',
                              boosting_type = 'Plain',
                              bootstrap_type = 'Bernoulli',
                              colsample_bylevel = 0.08656159895289164,
                              subsample = 0.46623542352578917,
                              depth=9,)
    return Cat_Model

%memit train_cat, cat_test_preds , Cat_all_models = Train_ML(cat_factory, X, y, test)


 33%|████████████████████████████████████████▎                                                                                | 1/3 [01:58<03:56, 118.15s/it]

Fold 1: Train pAUC = 0.1933, Validation pAUC = 0.1802


 67%|████████████████████████████████████████████████████████████████████████████████▋                                        | 2/3 [03:53<01:56, 116.39s/it]

Fold 2: Train pAUC = 0.1957, Validation pAUC = 0.1724


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [05:56<00:00, 118.94s/it]

Fold 3: Train pAUC = 0.1933, Validation pAUC = 0.1766

Mean Train pAUC: 0.1941
Mean Validation pAUC: 0.1764
CPU times: user 39min 8s, sys: 19min 21s, total: 58min 30s
Wall time: 5min 57s





# XGB

In [10]:
%%time

def xgb_factory():
    xgb_params2 = {
        'objective': 'binary:logistic', 'colsample_bytree': 0.11756728710020253,'max_depth': 4, 
        'learning_rate': 0.009393224320850784,'n_estimators': 1227, 'subsample': 0.9589462514195692,
        'lambda': 0.34216652262461505,'alpha': 1.150597512455824e-07
                  }
    
    xgb_Model = XGBClassifier(**xgb_params2,random_state=SEED)
    return xgb_Model

%memit train_xgb, xgb_test_preds , xgb_all_models = Train_ML(xgb_factory, X, y, test)


 33%|████████████████████████████████████████▎                                                                                | 1/3 [01:45<03:30, 105.23s/it]

Fold 1: Train pAUC = 0.1999, Validation pAUC = 0.1811


 67%|████████████████████████████████████████████████████████████████████████████████▋                                        | 2/3 [03:24<01:41, 101.79s/it]

Fold 2: Train pAUC = 0.1999, Validation pAUC = 0.1722


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [05:03<00:00, 101.26s/it]

Fold 3: Train pAUC = 0.1999, Validation pAUC = 0.1790

Mean Train pAUC: 0.1999
Mean Validation pAUC: 0.1774
CPU times: user 1h 13min 49s, sys: 8.36 s, total: 1h 13min 58s
Wall time: 5min 4s





# Test

In [11]:
%%time

sample_file = '/kaggle/input/isic-2024-challenge/sample_submission.csv'

if OWN_INSTANCE:
    sample_file = 'data/sample_submission.csv'
    
Sample = pd.read_csv(sample_file)

lgb_test = np.mean(test_preds, axis=0)
cat_test = np.mean(cat_test_preds, axis=0)
xgb_test = np.mean(xgb_test_preds, axis=0)


ensemble_preds = (lgb_test + cat_test + xgb_test) / 3

sub = pd.DataFrame({
    'isic_id': Sample['isic_id'],
    'target': ensemble_preds
})

sub.to_csv('submission.csv', index=False)
sub.head()

CPU times: user 3.17 ms, sys: 3.43 ms, total: 6.6 ms
Wall time: 5.11 ms


Unnamed: 0,isic_id,target
0,ISIC_0015657,2.5e-05
1,ISIC_0015729,1.2e-05
2,ISIC_0015740,1.8e-05


# Save models

In [13]:
def dump_models(framework, models):
    for idx, model in enumerate(models):
        joblib.dump(model, f'gradboost/{framework}_{idx}.joblib')

dump_models("lgbm", all_models)
dump_models("catboost", Cat_all_models)
dump_models("xgb", xgb_all_models)