# Skin cancer detection

Loading of torcheval library

In [1]:
!pip install torcheval --no-index --find-links=file:///kaggle/input/torcheval/torcheval

Looking in links: file:///kaggle/input/torcheval/torcheval
Processing /kaggle/input/torcheval/torcheval/torcheval-0.0.7-py3-none-any.whl
Installing collected packages: torcheval
Successfully installed torcheval-0.0.7


Loading of CV Model trained before on dataset of training

In [2]:
import numpy as np, pandas as pd, polars as pl
from colorama import Fore, Back, Style
import os
from sklearn.metrics import roc_curve, auc
import h5py
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier
import lightgbm as lgb, catboost as cb, xgboost as xgb
import io
import albumentations as A
import torchvision.transforms as transform
from PIL import Image 
import gc
import cv2
import math
import copy
import time
import random
import glob
from matplotlib import pyplot as plt
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.cuda import amp
import torchvision
from torcheval.metrics.functional import binary_auroc
import joblib
from tqdm import tqdm
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, StratifiedGroupKFold 
import timm
# Albumentations for augmentations
import albumentations as A
from albumentations.pytorch import ToTensorV2
# For colored terminal text
from colorama import Fore, Back, Style
import warnings
import glob
import warnings

warnings.filterwarnings("ignore")
b_ = Fore.BLUE
sr_ = Style.RESET_ALL
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

ROOT_DIR = "/kaggle/input/isic-2024-challenge"
TEST_DIR = f'{ROOT_DIR}/test-image.hdf5'
test_images = sorted(glob.glob(f"{TEST_DIR}/*.jpg"))
df_test = pd.read_csv(f"{ROOT_DIR}/test-metadata.csv")

def get_filepath(image_id):
    return f"{TEST_DIR}/{image_id}.jpg"

class SkinCancerNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 16, 5)
        self.bn1 = nn.BatchNorm2d(16)
        self.pool = nn.MaxPool2d(2, 2)

        self.conv2 = nn.Conv2d(16, 32, 5)
        self.bn2 = nn.BatchNorm2d(32)

        self.conv3 = nn.Conv2d(32, 64, 3)
        self.bn3 = nn.BatchNorm2d(64)

        self.conv4 = nn.Conv2d(64, 128, 3)
        self.bn4 = nn.BatchNorm2d(128)

        # Calculate the size of the feature map after the conv and pool layers
        # Input size: 128x128
        # After conv1: (128 - 5 + 1) = 124 -> 124x124
        # After pool1: 124 / 2 = 62 -> 62x62
        # After conv2: (62 - 5 + 1) = 58 -> 58x58
        # After pool2: 58 / 2 = 29 -> 29x29
        # After conv3: (29 - 3 + 1) = 27 -> 27x27
        # After pool3: 27 / 2 = 13.5 -> 13x13 (rounding down)
        # After conv4: (13 - 3 + 1) = 11 -> 11x11
        # After pool4: 11 / 2 = 5.5 -> 5x5 (rounding down)

        self.fc1 = nn.Linear(128 * 5 * 5, 512)
        self.dropout1 = nn.Dropout(0.5)

        self.fc2 = nn.Linear(512, 256)
        self.dropout2 = nn.Dropout(0.5)

        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, 1)

    def forward(self, x):
        x = self.pool(F.relu(self.bn1(self.conv1(x))))
        x = self.pool(F.relu(self.bn2(self.conv2(x))))
        x = self.pool(F.relu(self.bn3(self.conv3(x))))
        x = self.pool(F.relu(self.bn4(self.conv4(x))))

        x = torch.flatten(x, 1)

        x = F.relu(self.fc1(x))
        x = self.dropout1(x)

        x = F.relu(self.fc2(x))
        x = self.dropout2(x)

        x = F.relu(self.fc3(x))
        x = torch.sigmoid(self.fc4(x))

        return x
    
    
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print("Device:", device)

model = SkinCancerNet()
model = model.to(device)

# Load the model weights
model.load_state_dict(torch.load('/kaggle/input/model/pytorch/default/1/my_model.pth'))
model.eval() 

class ISIC2024Dataset2(Dataset):
    def __init__(self, metadata: pd.DataFrame, ids_images: dict, test: bool=False,transform = None):
        self.metadata = metadata
        self.fp_hdf = h5py.File(TEST_DIR, mode="r")
        self.ids_images = ids_images
        self.test = test
        self.transform = transform

    def __len__(self):
        return len(self.metadata)

    def __getitem__(self, item):
        isic_row = self.metadata.iloc[item]
        isic_id = isic_row.isic_id
        image = Image.open(io.BytesIO(self.fp_hdf[isic_id][()]))
        if self.transform:
            image = self.transform(image=np.array(image))['image']
        if self.test:
            return image
        return image
    
    

ids = set(df_test["isic_id"])
def prepare_loaders2(df,ids):
    data_transform  = A.Compose([
        A.Resize(128, 128),
        A.Normalize(
                mean=[0.485, 0.456, 0.406], 
                std=[0.229, 0.224, 0.225], 
                max_pixel_value=255.0, 
                p=1.0
            ),
        ToTensorV2()], p=1.)
    df = df.reset_index(drop=True)
    valid_dataset = ISIC2024Dataset2(df,ids,transform = data_transform )
    valid_loader = DataLoader(valid_dataset, batch_size=32, 
                             num_workers=2, shuffle=False, pin_memory=True)
    return valid_loader


valid_loader  = prepare_loaders2(df_test,ids)

  data = fetch_version_info()


Device: cuda:0


Prediction of the CV model on test data

In [3]:
all_probs = []
with torch.no_grad():
     for i, data in enumerate(tqdm(valid_loader)):
        inputs = data.to(device)
         # calculate outputs by running images through the network
        probs = model(inputs).flatten()
        all_probs.extend(probs.cpu().tolist())
         # the class with the highest energy is what we choose as prediction
        predicted = (probs >= 0.5).float()
        
df_test["pred"] = all_probs

100%|██████████| 1/1 [00:00<00:00,  1.18it/s]


# Machine Learning

Feature engineering

In [4]:
num_cols = [
    'age_approx',                        # Approximate age of patient at time of imaging.
    'clin_size_long_diam_mm',            # Maximum diameter of the lesion (mm).+
    'tbp_lv_A',                          # A inside  lesion.+
    'tbp_lv_Aext',                       # A outside lesion.+
    'tbp_lv_B',                          # B inside  lesion.+
    'tbp_lv_Bext',                       # B outside lesion.+ 
    'tbp_lv_C',                          # Chroma inside  lesion.+
    'tbp_lv_Cext',                       # Chroma outside lesion.+
    'tbp_lv_H',                          # Hue inside the lesion; calculated as the angle of A* and B* in LAB* color space. Typical values range from 25 (red) to 75 (brown).+
    'tbp_lv_Hext',                       # Hue outside lesion.+
    'tbp_lv_L',                          # L inside lesion.+
    'tbp_lv_Lext',                       # L outside lesion.+
    'tbp_lv_areaMM2',                    # Area of lesion (mm^2).+
    'tbp_lv_area_perim_ratio',           # Border jaggedness, the ratio between lesions perimeter and area. Circular lesions will have low values; irregular shaped lesions will have higher values. Values range 0-10.+
    'tbp_lv_color_std_mean',             # Color irregularity, calculated as the variance of colors within the lesion's boundary.
    'tbp_lv_deltaA',                     # Average A contrast (inside vs. outside lesion).+
    'tbp_lv_deltaB',                     # Average B contrast (inside vs. outside lesion).+
    'tbp_lv_deltaL',                     # Average L contrast (inside vs. outside lesion).+
    'tbp_lv_deltaLB',                    #
    'tbp_lv_deltaLBnorm',                # Contrast between the lesion and its immediate surrounding skin. Low contrast lesions tend to be faintly visible such as freckles; high contrast lesions tend to be those with darker pigment. Calculated as the average delta LB of the lesion relative to its immediate background in LAB* color space. Typical values range from 5.5 to 25.+
    'tbp_lv_eccentricity',               # Eccentricity.+
    'tbp_lv_minorAxisMM',                # Smallest lesion diameter (mm).+
    'tbp_lv_nevi_confidence',            # Nevus confidence score (0-100 scale) is a convolutional neural network classifier estimated probability that the lesion is a nevus. The neural network was trained on approximately 57,000 lesions that were classified and labeled by a dermatologist.+,++
    'tbp_lv_norm_border',                # Border irregularity (0-10 scale); the normalized average of border jaggedness and asymmetry.+
    'tbp_lv_norm_color',                 # Color variation (0-10 scale); the normalized average of color asymmetry and color irregularity.+
    'tbp_lv_perimeterMM',                # Perimeter of lesion (mm).+
    'tbp_lv_radial_color_std_max',       # Color asymmetry, a measure of asymmetry of the spatial distribution of color within the lesion. This score is calculated by looking at the average standard deviation in LAB* color space within concentric rings originating from the lesion center. Values range 0-10.+
    'tbp_lv_stdL',                       # Standard deviation of L inside  lesion.+
    'tbp_lv_stdLExt',                    # Standard deviation of L outside lesion.+
    'tbp_lv_symm_2axis',                 # Border asymmetry; a measure of asymmetry of the lesion's contour about an axis perpendicular to the lesion's most symmetric axis. Lesions with two axes of symmetry will therefore have low scores (more symmetric), while lesions with only one or zero axes of symmetry will have higher scores (less symmetric). This score is calculated by comparing opposite halves of the lesion contour over many degrees of rotation. The angle where the halves are most similar identifies the principal axis of symmetry, while the second axis of symmetry is perpendicular to the principal axis. Border asymmetry is reported as the asymmetry value about this second axis. Values range 0-10.+
    'tbp_lv_symm_2axis_angle',           # Lesion border asymmetry angle.+
    'tbp_lv_x',                          # X-coordinate of the lesion on 3D TBP.+
    'tbp_lv_y',                          # Y-coordinate of the lesion on 3D TBP.+
    'tbp_lv_z',                          # Z-coordinate of the lesion on 3D TBP.+
]

new_num_cols = [
    'lesion_size_ratio',                 # tbp_lv_minorAxisMM      / clin_size_long_diam_mm
    'lesion_shape_index',                # tbp_lv_areaMM2          / tbp_lv_perimeterMM **2
    'hue_contrast',                      # tbp_lv_H                - tbp_lv_Hext              abs
    'luminance_contrast',                # tbp_lv_L                - tbp_lv_Lext              abs
    'lesion_color_difference',           # tbp_lv_deltaA **2       + tbp_lv_deltaB **2 + tbp_lv_deltaL **2  sqrt  
    'border_complexity',                 # tbp_lv_norm_border      + tbp_lv_symm_2axis
    'color_uniformity',                  # tbp_lv_color_std_mean   / tbp_lv_radial_color_std_max

    'position_distance_3d',              # tbp_lv_x **2 + tbp_lv_y **2 + tbp_lv_z **2  sqrt
    'perimeter_to_area_ratio',           # tbp_lv_perimeterMM      / tbp_lv_areaMM2
    'area_to_perimeter_ratio',           # tbp_lv_areaMM2          / tbp_lv_perimeterMM
    'lesion_visibility_score',           # tbp_lv_deltaLBnorm      + tbp_lv_norm_color
    'symmetry_border_consistency',       # tbp_lv_symm_2axis       * tbp_lv_norm_border
    'consistency_symmetry_border',       # tbp_lv_symm_2axis       * tbp_lv_norm_border / (tbp_lv_symm_2axis + tbp_lv_norm_border)

    'color_consistency',                 # tbp_lv_stdL             / tbp_lv_Lext
    'consistency_color',                 # tbp_lv_stdL*tbp_lv_Lext / tbp_lv_stdL + tbp_lv_Lext
    'size_age_interaction',              # clin_size_long_diam_mm  * age_approx
    'hue_color_std_interaction',         # tbp_lv_H                * tbp_lv_color_std_mean
    'lesion_severity_index',             # tbp_lv_norm_border      + tbp_lv_norm_color + tbp_lv_eccentricity / 3
    'shape_complexity_index',            # border_complexity       + lesion_shape_index
    'color_contrast_index',              # tbp_lv_deltaA + tbp_lv_deltaB + tbp_lv_deltaL + tbp_lv_deltaLBnorm

    'log_lesion_area',                   # tbp_lv_areaMM2          + 1  np.log
    'normalized_lesion_size',            # clin_size_long_diam_mm  / age_approx
    'mean_hue_difference',               # tbp_lv_H                + tbp_lv_Hext    / 2
    'std_dev_contrast',                  # tbp_lv_deltaA **2 + tbp_lv_deltaB **2 + tbp_lv_deltaL **2   / 3  np.sqrt
    'color_shape_composite_index',       # tbp_lv_color_std_mean   + bp_lv_area_perim_ratio + tbp_lv_symm_2axis   / 3
    'lesion_orientation_3d',             # tbp_lv_y                , tbp_lv_x  np.arctan2
    'overall_color_difference',          # tbp_lv_deltaA           + tbp_lv_deltaB + tbp_lv_deltaL   / 3

    'symmetry_perimeter_interaction',    # tbp_lv_symm_2axis       * tbp_lv_perimeterMM
    'comprehensive_lesion_index',        # tbp_lv_area_perim_ratio + tbp_lv_eccentricity + bp_lv_norm_color + tbp_lv_symm_2axis   / 4
    'color_variance_ratio',              # tbp_lv_color_std_mean   / tbp_lv_stdLExt
    'border_color_interaction',          # tbp_lv_norm_border      * tbp_lv_norm_color
    'border_color_interaction_2',
    'size_color_contrast_ratio',         # clin_size_long_diam_mm  / tbp_lv_deltaLBnorm
    'age_normalized_nevi_confidence',    # tbp_lv_nevi_confidence  / age_approx
    'age_normalized_nevi_confidence_2',
    'color_asymmetry_index',             # tbp_lv_symm_2axis       * tbp_lv_radial_color_std_max

    'volume_approximation_3d',           # tbp_lv_areaMM2          * sqrt(tbp_lv_x**2 + tbp_lv_y**2 + tbp_lv_z**2)
    'color_range',                       # abs(tbp_lv_L - tbp_lv_Lext) + abs(tbp_lv_A - tbp_lv_Aext) + abs(tbp_lv_B - tbp_lv_Bext)
    'shape_color_consistency',           # tbp_lv_eccentricity     * tbp_lv_color_std_mean
    'border_length_ratio',               # tbp_lv_perimeterMM      / pi * sqrt(tbp_lv_areaMM2 / pi)
    'age_size_symmetry_index',           # age_approx              * clin_size_long_diam_mm * tbp_lv_symm_2axis
    'index_age_size_symmetry',           # age_approx              * tbp_lv_areaMM2 * tbp_lv_symm_2axis
]

cat_cols = ['sex', 'anatom_site_general', 'tbp_tile_type', 'tbp_lv_location', 'tbp_lv_location_simple', 'attribution']
norm_cols = [f'{col}_patient_norm' for col in num_cols + new_num_cols]
special_cols = ['count_per_patient']
feature_cols = num_cols + new_num_cols + cat_cols + norm_cols + special_cols

def read_data(path):
    return (
        pl.read_csv(path)
        .with_columns(
            pl.col('age_approx').cast(pl.String).replace('NA', np.nan).cast(pl.Float64),
        )
        .with_columns(
            pl.col(pl.Float64).fill_nan(pl.col(pl.Float64).median()), # You may want to impute test data with train
        )
        .with_columns(
            lesion_size_ratio              = pl.col('tbp_lv_minorAxisMM') / pl.col('clin_size_long_diam_mm'),
            lesion_shape_index             = pl.col('tbp_lv_areaMM2') / (pl.col('tbp_lv_perimeterMM') ** 2),
            hue_contrast                   = (pl.col('tbp_lv_H') - pl.col('tbp_lv_Hext')).abs(),
            luminance_contrast             = (pl.col('tbp_lv_L') - pl.col('tbp_lv_Lext')).abs(),
            lesion_color_difference        = (pl.col('tbp_lv_deltaA') ** 2 + pl.col('tbp_lv_deltaB') ** 2 + pl.col('tbp_lv_deltaL') ** 2).sqrt(),
            border_complexity              = pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_symm_2axis'),
            color_uniformity               = pl.col('tbp_lv_color_std_mean') / (pl.col('tbp_lv_radial_color_std_max') + err),
        )
        .with_columns(
            position_distance_3d           = (pl.col('tbp_lv_x') ** 2 + pl.col('tbp_lv_y') ** 2 + pl.col('tbp_lv_z') ** 2).sqrt(),
            perimeter_to_area_ratio        = pl.col('tbp_lv_perimeterMM') / pl.col('tbp_lv_areaMM2'),
            area_to_perimeter_ratio        = pl.col('tbp_lv_areaMM2') / pl.col('tbp_lv_perimeterMM'),
            lesion_visibility_score        = pl.col('tbp_lv_deltaLBnorm') + pl.col('tbp_lv_norm_color'),
            combined_anatomical_site       = pl.col('anatom_site_general') + '_' + pl.col('tbp_lv_location'),
            symmetry_border_consistency    = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_norm_border'),
            consistency_symmetry_border    = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_norm_border') / (pl.col('tbp_lv_symm_2axis') + pl.col('tbp_lv_norm_border')),
        )
        .with_columns(
            color_consistency              = pl.col('tbp_lv_stdL') / pl.col('tbp_lv_Lext'),
            consistency_color              = pl.col('tbp_lv_stdL') * pl.col('tbp_lv_Lext') / (pl.col('tbp_lv_stdL') + pl.col('tbp_lv_Lext')),
            size_age_interaction           = pl.col('clin_size_long_diam_mm') * pl.col('age_approx'),
            hue_color_std_interaction      = pl.col('tbp_lv_H') * pl.col('tbp_lv_color_std_mean'),
            lesion_severity_index          = (pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_norm_color') + pl.col('tbp_lv_eccentricity')) / 3,
            shape_complexity_index         = pl.col('border_complexity') + pl.col('lesion_shape_index'),
            color_contrast_index           = pl.col('tbp_lv_deltaA') + pl.col('tbp_lv_deltaB') + pl.col('tbp_lv_deltaL') + pl.col('tbp_lv_deltaLBnorm'),
        )
        .with_columns(
            log_lesion_area                = (pl.col('tbp_lv_areaMM2') + 1).log(),
            normalized_lesion_size         = pl.col('clin_size_long_diam_mm') / pl.col('age_approx'),
            mean_hue_difference            = (pl.col('tbp_lv_H') + pl.col('tbp_lv_Hext')) / 2,
            std_dev_contrast               = ((pl.col('tbp_lv_deltaA') ** 2 + pl.col('tbp_lv_deltaB') ** 2 + pl.col('tbp_lv_deltaL') ** 2) / 3).sqrt(),
            color_shape_composite_index    = (pl.col('tbp_lv_color_std_mean') + pl.col('tbp_lv_area_perim_ratio') + pl.col('tbp_lv_symm_2axis')) / 3,
            lesion_orientation_3d          = pl.arctan2(pl.col('tbp_lv_y'), pl.col('tbp_lv_x')),
            overall_color_difference       = (pl.col('tbp_lv_deltaA') + pl.col('tbp_lv_deltaB') + pl.col('tbp_lv_deltaL')) / 3,
        )
        .with_columns(
            symmetry_perimeter_interaction = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_perimeterMM'),
            comprehensive_lesion_index     = (pl.col('tbp_lv_area_perim_ratio') + pl.col('tbp_lv_eccentricity') + pl.col('tbp_lv_norm_color') + pl.col('tbp_lv_symm_2axis')) / 4,
            color_variance_ratio           = pl.col('tbp_lv_color_std_mean') / pl.col('tbp_lv_stdLExt'),
            border_color_interaction       = pl.col('tbp_lv_norm_border') * pl.col('tbp_lv_norm_color'),
            border_color_interaction_2     = pl.col('tbp_lv_norm_border') * pl.col('tbp_lv_norm_color') / (pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_norm_color')),
            size_color_contrast_ratio      = pl.col('clin_size_long_diam_mm') / pl.col('tbp_lv_deltaLBnorm'),
            age_normalized_nevi_confidence = pl.col('tbp_lv_nevi_confidence') / pl.col('age_approx'),
            age_normalized_nevi_confidence_2 = (pl.col('clin_size_long_diam_mm')**2 + pl.col('age_approx')**2).sqrt(),
            color_asymmetry_index          = pl.col('tbp_lv_radial_color_std_max') * pl.col('tbp_lv_symm_2axis'),
        )
        .with_columns(
            volume_approximation_3d        = pl.col('tbp_lv_areaMM2') * (pl.col('tbp_lv_x')**2 + pl.col('tbp_lv_y')**2 + pl.col('tbp_lv_z')**2).sqrt(),
            color_range                    = (pl.col('tbp_lv_L') - pl.col('tbp_lv_Lext')).abs() + (pl.col('tbp_lv_A') - pl.col('tbp_lv_Aext')).abs() + (pl.col('tbp_lv_B') - pl.col('tbp_lv_Bext')).abs(),
            shape_color_consistency        = pl.col('tbp_lv_eccentricity') * pl.col('tbp_lv_color_std_mean'),
            border_length_ratio            = pl.col('tbp_lv_perimeterMM') / (2 * np.pi * (pl.col('tbp_lv_areaMM2') / np.pi).sqrt()),
            age_size_symmetry_index        = pl.col('age_approx') * pl.col('clin_size_long_diam_mm') * pl.col('tbp_lv_symm_2axis'),
            index_age_size_symmetry        = pl.col('age_approx') * pl.col('tbp_lv_areaMM2') * pl.col('tbp_lv_symm_2axis'),
        )
        .with_columns(
            ((pl.col(col) - pl.col(col).mean().over('patient_id')) / (pl.col(col).std().over('patient_id') + err)).alias(f'{col}_patient_norm') for col in (num_cols + new_num_cols)
        )
        .with_columns(
            count_per_patient = pl.col('isic_id').count().over('patient_id'),
        )
        .with_columns(
            pl.col(cat_cols).cast(pl.Categorical),
        )
        .to_pandas()
        .set_index(id_col)
    )

Read of train data

In [5]:
err = 1e-5
id_col = 'isic_id'
root = Path('/kaggle/input/isic-2024-challenge')

train_path = root / 'train-metadata.csv'
test_path = root / 'test-metadata.csv'
subm_path = root / 'sample_submission.csv'

df_train = pd.read_csv("/kaggle/input/df-train/df_train.csv")
df_cleaned = read_data(train_path)
df_cleaned["pred"] = df_train["pred"].values

Preprocessing of train data

In [1]:
def preprocess(df_cleaned):
    mask = df_cleaned['anatom_site_general'].isna() & (df_cleaned['target'] != 1)
    df_cleaned = df_cleaned[~mask]

    mask = df_cleaned['sex'].isna() & (df_cleaned['target'] != 1)

    # Drop rows based on the mask
    df_cleaned = df_cleaned[~mask]

    mask = df_cleaned['age_approx'].isna() & (df_cleaned['target'] != 1)
    df_cleaned = df_cleaned[~mask]
    df_cleaned["age_approx"]= df_cleaned["age_approx"].apply(lambda x: df_cleaned["age_approx"].mean() if pd.isna(x) else x)
    df_cleaned["sex"] = df_cleaned["sex"].apply(lambda x: 1 if pd.isna(x) else x)
    df_cleaned["sex"] = df_cleaned["sex"].apply(lambda x:1 if x== "male" else 0)
    df_cleaned["sex"] = df_cleaned["sex"].fillna(1).astype(int)
    df_cleaned["tbp_tile_type"] = df_cleaned["tbp_tile_type"].apply(lambda x:1 if x== "3D: white" else 0).astype(float)
    text_mapping = {
        'posterior torso': 2,
        'lower extremity': 0,
        'anterior torso': 1,
        'upper extremity': 3,
        'head/neck': 4  # Assuming 0 if the count isn't provided or replace with actual count
    }
    df_cleaned['anatom_site_general'] = df_cleaned['anatom_site_general'].map(text_mapping)
    df_cleaned['anatom_site_general'] = df_cleaned['anatom_site_general'].astype(float)
    df_cleaned["sex"]= df_cleaned["sex"].astype(int)
    y = df_cleaned['target']

    del df_cleaned["tbp_lv_location_simple"]
    del df_cleaned["tbp_lv_location"]
    del df_cleaned["patient_id"] 
    del df_cleaned["iddx_full"]
    del df_cleaned["attribution"]
    del df_cleaned["iddx_1"]
    del df_cleaned["image_type"]
    del df_cleaned["mel_mitotic_index"] 
    del df_cleaned["target"]
    del df_cleaned["iddx_5"]
    del df_cleaned["iddx_4"]
    del df_cleaned["iddx_3"]
    del df_cleaned["iddx_2"]
    del df_cleaned["mel_thick_mm"] 
    del df_cleaned["lesion_id"] 
    del df_cleaned["copyright_license"]
    del df_cleaned["tbp_lv_dnn_lesion_confidence"]
    del df_cleaned["combined_anatomical_site"]
    df_cleaned.reset_index(drop=True, inplace=True)
    
    return df_cleaned

df_cleaned = preprocess(df_cleaned)

LGB model on tabular data

In [7]:
# split the dataset into the training set and test set

sampling_ratio = 0.01
X = df_cleaned
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,shuffle=True, random_state = 0)

lgb_params = {
    'objective':        'binary',
    'verbosity':        -1,
    'n_iter':           200,
    'boosting_type':    'gbdt',
    'random_state':     42,
    'lambda_l1':        0.08758718919397321, 
    'lambda_l2':        0.0039689175176025465, 
    'learning_rate':    0.03231007103195577, 
    'max_depth':        4, 
    'num_leaves':       103, 
    'colsample_bytree': 0.8329551585827726, 
    'colsample_bynode': 0.4025961355653304, 
    'bagging_fraction': 0.7738954452473223, 
    'bagging_freq':     4, 
    'min_data_in_leaf': 85, 
    'scale_pos_weight': 2.7984184778875543,
}

lgb_model = Pipeline([
    ('sampler_1', RandomOverSampler(sampling_strategy= 0.003 , random_state=42)),
    ('sampler_2', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=42)),
    ('classifier', lgb.LGBMClassifier(**lgb_params)),
])

CB model on tabular data

In [8]:
sampling_ratio = 0.01
X = df_cleaned
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,shuffle=True, random_state = 0)

cb_params = {
    'loss_function':     'Logloss',
    'iterations':        250,
    'verbose':           False,
    'random_state':      42,
    'max_depth':         7, 
    'learning_rate':     0.06936242010150652, 
    'scale_pos_weight':  2.6149345838209532, 
    'l2_leaf_reg':       6.216113851699493, 
    'subsample':         0.6249261779711819, 
    'min_data_in_leaf':  24
}
cb_model = Pipeline([
    ('sampler_1', RandomOverSampler(sampling_strategy= 0.003 , random_state=42)),
    ('sampler_2', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=42)),
    ('classifier', cb.CatBoostClassifier(**cb_params)),
])

Stacking (Aggregating of 2 "complex model")

In [9]:
estimator = VotingClassifier([
    ('lgb', lgb_model), ('cb', cb_model)             # ('xgb', xgb_model),
], voting='soft',weights=[0.5,0.50])

Fitting of models

In [10]:
lgb_model.fit(X_train, y_train)
cb_model.fit(X_train, y_train)
estimator.fit(X_train, y_train)

Feature importance LGB model

In [11]:
# Extract the fitted model from the pipeline
model = lgb_model.named_steps['classifier']

# Get feature importance scores
feature_importances = model.feature_importances_

# Create a DataFrame to hold feature names and their importance
importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importances
})

# Sort the DataFrame by importance
importance_df = importance_df.sort_values(by='Importance', ascending=False)

print("Feature Importances:")
print(importance_df)

Feature Importances:
                                           Feature  Importance
156                                           pred          96
155                              count_per_patient          82
147  age_normalized_nevi_confidence_2_patient_norm          82
79                         age_approx_patient_norm          75
71                age_normalized_nevi_confidence_2          69
..                                             ...         ...
131            shape_complexity_index_patient_norm           2
29                     tbp_lv_radial_color_std_max           2
27                               tbp_lv_norm_color           1
17                           tbp_lv_color_std_mean           1
1                                              sex           0

[157 rows x 2 columns]


Feature importance CB model

In [12]:
# Extract the fitted model from the pipeline
model = lgb_model.named_steps['classifier']

# Get feature importance scores
feature_importances = model.feature_importances_

# Create a DataFrame to hold feature names and their importance
importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importances
})

# Sort the DataFrame by importance
importance_df = importance_df.sort_values(by='Importance', ascending=False)

print("Feature Importances:")
print(importance_df)

Feature Importances:
                                           Feature  Importance
156                                           pred          96
155                              count_per_patient          82
147  age_normalized_nevi_confidence_2_patient_norm          82
79                         age_approx_patient_norm          75
71                age_normalized_nevi_confidence_2          69
..                                             ...         ...
131            shape_complexity_index_patient_norm           2
29                     tbp_lv_radial_color_std_max           2
27                               tbp_lv_norm_color           1
17                           tbp_lv_color_std_mean           1
1                                              sex           0

[157 rows x 2 columns]


Evaluation on df_train of model

In [13]:
y_pred_proba = estimator.predict_proba(X_test)[:, 1]
def score(solution: np.array, submission: np.array, min_tpr: float=0.80) -> float:

    # rescale the target. set 0s to 1s and 1s to 0s (since sklearn only has max_fpr)
    v_gt = abs(solution-1)
    v_pred = -1.0 * submission

    max_fpr = abs(1-min_tpr)

    # using sklearn.metric functions: (1) roc_curve and (2) auc
    fpr, tpr, _ = roc_curve(v_gt, v_pred, sample_weight=None)
    if max_fpr is None or max_fpr == 1:
        return auc(fpr, tpr)
    if max_fpr <= 0 or max_fpr > 1:
        raise ValueError("Expected min_tpr in range [0, 1), got: %r" % min_tpr)
        
    # Add a single point at max_fpr by linear interpolation
    stop = np.searchsorted(fpr, max_fpr, "right")
    x_interp = [fpr[stop - 1], fpr[stop]]
    y_interp = [tpr[stop - 1], tpr[stop]]
    tpr = np.append(tpr[:stop], np.interp(max_fpr, x_interp, y_interp))
    fpr = np.append(fpr[:stop], max_fpr)
    partial_auc = auc(fpr, tpr)
    
    return partial_auc

#print(f"Accuracy of the model: {100 * correct // total} %")
print(f"AUC of the model: {score(np.array(y_test), np.array(y_pred_proba), min_tpr=0.0)}")
print(f"pAUC-TPR(0.8) of the model: {score(np.array(y_test), np.array(y_pred_proba))}")

AUC of the model: 0.9628906904530173
pAUC-TPR(0.8) of the model: 0.16886259033073434


# Submission

Read of test data

In [14]:
err = 1e-5
id_col = 'isic_id'
root = Path('/kaggle/input/isic-2024-challenge')

train_path = root / 'train-metadata.csv'
test_path = root / 'test-metadata.csv'

df_sub = read_data(test_path)
df_sub["pred"] = df_test["pred"].values

Preprocessing of test data

In [15]:
del df_sub["tbp_lv_location_simple"]
del df_sub["tbp_lv_location"]
del df_sub["patient_id"] 
del df_sub["attribution"]
del df_sub["image_type"]
del df_sub["combined_anatomical_site"]
del df_sub["copyright_license"]
df_sub.reset_index(drop=True, inplace=True)
df_sub["age_approx"]= df_cleaned2["age_approx"].apply(lambda x: df_cleaned["age_approx"].mean() if pd.isna(x) else x)
df_sub["sex"] = df_cleaned2["sex"].apply(lambda x: 1 if pd.isna(x) else x)
df_sub["sex"] = df_cleaned2["sex"].apply(lambda x:1 if x== "male" else 0)
df_sub["tbp_tile_type"] = df_cleaned2["tbp_tile_type"].apply(lambda x:1 if x== "3D: white" else 0).astype(float)
text_mapping = {
    'posterior torso': 2,
    'lower extremity': 0,
    'anterior torso': 1,
    'upper extremity': 3,
    'head/neck': 4
}
df_sub['anatom_site_general'] = df_sub['anatom_site_general'].map(text_mapping).astype(float)
df_sub["sex"]= df_sub["sex"].astype(int)

Prediction on test data

In [16]:
y_pred_proba = estimator.predict_proba(df_sub)
y_pred_proba = y_pred_proba[:,1]
submission = pd.DataFrame()
submission["isic_id"] = df_test["isic_id"]
submission["target"] = y_pred_proba
submission.to_csv('submission.csv', index=False)
submission

Unnamed: 0,isic_id,target
0,ISIC_0015657,0.26324
1,ISIC_0015729,0.341653
2,ISIC_0015740,0.511797
