In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GroupKFold
from lightgbm import LGBMClassifier
import os

## Definition of path and downloading the data
As sometimes I run the notebook on kaggle and sometimes on my PC I must change the path.

In [4]:
#global variables
KAGGLE = 0
PATH = "/kaggle/input/isic-2024-challenge/" if KAGGLE ==1 else "input/"

In [5]:
df_train = pd.read_csv( os.path.join(PATH,"train-metadata.csv"))
df_test = pd.read_csv( os.path.join(PATH,"test-metadata.csv"))

  df_train = pd.read_csv( os.path.join(PATH,"train-metadata.csv"))


## Explonatory DF
I create a df with columns description to consult it if necessary inside the notebook.

In [6]:
data = {
    "field_name": [
        "target", "lesion_id", "iddx_full", "iddx_1", "iddx_2", "iddx_3", "iddx_4", "iddx_5", 
        "mel_mitotic_index", "mel_thick_mm", "tbp_lv_dnn_lesion_confidence", 
        "isic_id", "patient_id", "age_approx", "sex", "anatom_site_general", 
        "clin_size_long_diam_mm", "image_type", "tbp_tile_type", "tbp_lv_A", "tbp_lv_Aext", 
        "tbp_lv_B", "tbp_lv_Bext", "tbp_lv_C", "tbp_lv_Cext", "tbp_lv_H", "tbp_lv_Hext", 
        "tbp_lv_L", "tbp_lv_Lext", "tbp_lv_areaMM2", "tbp_lv_area_perim_ratio", 
        "tbp_lv_color_std_mean", "tbp_lv_deltaA", "tbp_lv_deltaB", "tbp_lv_deltaL", 
        "tbp_lv_deltaLBnorm", "tbp_lv_eccentricity", "tbp_lv_location", 
        "tbp_lv_location_simple", "tbp_lv_minorAxisMM", "tbp_lv_nevi_confidence", 
        "tbp_lv_norm_border", "tbp_lv_norm_color", "tbp_lv_perimeterMM", 
        "tbp_lv_radial_color_std_max", "tbp_lv_stdL", "tbp_lv_stdLExt", 
        "tbp_lv_symm_2axis", "tbp_lv_symm_2axis_angle", "tbp_lv_x", "tbp_lv_y", "tbp_lv_z", 
        "attribution", "copyright_license"
    ],
    "description": [
        "Binary class {0: benign, 1: malignant}.", 
        "Unique lesion identifier. Present in lesions that were manually tagged as a lesion of interest.", 
        "Fully classified lesion diagnosis.", 
        "First level lesion diagnosis.", 
        "Second level lesion diagnosis.", 
        "Third level lesion diagnosis.", 
        "Fourth level lesion diagnosis.", 
        "Fifth level lesion diagnosis.", 
        "Mitotic index of invasive malignant melanomas.", 
        "Thickness in depth of melanoma invasion.", 
        "Lesion confidence score (0-100 scale).", 
        "Unique case identifier.", 
        "Unique patient identifier.", 
        "Approximate age of patient at time of imaging.", 
        "Sex of the person.", 
        "Location of the lesion on the patient's body.", 
        "Maximum diameter of the lesion (mm).", 
        "Structured field of the ISIC Archive for image type.", 
        "Lighting modality of the 3D TBP source image.", 
        "A inside lesion.", 
        "A outside lesion.", 
        "B inside lesion.", 
        "B outside lesion.", 
        "Chroma inside lesion.", 
        "Chroma outside lesion.", 
        "Hue inside the lesion; calculated as the angle of A* and B* in LAB* color space. Typical values range from 25 (red) to 75 (brown).", 
        "Hue outside lesion.", 
        "L inside lesion.", 
        "L outside lesion.", 
        "Area of lesion (mm^2).", 
        "Border jaggedness, the ratio between lesions perimeter and area. Circular lesions will have low values; irregular shaped lesions will have higher values. Values range 0-10.", 
        "Color irregularity, calculated as the variance of colors within the lesion's boundary.", 
        "Average A contrast (inside vs. outside lesion).", 
        "Average B contrast (inside vs. outside lesion).", 
        "Average L contrast (inside vs. outside lesion).", 
        "Contrast between the lesion and its immediate surrounding skin. Low contrast lesions tend to be faintly visible such as freckles; high contrast lesions tend to be those with darker pigment. Calculated as the average delta LB of the lesion relative to its immediate background in LAB* color space. Typical values range from 5.5 to 25.", 
        "Eccentricity.", 
        "Classification of anatomical location, divides arms & legs to upper & lower; torso into thirds.", 
        "Classification of anatomical location, simple.", 
        "Smallest lesion diameter (mm).", 
        "Nevus confidence score (0-100 scale) is a convolutional neural network classifier estimated probability that the lesion is a nevus. The neural network was trained on approximately 57,000 lesions that were classified and labeled by a dermatologist.", 
        "Border irregularity (0-10 scale); the normalized average of border jaggedness and asymmetry.", 
        "Color variation (0-10 scale); the normalized average of color asymmetry and color irregularity.", 
        "Perimeter of lesion (mm).", 
        "Color asymmetry, a measure of asymmetry of the spatial distribution of color within the lesion. This score is calculated by looking at the average standard deviation in LAB* color space within concentric rings originating from the lesion center. Values range 0-10.", 
        "Standard deviation of L inside lesion.", 
        "Standard deviation of L outside lesion.", 
        "Border asymmetry; a measure of asymmetry of the lesion's contour about an axis perpendicular to the lesion's most symmetric axis. Lesions with two axes of symmetry will therefore have low scores (more symmetric), while lesions with only one or zero axes of symmetry will have higher scores (less symmetric). This score is calculated by comparing opposite halves of the lesion contour over many degrees of rotation. The angle where the halves are most similar identifies the principal axis of symmetry, while the second axis of symmetry is perpendicular to the principal axis. Border asymmetry is reported as the asymmetry value about this second axis. Values range 0-10.", 
        "Lesion border asymmetry angle.", 
        "X-coordinate of the lesion on 3D TBP.", 
        "Y-coordinate of the lesion on 3D TBP.", 
        "Z-coordinate of the lesion on 3D TBP.", 
        "Image attribution, synonymous with image source.", 
        "Copyright license."
    ],
    "train_only": [
        True, True, True, True, True, True, True, True, True, True, True,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False
    ]
}

df_explained = pd.DataFrame(data)


In [8]:
pd.set_option('display.max_colwidth', None)
print(df_explained['description'][df_explained['field_name']=="tbp_lv_color_std_mean"])

31    Color irregularity, calculated as the variance of colors within the lesion's boundary.
Name: description, dtype: object


In [9]:
#creating a variable with columns that appear both in test and train
test_and_target = list(df_explained.columns[df['train_only']==False])
test_and_target.append('target')
print(test_and_target)
test_and_target=["target", 
        "isic_id", "patient_id", "age_approx", "sex", "anatom_site_general", 
        "clin_size_long_diam_mm", "image_type", "tbp_tile_type", "tbp_lv_A", "tbp_lv_Aext", 
        "tbp_lv_B", "tbp_lv_Bext", "tbp_lv_C", "tbp_lv_Cext", "tbp_lv_H", "tbp_lv_Hext", 
        "tbp_lv_L", "tbp_lv_Lext", "tbp_lv_areaMM2", "tbp_lv_area_perim_ratio", 
        "tbp_lv_color_std_mean", "tbp_lv_deltaA", "tbp_lv_deltaB", "tbp_lv_deltaL", 
        "tbp_lv_deltaLBnorm", "tbp_lv_eccentricity", "tbp_lv_location", 
        "tbp_lv_location_simple", "tbp_lv_minorAxisMM", "tbp_lv_nevi_confidence", 
        "tbp_lv_norm_border", "tbp_lv_norm_color", "tbp_lv_perimeterMM", 
        "tbp_lv_radial_color_std_max", "tbp_lv_stdL", "tbp_lv_stdLExt", 
        "tbp_lv_symm_2axis", "tbp_lv_symm_2axis_angle", "tbp_lv_x", "tbp_lv_y", "tbp_lv_z", 
        "attribution", "copyright_license"
    ]

In [34]:
#creating a df with columns that appear both in train and test
df_train_filtered=df_train[test_and_target]
categorical_columns = df_train_filtered.select_dtypes(include=['object', 'category']).columns
numerical_columns = df_train_filtered.select_dtypes(include=['number']).columns

## EDA


In [11]:
# checking columns with zero values as they can be really null values
def value_counts (df, value):
    value_columns = df.columns[(df == value).any()]
    print(value_columns)

    # Count number of 0s in each column that has at least one 0
    value_counts = df[value_columns].eq(0).sum()

    # Print the number of 0s in each relevant column
    print(f"Number of {value} in each column that has at least one {value}:{value_counts}")

value_counts(df_train_filtered,0)
 

Index(['target', 'tbp_lv_color_std_mean', 'tbp_lv_nevi_confidence',
       'tbp_lv_norm_color', 'tbp_lv_radial_color_std_max',
       'tbp_lv_symm_2axis_angle'],
      dtype='object')
Number of 0 in each column that has at least one 0:target                         400666
tbp_lv_color_std_mean           26187
tbp_lv_nevi_confidence              6
tbp_lv_norm_color               26182
tbp_lv_radial_color_std_max     29733
tbp_lv_symm_2axis_angle         18562
dtype: int64


In [31]:
# Calculate mean for numerical columns grouped by 'target'
means_by_target = df_train_filtered[numerical_columns].groupby(df_train_filtered['target']).mean()
print(means_by_target)

# Check the number of different values in cat columns to see which cat columns can be useful
for c in categorical_columns:
    num_dif = df_train_filtered[c].nunique()
    if num_dif < 30:
        print (f'{c}: {num_dif} different values : {df_train_filtered[c].unique()}')
    else:
        print (f'{c}: {num_dif} different values')             
    

Categorical columns: Index(['isic_id', 'patient_id', 'sex', 'anatom_site_general', 'image_type',
       'tbp_tile_type', 'tbp_lv_location', 'tbp_lv_location_simple',
       'attribution', 'copyright_license'],
      dtype='object')
Numerical columns: Index(['target', 'age_approx', 'clin_size_long_diam_mm', 'tbp_lv_A',
       'tbp_lv_Aext', 'tbp_lv_B', 'tbp_lv_Bext', 'tbp_lv_C', 'tbp_lv_Cext',
       'tbp_lv_H', 'tbp_lv_Hext', 'tbp_lv_L', 'tbp_lv_Lext', 'tbp_lv_areaMM2',
       'tbp_lv_area_perim_ratio', 'tbp_lv_color_std_mean', 'tbp_lv_deltaA',
       'tbp_lv_deltaB', 'tbp_lv_deltaL', 'tbp_lv_deltaLBnorm',
       'tbp_lv_eccentricity', 'tbp_lv_minorAxisMM', 'tbp_lv_nevi_confidence',
       'tbp_lv_norm_border', 'tbp_lv_norm_color', 'tbp_lv_perimeterMM',
       'tbp_lv_radial_color_std_max', 'tbp_lv_stdL', 'tbp_lv_stdLExt',
       'tbp_lv_symm_2axis', 'tbp_lv_symm_2axis_angle', 'tbp_lv_x', 'tbp_lv_y',
       'tbp_lv_z'],
      dtype='object')
        target  age_approx  clin_size_long_d

In [10]:
# Check distribution of one cat variable for target 0 and 1
place_by_target = df_train_filtered.groupby(['anatom_site_general', 'target']).size().unstack(fill_value=0)
place_totals = place_by_target.sum(axis=1)
place_by_target['percentage_1'] = (place_by_target[1] / place_totals) * 100
print(place_by_target)

target                    0    1  percentage_1
anatom_site_general                           
anterior torso        87688   82      0.093426
head/neck             11968   78      0.647518
lower extremity      102955   73      0.070855
posterior torso      121799  103      0.084494
upper extremity       70500   57      0.080786


## Preprocessing

In [35]:
df_train_filtered = df_train_filtered[df_train_filtered['anatom_site_general'].notna()]
df_train_filtered.reset_index(drop=True, inplace=True)

In [37]:
def feature_engineering(df):
    # Taken from https://www.kaggle.com/code/snnclsr/tabular-ensemble-lgbm-catboost
    df["lesion_size_ratio"] = df["tbp_lv_minorAxisMM"] / df["clin_size_long_diam_mm"]
    df["lesion_shape_index"] = df["tbp_lv_areaMM2"] / (df["tbp_lv_perimeterMM"] ** 2)
    df["hue_contrast"] = (df["tbp_lv_H"] - df["tbp_lv_Hext"]).abs()
    df["luminance_contrast"] = (df["tbp_lv_L"] - df["tbp_lv_Lext"]).abs()
    df["lesion_color_difference"] = np.sqrt(df["tbp_lv_deltaA"] ** 2 + df["tbp_lv_deltaB"] ** 2 + df["tbp_lv_deltaL"] ** 2)
    df["border_complexity"] = df["tbp_lv_norm_border"] + df["tbp_lv_symm_2axis"]
    df["color_uniformity"] = df["tbp_lv_color_std_mean"] / df["tbp_lv_radial_color_std_max"]
    df["3d_position_distance"] = np.sqrt(df["tbp_lv_x"] ** 2 + df["tbp_lv_y"] ** 2 + df["tbp_lv_z"] ** 2) 
    df["perimeter_to_area_ratio"] = df["tbp_lv_perimeterMM"] / df["tbp_lv_areaMM2"]
    df["lesion_visibility_score"] = df["tbp_lv_deltaLBnorm"] + df["tbp_lv_norm_color"] 
    df["symmetry_border_consistency"] = df["tbp_lv_symm_2axis"] * df["tbp_lv_norm_border"]
    df["color_consistency"] = df["tbp_lv_stdL"] / df["tbp_lv_Lext"]    
    df["size_age_interaction"] = df["clin_size_long_diam_mm"] * df["age_approx"]
    df["hue_color_std_interaction"] = df["tbp_lv_H"] * df["tbp_lv_color_std_mean"]
    df["lesion_severity_index"] = (df["tbp_lv_norm_border"] + df["tbp_lv_norm_color"] + df["tbp_lv_eccentricity"]) / 3
    df["shape_complexity_index"] = df["border_complexity"] + df["lesion_shape_index"]
    df["color_contrast_index"] = df["tbp_lv_deltaA"] + df["tbp_lv_deltaB"] + df["tbp_lv_deltaL"] + df["tbp_lv_deltaLBnorm"]
    df["log_lesion_area"] = np.log(df["tbp_lv_areaMM2"] + 1)
    df["normalized_lesion_size"] = df["clin_size_long_diam_mm"] / df["age_approx"]
    df["mean_hue_difference"] = (df["tbp_lv_H"] + df["tbp_lv_Hext"]) / 2
    df["std_dev_contrast"] = np.sqrt((df["tbp_lv_deltaA"] ** 2 + df["tbp_lv_deltaB"] ** 2 + df["tbp_lv_deltaL"] ** 2) / 3)
    df["color_shape_composite_index"] = (df["tbp_lv_color_std_mean"] + df["tbp_lv_area_perim_ratio"] + df["tbp_lv_symm_2axis"]) / 3
    df["3d_lesion_orientation"] = np.arctan2(df_train["tbp_lv_y"], df_train["tbp_lv_x"])
    df["overall_color_difference"] = (df["tbp_lv_deltaA"] + df["tbp_lv_deltaB"] + df["tbp_lv_deltaL"]) / 3
    df["symmetry_perimeter_interaction"] = df["tbp_lv_symm_2axis"] * df["tbp_lv_perimeterMM"]
    df["comprehensive_lesion_index"] = (df["tbp_lv_area_perim_ratio"] + df["tbp_lv_eccentricity"] + df["tbp_lv_norm_color"] + df["tbp_lv_symm_2axis"]) / 4
    df["color_variance_ratio"] = df["tbp_lv_color_std_mean"] / df["tbp_lv_stdLExt"]
    df["border_color_interaction"] = df["tbp_lv_norm_border"] * df["tbp_lv_norm_color"]
    df["size_color_contrast_ratio"] = df["clin_size_long_diam_mm"] / df["tbp_lv_deltaLBnorm"]
    df["age_normalized_nevi_confidence"] = df["tbp_lv_nevi_confidence"] / df["age_approx"]
    df["color_asymmetry_index"] = df["tbp_lv_radial_color_std_max"] * df["tbp_lv_symm_2axis"]
    df["3d_volume_approximation"] = df["tbp_lv_areaMM2"] * np.sqrt(df["tbp_lv_x"]**2 + df["tbp_lv_y"]**2 + df["tbp_lv_z"]**2)
    df["color_range"] = (df["tbp_lv_L"] - df["tbp_lv_Lext"]).abs() + (df["tbp_lv_A"] - df["tbp_lv_Aext"]).abs() + (df["tbp_lv_B"] - df["tbp_lv_Bext"]).abs()
    df["shape_color_consistency"] = df["tbp_lv_eccentricity"] * df["tbp_lv_color_std_mean"]
    df["border_length_ratio"] = df["tbp_lv_perimeterMM"] / (2 * np.pi * np.sqrt(df["tbp_lv_areaMM2"] / np.pi))
    df["age_size_symmetry_index"] = df["age_approx"] * df["clin_size_long_diam_mm"] * df["tbp_lv_symm_2axis"]   
    return df

In [90]:
def feature_engineering_process(df, num_columns, medians):
     # Handle numerical columns missing values
    for nc in num_columns:
        df[nc] = df[nc].fillna(medians.get(nc, 0))  
    # Replacement for 0 to na to avoid incorrect calulation in feature engineering as 0 really mean missing value
    replace_cols = ['tbp_lv_color_std_mean', 'tbp_lv_norm_color', 'tbp_lv_radial_color_std_max']
    for col in replace_cols:
        if col in df.columns:
            df[col] = df[col].replace(0, np.nan)
    #Feature engineering
    df_num = df[num_columns]
    df_num = feature_engineering(df_num)   
    numerical_columns_list = list(df_num.columns)    
    
    return df_num, numerical_columns_list

In [91]:
class Preprocessor:
    def __init__(self):
        self.medians = {}
        self.one_hot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
        self.scaler = StandardScaler()
        self.encoded_cols = []
        self.numerical_columns = []
        self.cat_cols = ["sex", "anatom_site_general"]       

    def fit(self, df):
        # Check if input is a numpy array and convert to DataFrame if necessary
        if isinstance(df, np.ndarray):
            df = pd.DataFrame(df)
        
        # Determine numerical columns
        self.numerical_columns = list(df.select_dtypes(include=['number']).columns)       
        if "target" in self.numerical_columns:
            self.numerical_columns.remove("target") 
        
        # Compute medians for numerical columns
        num_medians = df[self.numerical_columns].median()
        self.medians = dict(num_medians)

        # Compute median for angle
        if 'tbp_lv_symm_2axis_angle' in df.columns:
            self.medians['tbp_lv_symm_2axis_angle'] = df['tbp_lv_symm_2axis_angle'].median()
        
        # Fit OneHotEncoder
        self.one_hot_encoder.fit(df[self.cat_cols])
        self.encoded_cols = self.one_hot_encoder.get_feature_names_out(self.cat_cols)                 
        
        # I have to do feature engineering process in fit as I nedd the result to fit the scaler
        #Feature engineering
        df, numerical_columns_list = feature_engineering_process (df,self.numerical_columns,self.medians)      
        
        # Scale numerical columns           
        self.scaler.fit(df[numerical_columns_list])

        return self

    def transform(self, df):
        # Handle categorical columns missing values
        df['sex'] = df['sex'].fillna('male')
        df[self.cat_cols] = df[self.cat_cols].fillna('other')            
        
        # Specific replacement for 'tbp_lv_symm_2axis_angle' as 0 mean really missing value
        if 'tbp_lv_symm_2axis_angle' in df.columns:
            df['tbp_lv_symm_2axis_angle'] = df['tbp_lv_symm_2axis_angle'].replace(0, self.medians['tbp_lv_symm_2axis_angle'])        
       
        # Apply one-hot encoding
        X_cat = self.one_hot_encoder.transform(df[self.cat_cols])
        df_encoded = pd.DataFrame(X_cat, columns=self.encoded_cols)
        
        #Feature engineering
        df_num, numerical_columns_list = feature_engineering_process (df,self.numerical_columns,self.medians)          
        
        # Scale numerical columns       
        df_num[numerical_columns_list] = self.scaler.transform(df_num[numerical_columns_list])
        
        # Combine encoded and scaled data  
        df_final = pd.concat([df_num, df_encoded], axis=1)
        
        # Add id columns and target back if they exist
        if "isic_id" in df.columns and "patient_id" in df.columns:
            ids = df[["isic_id", "patient_id"]]
            df_final = pd.concat([df_final, ids], axis=1)
        if "target" in df.columns:
            targets = df[["target"]]
            df_final = pd.concat([df_final, targets], axis=1)
        
        return df_final
    

In [92]:
preprocessor = Preprocessor()
preprocessor.fit(df_train_filtered)
df_train_final = preprocessor.transform(df_train_filtered)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["lesion_size_ratio"] = df["tbp_lv_minorAxisMM"] / df["clin_size_long_diam_mm"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["lesion_shape_index"] = df["tbp_lv_areaMM2"] / (df["tbp_lv_perimeterMM"] ** 2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["hue_contrast"] = (df["tbp_lv_H"] - df

In [93]:
len(df_train_final.columns)

80

## Training

### Competition metric

In [96]:
def comp_score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str, min_tpr: float=0.80):
    v_gt = abs(np.asarray(solution.values)-1)
    v_pred = np.array([1.0 - x for x in submission.values])
    max_fpr = abs(1-min_tpr)
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    # change scale from [0.5, 1.0] to [0.5 * max_fpr**2, max_fpr]
    # https://math.stackexchange.com/questions/914823/shift-numbers-into-a-different-range
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)
    return partial_auc

### CV validation and shuffling

In [98]:
N_SPLITS = 10
gkf = GroupKFold(n_splits=N_SPLITS)

df_train_final = df_train_final.sample(frac=1).reset_index(drop=True)
df_train_final["fold"] = -1
for idx, (train_idx, val_idx) in enumerate(gkf.split(df_train_final, df_train_final["target"], groups=df_train_final["patient_id"])):
    df_train_final.loc[val_idx, "fold"] = idx
df_train_final

Unnamed: 0,age_approx,clin_size_long_diam_mm,tbp_lv_A,tbp_lv_Aext,tbp_lv_B,tbp_lv_Bext,tbp_lv_C,tbp_lv_Cext,tbp_lv_H,tbp_lv_Hext,...,sex_other,anatom_site_general_anterior torso,anatom_site_general_head/neck,anatom_site_general_lower extremity,anatom_site_general_posterior torso,anatom_site_general_upper extremity,isic_id,patient_id,target,fold
0,1.995929,-0.252675,-1.111864,-1.525065,0.410464,0.240092,-0.107212,-0.280735,1.508285,1.805312,...,0.0,1.0,0.0,0.0,0.0,0.0,ISIC_2190126,IP_9726832,0,3
1,1.626848,-0.706192,-0.111441,-0.035609,-0.216502,-0.533490,-0.236793,-0.473204,-0.070983,-0.371731,...,0.0,0.0,0.0,1.0,0.0,0.0,ISIC_2146527,IP_5143034,0,7
2,0.519606,-0.838228,-0.374813,-1.323398,2.776886,2.545639,2.093720,1.812540,2.185408,2.488234,...,0.0,0.0,0.0,0.0,1.0,0.0,ISIC_0775151,IP_9103159,0,7
3,0.888687,-0.809525,0.398180,0.679849,0.899092,0.623714,0.816109,0.718582,0.398045,-0.219368,...,0.0,1.0,0.0,0.0,0.0,0.0,ISIC_2484964,IP_9745479,0,1
4,1.626848,-0.413415,1.270533,1.081792,0.053574,0.228276,0.563036,0.563115,-1.071243,-0.859295,...,0.0,0.0,0.0,1.0,0.0,0.0,ISIC_2673643,IP_1045154,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395298,-0.218555,-0.321564,0.765632,0.419500,0.290336,-0.016208,0.506748,0.110414,-0.425619,-0.429301,...,0.0,0.0,0.0,0.0,1.0,0.0,ISIC_3623037,IP_3927284,0,2
395299,-1.694878,1.400649,0.330752,-1.288558,0.482597,-0.292302,0.470141,-0.683799,0.127802,1.231513,...,0.0,1.0,0.0,0.0,0.0,0.0,ISIC_0513111,IP_1480068,0,3
395300,-0.587636,-0.476563,-0.537608,-0.763755,-0.223299,-0.062814,-0.411518,-0.335657,0.365212,0.770549,...,0.0,0.0,0.0,0.0,0.0,1.0,ISIC_3754950,IP_6832954,0,5
395301,-0.218555,1.061948,-0.951427,-0.904552,-1.220572,-1.281168,-1.333924,-1.392979,-0.210383,0.003897,...,0.0,0.0,0.0,1.0,0.0,0.0,ISIC_3826317,IP_9496166,0,2


### training

In [107]:
lgb_params = {
    'objective': 'binary',
    # "random_state": 42,
    "n_estimators": 1500,
    'learning_rate': 0.001,
    'bagging_freq': 1,
    'pos_bagging_fraction': 0.75,
    'neg_bagging_fraction': 0.05,
    'feature_fraction': 0.6,
    'lambda_l1': 0.2,
    'lambda_l2': 0.7,
    'num_leaves': 35,
    "min_data_in_leaf": 50,
    "verbosity": -1,
    # "device": "gpu"
    # "extra_trees": True
}

lgb_scores = []
lgb_models = []
train_cols = list(df_train_final.columns)
for c in ["fold", "target","isic_id", "patient_id"]:
    train_cols.remove(c)
for fold in range(N_SPLITS):
    _df_train = df_train_final[df_train_final["fold"] != fold].reset_index(drop=True)
    _df_valid = df_train_final[df_train_final["fold"] == fold].reset_index(drop=True)
    model = LGBMClassifier(**lgb_params)    
    # model = VotingClassifier([(f"lgb_{i}", lgb.LGBMClassifier(random_state=i, **lgb_params)) for i in range(1)], voting="soft")
    model.fit(_df_train[train_cols], _df_train["target"])
    preds = model.predict_proba(_df_valid[train_cols])[:, 1]
    score = comp_score(_df_valid[["target"]], pd.DataFrame(preds, columns=["prediction"]), "")
    print(f"fold: {fold} - Partial AUC Score: {score:.5f}")
    lgb_scores.append(score)
    lgb_models.append(model)

fold: 0 - Partial AUC Score: 0.17948
fold: 1 - Partial AUC Score: 0.15543
fold: 2 - Partial AUC Score: 0.16172
fold: 3 - Partial AUC Score: 0.16401
fold: 4 - Partial AUC Score: 0.13093
fold: 5 - Partial AUC Score: 0.18273
fold: 6 - Partial AUC Score: 0.14980
fold: 7 - Partial AUC Score: 0.17018
fold: 8 - Partial AUC Score: 0.09771
fold: 9 - Partial AUC Score: 0.16992


In [108]:
lgbm_score = np.mean(lgb_scores)
print(f"LGBM Score: {lgbm_score:.5f}")

LGBM Score: 0.15619


### Prediction

In [115]:
df_test_final = preprocessor.transform(df_test)
test_cols = list(df_test_final.columns)
for c in ["isic_id", "patient_id"]:
    test_cols.remove(c)
print(df_test_final[test_cols])
lgb_preds =np.mean([model.predict_proba(df_test_final[test_cols])[:, 1] for model in lgb_models],axis=0)
lgb_preds 

   age_approx  clin_size_long_diam_mm  tbp_lv_A  tbp_lv_Aext  tbp_lv_B  \
0   -0.956716               -0.706192  0.711846     1.452077  0.024677   
1   -1.694878               -0.809525 -0.829749    -1.492132  0.582124   
2    0.519606               -0.442119  1.074854     1.432195  0.419865   

   tbp_lv_Bext  tbp_lv_C  tbp_lv_Cext  tbp_lv_H  tbp_lv_Hext  ...  \
0     0.036119  0.290498     0.573242 -0.621119    -1.331326  ...   
1     0.143826  0.124961    -0.357369  1.334613     1.718607  ...   
2     0.336511  0.734847     0.791126 -0.575095    -1.080711  ...   

   border_length_ratio  age_size_symmetry_index  sex_female  sex_male  \
0             0.812669                -0.246080         0.0       1.0   
1             0.040993                -0.626517         1.0       0.0   
2             0.144205                 0.066274         0.0       1.0   

   sex_other  anatom_site_general_anterior torso  \
0        0.0                                 0.0   
1        0.0                 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["lesion_size_ratio"] = df["tbp_lv_minorAxisMM"] / df["clin_size_long_diam_mm"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["lesion_shape_index"] = df["tbp_lv_areaMM2"] / (df["tbp_lv_perimeterMM"] ** 2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["hue_contrast"] = (df["tbp_lv_H"] - df

array([0.00141939, 0.00070982, 0.00290841])

### Submission

In [117]:
df_sub = pd.read_csv("/kaggle/input/isic-2024-challenge/sample_submission.csv")
df_sub["target"] = lgb_preds
df_sub.to_csv("submission.csv", index=False)
df_sub

Unnamed: 0,isic_id,target
0,ISIC_0015657,0.001419
1,ISIC_0015729,0.00071
2,ISIC_0015740,0.002908
