In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler, PowerTransformer
import catboost as cb
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import roc_auc_score

dtype_dict = {
    'isic_id': 'object',
    'target': 'int64',
    'patient_id': 'object',
    'age_approx': 'float64',
    'sex': 'string',
    'anatom_site_general': 'string',
    'clin_size_long_diam_mm': 'float64',
    'image_type': 'object',
    'tbp_tile_type': 'string',
    'tbp_lv_A': 'float64',
    'tbp_lv_Aext': 'float64',
    'tbp_lv_B': 'float64',
    'tbp_lv_Bext': 'float64',
    'tbp_lv_C': 'float64',
    'tbp_lv_Cext': 'float64',
    'tbp_lv_H': 'float64',
    'tbp_lv_Hext': 'float64',
    'tbp_lv_L': 'float64',
    'tbp_lv_Lext': 'float64',
    'tbp_lv_areaMM2': 'float64',
    'tbp_lv_area_perim_ratio': 'float64',
    'tbp_lv_color_std_mean': 'float64',
    'tbp_lv_deltaA': 'float64',
    'tbp_lv_deltaB': 'float64',
    'tbp_lv_deltaL': 'float64',
    'tbp_lv_deltaLB': 'float64',
    'tbp_lv_deltaLBnorm': 'float64',
    'tbp_lv_eccentricity': 'float64',
    'tbp_lv_location': 'string',
    'tbp_lv_location_simple': 'string',
    'tbp_lv_minorAxisMM': 'float64',
    'tbp_lv_nevi_confidence': 'float64',
    'tbp_lv_norm_border': 'float64',
    'tbp_lv_norm_color': 'float64',
    'tbp_lv_perimeterMM': 'float64',
    'tbp_lv_radial_color_std_max': 'float64',
    'tbp_lv_stdL': 'float64',
    'tbp_lv_stdLExt': 'float64',
    'tbp_lv_symm_2axis': 'float64',
    'tbp_lv_symm_2axis_angle': 'int64',
    'tbp_lv_x': 'float64',
    'tbp_lv_y': 'float64',
    'tbp_lv_z': 'float64',
    'attribution': 'object',
    'copyright_license': 'object'
}


# Load the CSV files into DataFrames
train_df = pd.read_csv('/kaggle/input/isic-2024-challenge/train-metadata.csv',low_memory=False,dtype=dtype_dict)
test_df = pd.read_csv('/kaggle/input/isic-2024-challenge/test-metadata.csv',low_memory=False,dtype=dtype_dict)

# Find the columns present in the train DataFrame but not in the test DataFrame
# Ensure that the 'target' column is not removed
columns_to_remove = [col for col in train_df.columns if col not in test_df.columns and col != 'target']

# Drop these columns from the train DataFrame
train_df = train_df.drop(columns=columns_to_remove)

# First, replace NaN values with None if they are not already None
train_df['sex'] = train_df['sex'].replace({np.nan: None})

# Now apply your mapping
train_df['sex'] = train_df['sex'].map({'male': 1, 'female': 0, None: 0.5})

# Calculate the mode of the column
mode_value = train_df['anatom_site_general'].mode()[0]

# Replace NaN values in the column with the mode
train_df['anatom_site_general'] = train_df['anatom_site_general'].fillna(mode_value)

# Calculate the mode of the column
mean_value = train_df['age_approx'].mean()

# Replace NaN values in the column with the mode
train_df['age_approx'] = train_df['age_approx'].fillna(mean_value)

targets = train_df['target']
train_df = train_df.drop(columns = ['target'])

In [2]:
# Assuming train_df is your DataFrame
# train_df = pd.read_csv('your_train_df.csv')

# Step 1: Divide 'tbp_lv_nevi_confidence' by 100
train_df['tbp_lv_nevi_confidence'] = train_df['tbp_lv_nevi_confidence'] / 100

# Step 2: Drop specified columns if they exist
columns_to_drop = [
    'isic_id', 'patient_id', 'image_type', 
    'attribution', 'copyright_license'
]
train_df.drop(columns=columns_to_drop, inplace=True)

# Step 3: Min-Max Scaling for 'age'
scaler = MinMaxScaler()
train_df['age_approx'] = scaler.fit_transform(train_df[['age_approx']])

# Step 4: One-Hot Encoding for categorical variables
categorical_columns = [
    'sex', 'anatom_site_general', 'tbp_tile_type', 
    'tbp_lv_location', 'tbp_lv_location_simple'
]
categorical_features_indices = [train_df.columns.get_loc(col) for col in categorical_columns]

# Step 5: Yeo-Johnson and Z-Score Scaling for specified columns
columns_to_transform = [
    'clin_size_long_diam_mm', 'tbp_lv_areaMM2', 'tbp_lv_area_perim_ratio', 'tbp_lv_color_std_mean',
    'tbp_lv_deltaA', 'tbp_lv_deltaB', 'tbp_lv_deltaL', 'tbp_lv_deltaLB', 'tbp_lv_deltaLBnorm', 
    'tbp_lv_eccentricity', 'tbp_lv_minorAxisMM', 'tbp_lv_norm_border', 'tbp_lv_norm_color', 
    'tbp_lv_perimeterMM', 'tbp_lv_radial_color_std_max', 'tbp_lv_stdL', 'tbp_lv_stdLExt', 
    'tbp_lv_symm_2axis', 'tbp_lv_x', 'tbp_lv_y', 'tbp_lv_z', 'tbp_lv_A', 'tbp_lv_Aext', 
    'tbp_lv_B', 'tbp_lv_Bext', 'tbp_lv_C', 'tbp_lv_Cext', 'tbp_lv_H', 'tbp_lv_Hext', 
    'tbp_lv_L', 'tbp_lv_symm_2axis_angle'
]

yeo_transformer = PowerTransformer(method='yeo-johnson')
standard_scaler = StandardScaler()


# Apply Yeo-Johnson transformation
train_df[columns_to_transform] = yeo_transformer.fit_transform(train_df[columns_to_transform])
# Apply Z-Score normalization
train_df[columns_to_transform] = standard_scaler.fit_transform(train_df[columns_to_transform])

# Save the preprocessed DataFrame to a new CSV file
# train_df.to_csv('preprocessed_train_df.csv', index=False)

In [3]:
# Handling missing values and mapping for test_df
test_df['sex'] = test_df['sex'].replace({np.nan: None})
test_df['sex'] = test_df['sex'].map({'male': 1, 'female': 0, None: 0.5})

test_df['anatom_site_general'] = test_df['anatom_site_general'].fillna(mode_value)
test_df['age_approx'] = test_df['age_approx'].fillna(mean_value)
test_df['tbp_lv_nevi_confidence'] = test_df['tbp_lv_nevi_confidence'] / 100

# Drop specified columns
isic = test_df["isic_id"]
test_df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

# Min-Max Scaling for 'age_approx'
test_df['age_approx'] = scaler.transform(test_df[['age_approx']])

test_df[columns_to_transform] = yeo_transformer.transform(test_df[columns_to_transform])
test_df[columns_to_transform] = standard_scaler.transform(test_df[columns_to_transform])

for col in categorical_features_indices:
    train_df.iloc[:, col] = train_df.iloc[:, col].astype(str)
for col in categorical_features_indices:
    test_df.iloc[:, col] = test_df.iloc[:, col].astype(str)

1         1.0
2         1.0
3         1.0
4         1.0
         ... 
401054    1.0
401055    1.0
401056    0.0
401057    0.0
401058    1.0
Name: sex, Length: 401059, dtype: object' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  train_df.iloc[:, col] = train_df.iloc[:, col].astype(str)
1    0.0
2    1.0
Name: sex, dtype: object' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  test_df.iloc[:, col] = test_df.iloc[:, col].astype(str)


In [4]:
def pauc(y_true, y_pred):
    min_tpr = 0.8
    v_gt = abs(np.asarray(y_true) - 1)
    v_pred = np.array([1.0 - x for x in y_pred])
    max_fpr = abs(1 - min_tpr)
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    partial_auc = 0.5 * max_fpr * 2 + (max_fpr - 0.5 * max_fpr * 2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)
    return partial_auc

In [5]:
from sklearn.model_selection import StratifiedKFold
import gc

FOLDS = 5
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)
test_preds = np.zeros((len(test_df), FOLDS), dtype=np.float32)
scores = []
for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, targets)):
    print("#" * 25)
    print(f"# Fold {fold + 1}")
    print("#" * 25)
    X_train_fold = train_df.loc[train_idx]
    y_train_fold = targets.loc[train_idx].values
    X_val_fold = train_df.loc[val_idx]
    y_val_fold = targets.loc[val_idx].values
    X_train_pool = Pool(X_train_fold, y_train_fold, cat_features=categorical_features_indices)
    X_val_pool = Pool(X_val_fold, y_val_fold, cat_features=categorical_features_indices)
    X_test_pool = Pool(test_df, cat_features=categorical_features_indices)
    
    # Train CatBoost
    model = cb.CatBoostClassifier(
        loss_function='Logloss',
        eval_metric='AUC',
        class_names=[0, 1],
        learning_rate=0.075,
        iterations=10000,
        depth=9,
        random_strength=0,
        l2_leaf_reg=0.5,
        max_leaves=512,
        fold_permutation_block=64,
        task_type='GPU',
        random_seed=42,
        verbose=False
    )

    model.fit(X=X_train_pool, 
              eval_set=X_val_pool, 
              verbose=500, 
              early_stopping_rounds=200)

    score = model.best_score_['validation']['AUC']
    print('Fold ROC-AUC score: ', score)
    scores.append(score)

    test_preds[:, fold] = model.predict_proba(test_df)[:, 1]
    
    del X_train_fold, y_train_fold
    del X_val_fold, y_val_fold
    del X_train_pool, X_val_pool, X_test_pool
    del model
    gc.collect()

#########################
# Fold 1
#########################


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8019479	best: 0.8019479 (0)	total: 12.7s	remaining: 1d 11h 12m 3s
bestTest = 0.9355524182
bestIteration = 125
Shrink model to first 126 iterations.
Fold ROC-AUC score:  0.9355524182319641
#########################
# Fold 2
#########################


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6823626	best: 0.6823626 (0)	total: 71.2ms	remaining: 11m 51s
bestTest = 0.9532405734
bestIteration = 145
Shrink model to first 146 iterations.
Fold ROC-AUC score:  0.9532405734062195
#########################
# Fold 3
#########################


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7318250	best: 0.7318250 (0)	total: 69.7ms	remaining: 11m 37s
bestTest = 0.9378029108
bestIteration = 63
Shrink model to first 64 iterations.
Fold ROC-AUC score:  0.9378029108047485
#########################
# Fold 4
#########################


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7375234	best: 0.7375234 (0)	total: 69.6ms	remaining: 11m 35s
bestTest = 0.9369870424
bestIteration = 182
Shrink model to first 183 iterations.
Fold ROC-AUC score:  0.936987042427063
#########################
# Fold 5
#########################


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8351799	best: 0.8351799 (0)	total: 1.14s	remaining: 3h 10m 40s
bestTest = 0.9423976243
bestIteration = 120
Shrink model to first 121 iterations.
Fold ROC-AUC score:  0.9423976242542267


In [6]:
test_preds = np.mean(test_preds, axis=1)
submission = pd.read_csv('/kaggle/input/isic-2024-challenge/sample_submission.csv')
submission['target'] = test_preds.astype(np.float32)
submission['isic_id'] = isic
submission.to_csv('submission.csv', index=False)