In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostClassifier
from sklearn.preprocessing import QuantileTransformer
from sklearn.metrics import roc_auc_score, auc, roc_curve
from scipy.stats import yeojohnson, boxcox
from sklearn.model_selection import KFold, StratifiedKFold, RandomizedSearchCV
import time

import warnings
warnings.filterwarnings("ignore")

In [None]:
train_meta = pd.read_csv("/kaggle/input/isic-2024-challenge/train-metadata.csv")
test_meta = pd.read_csv("/kaggle/input/isic-2024-challenge/test-metadata.csv")

In [None]:
base_features = test_meta.drop(columns = ['patient_id', 'attribution', 'copyright_license']).columns
test_id = test_meta["isic_id"]

In [None]:
train_meta_df = pd.concat([train_meta[base_features], train_meta['target']], axis = 1)
test_meta_df = test_meta[base_features]

In [None]:
all_meta = pd.concat([train_meta_df, test_meta_df], sort=False).reset_index(drop=True)

In [None]:
def distplot(x):
    plt.figure(figsize=(3,3))
    sns.distplot(x);

In [None]:
def quantile_transform(x):
    quantile_transformer = QuantileTransformer(output_distribution = 'normal', random_state = 42)
    all_meta[x] = quantile_transformer.fit_transform(all_meta[x].values.reshape(-1,1)).flatten()

In [None]:
def data_preprocess(df):
    for i in df.columns:
        quantile_transform(i)

In [None]:
categorical_features = all_meta.drop(['isic_id'], axis=1).select_dtypes(include=object).columns
numerical_features_to_update = all_meta.drop(categorical_features, axis=1).drop(['isic_id','target','age_approx'], axis=1).columns

In [None]:
len(numerical_features_to_update), len(categorical_features)

In [None]:
data_preprocess(all_meta[numerical_features_to_update])

In [None]:
all_meta.info()

In [None]:
def show_distributions(df):
    for i in df.columns:
        distplot(df[i])

In [None]:
plt.fig
#show_distributions(all_meta[numerical_features_to_update])

In [None]:
all_meta[categorical_features] = all_meta[categorical_features].fillna('missing')

In [None]:
train_meta_df_upd = all_meta[~all_meta['target'].isnull()].drop('isic_id', axis=1)
test_meta_df_upd = all_meta[all_meta['target'].isnull()].drop(['isic_id', 'target'], axis=1)

In [None]:
train_meta_df_upd.shape, test_meta_df_upd.shape

In [None]:
def calculate_pauc(y_true, y_scores, tpr_threshold=0.8):
    fpr, tpr, thresholds = roc_curve(y_true, y_scores)
    
    mask = tpr >= tpr_threshold
    if np.sum(mask) < 2:
        raise ValueError("Not enough points above the TPR threshold for pAUC calculation.")
    
    fpr_above_threshold = fpr[mask]
    tpr_above_threshold = tpr[mask]
    
    partial_auc = auc(fpr_above_threshold, tpr_above_threshold)
    
    pauc = partial_auc * (1 - tpr_threshold)
    
    return pauc

In [None]:
X = train_meta_df_upd.drop('target', axis=1)
y = train_meta_df_upd['target']

In [None]:
cat_features_indices = [X.columns.get_loc(col) for col in categorical_features]

In [None]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

In [None]:
pauc_scores = []
models = []

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Initialize the CatBoost model
    model = CatBoostClassifier(
        iterations=1000,
        learning_rate=0.1,
        depth=9,
        l2_leaf_reg=3,
        cat_features=cat_features_indices,
        eval_metric='AUC',
        random_seed=42,
        verbose=100,
        early_stopping_rounds=100
    )
    
    # model = CatBoostClassifier(**best_params, eval_metric='AUC', verbose=100)
    
    # Train the model
    model.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=100)
    
    # Predict probabilities
    test_pred = model.predict_proba(X_test)[:, 1]
    
    models.append(model)

    # Calculate pAUC
    pauc = calculate_pauc(y_test, test_pred)
    pauc_scores.append(pauc)

In [None]:
print(f'Average pAUC score: {np.mean(pauc_scores):.4f}')

In [None]:
submit_score = []
for fold_, model in enumerate(models):
    pred_ = model.predict_proba(test_meta_df_upd)[:, 1]
    submit_score.append(pred_)

submit_pred = np.mean(submit_score, axis=0)

In [None]:
submission = pd.DataFrame({
    'isic_id': test_id,
    'target': submit_pred
})

# Save
submission.to_csv('submission.csv', index=False)

submission