In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
import cv2
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
import xgboost as xgb
import matplotlib.pyplot as plt

# Load the train and test datasets
train_df = pd.read_csv('/kaggle/input/siim-isic-melanoma-classification/train.csv')
test_df = pd.read_csv('/kaggle/input/siim-isic-melanoma-classification/test.csv')

# Path to images
train_image_dir = '/kaggle/input/siim-isic-melanoma-classification/jpeg/train/'
test_image_dir = '/kaggle/input/siim-isic-melanoma-classification/jpeg/test/'

# Set constants
IMG_SIZE = 196  # Reduced image size for faster computation
BATCH_SIZE = 16

# Image preprocessing function
def preprocess_image(image_path, img_size=IMG_SIZE):
    img = cv2.imread(image_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (img_size, img_size))
    img = img / 255.0  # Normalize
    return img

def load_and_preprocess_image(image_name, label=None, is_train=True):
    image_path = tf.strings.join([train_image_dir, image_name, '.jpg'], separator='')
    image = tf.io.read_file(image_path)
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, [IMG_SIZE, IMG_SIZE])
    image = image / 255.0  # Normalize

    if is_train:
        image = tf.image.random_flip_left_right(image)
        image = tf.image.random_brightness(image, max_delta=0.1)
        image = tf.image.random_contrast(image, lower=0.8, upper=1.2)
        image = tf.image.random_saturation(image, lower=0.8, upper=1.2)
        image = tf.image.random_hue(image, max_delta=0.1)

    if label is not None:
        return image, label
    return image


def create_dataset(df, is_train=True):
    dataset = tf.data.Dataset.from_tensor_slices((df['image_name'], df['target']))
    dataset = dataset.map(lambda x, y: (load_and_preprocess_image(x, y)), 
                          num_parallel_calls=tf.data.AUTOTUNE)
    if is_train:
        dataset = dataset.shuffle(buffer_size=len(df)).batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)
    else:
        dataset = dataset.batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)
    return dataset

In [5]:
train_df.shape

(33126, 8)

In [6]:
train_df["target"].value_counts()

target
0    32542
1      584
Name: count, dtype: int64

In [7]:
train_df["anatom_site_general_challenge"].value_counts()

anatom_site_general_challenge
torso              16845
lower extremity     8417
upper extremity     4983
head/neck           1855
palms/soles          375
oral/genital         124
Name: count, dtype: int64

In [8]:
train_df = train_df.dropna(subset=['sex'])

In [9]:
train_df["sex"].value_counts()

sex
male      17080
female    15981
Name: count, dtype: int64

In [10]:
train_df.head()

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target
0,ISIC_2637011,IP_7279968,male,45.0,head/neck,unknown,benign,0
1,ISIC_0015719,IP_3075186,female,45.0,upper extremity,unknown,benign,0
2,ISIC_0052212,IP_2842074,female,50.0,lower extremity,nevus,benign,0
3,ISIC_0068279,IP_6890425,female,45.0,head/neck,unknown,benign,0
4,ISIC_0074268,IP_8723313,female,55.0,upper extremity,unknown,benign,0


## Fill Missing values

In [11]:
# Fill missing values for age
train_df['age_approx'].fillna(train_df['age_approx'].mean(), inplace=True)

# Fill missing categorical features with 'unknown'
train_df['sex'].fillna('unknown', inplace=True)
train_df['anatom_site_general_challenge'].fillna(train_df['anatom_site_general_challenge'].mode()[0], inplace=True)
# train_df['diagnosis'].fillna('unknown', inplace=True)
# train_df['benign_malignant'].fillna('unknown', inplace=True)

# Encode categorical features using one-hot encoding
one_hot_columns = ['sex', 'anatom_site_general_challenge']
train_df = pd.get_dummies(train_df, columns=one_hot_columns)

y = train_df['target']

# Train-Validation Split
# Set the random seed for reproducibility
np.random.seed(0)

# Generate one random number between 0 and 100
random_number = np.random.randint(0, 50) 
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=random_number)

train_dataset = create_dataset(train_df)
val_dataset = create_dataset(val_df, is_train=False)

train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)
val_dataset = val_dataset.prefetch(tf.data.experimental.AUTOTUNE)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['age_approx'].fillna(train_df['age_approx'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['sex'].fillna('unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object o

In [12]:
train_df.head()

Unnamed: 0,image_name,patient_id,age_approx,diagnosis,benign_malignant,target,sex_female,sex_male,anatom_site_general_challenge_head/neck,anatom_site_general_challenge_lower extremity,anatom_site_general_challenge_oral/genital,anatom_site_general_challenge_palms/soles,anatom_site_general_challenge_torso,anatom_site_general_challenge_upper extremity
31785,ISIC_9603756,IP_4281194,80.0,unknown,benign,0,True,False,False,False,False,False,True,False
30581,ISIC_9240979,IP_2358028,45.0,nevus,benign,0,False,True,False,False,False,False,False,True
22527,ISIC_6831244,IP_3145948,45.0,nevus,benign,0,False,True,False,False,False,False,True,False
4531,ISIC_1465113,IP_8029095,45.0,unknown,benign,0,True,False,False,False,False,False,True,False
30740,ISIC_9290617,IP_7718399,40.0,nevus,benign,0,True,False,True,False,False,False,False,False


In [13]:
from sklearn.utils import class_weight
from sklearn.metrics import classification_report
# import tensorflow_addons as tfa
# from tensorflow_addons.optimizers import CyclicalLearningRate

lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=1e-4, decay_steps=10000, decay_rate=0.9)

steps_per_epoch = len(train_df) // BATCH_SIZE
validation_steps = len(val_df) // BATCH_SIZE

# early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Calculate class weights
class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y),
    y=y
)

# Convert class weights to a dictionary
class_weights = {0: class_weights[0], 1: class_weights[1]}


In [15]:
def load_and_preprocess_image_test(image_name, label=None):
    image_path = tf.strings.join([test_image_dir, image_name, '.jpg'], separator='')
    image = tf.io.read_file(image_path)
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, [IMG_SIZE, IMG_SIZE])
    image = image / 255.0  # Normalize
    
    # Move data augmentation to GPU
    image = tf.image.random_flip_left_right(image)
    image = tf.image.random_brightness(image, max_delta=0.1)
    
    if label is not None:
        return image, label
    return image

# Prepare the test dataset
def create_test_dataset(df):
    dataset = tf.data.Dataset.from_tensor_slices(df['image_name'])
    dataset = dataset.map(lambda x: load_and_preprocess_image_test(x), num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
    return dataset

In [16]:
train_df = train_df.drop(columns=['image_name', 'patient_id', 'diagnosis', 'benign_malignant'], axis=1)
val_df = val_df.drop(columns=['image_name', 'patient_id', 'diagnosis', 'benign_malignant'], axis=1)

# Extract tabular features for RandomForest and XGBoost
X_train_tabular = train_df.drop(columns=['target'], axis=1)
y_train = train_df['target']

X_val_tabular = val_df.drop(columns=['target'], axis=1)
y_val = val_df['target']

# Train RandomForest

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_tabular, y_train)

# Train XGBoost
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train_tabular, y_train)

In [17]:
# Fill missing values for age
test_df['age_approx'].fillna(test_df['age_approx'].mean(), inplace=True)

# Fill missing categorical features with 'unknown'
test_df['sex'].fillna('unknown', inplace=True)
test_df['anatom_site_general_challenge'].fillna(test_df['anatom_site_general_challenge'].mode()[0], inplace=True)
# train_df['diagnosis'].fillna('unknown', inplace=True)
# train_df['benign_malignant'].fillna('unknown', inplace=True)

# Encode categorical features using one-hot encoding
one_hot_columns = ['sex', 'anatom_site_general_challenge']
test_df = pd.get_dummies(test_df, columns=one_hot_columns)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['age_approx'].fillna(test_df['age_approx'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['sex'].fillna('unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on w

In [18]:
test_dataset = create_test_dataset(test_df)
test_dataset = test_dataset.prefetch(tf.data.AUTOTUNE)

In [None]:
# Defined model (using EfficientNetB3)
base_model = tf.keras.applications.EfficientNetB3(
    include_top=False,
    weights='imagenet',
    input_shape=(IMG_SIZE, IMG_SIZE, 3)
)

base_model.trainable = False

# Custom classification head
model = tf.keras.Sequential([
    base_model,
    tf.keras.layers.GlobalAveragePooling2D(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')  # Adjust for binary classification
])

base_model.trainable = True


# Compile the model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Train the model
history = model.fit(
    train_dataset,
    epochs=4,
    validation_data=val_dataset,
    class_weight=class_weights,
    callbacks=[early_stopping],
    verbose=1
)

Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb3_notop.h5
[1m43941136/43941136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 0us/step
Epoch 1/4


I0000 00:00:1730052536.998188      64 service.cc:145] XLA service 0x7a1cb40068f0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1730052537.006231      64 service.cc:153]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
I0000 00:00:1730052648.347455      64 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m1653/1653[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1465s[0m 242ms/step - accuracy: 0.6952 - loss: 0.5840 - val_accuracy: 0.7485 - val_loss: 0.4181
Epoch 2/4
[1m1653/1653[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1224s[0m 228ms/step - accuracy: 0.7399 - loss: 0.4625 - val_accuracy: 0.6834 - val_loss: 0.6453
Epoch 3/4
[1m1653/1653[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1219s[0m 229ms/step - accuracy: 0.7426 - loss: 0.4489 - val_accuracy: 0.7732 - val_loss: 0.4890
Epoch 4/4
[1m1653/1653[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1260s[0m 240ms/step - accuracy: 0.7828 - loss: 0.3921 - val_accuracy: 0.7032 - val_loss: 0.4374


In [20]:
y_pred_proba_cnn = model.predict(val_dataset)

[1m414/414[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m193s[0m 448ms/step


In [None]:
# CNN Model AUC-ROC
auc_cnn = roc_auc_score(y_val, y_pred_proba_cnn)
print(f'CNN Model AUC: {auc_cnn}')

# RandomForest AUC-ROC
y_pred_proba_rf = rf_model.predict_proba(X_val_tabular)[:, 1]
auc_rf = roc_auc_score(y_val, y_pred_proba_rf)
print(f'RandomForest Model AUC: {auc_rf}')

# XGBoost AUC-ROC
y_pred_proba_xgb = xgb_model.predict_proba(X_val_tabular)[:, 1]
auc_xgb = roc_auc_score(y_val, y_pred_proba_xgb)
print(f'XGBoost Model AUC: {auc_xgb}')

CNN Model AUC: 0.8425460218754591
RandomForest Model AUC: 0.665977343076697
XGBoost Model AUC: 0.6755412748953198


In [27]:
# Average the predicted probabilities (Ensemble)
ensemble_proba = (y_pred_proba_cnn.flatten() + y_pred_proba_rf + y_pred_proba_xgb) / 3
auc_ensemble = roc_auc_score(y_val, ensemble_proba)
print(f'Ensemble Model AUC: {auc_ensemble}')

Ensemble Model AUC: 0.8530556086692659


In [28]:
test_predictions = model.predict(test_dataset)

[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m281s[0m 397ms/step


In [29]:
test = test_df.drop(columns=['image_name', 'patient_id'], axis=1)

# Extract tabular features for RandomForest and XGBoost
X_test_tabular = test

y_pred_proba_rf = rf_model.predict_proba(X_test_tabular)[:, 1]

# XGBoost AUC-ROC
y_pred_proba_xgb = xgb_model.predict_proba(X_test_tabular)[:, 1]

ensemble_proba = (test_predictions.flatten() + y_pred_proba_rf + y_pred_proba_xgb) / 3

# Create submission file
submission_df = pd.DataFrame({'image_name': test_df['image_name'], 'target': ensemble_proba})
submission_df.to_csv('submission.csv', index=False)