In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV, learning_curve
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Reading the data
train_df = pd.read_csv("/kaggle/input/joined-isic-optimized-87-df/joined_train_df_optimized.csv")
test_df = pd.read_csv("/kaggle/input/joined-test-87-isic/joined_test_df_optimized.csv")

In [3]:
print(train_df.shape)
print(test_df.shape)

(29488, 1563)
(2293, 1563)


In [4]:
# Droping the one-hot encoded label columns
label_cols = [
    'Actinic_keratosis', 'Basal_cell_carcinoma', 'Benign_keratosis', 
    'Dermatofibroma', 'Melanocytic_nevus', 'Melanoma', 
    'Squamous_cell_carcinoma', 'Vascular_lesion'
]
train_df = train_df.drop(columns=label_cols)
test_df = test_df.drop(columns=label_cols)

In [5]:
print(train_df.shape)
print(test_df.shape)

(29488, 1555)
(2293, 1555)


In [6]:
# Encoding the labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_df['disease_label'])
y_test = label_encoder.transform(test_df['disease_label'])
# Saving the inverse mapping for later
class_names = label_encoder.classes_

In [7]:
X_train = train_df.drop(columns=['disease_label', 'image_path'])  
X_test = test_df.drop(columns=['disease_label', 'image_path'])

In [8]:
print("Data shapes:")
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")
print(f"Classes: {class_names}")

Data shapes:
X_train: (29488, 1553), y_train: (29488,)
X_test: (2293, 1553), y_test: (2293,)
Classes: ['Actinic_keratosis' 'Basal_cell_carcinoma' 'Benign_keratosis'
 'Dermatofibroma' 'Melanocytic_nevus' 'Melanoma' 'Squamous_cell_carcinoma'
 'Vascular_lesion']


In [9]:
baseline_xgb = xgb.XGBClassifier(
    tree_method="hist",
    device="cuda",
    predictor="gpu_predictor",
    random_state=42,
    n_jobs=1,
    eval_metric='mlogloss',
    objective='multi:softprob'
)

In [10]:
print("Training baseline model")
baseline_xgb.fit(X_train, y_train)

Training baseline model


In [11]:
# The Baseline predictions and evaluation
y_train_pred_baseline = baseline_xgb.predict(X_train)
y_test_pred_baseline = baseline_xgb.predict(X_test)

train_acc_baseline = accuracy_score(y_train, y_train_pred_baseline)
test_acc_baseline = accuracy_score(y_test, y_test_pred_baseline)

In [12]:
print(f"\nBaseline Model Performance:")
print(f"Training Accuracy: {train_acc_baseline:.4f}")
print(f"Test Accuracy: {test_acc_baseline:.4f}")


Baseline Model Performance:
Training Accuracy: 0.9926
Test Accuracy: 0.8696


In [10]:
# Defining the parameter grid for RandomizedSearchCV
param_grid = {
    'n_estimators': [100, 200, 300, 500, 800],
    'max_depth': [3, 4, 5, 6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2, 0.3],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bylevel': [0.6, 0.7, 0.8, 0.9, 1.0],
    'reg_alpha': [0, 0.01, 0.1, 1, 10],
    'reg_lambda': [0, 0.01, 0.1, 1, 10],
    'min_child_weight': [1, 3, 5, 7],
    'gamma': [0, 0.1, 0.2, 0.5, 1.0]
}

In [11]:
xgb_model = xgb.XGBClassifier(
    tree_method="hist",
    device="cuda",
    predictor="gpu_predictor",
    random_state=42,
    n_jobs=1,
    eval_metric='mlogloss',
    objective='multi:softprob'
)

In [12]:
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_grid,
    n_iter=50,
    cv=5,
    scoring='accuracy',
    n_jobs=1,
    random_state=42,
    verbose=1
)

random_search.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [13]:
print(f"\nBest parameters found:")
for param, value in random_search.best_params_.items():
    print(f"  {param}: {value}")
print(f"\nBest cross-validation score: {random_search.best_score_:.4f}")

best_xgb = random_search.best_estimator_


Best parameters found:
  subsample: 0.7
  reg_lambda: 0
  reg_alpha: 10
  n_estimators: 100
  min_child_weight: 5
  max_depth: 3
  learning_rate: 0.01
  gamma: 0.2
  colsample_bytree: 0.8
  colsample_bylevel: 1.0

Best cross-validation score: 0.9882


In [14]:
# Predictions with the best model
y_train_pred = best_xgb.predict(X_train)
y_test_pred = best_xgb.predict(X_test)

In [15]:
# Calculating the accuracies
train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)

In [16]:
print(f"\nOptimized Model Performance:")
print(f"Training Accuracy: {train_acc:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")


Optimized Model Performance:
Training Accuracy: 0.9910
Test Accuracy: 0.8713


In [18]:
print(f"\nImprovement over baseline:")
print(f"Training Accuracy: {train_acc - 0.9926:+.4f}")
print(f"Test Accuracy: {test_acc - 0.8696:+.4f}")


Improvement over baseline:
Training Accuracy: -0.0016
Test Accuracy: +0.0017


In [19]:
# Convert numeric predictions back to string labels for classification report
y_test_true_labels = label_encoder.inverse_transform(y_test)
y_test_pred_labels = label_encoder.inverse_transform(y_test_pred)

In [20]:
print(classification_report(y_test_true_labels, y_test_pred_labels, digits=4))

                         precision    recall  f1-score   support

      Actinic_keratosis     0.7434    0.6720    0.7059       125
   Basal_cell_carcinoma     0.9061    0.9302    0.9180       301
       Benign_keratosis     0.8702    0.7510    0.8062       241
         Dermatofibroma     0.9355    0.8529    0.8923        34
      Melanocytic_nevus     0.8779    0.9664    0.9200      1071
               Melanoma     0.9011    0.7559    0.8222       422
Squamous_cell_carcinoma     0.6056    0.6056    0.6056        71
        Vascular_lesion     0.9643    0.9643    0.9643        28

               accuracy                         0.8713      2293
              macro avg     0.8505    0.8123    0.8293      2293
           weighted avg     0.8712    0.8713    0.8685      2293

