In [1]:
import os
import cv2
import numpy as np
import pandas as pd
from skimage import io, color
from skimage.segmentation import active_contour
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image


In [2]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
df=pd.read_csv('/content/drive/MyDrive/mini_proj_data/proj_metadata.csv')

In [4]:
df

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,image_path
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,/content/drive/MyDrive/mini_proj_data/HAM10000...
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,/content/drive/MyDrive/mini_proj_data/HAM10000...
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,/content/drive/MyDrive/mini_proj_data/HAM10000...
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,/content/drive/MyDrive/mini_proj_data/HAM10000...
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,/content/drive/MyDrive/mini_proj_data/HAM10000...
...,...,...,...,...,...,...,...,...
10010,HAM_0002867,ISIC_0033084,akiec,histo,40.0,male,abdomen,/content/drive/MyDrive/mini_proj_data/HAM10000...
10011,HAM_0002867,ISIC_0033550,akiec,histo,40.0,male,abdomen,/content/drive/MyDrive/mini_proj_data/HAM10000...
10012,HAM_0002867,ISIC_0033536,akiec,histo,40.0,male,abdomen,/content/drive/MyDrive/mini_proj_data/HAM10000...
10013,HAM_0000239,ISIC_0032854,akiec,histo,80.0,male,face,/content/drive/MyDrive/mini_proj_data/HAM10000...


In [5]:
df.columns

Index(['lesion_id', 'image_id', 'dx', 'dx_type', 'age', 'sex', 'localization',
       'image_path'],
      dtype='object')

In [6]:
df.shape

(10015, 8)

In [7]:
df['dx'].value_counts()

Unnamed: 0_level_0,count
dx,Unnamed: 1_level_1
nv,6705
mel,1113
bkl,1099
bcc,514
akiec,327
vasc,142
df,115


In [10]:
from collections import Counter
# Clean labels (HAM10000 7 classes)
CLASSES = ['akiec','bcc','bkl','df','mel','nv','vasc']  # canonical order
df = df[df['dx'].isin(CASSES := CLASSES)].reset_index(drop=True)

# Integer class id
label2id = {c:i for i,c in enumerate(CLASSES)}
id2label = {i:c for c,i in label2id.items()}
df['label_id'] = df['dx'].map(label2id).astype(int)

# Optional: binary malignant flag
MALIGNANT = {'akiec','bcc','mel'}
df['is_malignant'] = df['dx'].isin(MALIGNANT).astype(int)

print(df[['image_id','dx','label_id']].head())
print("Class counts:", Counter(df['dx']))

       image_id   dx  label_id
0  ISIC_0027419  bkl         2
1  ISIC_0025030  bkl         2
2  ISIC_0026769  bkl         2
3  ISIC_0025661  bkl         2
4  ISIC_0031633  bkl         2
Class counts: Counter({'nv': 6705, 'mel': 1113, 'bkl': 1099, 'bcc': 514, 'akiec': 327, 'vasc': 142, 'df': 115})


In [11]:
df.columns

Index(['lesion_id', 'image_id', 'dx', 'dx_type', 'age', 'sex', 'localization',
       'image_path', 'label_id', 'is_malignant'],
      dtype='object')

In [13]:
# =========================
# STEP 1: HAIR REMOVAL (DullRazor)
# =========================
def remove_hair(img):
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    kernel = cv2.getStructuringElement(1,(17,17))
    blackhat = cv2.morphologyEx(gray, cv2.MORPH_BLACKHAT, kernel)
    ret,thresh2 = cv2.threshold(blackhat,10,255,cv2.THRESH_BINARY)
    dst = cv2.inpaint(img,thresh2,1,cv2.INPAINT_TELEA)
    return dst

In [14]:
# =========================
# STEP 2: SEGMENTATION (Crop + Mask option)
# =========================
import cv2
import numpy as np

def segment_lesion(img, mode="crop"):
    """
    Segment lesion using threshold + contour.

    Args:
        img  : input RGB image
        mode : "crop" (bounding box crop) or "mask" (keep lesion, remove background)
    Returns:
        processed lesion image (same size as original if mask mode, cropped if crop mode)
    """
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    gray = cv2.GaussianBlur(gray, (5,5), 0)

    # Otsu threshold
    _, mask = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    # Find largest contour (assume lesion)
    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if len(contours) == 0:
        return img  # fallback if nothing found

    c = max(contours, key=cv2.contourArea)

    if mode == "crop":
        # Bounding box crop with padding
        x, y, w, h = cv2.boundingRect(c)
        pad = 10
        x = max(0, x - pad)
        y = max(0, y - pad)
        w = min(img.shape[1] - x, w + 2*pad)
        h = min(img.shape[0] - y, h + 2*pad)
        lesion_crop = img[y:y+h, x:x+w]
        return lesion_crop

    elif mode == "mask":
        # Create mask of lesion only
        lesion_mask = np.zeros_like(gray)
        cv2.drawContours(lesion_mask, [c], -1, 255, -1)   # fill contour
        segmented = cv2.bitwise_and(img, img, mask=lesion_mask)
        return segmented

    else:
        raise ValueError("mode must be 'crop' or 'mask'")


In [15]:
# def augment_image(img):
#     """Apply random augmentation for minority classes"""
#     # Random flip
#     if np.random.rand() > 0.5:
#         img = cv2.flip(img, 0)  # vertical flip
#     if np.random.rand() > 0.5:
#         img = cv2.flip(img, 1)  # horizontal flip

#     # Random rotation (-30 to 30 degrees)
#     angle = np.random.uniform(-30, 30)
#     M = cv2.getRotationMatrix2D((IM_SIZE // 2, IM_SIZE // 2), angle, 1)
#     img = cv2.warpAffine(img, M, (IM_SIZE, IM_SIZE), borderMode=cv2.BORDER_REFLECT)

#     # Random brightness/contrast
#     if np.random.rand() > 0.5:
#         alpha = np.random.uniform(0.8, 1.2)  # contrast
#         beta = np.random.randint(-20, 20)    # brightness
#         img = cv2.convertScaleAbs(img, alpha=alpha, beta=beta)

#     # Random Gaussian blur
#     if np.random.rand() > 0.7:
#         ksize = np.random.choice([3, 5])  # kernel size
#         img = cv2.GaussianBlur(img, (ksize, ksize), 0)

#     return img


In [16]:
# =========================
# CLAHE OBJECT
# =========================
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))

In [17]:
import os

SAVE_DIR = '/content/drive/MyDrive/mini_proj_data/Processed_data/'
os.makedirs(SAVE_DIR, exist_ok=True)

# Confirm
print("SAVE_DIR absolute path:", os.path.abspath(SAVE_DIR))
print("Folder exists?", os.path.exists(SAVE_DIR))


SAVE_DIR absolute path: /content/drive/MyDrive/mini_proj_data/Processed_data
Folder exists? True


In [19]:
# =========================
# CONFIG
# =========================
IM_SIZE = 224
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD  = (0.229, 0.224, 0.225)

SAVE_DIR = '/content/drive/MyDrive/mini_proj_data/Processed_data/'
processed_data = []

for i, row in tqdm(df.iterrows(), total=len(df)):
    img_path = row["image_path"]
    label = row["dx"]

    img = cv2.imread(img_path)
    if img is None:
        continue
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    # ---------- Base Preprocessing ----------
    img = remove_hair(img)

    # Scale to [0,1]
    img = img.astype(np.float32) / 255.0

    # Apply CLAHE
    lab = cv2.cvtColor((img*255).astype(np.uint8), cv2.COLOR_RGB2LAB)
    l, a, b = cv2.split(lab)
    l = clahe.apply(l)
    lab = cv2.merge((l, a, b))
    img_enhanced = cv2.cvtColor(lab, cv2.COLOR_LAB2RGB)

    # ---------- Save Original ----------
    img_resized = cv2.resize(img_enhanced, (IM_SIZE, IM_SIZE))
    save_path_orig = os.path.join(SAVE_DIR, f"{row['image_id']}_orig.jpg")
    cv2.imwrite(save_path_orig, cv2.cvtColor(img_resized, cv2.COLOR_RGB2BGR))

    processed_data.append({
        **row.to_dict(),
        "clean_path": save_path_orig,
        "variant": "orig"
    })

    # ---------- Save Segmented ----------
    img_segmented = segment_lesion(img_enhanced)
    img_segmented_resized = cv2.resize(img_segmented, (IM_SIZE, IM_SIZE))
    save_path_seg = os.path.join(SAVE_DIR, f"{row['image_id']}_seg.jpg")
    cv2.imwrite(save_path_seg, cv2.cvtColor(img_segmented_resized, cv2.COLOR_RGB2BGR))

    processed_data.append({
        **row.to_dict(),
        "clean_path": save_path_seg,
        "variant": "seg"
    })


  1%|          | 95/10015 [02:10<3:46:49,  1.37s/it]


KeyboardInterrupt: 

In [None]:
df_processed = pd.DataFrame(processed_data)
print("Total processed images (including augmentations):", len(df_processed))
df_processed.head()

Total processed images (including augmentations): 11767


Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,image_path,label_id,is_malignant,clean_path
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,/content/drive/MyDrive/mini_proj_data/HAM10000...,2,0,/content/drive/MyDrive/mini_proj_data/Processe...
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,/content/drive/MyDrive/mini_proj_data/HAM10000...,2,0,/content/drive/MyDrive/mini_proj_data/Processe...
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,/content/drive/MyDrive/mini_proj_data/HAM10000...,2,0,/content/drive/MyDrive/mini_proj_data/Processe...
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,/content/drive/MyDrive/mini_proj_data/HAM10000...,2,0,/content/drive/MyDrive/mini_proj_data/Processe...
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,/content/drive/MyDrive/mini_proj_data/HAM10000...,2,0,/content/drive/MyDrive/mini_proj_data/Processe...


In [None]:
df_processed

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,image_path,label_id,is_malignant,clean_path
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,/content/drive/MyDrive/mini_proj_data/HAM10000...,2,0,/content/drive/MyDrive/mini_proj_data/Processe...
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,/content/drive/MyDrive/mini_proj_data/HAM10000...,2,0,/content/drive/MyDrive/mini_proj_data/Processe...
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,/content/drive/MyDrive/mini_proj_data/HAM10000...,2,0,/content/drive/MyDrive/mini_proj_data/Processe...
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,/content/drive/MyDrive/mini_proj_data/HAM10000...,2,0,/content/drive/MyDrive/mini_proj_data/Processe...
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,/content/drive/MyDrive/mini_proj_data/HAM10000...,2,0,/content/drive/MyDrive/mini_proj_data/Processe...
...,...,...,...,...,...,...,...,...,...,...,...
11762,HAM_0000239,ISIC_0032854,akiec,histo,80.0,male,face,/content/drive/MyDrive/mini_proj_data/HAM10000...,0,1,/content/drive/MyDrive/mini_proj_data/Processe...
11763,HAM_0000239,ISIC_0032854,akiec,histo,80.0,male,face,/content/drive/MyDrive/mini_proj_data/HAM10000...,0,1,/content/drive/MyDrive/mini_proj_data/Processe...
11764,HAM_0000239,ISIC_0032854,akiec,histo,80.0,male,face,/content/drive/MyDrive/mini_proj_data/HAM10000...,0,1,/content/drive/MyDrive/mini_proj_data/Processe...
11765,HAM_0000239,ISIC_0032854,akiec,histo,80.0,male,face,/content/drive/MyDrive/mini_proj_data/HAM10000...,0,1,/content/drive/MyDrive/mini_proj_data/Processe...


In [None]:
df_processed.to_csv('/content/drive/MyDrive/mini_proj_data/processed_data.csv', index=False)