In [37]:
import os
import cv2
import numpy as np
import pandas as pd
from skimage import io, color
from skimage.segmentation import active_contour
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image


In [21]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [22]:
df=pd.read_csv('/content/drive/MyDrive/mini_proj_data/proj_metadata.csv')

In [23]:
df

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,image_path
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,/content/drive/MyDrive/mini_proj_data/HAM10000...
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,/content/drive/MyDrive/mini_proj_data/HAM10000...
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,/content/drive/MyDrive/mini_proj_data/HAM10000...
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,/content/drive/MyDrive/mini_proj_data/HAM10000...
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,/content/drive/MyDrive/mini_proj_data/HAM10000...
...,...,...,...,...,...,...,...,...
10010,HAM_0002867,ISIC_0033084,akiec,histo,40.0,male,abdomen,/content/drive/MyDrive/mini_proj_data/HAM10000...
10011,HAM_0002867,ISIC_0033550,akiec,histo,40.0,male,abdomen,/content/drive/MyDrive/mini_proj_data/HAM10000...
10012,HAM_0002867,ISIC_0033536,akiec,histo,40.0,male,abdomen,/content/drive/MyDrive/mini_proj_data/HAM10000...
10013,HAM_0000239,ISIC_0032854,akiec,histo,80.0,male,face,/content/drive/MyDrive/mini_proj_data/HAM10000...


In [24]:
df.columns

Index(['lesion_id', 'image_id', 'dx', 'dx_type', 'age', 'sex', 'localization',
       'image_path'],
      dtype='object')

In [25]:
df.shape

(10015, 8)

In [26]:
df['dx'].value_counts()

Unnamed: 0_level_0,count
dx,Unnamed: 1_level_1
nv,6705
mel,1113
bkl,1099
bcc,514
akiec,327
vasc,142
df,115


In [29]:
# Clean labels (HAM10000 7 classes)
CLASSES = ['akiec','bcc','bkl','df','mel','nv','vasc']  # canonical order
df = df[df['dx'].isin(CASSES := CLASSES)].reset_index(drop=True)

# Integer class id
label2id = {c:i for i,c in enumerate(CLASSES)}
id2label = {i:c for c,i in label2id.items()}
df['label_id'] = df['dx'].map(label2id).astype(int)

# Optional: binary malignant flag
MALIGNANT = {'akiec','bcc','mel'}
df['is_malignant'] = df['dx'].isin(MALIGNANT).astype(int)

print(df[['image_id','dx','label_id']].head())
print("Class counts:", Counter(df['dx']))

       image_id   dx  label_id
0  ISIC_0027419  bkl         2
1  ISIC_0025030  bkl         2
2  ISIC_0026769  bkl         2
3  ISIC_0025661  bkl         2
4  ISIC_0031633  bkl         2
Class counts: Counter({'nv': 6705, 'mel': 1113, 'bkl': 1099, 'bcc': 514, 'akiec': 327, 'vasc': 142, 'df': 115})


In [31]:
df.columns

Index(['lesion_id', 'image_id', 'dx', 'dx_type', 'age', 'sex', 'localization',
       'image_path', 'label_id', 'is_malignant'],
      dtype='object')

In [40]:
# =========================
# CONFIG
# =========================
IM_SIZE = 224
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD  = (0.229, 0.224, 0.225)

train_tfms = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((IM_SIZE, IM_SIZE)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomVerticalFlip(p=0.1),
    transforms.RandomRotation(15),
    transforms.ToTensor(),
    transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD),
])

val_tfms = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((IM_SIZE, IM_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD),
])


In [38]:
# =========================
# STEP 1: HAIR REMOVAL (DullRazor)
# =========================
def remove_hair(img):
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    kernel = cv2.getStructuringElement(1,(17,17))
    blackhat = cv2.morphologyEx(gray, cv2.MORPH_BLACKHAT, kernel)
    ret,thresh2 = cv2.threshold(blackhat,10,255,cv2.THRESH_BINARY)
    dst = cv2.inpaint(img,thresh2,1,cv2.INPAINT_TELEA)
    return dst

In [39]:
# =========================
# STEP 2: SEGMENTATION (Snake/Active Contour)
# =========================
def segment_lesion(img):
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    gray = cv2.GaussianBlur(gray, (5,5), 0)

    # Threshold (Otsu)
    _, mask = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    # Find largest contour (lesion)
    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if len(contours) == 0:
        return img  # fallback if no contour found

    c = max(contours, key=cv2.contourArea)
    x,y,w,h = cv2.boundingRect(c)
    lesion_crop = img[y:y+h, x:x+w]
    return lesion_crop

In [43]:
def augment_image(img):
    """Apply random augmentation for minority classes"""
    # Random flip
    if np.random.rand() > 0.5:
        img = cv2.flip(img, 0)  # vertical flip
    if np.random.rand() > 0.5:
        img = cv2.flip(img, 1)  # horizontal flip

    # Random rotation (-30 to 30 degrees)
    angle = np.random.uniform(-30, 30)
    M = cv2.getRotationMatrix2D((IM_SIZE//2, IM_SIZE//2), angle, 1)
    img = cv2.warpAffine(img, M, (IM_SIZE, IM_SIZE), borderMode=cv2.BORDER_REFLECT)

    return img

In [44]:
# =========================
# CLAHE OBJECT
# =========================
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))

In [46]:
# =========================
# PROCESS IMAGES
# =========================
clean_paths = []
labels = []
SAVE_DIR='/content/drive/MyDrive/mini_proj_data/Processed_data/'
# Count samples per class to identify minority classes
class_counts = df['dx'].value_counts()
minority_classes = class_counts[class_counts < 500].index.tolist()  # threshold can be adjusted

for i, row in tqdm(df.iterrows(), total=len(df)):
    img_path = row["image_path"]
    label = row["dx"]

    img = cv2.imread(img_path)
    if img is None:
        continue
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    # Hair Removal
    img = remove_hair(img)

    # Segmentation
    img = segment_lesion(img)

    # CLAHE Enhancement
    lab = cv2.cvtColor(img, cv2.COLOR_RGB2LAB)
    l, a, b = cv2.split(lab)
    l = clahe.apply(l)
    lab = cv2.merge((l, a, b))
    img = cv2.cvtColor(lab, cv2.COLOR_LAB2RGB)

    # Resize
    img = cv2.resize(img, (IM_SIZE, IM_SIZE))

    # Save original processed image
    save_path = os.path.join(SAVE_DIR, f"{row['image_id']}.jpg")
    cv2.imwrite(save_path, cv2.cvtColor(img, cv2.COLOR_RGB2BGR))
    clean_paths.append(save_path)
    labels.append(label)

    # =========================
    # AUGMENT MINORITY CLASSES
    # =========================
    if label in minority_classes:
        for aug_idx in range(3):  # create 3 augmented versions
            aug_img = augment_image(img)
            aug_save_path = os.path.join(SAVE_DIR, f"{row['image_id']}_aug{aug_idx}.jpg")
            cv2.imwrite(aug_save_path, cv2.cvtColor(aug_img, cv2.COLOR_RGB2BGR))
            clean_paths.append(aug_save_path)
            labels.append(label)


  3%|▎         | 305/10015 [00:15<08:03, 20.06it/s]


KeyboardInterrupt: 

In [None]:
df_processed = pd.DataFrame({
    "clean_path": clean_paths,
    "label": labels
})
# =========================
# UPDATE DATAFRAME
# =========================
df["clean_path"] = clean_paths
df["label"] = labels

print("Processing complete. Total images saved:", len(clean_paths))