In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("surajghuwalewala/ham1000-segmentation-and-classification")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/ham1000-segmentation-and-classification


In [3]:
!pip install opencv-python pandas numpy matplotlib



In [None]:
import pandas as pd
import cv2
import numpy as np
import os
from tqdm import tqdm

# ===================================================================
# --- 1. Configuration ---
# ===================================================================
IMAGE_DIR = path+'/images'
MASK_DIR = path+'/masks'
CSV_PATH = path+'/GroundTruth.csv'

IMAGE_ID_COLUMN = 'image'  
LABEL_COLUMNS = ['MEL', 'NV', 'BCC','AKIEC','BKL','DF'] 
TARGET_SIZE = (224, 224)

# ===================================================================
# --- 2. Preprocessing Functions ---
# ===================================================================
def remove_hair(image):
    """Removes hair from a skin image using morphological operations."""
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) #convert to grayscale
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (15, 15)) #15*15 kernel
    blackhat = cv2.morphologyEx(gray, cv2.MORPH_BLACKHAT, kernel) #using blackhat morphology for hair detection
    _, hair_mask = cv2.threshold(blackhat, 10, 255, cv2.THRESH_BINARY)#hair masked over intensity 10
    inpainted_image = cv2.inpaint(image, hair_mask, 3, cv2.INPAINT_TELEA)#inpaint the mask
    return inpainted_image

def preprocess_for_cnn(image_path, mask_path, target_size):
    """Full preprocessing pipeline for one image."""
    image = cv2.imread(image_path)
    mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)

    if image is None or mask is None:
        raise ValueError("Image or mask not found.")

    # Hair removal
    image_no_hair = remove_hair(image)

    # Binary mask
    _, binary_mask = cv2.threshold(mask, 127, 255, cv2.THRESH_BINARY)

    # Apply mask
    masked_image = cv2.bitwise_and(image_no_hair, image_no_hair, mask=binary_mask)

    # Crop to ROI (if mask is valid)
    if cv2.countNonZero(binary_mask) > 0:
        x, y, w, h = cv2.boundingRect(binary_mask)
        cropped_image = masked_image[y:y+h, x:x+w]
    else:
        cropped_image = image_no_hair  # fallback: use full image

    # Resize + normalize
    resized_image = cv2.resize(cropped_image, target_size)
    normalized_image = resized_image.astype(np.float32) / 255.0

    return normalized_image

# ===================================================================
# --- 3. Data Loading ---
# ===================================================================
print("Loading data from CSV...")
df = pd.read_csv(CSV_PATH)

# Add file paths
def find_image_path(img_id):
    for ext in ['.jpg', '.png', '.jpeg']:
        path = os.path.join(IMAGE_DIR, img_id + ext)
        if os.path.exists(path):
            return path
    return None

def find_mask_path(img_id):
    for ext in ['.png', '.jpg']:
        path = os.path.join(MASK_DIR, img_id + '_segmentation' + ext)
        if os.path.exists(path):
            return path
    return None


df['image_path'] = df[IMAGE_ID_COLUMN].apply(find_image_path)
df['mask_path'] = df[IMAGE_ID_COLUMN].apply(find_mask_path)

# Extract labels
all_labels = df[LABEL_COLUMNS].to_numpy()
print(f"Extracted labels with shape: {all_labels.shape}")

# ===================================================================
# --- 4. Preprocess Images ---
# ===================================================================
processed_images = []
valid_indices = []

print("\nStarting image preprocessing...")
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    try:
        if row['image_path'] and row['mask_path']:
            processed_img = preprocess_for_cnn(row['image_path'], row['mask_path'], TARGET_SIZE)
            processed_images.append(processed_img)
            valid_indices.append(index)
    except Exception as e:
        print(f"Error at index {index} ({row[IMAGE_ID_COLUMN]}): {e}")

# Convert to arrays
X = np.array(processed_images, dtype=np.float32)
y = all_labels[valid_indices]

print("\nProcessing Complete")
print(f"Processed {len(X)} images.")
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")


Loading data from CSV...
Extracted labels with shape: (10015, 6)

Starting image preprocessing...


 67%|██████▋   | 6673/10015 [11:22<05:11, 10.72it/s]

In [None]:
print(df[['image', 'image_path', 'mask_path']].head(5))


In [None]:
print(X.shape)
print(y.shape)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=12, stratify=y
)


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization

num_classes = y.shape[1]

# ==============================
# 1. Define CNN Architecture
# ==============================

model = Sequential([
    # Block 1
    Conv2D(32, (3,3), activation='relu', input_shape=(224, 224, 3)),
    BatchNormalization(),
    MaxPooling2D((2,2)),
    Dropout(0.25),
    
    # Block 2
    Conv2D(64, (3,3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D((2,2)),
    Dropout(0.25),
    
    # Block 3
    Conv2D(128, (3,3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D((2,2)),
    Dropout(0.25),
    
    # Flatten + Dense
    Flatten(),
    Dense(256, activation='relu'),
    Dropout(0.5),
    
    # Output Layer
    Dense(num_classes, activation='softmax')
])

# ==============================
# 2. Compile Model
# ==============================

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)



In [None]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=32,
    epochs=20,
    verbose=1
)


In [None]:
loss, acc = model.evaluate(X_val, y_val)
print(f"Validation Accuracy: {acc:.4f}")
