# Data Preprocessing

In [43]:
!pip install kagglehub ultralytics scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.6.1-cp310-cp310-macosx_12_0_arm64.whl (11.1 MB)
Collecting joblib>=1.2.0
  Using cached joblib-1.4.2-py3-none-any.whl (301 kB)
Collecting threadpoolctl>=3.1.0
  Using cached threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.4.2 scikit-learn-1.6.1 threadpoolctl-3.5.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [38]:
import kagglehub

while True:
    try:
        path = kagglehub.dataset_download("kmader/skin-cancer-mnist-ham10000")
    except Exception as e:
        print("ConnectionError, retrying...")
    else:
        break
print("Path to dataset files:", path)

Resuming download from 1629487104 bytes (3953427407 bytes left)...
Resuming download from https://www.kaggle.com/api/v1/datasets/download/kmader/skin-cancer-mnist-ham10000?dataset_version_number=2 (1629487104/5582914511) bytes left.


 38%|███▊      | 1.98G/5.20G [01:00<06:57, 8.27MB/s]


ConnectionError, retrying...
Resuming download from 2126512128 bytes (3456402383 bytes left)...
Resuming download from https://www.kaggle.com/api/v1/datasets/download/kmader/skin-cancer-mnist-ham10000?dataset_version_number=2 (2126512128/5582914511) bytes left.


100%|██████████| 5.20G/5.20G [05:07<00:00, 11.3MB/s]


Extracting files...
Path to dataset files: /Users/iavnukov/.cache/kagglehub/datasets/kmader/skin-cancer-mnist-ham10000/versions/2


In [39]:
!mkdir ./datasets

In [40]:
!mv $path ./datasets/skin-cancer

In [41]:
!mkdir ./datasets/skin-cancer/images 
!cp -r ./datasets/skin-cancer/HAM10000_images_part_1/* ./datasets/skin-cancer/images && cp -r ./datasets/skin-cancer/HAM10000_images_part_2/* ./datasets/skin-cancer/images
!rm -r ./datasets/skin-cancer/HAM10000_images_part_1 ./datasets/skin-cancer/HAM10000_images_part_2 ./datasets/skin-cancer/ham10000_images_part_1 ./datasets/skin-cancer/ham10000_images_part_2

rm: ./datasets/skin-cancer/ham10000_images_part_1: No such file or directory
rm: ./datasets/skin-cancer/ham10000_images_part_2: No such file or directory


In [57]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import shutil
from tqdm import tqdm
import cv2

class HAM10000Preprocessor:
    def __init__(self, dataset_path, output_path, root_path):
        self.dataset_path = root_path + dataset_path
        self.ds_path = f"./{output_path}"
        self.output_path = root_path + output_path
        self.image_path = os.path.join(self.dataset_path, 'images')
        self.metadata_path = os.path.join(self.dataset_path, 'HAM10000_metadata.csv')
        
        # Create label mapping
        self.label_map = {
            'nv': 0,    # Melanocytic nevi
            'mel': 1,   # Melanoma
            'bkl': 2,   # Benign keratosis
            'bcc': 3,   # Basal cell carcinoma
            'akiec': 4, # Actinic keratoses
            'vasc': 5,  # Vascular lesions
            'df': 6     # Dermatofibroma
        }

    def create_directories(self):
        """Create necessary directories for YOLO format"""
        for split in ['train', 'val', 'test']:
            for subdir in ['images', 'labels']:
                os.makedirs(os.path.join(self.output_path, split, subdir), exist_ok=True)

    def load_metadata(self):
        """Load and preprocess metadata"""
        df = pd.read_csv(self.metadata_path)
        df = self.preprocess_metadata(df)
        return df

    def sanitize_bbox(self, bbox):
        """Sanitize bounding box coordinates to ensure they are within [0, 1] range."""
        return [max(0, min(1, coord)) for coord in bbox]

    def create_yolo_annotation(self, image_id, label_id):
        try:
            image_path = os.path.join(self.image_path, f'{image_id}.jpg')
            if not os.path.exists(image_path):
                image_path = os.path.join(self.image_path, f'{image_id}.png')
            
            img = cv2.imread(image_path)
            if img is None:
                return None
            
            height, width = img.shape[:2]
            
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
            contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
            
            if not contours:
                x_center, y_center = 0.5, 0.5
                box_width, box_height = 0.8, 0.8
            else:
                largest_contour = max(contours, key=cv2.contourArea)
                x, y, w, h = cv2.boundingRect(largest_contour)
                
                x_center = (x + w/2) / width
                y_center = (y + h/2) / height
                box_width = w / width
                box_height = h / height
            
            bbox = self.sanitize_bbox([x_center, y_center, box_width, box_height])
            
            return f"{label_id} {bbox[0]:.6f} {bbox[1]:.6f} {bbox[2]:.6f} {bbox[3]:.6f}"
            
        except Exception as e:
            print(f"Error processing image {image_id}: {str(e)}")
            return None

    def split_dataset(self, df):
        """Split dataset into train, validation, and test sets"""
        # First split: 80% train, 20% temp
        train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42, 
                                           stratify=df['label_id'])
        
        # Second split: 10% validation, 10% test (from temp)
        val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42,
                                         stratify=temp_df['label_id'])
        
        return train_df, val_df, test_df

    def process_split(self, df, split_name):
        """Process a dataset split"""
        images_dir = os.path.join(self.output_path, split_name, 'images')
        labels_dir = os.path.join(self.output_path, split_name, 'labels')
        
        for _, row in tqdm(df.iterrows(), total=len(df), desc=f'Processing {split_name}'):
            image_id = row['image_id']
            label_id = row['label_id']
            
            # Copy image
            src_path = os.path.join(self.image_path, f'{image_id}.jpg')
            if not os.path.exists(src_path):
                src_path = os.path.join(self.image_path, f'{image_id}.png')
            
            if os.path.exists(src_path):
                dst_path = os.path.join(images_dir, f'{image_id}{os.path.splitext(src_path)[1]}')
                shutil.copy2(src_path, dst_path)
                
                # Create and save annotation
                yolo_annotation = self.create_yolo_annotation(image_id, label_id)
                if yolo_annotation:
                    with open(os.path.join(labels_dir, f'{image_id}.txt'), 'w') as f:
                        f.write(yolo_annotation)

    def create_data_yaml(self):
        """Create data.yaml file for YOLO"""
        yaml_content = {
            'path': self.ds_path,
            'train': 'train/images',
            'val': 'val/images',
            'test': 'test/images',
            'nc': len(self.label_map),
            'names': list(self.label_map.keys())
        }
        
        import yaml
        with open(os.path.join(self.output_path, 'data.yaml'), 'w') as f:
            yaml.dump(yaml_content, f, default_flow_style=False)

    def preprocess_metadata(self, metadata):
        # Filling na age
        metadata.fillna({'age': metadata['age'].mean()}, inplace=True)
        if metadata["age"].dtype == 'float64':
            metadata["age"] = metadata["age"].astype(int)
        
        # Removing duplicates
        lesion_id_cnt = metadata['lesion_id'].value_counts()
        def check_duplicates_lesions(id):
            if lesion_id_cnt[id] > 1:
                return True
            else:
                return False
        metadata['dup_les'] = metadata['lesion_id'].map(check_duplicates_lesions)
        metadata = metadata[metadata.dup_les == False]

        # Augmentation
        def data_aug(df, col_name, val_name, class_max):
            ''' This function balances the classes by 2 using the overrepresentation of the minor class'''
            given_cols = df[df[col_name].isin([val_name, class_max])]
            class_counts = df[col_name].value_counts()
            class_count_max = class_counts[class_max]
            class_count_val = class_counts[val_name]
            max_class = given_cols[given_cols[col_name]==class_max]
            val_class = given_cols[given_cols[col_name]==val_name]
            val_class_over = val_class.sample(class_count_max,replace= True)
            df = df[df[col_name] != val_name]
            df = pd.concat([df,val_class_over],axis=0)
            return df
        count = metadata['dx'].value_counts().sort_index()
        class_max = count.idxmax()
        for val_name in metadata.dx.unique():
            if val_name != class_max:
                metadata = data_aug(metadata, 'dx', val_name, class_max)

        # Add numerical labels
        metadata['label_id'] = metadata['dx'].map(self.label_map)

        return metadata

    def process_dataset(self):
        """Main processing function"""
        print("Starting dataset preparation...")

        # Create directories
        self.create_directories()

        # Load metadata
        df = self.load_metadata()
        print(f"Loaded {len(df)} images metadata")
        
        # Split dataset
        train_df, val_df, test_df = self.split_dataset(df)
        print(f"Split sizes: Train={len(train_df)}, Val={len(val_df)}, Test={len(test_df)}")
        
        # Process each split
        self.process_split(train_df, 'train')
        self.process_split(val_df, 'val')
        self.process_split(test_df, 'test')
        
        # Create data.yaml
        self.create_data_yaml()
        
        print("Dataset preparation completed!")
        
        # Print dataset statistics
        print("\nDataset Statistics:")
        print(f"Total images: {len(df)}")
        print("\nClass distribution:")
        class_dist = df['dx'].value_counts()
        for cls, count in class_dist.items():
            print(f"{cls}: {count} ({count/len(df)*100:.2f}%)")

root_path = "./datasets"
dataset = "/skin-cancer"
yolo_dataset = "/skin-cancer-yolo"

preprocessor = HAM10000Preprocessor(dataset, yolo_dataset, root_path)
preprocessor.process_dataset()

Starting dataset preparation...
Loaded 30905 images metadata
Split sizes: Train=24724, Val=3090, Test=3091


Processing train: 100%|██████████| 24724/24724 [01:53<00:00, 218.24it/s]
Processing val: 100%|██████████| 3090/3090 [00:14<00:00, 213.66it/s]
Processing test: 100%|██████████| 3091/3091 [00:14<00:00, 217.80it/s]

Dataset preparation completed!

Dataset Statistics:
Total images: 30905

Class distribution:
nv: 4415 (14.29%)
bkl: 4415 (14.29%)
df: 4415 (14.29%)
mel: 4415 (14.29%)
vasc: 4415 (14.29%)
bcc: 4415 (14.29%)
akiec: 4415 (14.29%)





# Training & Eval

In [58]:
from ultralytics import YOLO

PRETRAINED_PATH = "yolo11n.pt"
EPOCHS=75

model = YOLO(PRETRAINED_PATH)

results = model.train(
    data=f"{root_path}{yolo_dataset}/data.yaml",
    epochs=EPOCHS
)

Ultralytics 8.3.68 🚀 Python-3.10.14 torch-2.5.1 CPU (Apple M3 Pro)
[34m[1mengine/trainer: [0mtask=detect, mode=train, model=yolo11n.pt, data=./datasets/skin-cancer-yolo/data.yaml, epochs=75, time=None, patience=100, batch=16, imgsz=640, save=True, save_period=-1, cache=False, device=None, workers=8, project=None, name=train7, exist_ok=False, pretrained=True, optimizer=auto, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hybrid=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=False, agnostic_nms=False, classes=None, retina_masks=False, embed=None, show=False, save_frames=False, save_txt=False, save_conf=False, save_crop=False, show_labels=True, show_conf=True, 

[34m[1mtrain: [0mScanning /Users/iavnukov/PycharmProjects/skin-care/datasets/skin-cancer-yolo/train/labels.cache... 4631 images, 0 backgrounds, 0 corrupt: 100%|██████████| 4631/4631 [00:00<?, ?it/s]
[34m[1mval: [0mScanning /Users/iavnukov/PycharmProjects/skin-care/datasets/skin-cancer-yolo/val/labels.cache... 1329 images, 0 backgrounds, 0 corrupt: 100%|██████████| 1329/1329 [00:00<?, ?it/s]

Plotting labels to runs/detect/train7/labels.jpg... 





[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.01' and 'momentum=0.937' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m AdamW(lr=0.000909, momentum=0.9) with parameter groups 81 weight(decay=0.0), 88 weight(decay=0.0005), 87 bias(decay=0.0)
Image sizes 640 train, 640 val
Using 0 dataloader workers
Logging results to [1mruns/detect/train7[0m
Starting training for 75 epochs...

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       1/75         0G     0.4603      3.395      1.167         53        640:  15%|█▌        | 44/290 [03:31<19:44,  4.82s/it]


KeyboardInterrupt: 

In [None]:
from ultralytics import YOLO

BEST_PATH = "./runs/detect/train/weights/best.pt"

model = YOLO(BEST_PATH)
metrics = model.val(data=f"{root_path}{yolo_dataset}/data.yaml")
print("mAP50-95:\t", metrics.box.map)
print("mAP50:\t", metrics.box.map50)
print("mAP75:\t", metrics.box.map75)

In [62]:
from ultralytics import YOLO

BEST_PATH = "../../models/yolo.pt"

model = YOLO(BEST_PATH)
results = model("../../files/processed/canser.jpg")


image 1/1 /Users/iavnukov/PycharmProjects/skin-care-ai/src/skin_cv/../../files/processed/canser.jpg: 640x608 1 bkl, 51.2ms
Speed: 1.4ms preprocess, 51.2ms inference, 0.3ms postprocess per image at shape (1, 3, 640, 608)
