# Data Preprocessing

In [None]:
import kagglehub

path = kagglehub.dataset_download("kmader/skin-cancer-mnist-ham10000")

print("Path to dataset files:", path)

In [None]:
!mkdir ./datasets

In [None]:
!mv $path ./datasets/skin-cancer

In [None]:
!mkdir ./datasets/skin-cancer/images 
!cp -r ./datasets/skin-cancer/HAM10000_images_part_1/* ./datasets/skin-cancer/images && cp -r ./datasets/skin-cancer/HAM10000_images_part_2/* ./datasets/skin-cancer/images
!rm -r ./datasets/skin-cancer/HAM10000_images_part_1 ./datasets/skin-cancer/HAM10000_images_part_2 ./datasets/skin-cancer/ham10000_images_part_1 ./datasets/skin-cancer/ham10000_images_part_2

In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import shutil
from tqdm import tqdm
import cv2

class HAM10000Preprocessor:
    def __init__(self, dataset_path, output_path):
        self.dataset_path = dataset_path
        self.output_path = output_path
        self.image_path = os.path.join(dataset_path, 'images')
        self.metadata_path = os.path.join(dataset_path, 'HAM10000_metadata.csv')
        
        # Create label mapping
        self.label_map = {
            'nv': 0,    # Melanocytic nevi
            'mel': 1,   # Melanoma
            'bkl': 2,   # Benign keratosis
            'bcc': 3,   # Basal cell carcinoma
            'akiec': 4, # Actinic keratoses
            'vasc': 5,  # Vascular lesions
            'df': 6     # Dermatofibroma
        }

    def create_directories(self):
        """Create necessary directories for YOLO format"""
        for split in ['train', 'val', 'test']:
            for subdir in ['images', 'labels']:
                os.makedirs(os.path.join(self.output_path, split, subdir), exist_ok=True)

    def load_metadata(self):
        """Load and preprocess metadata"""
        df = pd.read_csv(self.metadata_path)
        df = self.preprocess_metadata(df)
        return df

    def sanitize_bbox(self, bbox):
        """Sanitize bounding box coordinates to ensure they are within [0, 1] range."""
        return [max(0, min(1, coord)) for coord in bbox]

    def create_yolo_annotation(self, image_id, label_id):
        try:
            image_path = os.path.join(self.image_path, f'{image_id}.jpg')
            if not os.path.exists(image_path):
                image_path = os.path.join(self.image_path, f'{image_id}.png')
            
            img = cv2.imread(image_path)
            if img is None:
                return None
            
            height, width = img.shape[:2]
            
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
            contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
            
            if not contours:
                x_center, y_center = 0.5, 0.5
                box_width, box_height = 0.8, 0.8
            else:
                largest_contour = max(contours, key=cv2.contourArea)
                x, y, w, h = cv2.boundingRect(largest_contour)
                
                x_center = (x + w/2) / width
                y_center = (y + h/2) / height
                box_width = w / width
                box_height = h / height
            
            bbox = self.sanitize_bbox([x_center, y_center, box_width, box_height])
            
            return f"{label_id} {bbox[0]:.6f} {bbox[1]:.6f} {bbox[2]:.6f} {bbox[3]:.6f}"
            
        except Exception as e:
            print(f"Error processing image {image_id}: {str(e)}")
            return None

    def split_dataset(self, df):
        """Split dataset into train, validation, and test sets"""
        # First split: 80% train, 20% temp
        train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42, 
                                           stratify=df['label_id'])
        
        # Second split: 10% validation, 10% test (from temp)
        val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42,
                                         stratify=temp_df['label_id'])
        
        return train_df, val_df, test_df

    def process_split(self, df, split_name):
        """Process a dataset split"""
        images_dir = os.path.join(self.output_path, split_name, 'images')
        labels_dir = os.path.join(self.output_path, split_name, 'labels')
        
        for _, row in tqdm(df.iterrows(), total=len(df), desc=f'Processing {split_name}'):
            image_id = row['image_id']
            label_id = row['label_id']
            
            # Copy image
            src_path = os.path.join(self.image_path, f'{image_id}.jpg')
            if not os.path.exists(src_path):
                src_path = os.path.join(self.image_path, f'{image_id}.png')
            
            if os.path.exists(src_path):
                dst_path = os.path.join(images_dir, f'{image_id}{os.path.splitext(src_path)[1]}')
                shutil.copy2(src_path, dst_path)
                
                # Create and save annotation
                yolo_annotation = self.create_yolo_annotation(image_id, label_id)
                if yolo_annotation:
                    with open(os.path.join(labels_dir, f'{image_id}.txt'), 'w') as f:
                        f.write(yolo_annotation)

    def create_data_yaml(self):
        """Create data.yaml file for YOLO"""
        yaml_content = {
            'path': self.output_path,
            'train': 'train/images',
            'val': 'val/images',
            'test': 'test/images',
            'nc': len(self.label_map),
            'names': list(self.label_map.keys())
        }
        
        import yaml
        with open(os.path.join(self.output_path, 'data.yaml'), 'w') as f:
            yaml.dump(yaml_content, f, default_flow_style=False)

    def preprocess_metadata(self, metadata):
        # Filling na age
        metadata.fillna({'age': metadata['age'].mean()}, inplace=True)
        if metadata["age"].dtype == 'float64':
            metadata["age"] = metadata["age"].astype(int)
        
        # Removing duplicates
        lesion_id_cnt = metadata['lesion_id'].value_counts()
        def check_duplicates_lesions(id):
            if lesion_id_cnt[id] > 1:
                return True
            else:
                return False
        metadata['dup_les'] = metadata['lesion_id'].map(check_duplicates_lesions)
        metadata = metadata[metadata.dup_les == False]

        # Augmentation
        def data_aug(df, col_name, val_name, class_max):
            ''' This function balances the classes by 2 using the overrepresentation of the minor class'''
            given_cols = df[df[col_name].isin([val_name, class_max])]
            class_counts = df[col_name].value_counts()
            class_count_max = class_counts[class_max]
            class_count_val = class_counts[val_name]
            max_class = given_cols[given_cols[col_name]==class_max]
            val_class = given_cols[given_cols[col_name]==val_name]
            val_class_over = val_class.sample(class_count_max,replace= True)
            df = df[df[col_name] != val_name]
            df = pd.concat([df,val_class_over],axis=0)
            return df
        count = metadata['dx'].value_counts().sort_index()
        class_max = count.idxmax()
        for val_name in metadata.dx.unique():
            if val_name != class_max:
                metadata = data_aug(metadata, 'dx', val_name, class_max)

        # Add numerical labels
        metadata['label_id'] = metadata['dx'].map(self.label_map)

        return metadata

    def process_dataset(self):
        """Main processing function"""
        print("Starting dataset preparation...")

        # Create directories
        self.create_directories()

        # Load metadata
        df = self.load_metadata()
        print(f"Loaded {len(df)} images metadata")
        
        # Split dataset
        train_df, val_df, test_df = self.split_dataset(df)
        print(f"Split sizes: Train={len(train_df)}, Val={len(val_df)}, Test={len(test_df)}")
        
        # Process each split
        self.process_split(train_df, 'train')
        self.process_split(val_df, 'val')
        self.process_split(test_df, 'test')
        
        # Create data.yaml
        self.create_data_yaml()
        
        print("Dataset preparation completed!")
        
        # Print dataset statistics
        print("\nDataset Statistics:")
        print(f"Total images: {len(df)}")
        print("\nClass distribution:")
        class_dist = df['dx'].value_counts()
        for cls, count in class_dist.items():
            print(f"{cls}: {count} ({count/len(df)*100:.2f}%)")


dataset = "./datasets/skin-cancer/images"
yolo_dataset = "./datasets/skin-cancer-yolo"

preprocessor = HAM10000Preprocessor(dataset, yolo_dataset)
preprocessor.process_dataset()

# Training & Eval

In [None]:
from ultralytics import YOLO

PRETRAINED_PATH = "yolo11n.pt"
EPOCHS=75

model = YOLO("PRETRAINED_PATH")

results = model.train(
    data=f"{yolo_dataset}/data.yaml", 
    epochs=EPOCHS
)

In [None]:
from ultralytics import YOLO

BEST_PATH = "./runs/detect/train3/weights/best.pt"

model = YOLO(BEST_PATH)
metrics = model.val(data=f"{yolo_dataset}/data.yaml")
print("mAP50-95:\t", metrics.box.map)
print("mAP50:\t", metrics.box.map50)
print("mAP75:\t", metrics.box.map75)