In [78]:
import os
import json
import shutil
import random
from pathlib import Path
from typing import Dict, List, Tuple, Optional, Set
from dataclasses import dataclass, field
from collections import defaultdict, Counter
import warnings

# Import required libraries
try:
    from pycocotools.coco import COCO
    from PIL import Image
    from tqdm import tqdm
except ImportError as e:
    print(f"Missing required library: {e}")
    print("Please install with: pip install pycocotools tqdm pillow")
    exit(1)

# Suppress COCO API warnings
warnings.filterwarnings('ignore')

In [85]:
@dataclass
class DatasetConfig:
    """Configuration for dataset creation"""
    
    # Basic settings
    random_seed: int = 42
    dataset_name: str = "coco_sama"
    
    # COCO dataset path
    coco_path: str = '/kaggle/input/coco-2017-dataset/coco2017'
    
    # Output path
    output_path: str = '/kaggle/working'
    
    # Category mapping - this is exactly what you want
    category_mapping: Dict[str, List[str]] = field(default_factory=lambda: {
        'person': ['person'],
        'pet': ['cat', 'dog'],
        'car': ['bus', 'truck', 'car']
    })
    
    # Dataset split configuration
    dataset_sizes: Dict[str, Dict[str, int]] = field(default_factory=lambda: {
        'train': {'person': 6500, 'pet': 6500, 'car': 6500, 'negative': 3000},
        'val': {'person': 2000, 'pet': 2000, 'car': 2000, 'negative': 1000}
    })
    
    # Merge train2017 and val2017 before splitting
    merge_splits: bool = True
    
    def __post_init__(self):
        self.full_output_path = os.path.join(self.output_path, self.dataset_name)
        self.all_target_categories = [cat for cats in self.category_mapping.values() for cat in cats]
        
        # Create reverse mapping for category ID conversion
        self.category_to_group = {}
        for group, categories in self.category_mapping.items():
            for category in categories:
                self.category_to_group[category] = group
        
        print(f"Dataset config initialized: {self.dataset_name}")
        print(f"Target categories: {self.all_target_categories}")
        print(f"Category mapping: {self.category_mapping}")
        print(f"Output path: {self.full_output_path}")

In [86]:
class COCOProcessor:
    """Main class for processing COCO dataset"""
    
    def __init__(self, config: DatasetConfig):
        self.config = config
        self.coco_train = None
        self.coco_val = None
        self.merged_data = None
        self.category_stats = {}
        self.new_categories = []
        self.old_to_new_cat_id = {}
        
        # Setup random seed
        random.seed(config.random_seed)
        print(f"Random seed set to: {config.random_seed}")
    
    def setup_directories(self):
        """Create necessary output directories"""
        print("Setting up output directories...")
        
        # Create main output directory
        Path(self.config.full_output_path).mkdir(exist_ok=True, parents=True)
        
        # Create split directories
        for split in ['train', 'val']:
            Path(self.config.full_output_path, split).mkdir(exist_ok=True)
        
        # Create annotations directory
        Path(self.config.full_output_path, 'annotations').mkdir(exist_ok=True)
        
        print("Directories created successfully")
    
    def load_coco_data(self):
        """Load COCO train and val datasets"""
        print("Loading COCO datasets...")
        
        try:
            # Load train dataset
            # train_ann_file = os.path.join(self.config.coco_path, 'annotations', 'instances_train2017.json')
            train_ann_file = os.path.join("/kaggle/input/merged-coco/merged_train.json")

            self.coco_train = COCO(train_ann_file)
            print(f"Loaded COCO train: {len(self.coco_train.getImgIds())} images")
            
            # Load val dataset
            # val_ann_file = os.path.join(self.config.coco_path, 'annotations', 'instances_val2017.json')
            val_ann_file = os.path.join("/kaggle/input/merged-coco/merged_val.json")

            self.coco_val = COCO(val_ann_file)
            print(f"Loaded COCO val: {len(self.coco_val.getImgIds())} images")
            
        except Exception as e:
            printr(f"Error loading COCO data: {e}")
            raise
    
    def merge_datasets(self):
        """Merge train and val datasets into one unified dataset"""
        print("Merging train and val datasets...")
        
        try:
            # Get all images from both datasets
            train_images = self.coco_train.dataset['images']
            val_images = self.coco_val.dataset['images']
            
            # Get all annotations from both datasets
            train_annotations = self.coco_train.dataset['annotations']
            val_annotations = self.coco_val.dataset['annotations']
            
            # Adjust IDs to avoid conflicts
            max_img_id = max([img['id'] for img in train_images])
            max_ann_id = max([ann['id'] for ann in train_annotations])
            
            # Update val image IDs
            img_id_mapping = {}
            for img in val_images:
                old_id = img['id']
                new_id = max_img_id + old_id
                img['id'] = new_id
                img_id_mapping[old_id] = new_id
            
            # Update val annotation IDs and image references
            for ann in val_annotations:
                ann['id'] = max_ann_id + ann['id']
                ann['image_id'] = img_id_mapping[ann['image_id']]
            
            # Merge data
            merged_images = train_images + val_images
            merged_annotations = train_annotations + val_annotations
            merged_categories = self.coco_train.dataset['categories']  # Same categories in both
            
            # Create merged dataset structure
            self.merged_data = {
                'images': merged_images,
                'annotations': merged_annotations,
                'categories': merged_categories
            }
            
            print(f"Merged dataset: {len(merged_images)} images, {len(merged_annotations)} annotations")
            
        except Exception as e:
            print(f"Error merging datasets: {e}")
            raise
    
    def setup_category_mapping(self):
        """Setup category mapping for remapping COCO categories to new categories"""
        print("Setting up category mapping...")
        
        try:
            # Create temporary COCO object for merged data
            temp_coco = COCO()
            temp_coco.dataset = self.merged_data
            temp_coco.createIndex()
            
            # Create new category structure
            self.new_categories = []
            self.old_to_new_cat_id = {}
            
            new_cat_id = 1
            for group, categories in self.config.category_mapping.items():
                # Create new category for this group
                new_category = {
                    'id': new_cat_id,
                    'name': group,
                    'supercategory': group
                }
                self.new_categories.append(new_category)
                
                # Map old category IDs to new category ID
                for category in categories:
                    old_cat_ids = temp_coco.getCatIds(catNms=[category])
                    for old_cat_id in old_cat_ids:
                        self.old_to_new_cat_id[old_cat_id] = new_cat_id
                        print(f"Mapping {category} (ID: {old_cat_id}) -> {group} (ID: {new_cat_id})")
                
                new_cat_id += 1
            
            print(f"Created {len(self.new_categories)} new categories")
            
        except Exception as e:
            print(f"Error setting up category mapping: {e}")
            raise
    
    def analyze_category_distribution(self):
        """Analyze distribution of target categories in the merged dataset"""
        print("Analyzing category distribution...")
        
        try:
            # Create temporary COCO object for merged data
            temp_coco = COCO()
            temp_coco.dataset = self.merged_data
            temp_coco.createIndex()
            
            # Get target category IDs
            target_cat_ids = temp_coco.getCatIds(catNms=self.config.all_target_categories)
            
            # Count images per category
            category_counts = {}
            total_target_images = set()
            
            for group, categories in self.config.category_mapping.items():
                group_images = set()
                group_counts = {}
                
                for category in categories:
                    cat_ids = temp_coco.getCatIds(catNms=[category])
                    if cat_ids:
                        img_ids = temp_coco.getImgIds(catIds=cat_ids)
                        group_counts[category] = len(img_ids)
                        group_images.update(img_ids)
                        total_target_images.update(img_ids)
                    else:
                        group_counts[category] = 0
                        print(f"Category '{category}' not found in dataset")
                
                category_counts[group] = {
                    'categories': group_counts,
                    'total_images': len(group_images),
                    'total_annotations': len(temp_coco.getAnnIds(imgIds=list(group_images), catIds=temp_coco.getCatIds(catNms=categories)))
                }
            
            # Count negative images (images without any target categories)
            all_img_ids = temp_coco.getImgIds()
            negative_images = set(all_img_ids) - total_target_images
            category_counts['negative'] = {
                'categories': {'negative': len(negative_images)},
                'total_images': len(negative_images),
                'total_annotations': 0
            }
            
            self.category_stats = category_counts
            
            # Log statistics
            print("=== CATEGORY DISTRIBUTION ANALYSIS ===")
            for group, stats in category_counts.items():
                print(f"{group.upper()}:")
                if group != 'negative':
                    for cat, count in stats['categories'].items():
                        print(f"  {cat}: {count:,} images")
                print(f"  Total {group} images: {stats['total_images']:,}")
                print(f"  Total {group} annotations: {stats['total_annotations']:,}")
                print("")
            
            total_images = len(all_img_ids)
            total_target = len(total_target_images)
            print(f"SUMMARY:")
            print(f"  Total images in dataset: {total_images:,}")
            print(f"  Images with target categories: {total_target:,}")
            print(f"  Negative images: {len(negative_images):,}")
            print(f"  Target coverage: {(total_target/total_images)*100:.1f}%")
            
        except Exception as e:
            print(f"Error analyzing category distribution: {e}")
            raise
    
    def sample_images_for_category(self, temp_coco, categories: List[str], target_count: int, already_sampled: Set[int]) -> List[int]:
        """Sample images for a specific category group"""
        group_img_ids = set()
        
        # Collect all images that contain any of the categories
        for category in categories:
            cat_ids = temp_coco.getCatIds(catNms=[category])
            if cat_ids:
                img_ids = temp_coco.getImgIds(catIds=cat_ids)
                group_img_ids.update(img_ids)
        
        # Remove already sampled images
        available_ids = [img_id for img_id in group_img_ids if img_id not in already_sampled]
        sample_count = min(target_count, len(available_ids))
        
        if sample_count > 0:
            sampled = random.sample(available_ids, sample_count)
            print(f"  {' + '.join(categories)}: {sample_count:,} / {len(available_ids):,} images")
            return sampled
        
        return []
    
    def sample_images_by_category(self, target_counts: Dict[str, int]) -> Dict[str, List[int]]:
        """Sample images from each category according to target counts"""
        print(f"Sampling images with target counts: {target_counts}")
        
        try:
            # Create temporary COCO object for merged data
            temp_coco = COCO()
            temp_coco.dataset = self.merged_data
            temp_coco.createIndex()
            
            sampled_images = {}
            all_sampled = set()
            
            # Sample from each category group
            for group, target_count in target_counts.items():
                if group == 'negative':
                    continue  # Handle negative separately
                
                categories = self.config.category_mapping[group]
                sampled = self.sample_images_for_category(temp_coco, categories, target_count, all_sampled)
                
                if sampled:
                    sampled_images[group] = sampled
                    all_sampled.update(sampled)
            
            # Sample negative images
            if 'negative' in target_counts:
                target_cat_ids = temp_coco.getCatIds(catNms=self.config.all_target_categories)
                all_img_ids = temp_coco.getImgIds()
                
                negative_candidates = []
                for img_id in all_img_ids:
                    if img_id in all_sampled:
                        continue
                    
                    ann_ids = temp_coco.getAnnIds(imgIds=[img_id], catIds=target_cat_ids)
                    if len(ann_ids) == 0:
                        negative_candidates.append(img_id)
                
                neg_count = target_counts['negative']
                sample_count = min(neg_count, len(negative_candidates))
                
                if sample_count > 0:
                    negative_sampled = random.sample(negative_candidates, sample_count)
                    sampled_images['negative'] = negative_sampled
                    print(f"  negative: {sample_count:,} / {len(negative_candidates):,} images")
            
            return sampled_images
            
        except Exception as e:
            print(f"Error sampling images: {e}")
            raise
    
    def create_split_dataset(self, split: str, sampled_images: Dict[str, List[int]]) -> Tuple[int, int]:
        """Create dataset split with sampled images and remapped categories"""
        print(f"Creating {split} dataset...")
        
        try:
            # Create temporary COCO object for merged data
            temp_coco = COCO()
            temp_coco.dataset = self.merged_data
            temp_coco.createIndex()
            
            # Collect all sampled image IDs
            all_img_ids = []
            for group, img_ids in sampled_images.items():
                all_img_ids.extend(img_ids)
            
            unique_img_ids = list(set(all_img_ids))
            print(f"Processing {len(unique_img_ids)} unique images for {split}")
            
            # Load images
            images = temp_coco.loadImgs(unique_img_ids)
            
            # Process annotations with category remapping
            annotations = []
            negative_imgs = set(sampled_images.get('negative', []))
            
            for img_id in tqdm(unique_img_ids, desc=f"Processing {split} annotations"):
                if img_id not in negative_imgs:
                    # Get all annotations for this image
                    ann_ids = temp_coco.getAnnIds(imgIds=[img_id])
                    img_annotations = temp_coco.loadAnns(ann_ids)
                    
                    # Filter and remap annotations
                    for ann in img_annotations:
                        old_cat_id = ann['category_id']
                        if old_cat_id in self.old_to_new_cat_id:
                            # Remap category ID
                            ann['category_id'] = self.old_to_new_cat_id[old_cat_id]
                            annotations.append(ann)
            
            # Copy image files
            print(f"Copying {len(images)} images to {split} directory...")
            
            for img in tqdm(images, desc=f"Copying {split} images"):
                # Determine source directory (train2017 or val2017)
                src_train = os.path.join(self.config.coco_path, 'train2017', img['file_name'])
                src_val = os.path.join(self.config.coco_path, 'val2017', img['file_name'])
                
                if os.path.exists(src_train):
                    src_path = src_train
                elif os.path.exists(src_val):
                    src_path = src_val
                else:
                    print(f"Image not found: {img['file_name']}")
                    continue
                
                dst_path = os.path.join(self.config.full_output_path, split, img['file_name'])
                shutil.copy2(src_path, dst_path)
            
            # Save annotation file with remapped categories
            annotation_data = {
                'images': images,
                'annotations': annotations,
                'categories': self.new_categories
            }
            
            ann_file = os.path.join(self.config.full_output_path, 'annotations', f'instances_{split}.json')
            with open(ann_file, 'w') as f:
                json.dump(annotation_data, f)
            
            print(f"✅ {split} dataset created: {len(images):,} images, {len(annotations):,} annotations")
            print(f"Categories remapped: {len(self.new_categories)} new categories")
            
            return len(images), len(annotations)
            
        except Exception as e:
            print(f"Error creating {split} dataset: {e}")
            raise
    
    def process_dataset(self):
        """Main processing pipeline"""
        print("Starting dataset processing pipeline...")
        
        try:
            # Step 1: Setup
            self.setup_directories()
            
            # Step 2: Load COCO data
            self.load_coco_data()
            
            # Step 3: Merge datasets if configured
            if self.config.merge_splits:
                self.merge_datasets()
            
            # Step 4: Setup category mapping
            self.setup_category_mapping()
            
            # Step 5: Analyze category distribution
            self.analyze_category_distribution()
            
            # Step 6: Process each split
            results = {}
            
            for split in ['train', 'val']:
                if split not in self.config.dataset_sizes:
                    print(f"Split '{split}' not found in dataset_sizes config")
                    continue
                
                print(f"\n{'='*50}")
                print(f"Processing {split.upper()} split")
                print(f"{'='*50}")
                
                # Sample images for this split
                sampled_images = self.sample_images_by_category(self.config.dataset_sizes[split])
                
                # Create dataset
                img_count, ann_count = self.create_split_dataset(split, sampled_images)
                results[split] = {'images': img_count, 'annotations': ann_count}
            
            # Final summary
            print(f"\n{'='*50}")
            print("FINAL DATASET SUMMARY")
            print(f"{'='*50}")
            
            total_images = 0
            total_annotations = 0
            
            for split, stats in results.items():
                print(f"{split.capitalize()}: {stats['images']:,} images, {stats['annotations']:,} annotations")
                total_images += stats['images']
                total_annotations += stats['annotations']
            
            print(f"Total: {total_images:,} images, {total_annotations:,} annotations")
            print(f"New categories: {[cat['name'] for cat in self.new_categories]}")
            print(f"✅ Dataset created successfully at: {self.config.full_output_path}")
            
            return results
            
        except Exception as e:
            print(f"Error in processing pipeline: {e}")
            raise

In [87]:
config = DatasetConfig()

Dataset config initialized: coco_sama
Target categories: ['person', 'cat', 'dog', 'bus', 'truck', 'car']
Category mapping: {'person': ['person'], 'pet': ['cat', 'dog'], 'car': ['bus', 'truck', 'car']}
Output path: /kaggle/working/coco_sama


In [88]:
processor = COCOProcessor(config)

Random seed set to: 42


In [89]:
results = processor.process_dataset()

Starting dataset processing pipeline...
Setting up output directories...
Directories created successfully
Loading COCO datasets...
loading annotations into memory...
Done (t=30.65s)
creating index...
index created!
Loaded COCO train: 118287 images
loading annotations into memory...
Done (t=0.85s)
creating index...
index created!
Loaded COCO val: 5000 images
Merging train and val datasets...
Merged dataset: 123287 images, 1115464 annotations
Setting up category mapping...
creating index...
index created!
Mapping person (ID: 1) -> person (ID: 1)
Mapping cat (ID: 17) -> pet (ID: 2)
Mapping dog (ID: 18) -> pet (ID: 2)
Mapping bus (ID: 6) -> car (ID: 3)
Mapping truck (ID: 8) -> car (ID: 3)
Mapping car (ID: 3) -> car (ID: 3)
Created 3 new categories
Analyzing category distribution...
creating index...
index created!
=== CATEGORY DISTRIBUTION ANALYSIS ===
PERSON:
  person: 68,659 images
  Total person images: 68,659
  Total person annotations: 373,972

PET:
  cat: 4,345 images
  dog: 4,700 im

Processing train annotations: 100%|██████████| 22500/22500 [00:00<00:00, 115485.05it/s]


Copying 22500 images to train directory...


Copying train images: 100%|██████████| 22500/22500 [02:03<00:00, 182.82it/s]


✅ train dataset created: 22,500 images, 112,098 annotations
Categories remapped: 3 new categories

Processing VAL split
Sampling images with target counts: {'person': 2000, 'pet': 2000, 'car': 2000, 'negative': 1000}
creating index...
index created!
  person: 2,000 / 68,659 images
  cat + dog: 1,915 / 1,915 images
  bus + truck + car: 2,000 / 17,234 images
  negative: 1,000 / 48,089 images
Creating val dataset...
creating index...
index created!
Processing 6915 unique images for val


Processing val annotations: 100%|██████████| 6915/6915 [00:00<00:00, 107159.63it/s]


Copying 6915 images to val directory...


Copying val images: 100%|██████████| 6915/6915 [00:36<00:00, 190.89it/s]


✅ val dataset created: 6,915 images, 34,264 annotations
Categories remapped: 3 new categories

FINAL DATASET SUMMARY
Train: 22,500 images, 112,098 annotations
Val: 6,915 images, 34,264 annotations
Total: 29,415 images, 146,362 annotations
New categories: ['person', 'pet', 'car']
✅ Dataset created successfully at: /kaggle/working/coco_sama


In [90]:
import os
from IPython.core.display import display, HTML

os.chdir('/kaggle/working')

# 🧠 Отримуємо ім'я директорії з датасетом
dataset_folder = DatasetConfig.dataset_name  # наприклад, "coco"
zip_filename = f"{dataset_folder}.zip"
print(f"Creating archive: {zip_filename}")

# 🧩 Використовуємо змінну у shell-команді через подвійні дужки
!zip -r -q "$zip_filename" "$dataset_folder"

zip_path = f'/kaggle/working/{zip_filename}'
if os.path.exists(zip_path):
    file_size = os.path.getsize(zip_path) / (1024*1024)  # MB
    print(f"\n✅ Archive created successfully!")
    print(f"📁 File: {zip_filename}")
    print(f"📊 Size: {file_size:.1f} MB")
    print(f"📍 Path: {zip_path}")

    print(f"\n📋 Archive contents:")
    !zipinfo "$zip_filename" | head -20

    display(HTML(f"""
    <div style="background-color: #e8f5e8; padding: 15px; border-radius: 10px; margin: 10px 0;">
        <h3>📥 Download Ready</h3>
        <a href="{zip_filename}" download style="background-color: #4CAF50; color: white; padding: 10px 20px; text-decoration: none; border-radius: 5px; font-weight: bold;">
        📥 Download {zip_filename} ({file_size:.1f} MB)
        </a>
    </div>
    """))
else:
    print("❌ Error: Archive not created!")
    print("Checking working directory contents:")
    !ls -la /kaggle/working/


Creating archive: coco_sama.zip

✅ Archive created successfully!
📁 File: coco_sama.zip
📊 Size: 4571.6 MB
📍 Path: /kaggle/working/coco_sama.zip

📋 Archive contents:
Archive:  coco_sama.zip
Zip file size: 4793667254 bytes, number of entries: 29421
drwxr-xr-x  3.0 unx        0 bx stor 25-Jul-16 12:05 coco_sama/
drwxr-xr-x  3.0 unx        0 bx stor 25-Jul-16 12:09 coco_sama/annotations/
-rw-r--r--  3.0 unx 21151900 tx defN 25-Jul-16 12:09 coco_sama/annotations/instances_val.json
-rw-r--r--  3.0 unx 69501892 tx defN 25-Jul-16 12:08 coco_sama/annotations/instances_train.json
drwxr-xr-x  3.0 unx        0 bx stor 25-Jul-16 12:08 coco_sama/train/
-rw-r--r--  3.0 unx   111650 bx defN 25-Apr-17 20:11 coco_sama/train/000000415990.jpg
-rw-r--r--  3.0 unx   115050 bx defN 25-Apr-17 20:07 coco_sama/train/000000237284.jpg
-rw-r--r--  3.0 unx   240080 bx defN 25-Apr-17 20:05 coco_sama/train/000000092096.jpg
-rw-r--r--  3.0 unx   153119 bx defN 25-Apr-17 20:09 coco_sama/train/000000443397.jpg
-rw-r--r--