# Private Dataset Training - YOLOv8

This notebook trains a YOLOv8 model for private object detection with 16 categories.

Features:
- Uses pretrained weights from non-private dataset
- 16 categories
- Progressive occlusion evaluation (crop type)
- Weights & Biases integration
- Google Drive integration


## Step 1: Setup Environment


In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


In [None]:
# Install dependencies
%pip install -q ultralytics roboflow python-dotenv opencv-python matplotlib numpy pandas pyyaml wandb gdown


In [None]:
# Clone or update your repository
import os
from pathlib import Path

REPO_DIR = "/content/credit_card_yolov12"

if Path(REPO_DIR).exists():
    print("Repository exists, pulling latest changes...")
    %cd {REPO_DIR}
    !git pull origin main
else:
    print("Cloning repository...")
    !git clone https://github.com/Turje/credit_card_yolov12.git
    %cd credit_card_yolov12

# Verify files exist
print("\nVerifying required files...")
required_files = [
    "src/split_dataset.py",
    "src/prepare_progressive_tests.py",
    "src/train.py",
    "src/evaluate_progressive.py"
]

for file in required_files:
    file_path = Path(REPO_DIR) / file
    if file_path.exists():
        print(f"‚úÖ {file}")
    else:
        print(f"‚ùå {file} - NOT FOUND!")


## Step 2: Setup Weights & Biases


In [None]:
import wandb

# Login to Weights & Biases
wandb.login(key='6defa0781045a6f791ddd5b18bd7ebbdcdfdc86d')

# Initialize wandb project
wandb.init(
    project="private-object-detection",
    name="yolov8-16categories-pretrained",
    config={
        "model_size": "m",
        "epochs": 100,
        "imgsz": 640,
        "batch": 16,
        "num_classes": 16,
        "pretrained_weights": "/content/drive/MyDrive/yolov12_runs/nonprivate/checkpoints/best.pt",
        "occlusion_type": "crop"
    }
)


In [None]:
## Step 3: Set Dataset Paths (Skip if datasets already downloaded)

# If you already have datasets on Drive, set paths here and skip Cell 8
# Otherwise, run Cell 8 to download and merge datasets

USE_EXISTING_DATASETS = True  # Set to True if datasets are already on Drive

if USE_EXISTING_DATASETS:
    from pathlib import Path
    
    DATASET_BASE = "/content/drive/MyDrive/credit_card_yolov12/datasets/private_dataset"
    
    # Find merged dataset (for training)
    merged_path = Path(DATASET_BASE) / "merged_dataset"
    if merged_path.exists() and (merged_path / "train" / "_annotations.coco.json").exists():
        ORIGINAL_DATASET = str(merged_path)
        print(f"‚úÖ Found merged dataset: {ORIGINAL_DATASET}")
    else:
        # Find individual datasets and merge them
        print("‚ö†Ô∏è Merged dataset not found. Looking for individual datasets...")
        
        valid_folders = []
        for folder in Path(DATASET_BASE).iterdir():
            if folder.is_dir() and folder.name not in {'__MACOSX', '.DS_Store'}:
                train_check = folder / "train" / "_annotations.coco.json"
                if train_check.exists():
                    valid_folders.append(folder)
                    print(f"   ‚úÖ Found: {folder.name}")
        
        if len(valid_folders) >= 3:
            # Merge datasets
            print(f"\nüîÑ Merging {len(valid_folders)} datasets...")
            import shutil
            import json
            
            merged_dir = Path(DATASET_BASE) / "merged_dataset"
            merged_dir.mkdir(exist_ok=True)
            merged_train = merged_dir / "train"
            merged_train.mkdir(exist_ok=True)
            
            all_images = []
            all_annotations = []
            image_id_offset = 0
            ann_id_offset = 0
            categories_map = {}
            
            for folder_path in valid_folders:
                train_folder = folder_path / "train"
                ann_file = train_folder / "_annotations.coco.json"
                
                if ann_file.exists():
                    with open(ann_file, 'r') as f:
                        coco_data = json.load(f)
                    
                    if not categories_map:
                        categories_map = {cat['id']: cat for cat in coco_data.get('categories', [])}
                    
                    for img in coco_data.get('images', []):
                        img['id'] = image_id_offset + img['id']
                        src_img = train_folder / img['file_name']
                        if src_img.exists():
                            dst_img = merged_train / img['file_name']
                            if dst_img.exists():
                                stem = Path(img['file_name']).stem
                                ext = Path(img['file_name']).suffix
                                img['file_name'] = f"{folder_path.name}_{stem}{ext}"
                                dst_img = merged_train / img['file_name']
                            shutil.copy2(src_img, dst_img)
                            all_images.append(img)
                    
                    for ann in coco_data.get('annotations', []):
                        ann['id'] = ann_id_offset + ann['id']
                        ann['image_id'] = image_id_offset + ann['image_id']
                        all_annotations.append(ann)
                    
                    image_id_offset = max(img['id'] for img in all_images) + 1
                    ann_id_offset = max(ann['id'] for ann in all_annotations) + 1
            
            merged_coco = {
                'images': all_images,
                'annotations': all_annotations,
                'categories': list(categories_map.values())
            }
            
            merged_ann_file = merged_train / "_annotations.coco.json"
            with open(merged_ann_file, 'w') as f:
                json.dump(merged_coco, f, indent=2)
            
            ORIGINAL_DATASET = str(merged_dir)
            print(f"   ‚úÖ Merged {len(all_images)} images")
        elif len(valid_folders) == 1:
            ORIGINAL_DATASET = str(valid_folders[0])
            print(f"‚úÖ Using single dataset: {ORIGINAL_DATASET}")
        else:
            raise ValueError(f"Expected 3+ datasets or 1 merged dataset. Found {len(valid_folders)} folders.")
    
    # Find query_images (for evaluation)
    query_folders = [f for f in Path(DATASET_BASE).iterdir() 
                     if f.is_dir() and 'query' in f.name.lower() 
                     and f.name not in {'__MACOSX', '.DS_Store'}]
    
    if query_folders and (query_folders[0] / "train" / "_annotations.coco.json").exists():
        QUERY_DATASET = str(query_folders[0])
        print(f"‚úÖ Found query_images: {QUERY_DATASET}")
    else:
        QUERY_DATASET = ORIGINAL_DATASET
        print(f"‚ö†Ô∏è query_images not found, using training dataset for evaluation")
    
    print(f"\n{'='*60}")
    print(f"‚úÖ Training dataset: {ORIGINAL_DATASET}")
    print(f"‚úÖ Query dataset: {QUERY_DATASET}")
    print(f"{'='*60}")
else:
    print("‚ö†Ô∏è USE_EXISTING_DATASETS is False. Run Cell 8 to download datasets.")


## Step 3: Download Private Dataset from Google Drive


In [None]:
import gdown
from pathlib import Path
import zipfile
import shutil

# Google Drive file IDs
PRIVATE_FILE_IDS = [
    "1ClFqB6vvVXqmru4hA5JhkphyfEEwjyte",  # Original dataset
    "1Y7jh8lTfAuTqIaDkrF9AS8XJaSAfsGwu",  # Additional dataset 1
    "1GEF0-6MVMrwSGdvbZtcIp9rB82-wqkDL"    # Additional dataset 2
]

# Download location
DATASET_BASE = "/content/drive/MyDrive/credit_card_yolov12/datasets"
Path(DATASET_BASE).mkdir(parents=True, exist_ok=True)

# Main extract directory
extract_dir = f"{DATASET_BASE}/private_dataset"
Path(extract_dir).mkdir(parents=True, exist_ok=True)

# Download and extract all datasets
print(f"Downloading {len(PRIVATE_FILE_IDS)} datasets from Google Drive...\n")

for idx, file_id in enumerate(PRIVATE_FILE_IDS, 1):
    zip_path = f"{DATASET_BASE}/private_dataset_{idx}.zip"
    print(f"[{idx}/{len(PRIVATE_FILE_IDS)}] Downloading dataset {idx}...")
    print(f"   File ID: {file_id}")
    
    # Download using gdown
    url = f"https://drive.google.com/uc?id={file_id}"
    try:
        gdown.download(url, zip_path, quiet=False)
        
        # Extract dataset
        if Path(zip_path).exists():
            print(f"   Extracting to: {extract_dir}")
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall(extract_dir)
            print(f"   ‚úÖ Dataset {idx} extracted successfully\n")
            
            # Clean up zip file
            Path(zip_path).unlink()
        else:
            print(f"   ‚ö†Ô∏è Warning: Download failed for dataset {idx}\n")
    except Exception as e:
        print(f"   ‚ùå Error downloading dataset {idx}: {e}\n")

# Find the actual dataset folder(s)
print(f"\nLooking for dataset folders in: {extract_dir}")
print(f"=" * 60)

# Skip macOS metadata folders
skip_folders = {'__MACOSX', '.DS_Store'}

# Check if there's a train folder directly in extract_dir
train_in_root = Path(extract_dir) / "train"
if train_in_root.exists():
    ORIGINAL_DATASET = extract_dir
    print(f"‚úÖ Found dataset structure in root: {ORIGINAL_DATASET}")
else:
    # Find all folders, excluding macOS metadata
    all_folders = [d for d in Path(extract_dir).iterdir() if d.is_dir() and d.name not in skip_folders]
    
    if not all_folders:
        ORIGINAL_DATASET = extract_dir
        print(f"‚ö†Ô∏è No dataset folders found, using extract directory: {ORIGINAL_DATASET}")
    else:
        print(f"Found {len(all_folders)} folder(s) (excluding macOS metadata):\n")
        
        # Check each folder for dataset structure
        dataset_candidates = []
        for folder in sorted(all_folders):  # Sort for consistent ordering
            # Double-check: skip __MACOSX and other metadata folders
            if folder.name in skip_folders or '__MACOSX' in folder.name or '.DS_Store' in folder.name:
                print(f"   ‚ùå SKIP {folder.name} - Metadata folder")
                continue
                
            train_check = folder / "train"
            ann_check = list(folder.glob("**/_annotations.coco.json"))
            has_images = len(list(folder.glob("**/*.jpg"))) + len(list(folder.glob("**/*.png"))) > 0
            image_count = len(list(folder.glob("**/*.jpg"))) + len(list(folder.glob("**/*.png")))
            
            if train_check.exists():
                train_images = len(list((train_check).glob("*.jpg"))) + len(list((train_check).glob("*.png")))
                train_anns = len(list((train_check).glob("*.json")))
                print(f"   ‚úÖ {folder.name} - Has 'train' folder ({train_images} images, {train_anns} annotations)")
                dataset_candidates.append((folder, "train", True, train_images, folder.name))
            elif ann_check:
                print(f"   ‚úÖ {folder.name} - Has COCO annotations ({len(ann_check)} files, {image_count} images)")
                dataset_candidates.append((folder, "annotations", True, image_count, folder.name))
            elif has_images:
                print(f"   üìÅ {folder.name} - Has images ({image_count} images, no annotations found)")
                dataset_candidates.append((folder, "images", False, image_count, folder.name))
            else:
                print(f"   üìÅ {folder.name} - Empty or unknown structure")
        
        # Handle multiple dataset folders - merge all for training
        if dataset_candidates:
            # Find all datasets with train structure, excluding __MACOSX
            train_candidates = [
                c for c in dataset_candidates 
                if c[1] == "train" and '__MACOSX' not in c[4] and '.DS_Store' not in c[4]
            ]
            
            # Find query_images separately (for evaluation), excluding __MACOSX
            query_candidates = [
                c for c in dataset_candidates 
                if 'query' in c[4].lower() and c[1] == "train" 
                and '__MACOSX' not in c[4] and '.DS_Store' not in c[4]
            ]
            if query_candidates:
                QUERY_DATASET = str(query_candidates[0][0])
                print(f"\n‚úÖ Found query_images for evaluation: {QUERY_DATASET} ({query_candidates[0][3]} images)")
            else:
                # Fallback: use first train dataset as query
                if train_candidates:
                    QUERY_DATASET = str(train_candidates[0][0])
                    print(f"\n‚ö†Ô∏è query_images not found, using first train dataset for evaluation: {QUERY_DATASET}")
                else:
                    QUERY_DATASET = None
            
            # Merge all train folders for training
            if len(train_candidates) > 1:
                print(f"\nüì¶ Found {len(train_candidates)} dataset folders with train structure:")
                total_images = sum(c[3] for c in train_candidates)
                for c in train_candidates:
                    print(f"   - {c[0].name}: {c[3]} images")
                
                # Create merged dataset directory
                merged_dir = Path(extract_dir) / "merged_dataset"
                merged_dir.mkdir(exist_ok=True)
                merged_train = merged_dir / "train"
                merged_train.mkdir(exist_ok=True)
                
                print(f"\nüîÑ Merging all datasets for training: {merged_dir}")
                
                # Merge all train folders
                import shutil
                import json
                from collections import defaultdict
                
                all_images = []
                all_annotations = []
                image_id_offset = 0
                ann_id_offset = 0
                categories_map = {}
                
                for folder_path, _, _, _ in train_candidates:
                    train_folder = folder_path / "train"
                    ann_file = train_folder / "_annotations.coco.json"
                    
                    if ann_file.exists():
                        with open(ann_file, 'r') as f:
                            coco_data = json.load(f)
                        
                        # Merge categories (assuming same categories)
                        if not categories_map:
                            categories_map = {cat['id']: cat for cat in coco_data.get('categories', [])}
                        
                        # Copy images and update IDs
                        for img in coco_data.get('images', []):
                            img['id'] = image_id_offset + img['id']
                            src_img = train_folder / img['file_name']
                            if src_img.exists():
                                dst_img = merged_train / img['file_name']
                                # Handle filename conflicts
                                if dst_img.exists():
                                    stem = Path(img['file_name']).stem
                                    ext = Path(img['file_name']).suffix
                                    img['file_name'] = f"{folder_path.name}_{stem}{ext}"
                                    dst_img = merged_train / img['file_name']
                                shutil.copy2(src_img, dst_img)
                                all_images.append(img)
                        
                        # Update annotation IDs
                        for ann in coco_data.get('annotations', []):
                            ann['id'] = ann_id_offset + ann['id']
                            ann['image_id'] = image_id_offset + ann['image_id']
                            all_annotations.append(ann)
                        
                        image_id_offset = max(img['id'] for img in all_images) + 1
                        ann_id_offset = max(ann['id'] for ann in all_annotations) + 1
                
                # Create merged COCO annotation file
                merged_coco = {
                    'images': all_images,
                    'annotations': all_annotations,
                    'categories': list(categories_map.values())
                }
                
                merged_ann_file = merged_train / "_annotations.coco.json"
                with open(merged_ann_file, 'w') as f:
                    json.dump(merged_coco, f, indent=2)
                
                print(f"   ‚úÖ Merged {len(all_images)} images and {len(all_annotations)} annotations")
                print(f"   ‚úÖ Created merged dataset: {merged_dir}")
                
                ORIGINAL_DATASET = str(merged_dir)
            else:
                # Single train folder
                ORIGINAL_DATASET = str(train_candidates[0][0])
                print(f"\n‚úÖ Using single dataset: {ORIGINAL_DATASET} ({train_candidates[0][3]} images)")
                if not QUERY_DATASET:
                    QUERY_DATASET = ORIGINAL_DATASET
        else:
            ORIGINAL_DATASET = extract_dir
            print(f"\n‚ö†Ô∏è No valid dataset structure found, using extract directory")

# FINAL SAFETY CHECK: Ensure ORIGINAL_DATASET is never __MACOSX
if '__MACOSX' in str(ORIGINAL_DATASET) or Path(ORIGINAL_DATASET).name == '__MACOSX':
    print(f"\n‚ùå CRITICAL ERROR: ORIGINAL_DATASET is set to __MACOSX!")
    print(f"   Attempting to fix...")
    
    # Search for valid datasets
    DATASET_BASE = Path(extract_dir)
    valid_folders = []
    for folder in DATASET_BASE.iterdir():
        if folder.is_dir() and folder.name not in {'__MACOSX', '.DS_Store'}:
            train_check = folder / "train" / "_annotations.coco.json"
            if train_check.exists():
                valid_folders.append(folder)
    
    if valid_folders:
        # Prefer query_images, then merged_dataset, then any valid
        query_folders = [f for f in valid_folders if 'query' in f.name.lower()]
        merged_folders = [f for f in valid_folders if 'merged' in f.name.lower()]
        
        if merged_folders:
            ORIGINAL_DATASET = str(merged_folders[0])
            print(f"   ‚úÖ Fixed: Using merged dataset: {ORIGINAL_DATASET}")
        elif query_folders:
            ORIGINAL_DATASET = str(query_folders[0])
            print(f"   ‚úÖ Fixed: Using query_images: {ORIGINAL_DATASET}")
        else:
            ORIGINAL_DATASET = str(valid_folders[0])
            print(f"   ‚úÖ Fixed: Using first valid dataset: {ORIGINAL_DATASET}")
    else:
        raise ValueError("No valid dataset folders found! Check Cell 8 output.")

# Verify annotation file exists
ann_check = Path(ORIGINAL_DATASET) / "train" / "_annotations.coco.json"
if not ann_check.exists():
    raise FileNotFoundError(
        f"Annotation file not found: {ann_check}\n"
        f"ORIGINAL_DATASET: {ORIGINAL_DATASET}\n"
        f"Please check Cell 8 output."
    )

print(f"\n{'=' * 60}")
print(f"üìã Training Strategy:")
print(f"   ‚úÖ Training: Merged dataset (query_images + left_rotate + right_rotate)")
print(f"   ‚úÖ Evaluation: query_images test set only")
print(f"   ‚úÖ Occlusion levels: 0%, 25%, 75%, 100%")
print(f"\n{'=' * 60}")
print(f"‚úÖ Training dataset: {ORIGINAL_DATASET}")
print(f"‚úÖ Query dataset (for evaluation): {QUERY_DATASET if 'QUERY_DATASET' in locals() else 'Not set'}")
print(f"   Training dataset exists: {Path(ORIGINAL_DATASET).exists()}")
print(f"   Annotation file exists: {ann_check.exists()}")
if 'QUERY_DATASET' in locals() and QUERY_DATASET:
    print(f"   Query dataset exists: {Path(QUERY_DATASET).exists()}")

# Verify the dataset path is accessible and show structure
if Path(ORIGINAL_DATASET).exists():
    items = list(Path(ORIGINAL_DATASET).iterdir())
    print(f"   Contains {len(items)} items")
    if items:
        print(f"\n   Folder contents:")
        for item in sorted(items)[:10]:  # Show first 10 items
            if item.is_dir():
                sub_items = len(list(item.iterdir()))
                print(f"      üìÅ {item.name}/ ({sub_items} items)")
            else:
                size_mb = item.stat().st_size / (1024 * 1024) if item.is_file() else 0
                print(f"      üìÑ {item.name} ({size_mb:.2f} MB)" if size_mb > 0 else f"      üìÑ {item.name}")
        
        # Check for train folder specifically
        train_path = Path(ORIGINAL_DATASET) / "train"
        if train_path.exists():
            train_images = len(list(train_path.glob("*.jpg"))) + len(list(train_path.glob("*.png")))
            train_anns = len(list(train_path.glob("*.json")))
            print(f"\n   ‚úÖ Train folder found: {train_images} images, {train_anns} annotation files")


## Step 4: Prepare Dataset


In [None]:
import sys
import os
from pathlib import Path

# Add src to path
sys.path.insert(0, '/content/credit_card_yolov12')

# Verify dataset structure
print(f"Dataset location: {ORIGINAL_DATASET}")
print(f"Dataset exists: {Path(ORIGINAL_DATASET).exists()}")

if Path(ORIGINAL_DATASET).exists():
    print(f"\nDataset contents:")
    for item in sorted(Path(ORIGINAL_DATASET).iterdir()):
        item_type = "üìÅ" if item.is_dir() else "üìÑ"
        print(f"  {item_type} {item.name}")
    
    # Check for train folder
    train_path = Path(ORIGINAL_DATASET) / "train"
    if train_path.exists():
        train_files = list(train_path.glob("*.json"))
        train_images = list(train_path.glob("*.jpg")) + list(train_path.glob("*.png"))
        print(f"\n‚úÖ Train folder found!")
        print(f"   - Annotations: {len(train_files)}")
        print(f"   - Images: {len(train_images)}")


In [None]:
# Split merged dataset for training
import os
import sys
from pathlib import Path

# Ensure we're in the repo directory
REPO_DIR = "/content/credit_card_yolov12"
os.chdir(REPO_DIR)
sys.path.insert(0, REPO_DIR)

# CRITICAL: Verify ORIGINAL_DATASET is valid before splitting
print(f"\n{'='*60}")
print(f"VERIFYING DATASET BEFORE SPLITTING")
print(f"{'='*60}")

# Check if ORIGINAL_DATASET exists and is valid
if 'ORIGINAL_DATASET' not in locals():
    raise ValueError("ORIGINAL_DATASET not set! Run Cell 7 or Cell 8 first.")

print(f"ORIGINAL_DATASET: {ORIGINAL_DATASET}")
print(f"Path exists: {Path(ORIGINAL_DATASET).exists()}")

# Check if __MACOSX is in the path
if '__MACOSX' in str(ORIGINAL_DATASET):
    print(f"\n‚ùå ERROR: ORIGINAL_DATASET contains __MACOSX!")
    print(f"   Searching for valid dataset...")
    
    # Find valid dataset
    DATASET_BASE = "/content/drive/MyDrive/credit_card_yolov12/datasets/private_dataset"
    valid_folders = []
    
    if Path(DATASET_BASE).exists():
        for folder in Path(DATASET_BASE).iterdir():
            if folder.is_dir() and folder.name not in {'__MACOSX', '.DS_Store'}:
                train_check = folder / "train" / "_annotations.coco.json"
                if train_check.exists():
                    valid_folders.append(folder)
                    print(f"   ‚úÖ Found valid dataset: {folder.name}")
    
    if valid_folders:
        # Prefer merged_dataset, then query_images, then any valid
        merged_folders = [f for f in valid_folders if 'merged' in f.name.lower()]
        query_folders = [f for f in valid_folders if 'query' in f.name.lower()]
        
        if merged_folders:
            ORIGINAL_DATASET = str(merged_folders[0])
            print(f"   ‚úÖ Fixed: Using merged dataset: {ORIGINAL_DATASET}")
        elif query_folders:
            ORIGINAL_DATASET = str(query_folders[0])
            print(f"   ‚úÖ Fixed: Using query_images: {ORIGINAL_DATASET}")
        else:
            ORIGINAL_DATASET = str(valid_folders[0])
            print(f"   ‚úÖ Fixed: Using first valid dataset: {ORIGINAL_DATASET}")
    else:
        raise ValueError("No valid dataset folders found! Please check your Drive folder.")

# Verify annotation file exists
ann_file = Path(ORIGINAL_DATASET) / "train" / "_annotations.coco.json"
if not ann_file.exists():
    print(f"\n‚ùå Annotation file not found: {ann_file}")
    print(f"   ORIGINAL_DATASET: {ORIGINAL_DATASET}")
    print(f"   Path exists: {Path(ORIGINAL_DATASET).exists()}")
    
    if Path(ORIGINAL_DATASET).exists():
        print(f"   Contents of ORIGINAL_DATASET:")
        for item in sorted(Path(ORIGINAL_DATASET).iterdir()):
            print(f"      - {item.name} ({'dir' if item.is_dir() else 'file'})")
    
    # Try to find annotation in alternative locations
    alt_locations = [
        Path(ORIGINAL_DATASET) / "_annotations.coco.json",
        Path(ORIGINAL_DATASET).parent / "train" / "_annotations.coco.json",
    ]
    
    found_alt = False
    for alt_loc in alt_locations:
        if alt_loc.exists():
            print(f"   ‚ö†Ô∏è Found annotation at alternative location: {alt_loc}")
            # Copy to expected location
            train_dir = Path(ORIGINAL_DATASET) / "train"
            train_dir.mkdir(exist_ok=True)
            import shutil
            shutil.copy2(alt_loc, ann_file)
            print(f"   ‚úÖ Copied annotation to: {ann_file}")
            found_alt = True
            break
    
    if not found_alt:
        raise FileNotFoundError(
            f"Annotation file not found: {ann_file}\n"
            f"Please check Cell 7/8 output and ensure a valid dataset was selected."
        )

print(f"‚úÖ Annotation file found: {ann_file}")
print(f"{'='*60}\n")

# Verify script exists
script_path = Path(REPO_DIR) / "src" / "split_dataset.py"
if script_path.exists():
    print(f"‚úÖ Found script: {script_path}")
    print(f"üìã Splitting dataset for training: {ORIGINAL_DATASET}")
    !python src/split_dataset.py --dataset {ORIGINAL_DATASET} --seed 42
else:
    print(f"‚ùå Script not found at: {script_path}")

# Also split query_images separately for evaluation
if 'QUERY_DATASET' in locals() and QUERY_DATASET and QUERY_DATASET != ORIGINAL_DATASET:
    # Verify query dataset too
    query_ann = Path(QUERY_DATASET) / "train" / "_annotations.coco.json"
    if query_ann.exists():
        print(f"\nüìã Splitting query_images for evaluation: {QUERY_DATASET}")
        !python src/split_dataset.py --dataset {QUERY_DATASET} --seed 42
    else:
        print(f"\n‚ö†Ô∏è Query dataset annotation not found: {query_ann}")
        print(f"   Skipping query dataset split")
else:
    print(f"\n‚úÖ Query dataset same as training dataset, already split")


In [None]:
# Generate progressive occlusion test sets from query_images test set only
import os
import sys
from pathlib import Path

# Ensure we're in the repo directory
REPO_DIR = "/content/credit_card_yolov12"
os.chdir(REPO_DIR)
sys.path.insert(0, REPO_DIR)

# Find query_images split directory (for evaluation)
DATASET_BASE = "/content/drive/MyDrive/credit_card_yolov12/datasets"
if 'QUERY_DATASET' in locals() and QUERY_DATASET:
    # Find the split directory for query_images
    query_name = Path(QUERY_DATASET).name
    query_split_dirs = list(Path(DATASET_BASE).glob(f"{query_name}_split"))
    if query_split_dirs:
        TEST_DATASET = str(query_split_dirs[0] / "test")
        print(f"‚úÖ Found query_images test dataset: {TEST_DATASET}")
    else:
        # Fallback: use first split directory
        split_dirs = list(Path(DATASET_BASE).glob("*_split"))
        if split_dirs:
            TEST_DATASET = str(split_dirs[0] / "test")
            print(f"‚ö†Ô∏è Using first split test dataset: {TEST_DATASET}")
        else:
            raise FileNotFoundError("Split dataset not found. Run split_dataset.py first.")
else:
    # Fallback: use first split directory
    split_dirs = list(Path(DATASET_BASE).glob("*_split"))
    if split_dirs:
        TEST_DATASET = str(split_dirs[0] / "test")
        print(f"‚ö†Ô∏è QUERY_DATASET not set, using first split: {TEST_DATASET}")
    else:
        raise FileNotFoundError("Split dataset not found. Run split_dataset.py first.")

# Verify script exists
script_path = Path(REPO_DIR) / "src" / "prepare_progressive_tests.py"
if script_path.exists():
    print(f"‚úÖ Found script: {script_path}")
    print(f"üìã Generating occlusion levels: 0%, 25%, 75%, 100%")
    !python src/prepare_progressive_tests.py --test-dataset {TEST_DATASET} --type crop --seed 42 --levels 25 75 100
else:
    print(f"‚ùå Script not found at: {script_path}")


## Step 5: Train Model with Pretrained Weights


In [None]:
from ultralytics import YOLO
import yaml
import json
from pathlib import Path

# Model configuration
MODEL_SIZE = "m"  # n, s, m, l, x
EPOCHS = 100
IMG_SIZE = 640
BATCH = 16

# Pretrained weights path
PRETRAINED_WEIGHTS = "/content/drive/MyDrive/yolov12_runs/nonprivate/checkpoints/best.pt"

# Find the actual split dataset directory
DATASET_BASE = "/content/drive/MyDrive/credit_card_yolov12/datasets"
split_dirs = list(Path(DATASET_BASE).glob("*_split"))
if split_dirs:
    split_base = split_dirs[0]
    print(f"‚úÖ Found split dataset: {split_base.name}")
    TRAIN_DATASET = str(split_base / "train")
    VAL_DATASET = str(split_base / "val")
else:
    raise FileNotFoundError("Split dataset not found")

print(f"Training on: {TRAIN_DATASET}")
print(f"Validating on: {VAL_DATASET}")
print(f"Pretrained weights: {PRETRAINED_WEIGHTS}")
print(f"Pretrained weights exists: {Path(PRETRAINED_WEIGHTS).exists()}")


In [None]:
# Prepare dataset config for YOLOv8
def prepare_yolo_config(train_path, val_path, output_config="dataset.yaml"):
    """Create YOLOv8 dataset config."""
    # Find annotation file (could be in train_path or train_path/train/)
    train_path_obj = Path(train_path)
    ann_file = train_path_obj / "_annotations.coco.json"
    
    if not ann_file.exists():
        # Try nested structure
        ann_file = train_path_obj / "train" / "_annotations.coco.json"
    
    if not ann_file.exists():
        raise FileNotFoundError(
            f"Annotation file not found. Checked:\n"
            f"  - {train_path_obj / '_annotations.coco.json'}\n"
            f"  - {train_path_obj / 'train' / '_annotations.coco.json'}"
        )
    
    print(f"Loading annotations from: {ann_file}")
    with open(ann_file, 'r') as f:
        coco_data = json.load(f)
    
    categories = coco_data.get('categories', [])
    class_names = [cat['name'] for cat in sorted(categories, key=lambda x: x['id'])]
    
    # Determine the base path and relative paths
    base_path = train_path_obj.parent
    
    # Create YOLOv8 config
    config = {
        'path': str(base_path.absolute()),
        'train': 'train',
        'val': 'val',
        'names': {i: name for i, name in enumerate(class_names)},
        'nc': len(class_names)
    }
    
    with open(output_config, 'w') as f:
        yaml.dump(config, f, default_flow_style=False)
    
    print(f"Dataset config created: {output_config}")
    print(f"Base path: {base_path}")
    print(f"Classes ({len(class_names)}): {class_names}")
    return output_config

config_file = prepare_yolo_config(TRAIN_DATASET, VAL_DATASET)


In [None]:
# Convert COCO to YOLO format
from src.train import convert_coco_to_yolo

# Convert train set
convert_coco_to_yolo(TRAIN_DATASET)

# Convert val set  
convert_coco_to_yolo(VAL_DATASET)


In [None]:
# Initialize model with pretrained weights
if Path(PRETRAINED_WEIGHTS).exists():
    print(f"Loading pretrained weights from: {PRETRAINED_WEIGHTS}")
    model = YOLO(PRETRAINED_WEIGHTS)
else:
    print(f"‚ö†Ô∏è Pretrained weights not found, using default YOLOv8{MODEL_SIZE}")
    model = YOLO(f"yolov8{MODEL_SIZE}.pt")

# Train model
# WandB is already initialized in Step 2, so it will automatically log training metrics
results = model.train(
    data=config_file,
    epochs=EPOCHS,
    imgsz=IMG_SIZE,
    batch=BATCH,
    project="/content/drive/MyDrive/credit_card_yolov12/models",
    name=f"private_objects_{MODEL_SIZE}",
    exist_ok=True,
    save=True,
    plots=True,
    val=True,
)


## Step 6: Save Model to Drive


In [None]:
import shutil
from pathlib import Path

# Paths
MODEL_DIR = results.save_dir
DRIVE_MODEL_DIR = f"/content/drive/MyDrive/credit_card_yolov12/models/private_objects_{MODEL_SIZE}"

# Check if models are already in Drive (they are, since training saves directly to Drive)
if str(MODEL_DIR) == DRIVE_MODEL_DIR:
    print(f"‚úÖ Models already saved to Drive: {MODEL_DIR}")
    print(f"   Best model: {Path(MODEL_DIR) / 'weights' / 'best.pt'}")
    print(f"   Last model: {Path(MODEL_DIR) / 'weights' / 'last.pt'}")
    DRIVE_MODEL_DIR = MODEL_DIR  # Use the same directory
else:
    # Copy best model to Drive
    best_model = Path(MODEL_DIR) / "weights" / "best.pt"
    last_model = Path(MODEL_DIR) / "weights" / "last.pt"
    
    Path(DRIVE_MODEL_DIR).mkdir(parents=True, exist_ok=True)
    
    if best_model.exists():
        shutil.copy2(best_model, f"{DRIVE_MODEL_DIR}/best.pt")
        print(f"‚úÖ Best model saved to: {DRIVE_MODEL_DIR}/best.pt")
    
    if last_model.exists():
        shutil.copy2(last_model, f"{DRIVE_MODEL_DIR}/last.pt")
        print(f"‚úÖ Last model saved to: {DRIVE_MODEL_DIR}/last.pt")

# Set DRIVE_MODEL_DIR for use in next cells
print(f"\n‚úÖ Model directory: {DRIVE_MODEL_DIR}")
print(f"‚úÖ Best model path: {Path(DRIVE_MODEL_DIR) / 'weights' / 'best.pt'}")


## Step 7: Evaluate on Progressive Occlusion


In [None]:
# Evaluate on progressive occlusion test sets
import os
import sys
from pathlib import Path

# Ensure we're in the repo directory
REPO_DIR = "/content/credit_card_yolov12"
os.chdir(REPO_DIR)
sys.path.insert(0, REPO_DIR)

# Find the actual split directory
DATASET_BASE = "/content/drive/MyDrive/credit_card_yolov12/datasets"
split_dirs = list(Path(DATASET_BASE).glob("*_split"))
if split_dirs:
    TEST_SETS_BASE = str(split_dirs[0])
    print(f"‚úÖ Found split directory: {TEST_SETS_BASE}")
else:
    TEST_SETS_BASE = f"{DATASET_BASE}/private_dataset_split"
    print(f"‚ö†Ô∏è Using fallback: {TEST_SETS_BASE}")

# Verify test occlusion directories exist
print(f"\nChecking test occlusion directories:")
for level in [0, 25, 50, 75]:
    test_dir = Path(TEST_SETS_BASE) / f"test_occlusion_{level}"
    if test_dir.exists():
        ann_file = test_dir / "train" / "_annotations.coco.json"
        if ann_file.exists():
            print(f"  ‚úÖ test_occlusion_{level}: {ann_file}")
        else:
            print(f"  ‚ö†Ô∏è test_occlusion_{level}: exists but no annotation file")
    else:
        print(f"  ‚ùå test_occlusion_{level}: not found")

BEST_MODEL = f"{DRIVE_MODEL_DIR}/weights/best.pt"
print(f"\nBest model: {BEST_MODEL}")
print(f"Test sets base: {TEST_SETS_BASE}")
print(f"Best model exists: {Path(BEST_MODEL).exists()}")

# Verify script exists
script_path = Path(REPO_DIR) / "src" / "evaluate_progressive.py"
if script_path.exists():
    print(f"‚úÖ Found script: {script_path}")
    !python src/evaluate_progressive.py \
        --model {BEST_MODEL} \
        --test-sets {TEST_SETS_BASE} \
        --output /content/drive/MyDrive/credit_card_yolov12/outputs/progressive_evaluation_private
else:
    print(f"‚ùå Script not found at: {script_path}")


## Step 8: Log Results to WandB


In [None]:
import json
import pandas as pd
from pathlib import Path

# Load progressive evaluation results
results_file = "/content/drive/MyDrive/credit_card_yolov12/outputs/progressive_evaluation_private/progressive_results.json"

if Path(results_file).exists():
    with open(results_file, 'r') as f:
        eval_results = json.load(f)
    
    # Log to wandb
    for occlusion_level, metrics in eval_results.items():
        wandb.log({
            f"mAP50_occlusion_{occlusion_level}": metrics['mAP50'],
            f"mAP50_95_occlusion_{occlusion_level}": metrics['mAP50_95'],
            f"precision_occlusion_{occlusion_level}": metrics['precision'],
            f"recall_occlusion_{occlusion_level}": metrics['recall'],
            f"f1_occlusion_{occlusion_level}": metrics['f1']
        })
    
    # Log visualization
    plot_file = "/content/drive/MyDrive/credit_card_yolov12/outputs/progressive_evaluation_private/progressive_occlusion_results.png"
    if Path(plot_file).exists():
        wandb.log({"progressive_occlusion_plot": wandb.Image(plot_file)})
    
    print("‚úÖ Results logged to Weights & Biases")
    print("\nProgressive Occlusion Results:")
    df = pd.DataFrame(eval_results).T
    print(df)
else:
    print("‚ö†Ô∏è Results file not found")


In [None]:
# Finish wandb run
wandb.finish()
print("‚úÖ Training complete! Check your Weights & Biases dashboard.")
print(f"‚úÖ Model saved to: {DRIVE_MODEL_DIR}")
print(f"‚úÖ Results saved to: /content/drive/MyDrive/credit_card_yolov12/outputs/progressive_evaluation_private")
