# Private Dataset Training - YOLOv8

This notebook trains a YOLOv8 model for private object detection with 16 categories.

Features:
- Uses pretrained weights from non-private dataset
- 16 categories
- Progressive occlusion evaluation (crop type)
- Weights & Biases integration
- Google Drive integration


## Step 1: Setup Environment


In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


In [None]:
# Install dependencies
%pip install -q ultralytics roboflow python-dotenv opencv-python matplotlib numpy pandas pyyaml wandb gdown


In [None]:
# Clone or update your repository
import os
from pathlib import Path

REPO_DIR = "/content/credit_card_yolov12"

if Path(REPO_DIR).exists():
    print("Repository exists, pulling latest changes...")
    %cd {REPO_DIR}
    !git pull origin main
else:
    print("Cloning repository...")
    !git clone https://github.com/Turje/credit_card_yolov12.git
    %cd credit_card_yolov12

# Verify files exist
print("\nVerifying required files...")
required_files = [
    "src/split_dataset.py",
    "src/prepare_progressive_tests.py",
    "src/train.py",
    "src/evaluate_progressive.py"
]

for file in required_files:
    file_path = Path(REPO_DIR) / file
    if file_path.exists():
        print(f"‚úÖ {file}")
    else:
        print(f"‚ùå {file} - NOT FOUND!")


## Step 2: Setup Weights & Biases


In [None]:
import wandb

# Login to Weights & Biases
wandb.login(key='6defa0781045a6f791ddd5b18bd7ebbdcdfdc86d')

# Initialize wandb project
wandb.init(
    project="private-object-detection",
    name="yolov8-16categories-pretrained",
    config={
        "model_size": "m",
        "epochs": 100,
        "imgsz": 640,
        "batch": 16,
        "num_classes": 16,
        "pretrained_weights": "/content/drive/MyDrive/yolov12_runs/nonprivate/checkpoints/best.pt",
        "occlusion_type": "crop"
    }
)


## Step 3: Download Private Dataset from Google Drive


In [None]:
import gdown
from pathlib import Path
import zipfile
import shutil

# Google Drive file IDs
PRIVATE_FILE_IDS = [
    "1ClFqB6vvVXqmru4hA5JhkphyfEEwjyte",  # Original dataset
    "1Y7jh8lTfAuTqIaDkrF9AS8XJaSAfsGwu",  # Additional dataset 1
    "1GEF0-6MVMrwSGdvbZtcIp9rB82-wqkDL"    # Additional dataset 2
]

# Download location
DATASET_BASE = "/content/drive/MyDrive/credit_card_yolov12/datasets"
Path(DATASET_BASE).mkdir(parents=True, exist_ok=True)

# Main extract directory
extract_dir = f"{DATASET_BASE}/private_dataset"
Path(extract_dir).mkdir(parents=True, exist_ok=True)

# Download and extract all datasets
print(f"Downloading {len(PRIVATE_FILE_IDS)} datasets from Google Drive...\n")

for idx, file_id in enumerate(PRIVATE_FILE_IDS, 1):
    zip_path = f"{DATASET_BASE}/private_dataset_{idx}.zip"
    print(f"[{idx}/{len(PRIVATE_FILE_IDS)}] Downloading dataset {idx}...")
    print(f"   File ID: {file_id}")
    
    # Download using gdown
    url = f"https://drive.google.com/uc?id={file_id}"
    try:
        gdown.download(url, zip_path, quiet=False)
        
        # Extract dataset
        if Path(zip_path).exists():
            print(f"   Extracting to: {extract_dir}")
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall(extract_dir)
            print(f"   ‚úÖ Dataset {idx} extracted successfully\n")
            
            # Clean up zip file
            Path(zip_path).unlink()
        else:
            print(f"   ‚ö†Ô∏è Warning: Download failed for dataset {idx}\n")
    except Exception as e:
        print(f"   ‚ùå Error downloading dataset {idx}: {e}\n")

# Find the actual dataset folder(s)
print(f"\nLooking for dataset folders in: {extract_dir}")
print(f"=" * 60)

# Skip macOS metadata folders
skip_folders = {'__MACOSX', '.DS_Store'}

# Check if there's a train folder directly in extract_dir
train_in_root = Path(extract_dir) / "train"
if train_in_root.exists():
    ORIGINAL_DATASET = extract_dir
    print(f"‚úÖ Found dataset structure in root: {ORIGINAL_DATASET}")
else:
    # Find all folders, excluding macOS metadata
    all_folders = [d for d in Path(extract_dir).iterdir() if d.is_dir() and d.name not in skip_folders]
    
    if not all_folders:
        ORIGINAL_DATASET = extract_dir
        print(f"‚ö†Ô∏è No dataset folders found, using extract directory: {ORIGINAL_DATASET}")
    else:
        print(f"Found {len(all_folders)} folder(s) (excluding macOS metadata):\n")
        
        # Check each folder for dataset structure
        dataset_candidates = []
        for folder in sorted(all_folders):  # Sort for consistent ordering
            train_check = folder / "train"
            ann_check = list(folder.glob("**/_annotations.coco.json"))
            has_images = len(list(folder.glob("**/*.jpg"))) + len(list(folder.glob("**/*.png"))) > 0
            image_count = len(list(folder.glob("**/*.jpg"))) + len(list(folder.glob("**/*.png")))
            
            if train_check.exists():
                train_images = len(list((train_check).glob("*.jpg"))) + len(list((train_check).glob("*.png")))
                train_anns = len(list((train_check).glob("*.json")))
                print(f"   ‚úÖ {folder.name} - Has 'train' folder ({train_images} images, {train_anns} annotations)")
                dataset_candidates.append((folder, "train", True, train_images))
            elif ann_check:
                print(f"   ‚úÖ {folder.name} - Has COCO annotations ({len(ann_check)} files, {image_count} images)")
                dataset_candidates.append((folder, "annotations", True, image_count))
            elif has_images:
                print(f"   üìÅ {folder.name} - Has images ({image_count} images, no annotations found)")
                dataset_candidates.append((folder, "images", False, image_count))
            else:
                print(f"   üìÅ {folder.name} - Empty or unknown structure")
        
        # Select the best candidate
        if dataset_candidates:
            # Prefer folders with train structure
            train_candidates = [c for c in dataset_candidates if c[1] == "train"]
            if train_candidates:
                # If multiple train folders, prefer the one with most images
                best_train = max(train_candidates, key=lambda x: x[3])
                ORIGINAL_DATASET = str(best_train[0])
                print(f"\n‚úÖ Selected dataset: {ORIGINAL_DATASET} (has train folder with {best_train[3]} images)")
            else:
                # Fall back to annotation-based
                ann_candidates = [c for c in dataset_candidates if c[1] == "annotations"]
                if ann_candidates:
                    # Prefer the one with most images
                    best_ann = max(ann_candidates, key=lambda x: x[3])
                    ORIGINAL_DATASET = str(best_ann[0])
                    print(f"\n‚úÖ Selected dataset: {ORIGINAL_DATASET} (has annotations, {best_ann[3]} images)")
                else:
                    # Use folder with most images
                    best_images = max(dataset_candidates, key=lambda x: x[3])
                    ORIGINAL_DATASET = str(best_images[0])
                    print(f"\n‚ö†Ô∏è Selected dataset: {ORIGINAL_DATASET} (has {best_images[3]} images, no annotations)")
                    print(f"   Note: May need manual verification of structure")
        else:
            ORIGINAL_DATASET = extract_dir
            print(f"\n‚ö†Ô∏è No valid dataset structure found, using extract directory")

print(f"\n{'=' * 60}")
print(f"‚úÖ Final dataset path: {ORIGINAL_DATASET}")
print(f"   Path exists: {Path(ORIGINAL_DATASET).exists()}")

# Verify the dataset path is accessible and show structure
if Path(ORIGINAL_DATASET).exists():
    items = list(Path(ORIGINAL_DATASET).iterdir())
    print(f"   Contains {len(items)} items")
    if items:
        print(f"\n   Folder contents:")
        for item in sorted(items)[:10]:  # Show first 10 items
            if item.is_dir():
                sub_items = len(list(item.iterdir()))
                print(f"      üìÅ {item.name}/ ({sub_items} items)")
            else:
                size_mb = item.stat().st_size / (1024 * 1024) if item.is_file() else 0
                print(f"      üìÑ {item.name} ({size_mb:.2f} MB)" if size_mb > 0 else f"      üìÑ {item.name}")
        
        # Check for train folder specifically
        train_path = Path(ORIGINAL_DATASET) / "train"
        if train_path.exists():
            train_images = len(list(train_path.glob("*.jpg"))) + len(list(train_path.glob("*.png")))
            train_anns = len(list(train_path.glob("*.json")))
            print(f"\n   ‚úÖ Train folder found: {train_images} images, {train_anns} annotation files")


## Step 4: Prepare Dataset


In [None]:
import sys
import os
from pathlib import Path

# Add src to path
sys.path.insert(0, '/content/credit_card_yolov12')

# Verify dataset structure
print(f"Dataset location: {ORIGINAL_DATASET}")
print(f"Dataset exists: {Path(ORIGINAL_DATASET).exists()}")

if Path(ORIGINAL_DATASET).exists():
    print(f"\nDataset contents:")
    for item in sorted(Path(ORIGINAL_DATASET).iterdir()):
        item_type = "üìÅ" if item.is_dir() else "üìÑ"
        print(f"  {item_type} {item.name}")
    
    # Check for train folder
    train_path = Path(ORIGINAL_DATASET) / "train"
    if train_path.exists():
        train_files = list(train_path.glob("*.json"))
        train_images = list(train_path.glob("*.jpg")) + list(train_path.glob("*.png"))
        print(f"\n‚úÖ Train folder found!")
        print(f"   - Annotations: {len(train_files)}")
        print(f"   - Images: {len(train_images)}")


In [None]:
# Split dataset
import os
import sys
from pathlib import Path

# Ensure we're in the repo directory
REPO_DIR = "/content/credit_card_yolov12"
os.chdir(REPO_DIR)
sys.path.insert(0, REPO_DIR)

# Verify script exists
script_path = Path(REPO_DIR) / "src" / "split_dataset.py"
if script_path.exists():
    print(f"‚úÖ Found script: {script_path}")
    !python src/split_dataset.py --dataset {ORIGINAL_DATASET} --seed 42
else:
    print(f"‚ùå Script not found at: {script_path}")


In [None]:
# Generate progressive occlusion test sets (crop type)
import os
import sys
from pathlib import Path

# Ensure we're in the repo directory
REPO_DIR = "/content/credit_card_yolov12"
os.chdir(REPO_DIR)
sys.path.insert(0, REPO_DIR)

# Find the split directory
DATASET_BASE = "/content/drive/MyDrive/credit_card_yolov12/datasets"
split_dirs = list(Path(DATASET_BASE).glob("*_split"))
if split_dirs:
    TEST_DATASET = str(split_dirs[0] / "test")
    print(f"‚úÖ Found test dataset: {TEST_DATASET}")
else:
    raise FileNotFoundError("Split dataset not found. Run split_dataset.py first.")

# Verify script exists
script_path = Path(REPO_DIR) / "src" / "prepare_progressive_tests.py"
if script_path.exists():
    print(f"‚úÖ Found script: {script_path}")
    !python src/prepare_progressive_tests.py --test-dataset {TEST_DATASET} --type crop --seed 42
else:
    print(f"‚ùå Script not found at: {script_path}")


## Step 5: Train Model with Pretrained Weights


In [None]:
from ultralytics import YOLO
import yaml
import json
from pathlib import Path

# Model configuration
MODEL_SIZE = "m"  # n, s, m, l, x
EPOCHS = 100
IMG_SIZE = 640
BATCH = 16

# Pretrained weights path
PRETRAINED_WEIGHTS = "/content/drive/MyDrive/yolov12_runs/nonprivate/checkpoints/best.pt"

# Find the actual split dataset directory
DATASET_BASE = "/content/drive/MyDrive/credit_card_yolov12/datasets"
split_dirs = list(Path(DATASET_BASE).glob("*_split"))
if split_dirs:
    split_base = split_dirs[0]
    print(f"‚úÖ Found split dataset: {split_base.name}")
    TRAIN_DATASET = str(split_base / "train")
    VAL_DATASET = str(split_base / "val")
else:
    raise FileNotFoundError("Split dataset not found")

print(f"Training on: {TRAIN_DATASET}")
print(f"Validating on: {VAL_DATASET}")
print(f"Pretrained weights: {PRETRAINED_WEIGHTS}")
print(f"Pretrained weights exists: {Path(PRETRAINED_WEIGHTS).exists()}")


In [None]:
# Prepare dataset config for YOLOv8
def prepare_yolo_config(train_path, val_path, output_config="dataset.yaml"):
    """Create YOLOv8 dataset config."""
    # Find annotation file (could be in train_path or train_path/train/)
    train_path_obj = Path(train_path)
    ann_file = train_path_obj / "_annotations.coco.json"
    
    if not ann_file.exists():
        # Try nested structure
        ann_file = train_path_obj / "train" / "_annotations.coco.json"
    
    if not ann_file.exists():
        raise FileNotFoundError(
            f"Annotation file not found. Checked:\n"
            f"  - {train_path_obj / '_annotations.coco.json'}\n"
            f"  - {train_path_obj / 'train' / '_annotations.coco.json'}"
        )
    
    print(f"Loading annotations from: {ann_file}")
    with open(ann_file, 'r') as f:
        coco_data = json.load(f)
    
    categories = coco_data.get('categories', [])
    class_names = [cat['name'] for cat in sorted(categories, key=lambda x: x['id'])]
    
    # Determine the base path and relative paths
    base_path = train_path_obj.parent
    
    # Create YOLOv8 config
    config = {
        'path': str(base_path.absolute()),
        'train': 'train',
        'val': 'val',
        'names': {i: name for i, name in enumerate(class_names)},
        'nc': len(class_names)
    }
    
    with open(output_config, 'w') as f:
        yaml.dump(config, f, default_flow_style=False)
    
    print(f"Dataset config created: {output_config}")
    print(f"Base path: {base_path}")
    print(f"Classes ({len(class_names)}): {class_names}")
    return output_config

config_file = prepare_yolo_config(TRAIN_DATASET, VAL_DATASET)


In [None]:
# Convert COCO to YOLO format
from src.train import convert_coco_to_yolo

# Convert train set
convert_coco_to_yolo(TRAIN_DATASET)

# Convert val set  
convert_coco_to_yolo(VAL_DATASET)


In [None]:
# Initialize model with pretrained weights
if Path(PRETRAINED_WEIGHTS).exists():
    print(f"Loading pretrained weights from: {PRETRAINED_WEIGHTS}")
    model = YOLO(PRETRAINED_WEIGHTS)
else:
    print(f"‚ö†Ô∏è Pretrained weights not found, using default YOLOv8{MODEL_SIZE}")
    model = YOLO(f"yolov8{MODEL_SIZE}.pt")

# Train model
# WandB is already initialized in Step 2, so it will automatically log training metrics
results = model.train(
    data=config_file,
    epochs=EPOCHS,
    imgsz=IMG_SIZE,
    batch=BATCH,
    project="/content/drive/MyDrive/credit_card_yolov12/models",
    name=f"private_objects_{MODEL_SIZE}",
    exist_ok=True,
    save=True,
    plots=True,
    val=True,
)


## Step 6: Save Model to Drive


In [None]:
import shutil
from pathlib import Path

# Paths
MODEL_DIR = results.save_dir
DRIVE_MODEL_DIR = f"/content/drive/MyDrive/credit_card_yolov12/models/private_objects_{MODEL_SIZE}"

# Check if models are already in Drive (they are, since training saves directly to Drive)
if str(MODEL_DIR) == DRIVE_MODEL_DIR:
    print(f"‚úÖ Models already saved to Drive: {MODEL_DIR}")
    print(f"   Best model: {Path(MODEL_DIR) / 'weights' / 'best.pt'}")
    print(f"   Last model: {Path(MODEL_DIR) / 'weights' / 'last.pt'}")
    DRIVE_MODEL_DIR = MODEL_DIR  # Use the same directory
else:
    # Copy best model to Drive
    best_model = Path(MODEL_DIR) / "weights" / "best.pt"
    last_model = Path(MODEL_DIR) / "weights" / "last.pt"
    
    Path(DRIVE_MODEL_DIR).mkdir(parents=True, exist_ok=True)
    
    if best_model.exists():
        shutil.copy2(best_model, f"{DRIVE_MODEL_DIR}/best.pt")
        print(f"‚úÖ Best model saved to: {DRIVE_MODEL_DIR}/best.pt")
    
    if last_model.exists():
        shutil.copy2(last_model, f"{DRIVE_MODEL_DIR}/last.pt")
        print(f"‚úÖ Last model saved to: {DRIVE_MODEL_DIR}/last.pt")

# Set DRIVE_MODEL_DIR for use in next cells
print(f"\n‚úÖ Model directory: {DRIVE_MODEL_DIR}")
print(f"‚úÖ Best model path: {Path(DRIVE_MODEL_DIR) / 'weights' / 'best.pt'}")


## Step 7: Evaluate on Progressive Occlusion


In [None]:
# Evaluate on progressive occlusion test sets
import os
import sys
from pathlib import Path

# Ensure we're in the repo directory
REPO_DIR = "/content/credit_card_yolov12"
os.chdir(REPO_DIR)
sys.path.insert(0, REPO_DIR)

# Find the actual split directory
DATASET_BASE = "/content/drive/MyDrive/credit_card_yolov12/datasets"
split_dirs = list(Path(DATASET_BASE).glob("*_split"))
if split_dirs:
    TEST_SETS_BASE = str(split_dirs[0])
    print(f"‚úÖ Found split directory: {TEST_SETS_BASE}")
else:
    TEST_SETS_BASE = f"{DATASET_BASE}/private_dataset_split"
    print(f"‚ö†Ô∏è Using fallback: {TEST_SETS_BASE}")

# Verify test occlusion directories exist
print(f"\nChecking test occlusion directories:")
for level in [0, 25, 50, 75]:
    test_dir = Path(TEST_SETS_BASE) / f"test_occlusion_{level}"
    if test_dir.exists():
        ann_file = test_dir / "train" / "_annotations.coco.json"
        if ann_file.exists():
            print(f"  ‚úÖ test_occlusion_{level}: {ann_file}")
        else:
            print(f"  ‚ö†Ô∏è test_occlusion_{level}: exists but no annotation file")
    else:
        print(f"  ‚ùå test_occlusion_{level}: not found")

BEST_MODEL = f"{DRIVE_MODEL_DIR}/weights/best.pt"
print(f"\nBest model: {BEST_MODEL}")
print(f"Test sets base: {TEST_SETS_BASE}")
print(f"Best model exists: {Path(BEST_MODEL).exists()}")

# Verify script exists
script_path = Path(REPO_DIR) / "src" / "evaluate_progressive.py"
if script_path.exists():
    print(f"‚úÖ Found script: {script_path}")
    !python src/evaluate_progressive.py \
        --model {BEST_MODEL} \
        --test-sets {TEST_SETS_BASE} \
        --output /content/drive/MyDrive/credit_card_yolov12/outputs/progressive_evaluation_private
else:
    print(f"‚ùå Script not found at: {script_path}")


## Step 8: Log Results to WandB


In [None]:
import json
import pandas as pd
from pathlib import Path

# Load progressive evaluation results
results_file = "/content/drive/MyDrive/credit_card_yolov12/outputs/progressive_evaluation_private/progressive_results.json"

if Path(results_file).exists():
    with open(results_file, 'r') as f:
        eval_results = json.load(f)
    
    # Log to wandb
    for occlusion_level, metrics in eval_results.items():
        wandb.log({
            f"mAP50_occlusion_{occlusion_level}": metrics['mAP50'],
            f"mAP50_95_occlusion_{occlusion_level}": metrics['mAP50_95'],
            f"precision_occlusion_{occlusion_level}": metrics['precision'],
            f"recall_occlusion_{occlusion_level}": metrics['recall'],
            f"f1_occlusion_{occlusion_level}": metrics['f1']
        })
    
    # Log visualization
    plot_file = "/content/drive/MyDrive/credit_card_yolov12/outputs/progressive_evaluation_private/progressive_occlusion_results.png"
    if Path(plot_file).exists():
        wandb.log({"progressive_occlusion_plot": wandb.Image(plot_file)})
    
    print("‚úÖ Results logged to Weights & Biases")
    print("\nProgressive Occlusion Results:")
    df = pd.DataFrame(eval_results).T
    print(df)
else:
    print("‚ö†Ô∏è Results file not found")


In [None]:
# Finish wandb run
wandb.finish()
print("‚úÖ Training complete! Check your Weights & Biases dashboard.")
print(f"‚úÖ Model saved to: {DRIVE_MODEL_DIR}")
print(f"‚úÖ Results saved to: /content/drive/MyDrive/credit_card_yolov12/outputs/progressive_evaluation_private")
