In [1]:
# CREATED JAN 22nd 2025 - morning

import os
import json
import shutil
import random
from pathlib import Path
from sklearn.model_selection import train_test_split
from datetime import datetime
from collections import Counter

def load_class_names():
    return [ 'Pinnipedia',  'Z. californianus',  'Else/Other',  'Gelatinous Object',  'Hydromedusae',  
            'P. bachei',  'Ctenophora',  'A. flavidus',  'Cydippida',  'Polyorchis sp.',  'Aequorea sp.',  
            'Hexagrammidae',  'Actinopterygii',  'Drift Algae',  'M. Cellularia',  'Scyphomedusae',  
            'Eutonina sp.',  'Sarsia sp.',  'Cnidaria',  'Background',  'A. Labiata',  'Embiotocidae',  
            'C. aggregata',  'Bolinopsidae',  'C. aggregata (dark morph)',  'R. vacca',  'Fish',  
            'F. Fish (unk)',  'Sch-Embiotocidae',  'Ex-Embiotocidae',  'E. Lateralis',  'Sch-C.pallasii',  
            'Ex-C. pallasii',  'Clupeidae',  'Ex-Clupeidae',  'B. Frenatus',  'Ex-C. aggregata',  
            'Sch-C. aggregata',  'Diving Birds',  'TBD 44',  'Scyphozoa',  'N. breviconis',  'C. pallasii']

def get_binary_category(class_id, class_names):
    if class_id >= len(class_names):
        return 0
    
    forage_fish_classes = [
        'F. Fish (unk)',
        'Sch-C.pallasii',
        'Ex-C. pallasii', 
        'Clupeidae',
        'Ex-Clupeidae',
        'Sch-E. mordax',
        'Ex-E. mordax',
    ]
    class_name = class_names[class_id]
    return 1 if class_name in forage_fish_classes else 0

def read_label_safely(label_path):
    try:
        with open(label_path) as f:
            for line in f:
                if line.strip():
                    parts = line.strip().split()
                    if parts:
                        return int(parts[0])
    except Exception as e:
        print(f"Error reading {label_path}: {e}")
    return None

def create_data_template():
    return {
        "info": {
            "description": "ff_test_data",
            "year": 2025,
            "contributor": "Talen",
            "date_created": datetime.now().strftime("%Y-%m-%d")
        },
        "images": [],
        "annotations": [],
        "categories": [
            {"id": 1, "name": "forage_fish"},
            {"id": 0, "name": "other"}
        ]
    }

def process_dataset():
    # =====================================================================
    # CHANGE IMAGE PATH HERE WHEN MOVING TO NEW DIRECTORY
    # =====================================================================
    img_dir = Path('/mnt/class_data/group4/talen/fish/images/train')
    
    # =====================================================================
    # CHANGE LABEL PATH HERE WHEN MOVING TO NEW DIRECTORY
    # =====================================================================
    label_dir = Path('/mnt/class_data/group4/talen/fish/labels/train')
    
    output_img_dir = Path.cwd() / '/mnt/class_data/group4/talen/eccv_18_all_images_sm'
    print(f"\nOutput directory for images will be created at: {output_img_dir}")
    
    if output_img_dir.exists():
        shutil.rmtree(output_img_dir)
    output_img_dir.mkdir(exist_ok=True)

    class_names = load_class_names()
    all_samples = []
    processed = 0
    unlabeled = 0

    print("\nProcessing images and labels...")
    for img_path in img_dir.glob('*.png'):
        label_path = label_dir / f"{img_path.stem}.txt"
        
        record = {
            'image_id': processed + 1,
            'file_name': img_path.name,
            'category_id': 0,  # Default to 0 for unlabeled images
            'img_path': img_path
        }

        if label_path.exists():
            class_id = read_label_safely(label_path)
            if class_id is not None and class_id < len(class_names):
                record['category_id'] = get_binary_category(class_id, class_names)
        else:
            unlabeled += 1

        all_samples.append(record)
        processed += 1

    print(f"\nDataset Summary:")
    category_counts = Counter(r['category_id'] for r in all_samples)
    print(f"Total images processed: {len(all_samples)}")
    print(f"Category 1 (forage fish): {category_counts[1]}")
    print(f"Category 0 (other/unlabeled): {category_counts[0]}")
    print(f"Unlabeled images: {unlabeled}")

    print("\nCopying images to output directory...")
    copied_images = set()
    for record in all_samples:
        if record['file_name'] not in copied_images:
            shutil.copy2(record['img_path'], output_img_dir / record['file_name'])
            copied_images.add(record['file_name'])

    train_records, temp = train_test_split(all_samples, test_size=0.3, random_state=42)
    val_records, test_records = train_test_split(temp, test_size=1/3, random_state=42)

    splits = {
        'train_annotations.json': train_records,
        'cis_val_annotations.json': val_records,
        'cis_test_annotations.json': test_records
    }

    print("\nCreating JSON annotation files...")
    for name, records in splits.items():
        counts = Counter(r['category_id'] for r in records)
        print(f"\n{name}:")
        print(f"Total images: {len(records)}")
        print(f"Category 1 (forage fish): {counts[1]}")
        print(f"Category 0 (other): {counts[0]}")

        output = create_data_template()
        output["images"] = [{"id": r["image_id"], "file_name": r["file_name"]} for r in records]
        output["annotations"] = [{"image_id": r["image_id"], "category_id": r["category_id"]} for r in records]
        
        output_path = Path.cwd() / name
        with open(output_path, 'w') as f:
            json.dump(output, f, indent=4)
        print(f"Saved annotation file: {output_path}")

if __name__ == "__main__":
    process_dataset()


Output directory for images will be created at: /mnt/class_data/group4/talen/eccv_18_all_images_sm

Processing images and labels...

Dataset Summary:
Total images processed: 212578
Category 1 (forage fish): 446
Category 0 (other/unlabeled): 212132
Unlabeled images: 0

Copying images to output directory...

Creating JSON annotation files...

train_annotations.json:
Total images: 148804
Category 1 (forage fish): 309
Category 0 (other): 148495
Saved annotation file: /home/Talen/train_annotations.json

cis_val_annotations.json:
Total images: 42516
Category 1 (forage fish): 86
Category 0 (other): 42430
Saved annotation file: /home/Talen/cis_val_annotations.json

cis_test_annotations.json:
Total images: 21258
Category 1 (forage fish): 51
Category 0 (other): 21207
Saved annotation file: /home/Talen/cis_test_annotations.json


In [1]:
import os
import json
import shutil
import random
import re
from pathlib import Path
from sklearn.model_selection import train_test_split
from datetime import datetime
from collections import Counter
from itertools import groupby

def load_class_names():
    return [ 'Pinnipedia',  'Z. californianus',  'Else/Other',  'Gelatinous Object',  'Hydromedusae',  
            'P. bachei',  'Ctenophora',  'A. flavidus',  'Cydippida',  'Polyorchis sp.',  'Aequorea sp.',  
            'Hexagrammidae',  'Actinopterygii',  'Drift Algae',  'M. Cellularia',  'Scyphomedusae',  
            'Eutonina sp.',  'Sarsia sp.',  'Cnidaria',  'Background',  'A. Labiata',  'Embiotocidae',  
            'C. aggregata',  'Bolinopsidae',  'C. aggregata (dark morph)',  'R. vacca',  'Fish',  
            'F. Fish (unk)',  'Sch-Embiotocidae',  'Ex-Embiotocidae',  'E. Lateralis',  'Sch-C.pallasii',  
            'Ex-C. pallasii',  'Clupeidae',  'Ex-Clupeidae',  'B. Frenatus',  'Ex-C. aggregata',  
            'Sch-C. aggregata',  'Diving Birds',  'TBD 44',  'Scyphozoa',  'N. breviconis',  'C. pallasii']

def get_binary_category(class_id, class_names):
    if class_id >= len(class_names):
        return 0
    
    forage_fish_classes = [
        'F. Fish (unk)',
        'Sch-C.pallasii',
        'Ex-C. pallasii', 
        'Clupeidae',
        'Ex-Clupeidae',
        # 'Ex-Embiotocidae',
        # 'Sch-Embiotocidae',
        'Sch-E. mordax',
        'Ex-E. mordax',
    ]
    class_name = class_names[class_id]
    return 1 if class_name in forage_fish_classes else 0

def read_label_safely(label_path):
    try:
        with open(label_path) as f:
            for line in f:
                if line.strip():
                    parts = line.strip().split()
                    if parts:
                        return int(parts[0])
    except Exception as e:
        print(f"Error reading {label_path}: {e}")
    return None

def create_data_template():
    return {
        "info": {
            "description": "ff_test_data",
            "year": 2025,
            "contributor": "Talen",
            "date_created": datetime.now().strftime("%Y-%m-%d")
        },
        "images": [],
        "annotations": [],
        "categories": [
            {"id": 1, "name": "forage_fish"},
            {"id": 0, "name": "other"}
        ]
    }

def extract_video_id(filename):
    """Extract timestamp/video ID from filename"""
    match = re.search(r'(\d{8}T\d{6})', filename)
    return match.group(1) if match else None

def process_dataset():
    img_dir = Path('/Users/talenrimmer/Desktop/All_training_data/fish/images/train')
    label_dir = Path('/Users/talenrimmer/Desktop/All_training_data/fish/labels/train')
    output_img_dir = Path.cwd() / 'eccv_18_all_images_sm'
    
    if output_img_dir.exists():
        shutil.rmtree(output_img_dir)
    output_img_dir.mkdir(exist_ok=True)

    class_names = load_class_names()
    video_records = {}
    processed = 0

    # Group by video ID
    for img_path in img_dir.glob('*.png'):
        label_path = label_dir / f"{img_path.stem}.txt"
        if not label_path.exists():
            continue

        class_id = read_label_safely(label_path)
        if class_id is None or class_id >= len(class_names):
            continue

        video_id = extract_video_id(img_path.name)
        if video_id is None:
            continue

        record = {
            'image_id': processed,
            'file_name': img_path.name,
            'category_id': get_binary_category(class_id, class_names),
            'img_path': img_path,
            'video_id': video_id
        }
        
        if video_id not in video_records:
            video_records[video_id] = []
        video_records[video_id].append(record)
        processed += 1

    # Split videos maintaining independence
    video_ids = list(video_records.keys())
    random.shuffle(video_ids)
    
    n_videos = len(video_ids)
    train_idx = int(0.7 * n_videos)
    val_idx = int(0.9 * n_videos)
    
    train_videos = video_ids[:train_idx]
    val_videos = video_ids[train_idx:val_idx]
    test_videos = video_ids[val_idx:]

    # Combine records by split
    train_records = [r for vid in train_videos for r in video_records[vid]]
    val_records = [r for vid in val_videos for r in video_records[vid]]
    test_records = [r for vid in test_videos for r in video_records[vid]]

    # Balance categories within each split
    def balance_split(records):
        cat_1 = [r for r in records if r['category_id'] == 1]
        cat_0 = [r for r in records if r['category_id'] == 0]
        n_samples = min(len(cat_1), len(cat_0))
        if n_samples == 0:
            return []
        return random.sample(cat_1, n_samples) + random.sample(cat_0, n_samples)

    train_records = balance_split(train_records)
    val_records = balance_split(val_records)
    test_records = balance_split(test_records)

    # Copy images
    for record in train_records + val_records + test_records:
        shutil.copy2(record['img_path'], output_img_dir / record['file_name'])

    splits = {
        'train_annotations.json': train_records,
        'cis_val_annotations.json': val_records,
        'cis_test_annotations.json': test_records
    }

    print("\nVideo Distribution in Splits:")
    for name, records in splits.items():
        videos = set(r['video_id'] for r in records)
        cats = Counter(r['category_id'] for r in records)
        print(f"\n{name}:")
        print(f"Unique videos: {len(videos)}")
        print(f"Total images: {len(records)}")
        print(f"Category 1 (forage fish): {cats[1]}")
        print(f"Category 0 (other): {cats[0]}")
        print(f"Videos included: {sorted(list(videos))}")

        output = create_data_template()
        output["images"] = [{"id": r["image_id"], "file_name": r["file_name"]} for r in records]
        output["annotations"] = [{"image_id": r["image_id"], "category_id": r["category_id"]} for r in records]
        
        with open(name, 'w') as f:
            json.dump(output, f, indent=4)

if __name__ == "__main__":
    process_dataset()


Video Distribution in Splits:

train_annotations.json:
Unique videos: 0
Total images: 0
Category 1 (forage fish): 0
Category 0 (other): 0
Videos included: []

cis_val_annotations.json:
Unique videos: 0
Total images: 0
Category 1 (forage fish): 0
Category 0 (other): 0
Videos included: []

cis_test_annotations.json:
Unique videos: 0
Total images: 0
Category 1 (forage fish): 0
Category 0 (other): 0
Videos included: []
