In [7]:
# This Jupyter notebook will combine the .json files from the kf_vis and farm_data_prep notebooks

"""
Ok great! 

I now have two datasets that both contain identically-named json and image folders.

The first dataset (called gopro dataset) has the following structure:

annotation files pathnames:

test file: /Users/talenrimmer/Desktop/CV4E_Code/ct_classifier_outputs/eccv_18_annotation_files/cis_test_annotations.json
train file: /Users/talenrimmer/Desktop/CV4E_Code/ct_classifier_outputs/eccv_18_annotation_files/train_annotations.json
val file: /Users/talenrimmer/Desktop/CV4E_Code/ct_classifier_outputs/eccv_18_annotation_files/cis_val_annotations.json

image files pathnames:
/Users/talenrimmer/Desktop/CV4E_Code/images


The second dataset (called farm dataset) has the following structure:

annotation files pathnames:

test file: /Users/talenrimmer/Desktop/CV4E_Code/cis_test_annotations.json
train file: /Users/talenrimmer/Desktop/CV4E_Code/train_annotations.json
val file: /Users/talenrimmer/Desktop/CV4E_Code/cis_val_annotations.json

image files pathnames:
/Users/talenrimmer/Desktop/CV4E_Code/eccv_18_all_images_sm

provide code to combine these two datasets into a single dataset with the following structure:

create a new folder called combined_dataset
create a new folder called eccv_18_annotation_files inside combined_dataset
combine the test json files from the two datasets into a single file called cis_test_annotations.json inside eccv_18_annotation_files
combine the train json files from the two datasets into a single file called train_annotations.json inside eccv_18_annotation_files
combine the val json files from the two datasets into a single file called cis_val_annotations.json inside eccv_18_annotation_files

create a new folder called eccv_18_all_images_sm inside combined_dataset

Then print the total number of files in each file type (json train, test, val, and images) in the combined datasets.
Print a method to verify that the combined dataset is correct.

"""

'\nOk great! \n\nI now have two datasets that both contain identically-named json and image folders.\n\nThe first dataset (called gopro dataset) has the following structure:\n\nannotation files pathnames:\n\ntest file: /Users/talenrimmer/Desktop/CV4E_Code/ct_classifier_outputs/eccv_18_annotation_files/cis_test_annotations.json\ntrain file: /Users/talenrimmer/Desktop/CV4E_Code/ct_classifier_outputs/eccv_18_annotation_files/train_annotations.json\nval file: /Users/talenrimmer/Desktop/CV4E_Code/ct_classifier_outputs/eccv_18_annotation_files/cis_val_annotations.json\n\nimage files pathnames:\n/Users/talenrimmer/Desktop/CV4E_Code/images\n\n\nThe second dataset (called farm dataset) has the following structure:\n\nannotation files pathnames:\n\ntest file: /Users/talenrimmer/Desktop/CV4E_Code/cis_test_annotations.json\ntrain file: /Users/talenrimmer/Desktop/CV4E_Code/train_annotations.json\nval file: /Users/talenrimmer/Desktop/CV4E_Code/cis_val_annotations.json\n\nimage files pathnames:\n/Use

In [10]:
import json
import shutil
from pathlib import Path
from collections import Counter

def setup_directories():
    """Create new directory structure"""
    base_dir = Path('/Users/talenrimmer/Desktop/CV4E_Code/combined_dataset')
    ann_dir = base_dir / 'eccv_18_annotation_files'
    img_dir = base_dir / 'eccv_18_all_images_sm'
    
    for dir in [base_dir, ann_dir, img_dir]:
        if dir.exists():
            shutil.rmtree(dir)
        dir.mkdir(parents=True)
    
    return base_dir, ann_dir, img_dir

def safe_copy_image(src_path, dst_path):
    """Copy image file with error handling"""
    try:
        if src_path.exists():
            shutil.copy2(src_path, dst_path)
            return True
    except Exception as e:
        print(f"Error copying {src_path}: {e}")
    return False

def merge_datasets():
    # Setup paths
    gopro_ann_dir = Path('/Users/talenrimmer/Desktop/CV4E_Code/ct_classifier_outputs/eccv_18_annotation_files')
    farm_ann_dir = Path('/Users/talenrimmer/Desktop/CV4E_Code')
    gopro_img_dir = Path('/Users/talenrimmer/Desktop/CV4E_Code/images')
    farm_img_dir = Path('/Users/talenrimmer/Desktop/CV4E_Code/eccv_18_all_images_sm')
    
    # Create directories
    base_dir, ann_dir, img_dir = setup_directories()
    
    # Track skipped items
    total_skipped = {'gopro': 0, 'farm': 0}
    
    # Process each split
    splits = ['train_annotations.json', 'cis_val_annotations.json', 'cis_test_annotations.json']
    
    for split in splits:
        print(f"\nProcessing {split}...")
        
        # Load JSONs
        gopro_data = json.load(open(gopro_ann_dir / split))
        farm_data = json.load(open(farm_ann_dir / split))
        
        # Track valid images
        valid_images = []
        valid_annotations = []
        next_id = 1
        split_skipped = {'gopro': 0, 'farm': 0}
        
        # Process GoPro images
        for img, ann in zip(gopro_data['images'], gopro_data['annotations']):
            src_path = gopro_img_dir / img['file_name']
            dst_path = img_dir / img['file_name']
            
            if safe_copy_image(src_path, dst_path):
                img['id'] = next_id
                ann['image_id'] = next_id
                valid_images.append(img)
                valid_annotations.append(ann)
                next_id += 1
            else:
                split_skipped['gopro'] += 1
                total_skipped['gopro'] += 1
        
        # Process Farm images
        for img, ann in zip(farm_data['images'], farm_data['annotations']):
            src_path = farm_img_dir / img['file_name']
            dst_path = img_dir / img['file_name']
            
            if safe_copy_image(src_path, dst_path):
                img['id'] = next_id
                ann['image_id'] = next_id
                valid_images.append(img)
                valid_annotations.append(ann)
                next_id += 1
            else:
                split_skipped['farm'] += 1
                total_skipped['farm'] += 1
        
        # Create merged data
        merged_data = {
            "info": gopro_data.get('info', {}),
            "images": valid_images,
            "annotations": valid_annotations,
            "categories": gopro_data.get('categories', [])
        }
        
        # Save merged JSON
        with open(ann_dir / split, 'w') as f:
            json.dump(merged_data, f, indent=4)
            
        print(f"Images processed: {len(valid_images)}")
        print(f"Annotations processed: {len(valid_annotations)}")
        print(f"Images skipped in this split - GoPro: {split_skipped['gopro']}, Farm: {split_skipped['farm']}")
        cats = Counter(ann['category_id'] for ann in valid_annotations)
        print(f"Category distribution: {dict(cats)}")
    
    # Final summary
    print(f"\nTotal images in combined directory: {len(list(img_dir.glob('*.png')))}")
    print(f"Total images skipped - GoPro: {total_skipped['gopro']}, Farm: {total_skipped['farm']}")

if __name__ == "__main__":
    merge_datasets()


Processing train_annotations.json...
Images processed: 968
Annotations processed: 968
Images skipped in this split - GoPro: 0, Farm: 0
Category distribution: {1: 482, 0: 486}

Processing cis_val_annotations.json...
Images processed: 278
Annotations processed: 278
Images skipped in this split - GoPro: 0, Farm: 0
Category distribution: {1: 141, 0: 137}

Processing cis_test_annotations.json...
Images processed: 140
Annotations processed: 140
Images skipped in this split - GoPro: 0, Farm: 0
Category distribution: {1: 70, 0: 70}

Total images in combined directory: 892
Total images skipped - GoPro: 0, Farm: 0


In [9]:
import json
import shutil
from pathlib import Path
from collections import Counter
import random

def setup_directories():
    base_dir = Path('/Users/talenrimmer/Desktop/CV4E_Code/combined_dataset')
    ann_dir = base_dir / 'eccv_18_annotation_files'
    img_dir = base_dir / 'eccv_18_all_images_sm'
    
    for dir in [base_dir, ann_dir, img_dir]:
        if dir.exists():
            shutil.rmtree(dir)
        dir.mkdir(parents=True)
    
    return base_dir, ann_dir, img_dir

def safe_copy_image(src_path, dst_path):
    try:
        if src_path.exists():
            shutil.copy2(src_path, dst_path)
            return True
    except Exception as e:
        print(f"Error copying {src_path}: {e}")
    return False

def split_by_ratio(data, ratios=[0.7, 0.2, 0.1]):
    """Split data by given ratios"""
    n = len(data)
    indices = list(range(n))
    random.shuffle(indices)
    
    cuts = [int(n * ratio) for ratio in ratios]
    cuts = [sum(cuts[:i]) for i in range(len(cuts) + 1)]
    
    return [
        [data[i] for i in indices[start:end]]
        for start, end in zip(cuts[:-1], cuts[1:])
    ]

def merge_datasets():
    # Setup paths with new farm locations
    gopro_ann_dir = Path('/Users/talenrimmer/Desktop/CV4E_Code/ct_classifier_outputs/eccv_18_annotation_files')
    farm_ann_dir = Path('/Users/talenrimmer/Desktop/All_training_data')
    gopro_img_dir = Path('/Users/talenrimmer/Desktop/CV4E_Code/images')
    farm_img_dir = Path('/Users/talenrimmer/Desktop/All_training_data/eccv_18_all_images_sm')
    
    base_dir, ann_dir, img_dir = setup_directories()
    
    # Collect all valid images and annotations
    all_images = []
    all_annotations = []
    next_id = 1
    
    # Process GoPro dataset
    for split in ['train_annotations.json', 'cis_val_annotations.json', 'cis_test_annotations.json']:
        data = json.load(open(gopro_ann_dir / split))
        for img, ann in zip(data['images'], data['annotations']):
            src_path = gopro_img_dir / img['file_name']
            if safe_copy_image(src_path, img_dir / img['file_name']):
                img['id'] = next_id
                ann['image_id'] = next_id
                all_images.append(img)
                all_annotations.append(ann)
                next_id += 1

    # Process Farm dataset
    farm_splits = {
        'train': 'train_annotations.json',
        'test': 'cis_test_annotations.json'
    }
    for split in farm_splits.values():
        data = json.load(open(farm_ann_dir / split))
        for img, ann in zip(data['images'], data['annotations']):
            src_path = farm_img_dir / img['file_name']
            if safe_copy_image(src_path, img_dir / img['file_name']):
                img['id'] = next_id
                ann['image_id'] = next_id
                all_images.append(img)
                all_annotations.append(ann)
                next_id += 1

    # Separate by category
    cat_1_records = []
    cat_0_records = []
    for img, ann in zip(all_images, all_annotations):
        record = {'image': img, 'annotation': ann}
        if ann['category_id'] == 1:
            cat_1_records.append(record)
        else:
            cat_0_records.append(record)

    # Split each category 70/20/10
    cat_1_splits = split_by_ratio(cat_1_records)
    cat_0_splits = split_by_ratio(cat_0_records)

    # Combine splits
    splits_data = {
        'train_annotations.json': (cat_1_splits[0], cat_0_splits[0]),
        'cis_val_annotations.json': (cat_1_splits[1], cat_0_splits[1]),
        'cis_test_annotations.json': (cat_1_splits[2], cat_0_splits[2])
    }

    # Save splits
    for filename, (cat_1_records, cat_0_records) in splits_data.items():
        records = cat_1_records + cat_0_records
        output = {
            "info": {"description": "Combined dataset"},
            "images": [r['image'] for r in records],
            "annotations": [r['annotation'] for r in records],
            "categories": [
                {"id": 1, "name": "forage_fish"},
                {"id": 0, "name": "other"}
            ]
        }
        
        with open(ann_dir / filename, 'w') as f:
            json.dump(output, f, indent=4)
        
        print(f"\n{filename}:")
        print(f"Category 1: {len(cat_1_records)}")
        print(f"Category 0: {len(cat_0_records)}")
        print(f"Total: {len(records)}")

    print(f"\nTotal images in combined directory: {len(list(img_dir.glob('*.png')))}")

if __name__ == "__main__":
    merge_datasets()

KeyboardInterrupt: 

In [7]:
import json
import shutil
from pathlib import Path
from collections import Counter
import random

def setup_directories():
    base_dir = Path('/Users/talenrimmer/Desktop/CV4E_Code/combined_dataset')
    ann_dir = base_dir / 'eccv_18_annotation_files'
    img_dir = base_dir / 'eccv_18_all_images_sm'
    
    for dir in [base_dir, ann_dir, img_dir]:
        if dir.exists():
            shutil.rmtree(dir)
        dir.mkdir(parents=True)
    
    return base_dir, ann_dir, img_dir

def count_category_1(json_path):
    """Count category 1 examples in JSON file"""
    try:
        with open(json_path) as f:
            data = json.load(f)
        return sum(1 for ann in data['annotations'] if ann['category_id'] == 1)
    except FileNotFoundError:
        return 0

def safe_copy_image(src_path, dst_path, copied_files):
    """Copy image if it doesn't exist in destination"""
    try:
        if src_path.exists() and dst_path.name not in copied_files:
            shutil.copy2(src_path, dst_path)
            copied_files.add(dst_path.name)
            return True
    except Exception as e:
        print(f"Error copying {src_path}: {e}")
    return False

def split_by_ratio(data, ratios=[0.7, 0.2, 0.1]):
    """Split data by given ratios"""
    n = len(data)
    indices = list(range(n))
    random.shuffle(indices)
    
    cuts = [int(n * ratio) for ratio in ratios]
    cuts = [sum(cuts[:i]) for i in range(len(cuts) + 1)]
    
    return [
        [data[i] for i in indices[start:end]]
        for start, end in zip(cuts[:-1], cuts[1:])
    ]

def merge_datasets():
    # Setup paths
    gopro_ann_dir = Path('/Users/talenrimmer/Desktop/CV4E_Code/ct_classifier_outputs')
    farm_ann_dir = Path('/Users/talenrimmer/Desktop/All_training_data')
    gopro_img_dir = Path('/Users/talenrimmer/Desktop/CV4E_Code/images')
    farm_img_dir = Path('/Users/talenrimmer/Desktop/All_training_data/eccv_18_all_images_sm')
    
    # Print initial category 1 counts
    print("\nInitial Category 1 Counts:")
    print("\nGoPro Dataset:")
    for split in ['train_annotations.json', 'cis_val_annotations.json', 'cis_test_annotations.json']:
        count = count_category_1(gopro_ann_dir / split)
        print(f"{split}: {count} category 1 examples")

    print("\nFarm Dataset:")
    for split in ['train_annotations.json', 'cis_val_annotations.json', 'cis_test_annotations.json']:
        count = count_category_1(farm_ann_dir / split)
        print(f"{split}: {count} category 1 examples")

    base_dir, ann_dir, img_dir = setup_directories()
    
    # Track copied files
    copied_files = set()
    
    # Collect all valid images and annotations
    all_images = []
    all_annotations = []
    next_id = 1
    
    # Process GoPro dataset
    for split in ['train_annotations.json', 'cis_val_annotations.json', 'cis_test_annotations.json']:
        data = json.load(open(gopro_ann_dir / split))
        for img, ann in zip(data['images'], data['annotations']):
            src_path = gopro_img_dir / img['file_name']
            dst_path = img_dir / img['file_name']
            if safe_copy_image(src_path, dst_path, copied_files):
                img['id'] = next_id
                ann['image_id'] = next_id
                all_images.append(img)
                all_annotations.append(ann)
                next_id += 1

    # Process Farm dataset
    farm_splits = {
        'train': 'train_annotations.json',
        'val': 'cis_val_annotations.json',
        'test': 'cis_test_annotations.json'
    }
    for split in farm_splits.values():
        try:
            data = json.load(open(farm_ann_dir / split))
            for img, ann in zip(data['images'], data['annotations']):
                src_path = farm_img_dir / img['file_name']
                dst_path = img_dir / img['file_name']
                if safe_copy_image(src_path, dst_path, copied_files):
                    img['id'] = next_id
                    ann['image_id'] = next_id
                    all_images.append(img)
                    all_annotations.append(ann)
                    next_id += 1
        except FileNotFoundError:
            print(f"Warning: {split} not found in farm dataset")

    # Separate by category
    cat_1_records = []
    cat_0_records = []
    for img, ann in zip(all_images, all_annotations):
        record = {'image': img, 'annotation': ann}
        if ann['category_id'] == 1:
            cat_1_records.append(record)
        else:
            cat_0_records.append(record)

    # Split each category 70/20/10
    cat_1_splits = split_by_ratio(cat_1_records)
    cat_0_splits = split_by_ratio(cat_0_records)

    # Combine splits
    splits_data = {
        'train_annotations.json': (cat_1_splits[0], cat_0_splits[0]),
        'cis_val_annotations.json': (cat_1_splits[1], cat_0_splits[1]),
        'cis_test_annotations.json': (cat_1_splits[2], cat_0_splits[2])
    }

    # Save splits
    for filename, (cat_1_records, cat_0_records) in splits_data.items():
        records = cat_1_records + cat_0_records
        output = {
            "info": {"description": "Combined dataset"},
            "images": [r['image'] for r in records],
            "annotations": [r['annotation'] for r in records],
            "categories": [
                {"id": 1, "name": "forage_fish"},
                {"id": 0, "name": "other"}
            ]
        }
        
        with open(ann_dir / filename, 'w') as f:
            json.dump(output, f, indent=4)
        
        print(f"\n{filename}:")
        print(f"Category 1: {len(cat_1_records)}")
        print(f"Category 0: {len(cat_0_records)}")
        print(f"Total: {len(records)}")

    print(f"\nTotal unique images copied: {len(copied_files)}")
    print(f"Total images in combined directory: {len(list(img_dir.glob('*.png')))}")

if __name__ == "__main__":
    merge_datasets()


Initial Category 1 Counts:

GoPro Dataset:
train_annotations.json: 172 category 1 examples
cis_val_annotations.json: 50 category 1 examples
cis_test_annotations.json: 25 category 1 examples

Farm Dataset:
train_annotations.json: 306 category 1 examples
cis_val_annotations.json: 95 category 1 examples
cis_test_annotations.json: 45 category 1 examples
Error copying /Users/talenrimmer/Desktop/All_training_data/eccv_18_all_images_sm/KelpCam06_20220822T160156_1600x1200_awb-auto_exp-night_fr-10_q-20_sh-0_b-50_c-0_i-800_sat-0._FRAME_2430_raw.png: [Errno 28] No space left on device: '/Users/talenrimmer/Desktop/All_training_data/eccv_18_all_images_sm/KelpCam06_20220822T160156_1600x1200_awb-auto_exp-night_fr-10_q-20_sh-0_b-50_c-0_i-800_sat-0._FRAME_2430_raw.png' -> '/Users/talenrimmer/Desktop/CV4E_Code/combined_dataset/eccv_18_all_images_sm/KelpCam06_20220822T160156_1600x1200_awb-auto_exp-night_fr-10_q-20_sh-0_b-50_c-0_i-800_sat-0._FRAME_2430_raw.png'
Error copying /Users/talenrimmer/Desktop/Al

KeyboardInterrupt: 