In [None]:
# This Jupyter notebook will combine the .json files from the kf_vis and farm_data_prep notebooks

"""
Ok great! 

I now have two datasets that both contain identically-named json and image folders.

The first dataset (called gopro dataset) has the following structure:

annotation files pathnames:

test file: /Users/talenrimmer/Desktop/CV4E_Code/ct_classifier_outputs/eccv_18_annotation_files/cis_test_annotations.json
train file: /Users/talenrimmer/Desktop/CV4E_Code/ct_classifier_outputs/eccv_18_annotation_files/train_annotations.json
val file: /Users/talenrimmer/Desktop/CV4E_Code/ct_classifier_outputs/eccv_18_annotation_files/cis_val_annotations.json

image files pathnames:
/Users/talenrimmer/Desktop/CV4E_Code/images


The second dataset (called farm dataset) has the following structure:

annotation files pathnames:

test file: /Users/talenrimmer/Desktop/CV4E_Code/cis_test_annotations.json
train file: /Users/talenrimmer/Desktop/CV4E_Code/train_annotations.json
val file: /Users/talenrimmer/Desktop/CV4E_Code/cis_val_annotations.json

image files pathnames:
/Users/talenrimmer/Desktop/CV4E_Code/eccv_18_all_images_sm

provide code to combine these two datasets into a single dataset with the following structure:

create a new folder called combined_dataset
create a new folder called eccv_18_annotation_files inside combined_dataset
combine the test json files from the two datasets into a single file called cis_test_annotations.json inside eccv_18_annotation_files
combine the train json files from the two datasets into a single file called train_annotations.json inside eccv_18_annotation_files
combine the val json files from the two datasets into a single file called cis_val_annotations.json inside eccv_18_annotation_files

create a new folder called eccv_18_all_images_sm inside combined_dataset

Then print the total number of files in each file type (json train, test, val, and images) in the combined datasets.
Print a method to verify that the combined dataset is correct.

"""

In [None]:
# Updated Jan 22nd 2:52pm

import json
import shutil
from pathlib import Path
from collections import Counter
import random

def setup_directories():
    # =====================================================================
    # DIRECTORY 1: CHANGE BASE OUTPUT DIRECTORY HERE
    # Default: /mnt/class_data/group4/talen/combined_imbalanced
    # =====================================================================
    base_dir = Path('/mnt/class_data/group4/talen/combined_imbalanced')
    ann_dir = base_dir / 'eccv_18_annotation_files'
    img_dir = base_dir / 'eccv_18_all_images_sm'
    
    print("\n=== OUTPUT DIRECTORIES ===")
    print(f"Base directory: {base_dir}")
    print(f"Annotation directory: {ann_dir}")
    print(f"Image directory: {img_dir}")
    print("=========================\n")
    
    for dir in [base_dir, ann_dir, img_dir]:
        if dir.exists():
            shutil.rmtree(dir)
        dir.mkdir(parents=True)
    
    return base_dir, ann_dir, img_dir

def count_category_1(json_path):
    """Count category 1 examples in JSON file"""
    try:
        with open(json_path) as f:
            data = json.load(f)
        return sum(1 for ann in data['annotations'] if ann['category_id'] == 1)
    except FileNotFoundError:
        return 0

def safe_copy_image(src_path, dst_path, copied_files):
    """Copy image if it doesn't exist in destination"""
    try:
        if src_path.exists() and dst_path.name not in copied_files:
            shutil.copy2(src_path, dst_path)
            copied_files.add(dst_path.name)
            return True
    except Exception as e:
        print(f"Error copying {src_path}: {e}")
    return False

def split_by_ratio(data, ratios=[0.7, 0.2, 0.1]):
    """Split data by given ratios"""
    n = len(data)
    indices = list(range(n))
    random.shuffle(indices)
    
    cuts = [int(n * ratio) for ratio in ratios]
    cuts = [sum(cuts[:i]) for i in range(len(cuts) + 1)]
    
    return [
        [data[i] for i in indices[start:end]]
        for start, end in zip(cuts[:-1], cuts[1:])
    ]

def merge_datasets():
    # =====================================================================
    # DIRECTORY 2: CHANGE INPUT DIRECTORIES HERE
    # =====================================================================
    gopro_ann_dir = Path('/mnt/class_data/group4/talen/gopro_preJan21/eccv_18_annotation_files')
    farm_ann_dir = Path('/mnt/class_data/group4/talen/farm_output/eccv_18_annotation_files')
    gopro_img_dir = Path('/mnt/class_data/group4/talen/gopro_preJan21/eccv_18_all_images_sm')
    farm_img_dir = Path('/mnt/class_data/group4/talen/farm_output/eccv_18_all_images_sm')      # Farm images path
    
    print("\n=== INPUT DIRECTORIES ===")
    print(f"GoPro annotations: {gopro_ann_dir}")
    print(f"Farm annotations: {farm_ann_dir}")
    print(f"GoPro images: {gopro_img_dir}")
    print(f"Farm images: {farm_img_dir}")
    print("========================\n")
    
    # Print initial category 1 counts
    print("\nInitial Category 1 Counts:")
    print("\nGoPro Dataset:")
    for split in ['train_annotations.json', 'cis_val_annotations.json', 'cis_test_annotations.json']:
        count = count_category_1(gopro_ann_dir / split)
        print(f"{split}: {count} category 1 examples")

    print("\nFarm Dataset:")
    for split in ['train_annotations.json', 'cis_val_annotations.json', 'cis_test_annotations.json']:
        count = count_category_1(farm_ann_dir / split)
        print(f"{split}: {count} category 1 examples")

    base_dir, ann_dir, img_dir = setup_directories()
    
    # Track copied files
    copied_files = set()
    
    # Collect all valid images and annotations
    all_images = []
    all_annotations = []
    next_id = 1
    
    # Process GoPro dataset
    for split in ['train_annotations.json', 'cis_val_annotations.json', 'cis_test_annotations.json']:
        data = json.load(open(gopro_ann_dir / split))
        for img, ann in zip(data['images'], data['annotations']):
            src_path = gopro_img_dir / img['file_name']
            dst_path = img_dir / img['file_name']
            if safe_copy_image(src_path, dst_path, copied_files):
                img['id'] = next_id
                ann['image_id'] = next_id
                all_images.append(img)
                all_annotations.append(ann)
                next_id += 1

    # Process Farm dataset
    farm_splits = {
        'train': 'train_annotations.json',
        'val': 'cis_val_annotations.json',
        'test': 'cis_test_annotations.json'
    }
    for split in farm_splits.values():
        try:
            data = json.load(open(farm_ann_dir / split))
            for img, ann in zip(data['images'], data['annotations']):
                src_path = farm_img_dir / img['file_name']
                dst_path = img_dir / img['file_name']
                if safe_copy_image(src_path, dst_path, copied_files):
                    img['id'] = next_id
                    ann['image_id'] = next_id
                    all_images.append(img)
                    all_annotations.append(ann)
                    next_id += 1
        except FileNotFoundError:
            print(f"Warning: {split} not found in farm dataset")

    # Separate by category
    cat_1_records = []
    cat_0_records = []
    for img, ann in zip(all_images, all_annotations):
        record = {'image': img, 'annotation': ann}
        if ann['category_id'] == 1:
            cat_1_records.append(record)
        else:
            cat_0_records.append(record)

    # Split each category 70/20/10
    cat_1_splits = split_by_ratio(cat_1_records)
    cat_0_splits = split_by_ratio(cat_0_records)

    # Combine splits
    splits_data = {
        'train_annotations.json': (cat_1_splits[0], cat_0_splits[0]),
        'cis_val_annotations.json': (cat_1_splits[1], cat_0_splits[1]),
        'cis_test_annotations.json': (cat_1_splits[2], cat_0_splits[2])
    }

    # Save splits
    for filename, (cat_1_records, cat_0_records) in splits_data.items():
        records = cat_1_records + cat_0_records
        output = {
            "info": {"description": "Combined dataset"},
            "images": [r['image'] for r in records],
            "annotations": [r['annotation'] for r in records],
            "categories": [
                {"id": 1, "name": "forage_fish"},
                {"id": 0, "name": "other"}
            ]
        }
        
        with open(ann_dir / filename, 'w') as f:
            json.dump(output, f, indent=4)
        
        print(f"\n{filename}:")
        print(f"Category 1: {len(cat_1_records)}")
        print(f"Category 0: {len(cat_0_records)}")
        print(f"Total: {len(records)}")

    print(f"\nTotal unique images copied: {len(copied_files)}")
    print(f"Total images in combined directory: {len(list(img_dir.glob('*.png')))}")

if __name__ == "__main__":
    merge_datasets()

In [None]:
# Updated Jan 29th - but needs to be tested! if not working, use the code above.

import json
import shutil
from pathlib import Path
from collections import Counter
import random

def setup_directories():
    # =====================================================================
    # DIRECTORY 1: CHANGE BASE OUTPUT DIRECTORY HERE
    # Default: /mnt/class_data/group4/talen/combined_imbalanced
    # =====================================================================
    base_dir = Path('/mnt/class_data/group4/talen/combined_imbalanced')
    ann_dir = base_dir / 'eccv_18_annotation_files'
    img_dir = base_dir / 'eccv_18_all_images_sm'
    
    print("\n=== OUTPUT DIRECTORIES ===")
    print(f"Base directory: {base_dir}")
    print(f"Annotation directory: {ann_dir}")
    print(f"Image directory: {img_dir}")
    print("=========================\n")
    
    for dir in [base_dir, ann_dir, img_dir]:
        if dir.exists():
            shutil.rmtree(dir)
        dir.mkdir(parents=True)
    
    return base_dir, ann_dir, img_dir

def count_category_1(json_path):
    """Count category 1 examples in JSON file"""
    try:
        with open(json_path) as f:
            data = json.load(f)
        return sum(1 for ann in data['annotations'] if ann['category_id'] == 1)
    except FileNotFoundError:
        return 0

def count_category_0(json_path):
    """Count category 0 examples in JSON file"""
    try:
        with open(json_path) as f:
            data = json.load(f)
        return sum(1 for ann in data['annotations'] if ann['category_id'] == 0)
    except FileNotFoundError:
        return 0

def safe_copy_image(src_path, dst_path, copied_files):
    """Copy image if it doesn't exist in destination"""
    try:
        if src_path.exists() and dst_path.name not in copied_files:
            shutil.copy2(src_path, dst_path)
            copied_files.add(dst_path.name)
            return True
    except Exception as e:
        print(f"Error copying {src_path}: {e}")
    return False

def split_by_ratio(data, ratios=[0.7, 0.2, 0.1]):
    """Split data by given ratios"""
    n = len(data)
    indices = list(range(n))
    random.shuffle(indices)
    
    cuts = [int(n * ratio) for ratio in ratios]
    cuts = [sum(cuts[:i]) for i in range(len(cuts) + 1)]
    
    return [
        [data[i] for i in indices[start:end]]
        for start, end in zip(cuts[:-1], cuts[1:])
    ]

def merge_datasets():
    # =====================================================================
    # DIRECTORY 2: CHANGE INPUT DIRECTORIES HERE
    # =====================================================================
    gopro_ann_dir = Path('/mnt/class_data/group4/talen/gopro_preJan21/eccv_18_annotation_files')
    farm_ann_dir = Path('/mnt/class_data/group4/talen/farm_output/eccv_18_annotation_files')
    gopro_img_dir = Path('/mnt/class_data/group4/talen/gopro_preJan21/eccv_18_all_images_sm')
    farm_img_dir = Path('/mnt/class_data/group4/talen/farm_output/eccv_18_all_images_sm')
    
    print("\n=== INPUT DIRECTORIES ===")
    print(f"GoPro annotations: {gopro_ann_dir}")
    print(f"Farm annotations: {farm_ann_dir}")
    print(f"GoPro images: {gopro_img_dir}")
    print(f"Farm images: {farm_img_dir}")
    print("========================\n")
    
    print("\nInitial Category Counts:")
    print("\nGoPro Dataset:")
    for split in ['train_annotations.json', 'cis_val_annotations.json', 'cis_test_annotations.json']:
        cat_1_count = count_category_1(gopro_ann_dir / split)
        cat_0_count = count_category_0(gopro_ann_dir / split)
        print(f"{split}:")
        print(f"  Category 1: {cat_1_count} examples")
        print(f"  Category 0: {cat_0_count} examples")

    print("\nFarm Dataset:")
    for split in ['train_annotations.json', 'cis_val_annotations.json', 'cis_test_annotations.json']:
        cat_1_count = count_category_1(farm_ann_dir / split)
        cat_0_count = count_category_0(farm_ann_dir / split)
        print(f"{split}:")
        print(f"  Category 1: {cat_1_count} examples")
        print(f"  Category 0: {cat_0_count} examples")

    base_dir, ann_dir, img_dir = setup_directories()
    
    copied_files = set()
    all_images = []
    all_annotations = []
    next_id = 1
    
    # Process GoPro dataset
    for split in ['train_annotations.json', 'cis_val_annotations.json', 'cis_test_annotations.json']:
        data = json.load(open(gopro_ann_dir / split))
        for img, ann in zip(data['images'], data['annotations']):
            src_path = gopro_img_dir / img['file_name']
            dst_path = img_dir / img['file_name']
            if safe_copy_image(src_path, dst_path, copied_files):
                img['id'] = next_id
                ann['image_id'] = next_id
                all_images.append(img)
                all_annotations.append(ann)
                next_id += 1

    # Process Farm dataset
    farm_splits = {
        'train': 'train_annotations.json',
        'val': 'cis_val_annotations.json',
        'test': 'cis_test_annotations.json'
    }
    for split in farm_splits.values():
        try:
            data = json.load(open(farm_ann_dir / split))
            for img, ann in zip(data['images'], data['annotations']):
                src_path = farm_img_dir / img['file_name']
                dst_path = img_dir / img['file_name']
                if safe_copy_image(src_path, dst_path, copied_files):
                    img['id'] = next_id
                    ann['image_id'] = next_id
                    all_images.append(img)
                    all_annotations.append(ann)
                    next_id += 1
        except FileNotFoundError:
            print(f"Warning: {split} not found in farm dataset")

    # Separate by category
    cat_1_records = []
    cat_0_records = []
    for img, ann in zip(all_images, all_annotations):
        record = {'image': img, 'annotation': ann}
        if ann['category_id'] == 1:
            cat_1_records.append(record)
        else:
            cat_0_records.append(record)

    # Split each category 70/20/10
    cat_1_splits = split_by_ratio(cat_1_records)
    cat_0_splits = split_by_ratio(cat_0_records)

    # Combine splits
    splits_data = {
        'train_annotations.json': (cat_1_splits[0], cat_0_splits[0]),
        'cis_val_annotations.json': (cat_1_splits[1], cat_0_splits[1]),
        'cis_test_annotations.json': (cat_1_splits[2], cat_0_splits[2])
    }

    # Save splits
    for filename, (cat_1_records, cat_0_records) in splits_data.items():
        records = cat_1_records + cat_0_records
        output = {
            "info": {"description": "Combined dataset"},
            "images": [r['image'] for r in records],
            "annotations": [r['annotation'] for r in records],
            "categories": [
                {"id": 1, "name": "forage_fish"},
                {"id": 0, "name": "other"}
            ]
        }
        
        with open(ann_dir / filename, 'w') as f:
            json.dump(output, f, indent=4)
        
        print(f"\n{filename}:")
        print(f"Category 1: {len(cat_1_records)}")
        print(f"Category 0: {len(cat_0_records)}")
        print(f"Total: {len(records)}")

    print(f"\nTotal unique images copied: {len(copied_files)}")
    print(f"Total images in combined directory: {len(list(img_dir.glob('*.png')))}")

if __name__ == "__main__":
    merge_datasets()