In [None]:
# This Jupyter notebook will combine the .json files from the kf_vis and farm_data_prep notebooks

"""
Ok great! 

I now have two datasets that both contain identically-named json and image folders.

The first dataset (called gopro dataset) has the following structure:

annotation files pathnames:

test file: /Users/talenrimmer/Desktop/CV4E_Code/ct_classifier_outputs/eccv_18_annotation_files/cis_test_annotations.json
train file: /Users/talenrimmer/Desktop/CV4E_Code/ct_classifier_outputs/eccv_18_annotation_files/train_annotations.json
val file: /Users/talenrimmer/Desktop/CV4E_Code/ct_classifier_outputs/eccv_18_annotation_files/cis_val_annotations.json

image files pathnames:
/Users/talenrimmer/Desktop/CV4E_Code/images


The second dataset (called farm dataset) has the following structure:

annotation files pathnames:

test file: /Users/talenrimmer/Desktop/CV4E_Code/cis_test_annotations.json
train file: /Users/talenrimmer/Desktop/CV4E_Code/train_annotations.json
val file: /Users/talenrimmer/Desktop/CV4E_Code/cis_val_annotations.json

image files pathnames:
/Users/talenrimmer/Desktop/CV4E_Code/eccv_18_all_images_sm

provide code to combine these two datasets into a single dataset with the following structure:

create a new folder called combined_dataset
create a new folder called eccv_18_annotation_files inside combined_dataset
combine the test json files from the two datasets into a single file called cis_test_annotations.json inside eccv_18_annotation_files
combine the train json files from the two datasets into a single file called train_annotations.json inside eccv_18_annotation_files
combine the val json files from the two datasets into a single file called cis_val_annotations.json inside eccv_18_annotation_files

create a new folder called eccv_18_all_images_sm inside combined_dataset

Then print the total number of files in each file type (json train, test, val, and images) in the combined datasets.
Print a method to verify that the combined dataset is correct.

"""

In [None]:
import json
import shutil
from pathlib import Path
from collections import Counter

def setup_directories():
    """Create new directory structure"""
    base_dir = Path('/Users/talenrimmer/Desktop/CV4E_Code/combined_dataset')
    ann_dir = base_dir / 'eccv_18_annotation_files'
    img_dir = base_dir / 'eccv_18_all_images_sm'
    
    for dir in [base_dir, ann_dir, img_dir]:
        if dir.exists():
            shutil.rmtree(dir)
        dir.mkdir(parents=True)
    
    return base_dir, ann_dir, img_dir

def safe_copy_image(src_path, dst_path):
    """Copy image file with error handling"""
    try:
        if src_path.exists():
            shutil.copy2(src_path, dst_path)
            return True
    except Exception as e:
        print(f"Error copying {src_path}: {e}")
    return False

def merge_datasets():
    # Setup paths
    gopro_ann_dir = Path('/Users/talenrimmer/Desktop/CV4E_Code/ct_classifier_outputs/eccv_18_annotation_files')
    farm_ann_dir = Path('/Users/talenrimmer/Desktop/CV4E_Code')
    gopro_img_dir = Path('/Users/talenrimmer/Desktop/CV4E_Code/images')
    farm_img_dir = Path('/Users/talenrimmer/Desktop/CV4E_Code/eccv_18_all_images_sm')
    
    # Create directories
    base_dir, ann_dir, img_dir = setup_directories()
    
    # Track skipped items
    total_skipped = {'gopro': 0, 'farm': 0}
    
    # Process each split
    splits = ['train_annotations.json', 'cis_val_annotations.json', 'cis_test_annotations.json']
    
    for split in splits:
        print(f"\nProcessing {split}...")
        
        # Load JSONs
        gopro_data = json.load(open(gopro_ann_dir / split))
        farm_data = json.load(open(farm_ann_dir / split))
        
        # Track valid images
        valid_images = []
        valid_annotations = []
        next_id = 1
        split_skipped = {'gopro': 0, 'farm': 0}
        
        # Process GoPro images
        for img, ann in zip(gopro_data['images'], gopro_data['annotations']):
            src_path = gopro_img_dir / img['file_name']
            dst_path = img_dir / img['file_name']
            
            if safe_copy_image(src_path, dst_path):
                img['id'] = next_id
                ann['image_id'] = next_id
                valid_images.append(img)
                valid_annotations.append(ann)
                next_id += 1
            else:
                split_skipped['gopro'] += 1
                total_skipped['gopro'] += 1
        
        # Process Farm images
        for img, ann in zip(farm_data['images'], farm_data['annotations']):
            src_path = farm_img_dir / img['file_name']
            dst_path = img_dir / img['file_name']
            
            if safe_copy_image(src_path, dst_path):
                img['id'] = next_id
                ann['image_id'] = next_id
                valid_images.append(img)
                valid_annotations.append(ann)
                next_id += 1
            else:
                split_skipped['farm'] += 1
                total_skipped['farm'] += 1
        
        # Create merged data
        merged_data = {
            "info": gopro_data.get('info', {}),
            "images": valid_images,
            "annotations": valid_annotations,
            "categories": gopro_data.get('categories', [])
        }
        
        # Save merged JSON
        with open(ann_dir / split, 'w') as f:
            json.dump(merged_data, f, indent=4)
            
        print(f"Images processed: {len(valid_images)}")
        print(f"Annotations processed: {len(valid_annotations)}")
        print(f"Images skipped in this split - GoPro: {split_skipped['gopro']}, Farm: {split_skipped['farm']}")
        cats = Counter(ann['category_id'] for ann in valid_annotations)
        print(f"Category distribution: {dict(cats)}")
    
    # Final summary
    print(f"\nTotal images in combined directory: {len(list(img_dir.glob('*.png')))}")
    print(f"Total images skipped - GoPro: {total_skipped['gopro']}, Farm: {total_skipped['farm']}")

if __name__ == "__main__":
    merge_datasets()