In [None]:
"""
CV4E Project 1: Preparing Data for Machine Learning

This project is a first iteration of preparing data for use in a ResNet machine learning model.
The data are in the form of a .json file, which were downloaded from BIIGLE and located in the following folder:
/Users/talenrimmer/Desktop/CV4E_Code/data/21209-lazo-1-2024-05-22.json

The goals of this project are to:
- Teach a whole-image classifier to identify images that contain a specific class (forage fish)
(If the above is successful, then):
a) quantify forage fish into a density estimate (small, medium, large schools)
b) identify species of forage fish

"""

In [None]:
import os
import json
import shutil
import random
import re
from pathlib import Path
from sklearn.model_selection import train_test_split
from datetime import datetime
from collections import Counter
from itertools import groupby

def load_class_names():
    return [ 'Pinnipedia',  'Z. californianus',  'Else/Other',  'Gelatinous Object',  'Hydromedusae',  
            'P. bachei',  'Ctenophora',  'A. flavidus',  'Cydippida',  'Polyorchis sp.',  'Aequorea sp.',  
            'Hexagrammidae',  'Actinopterygii',  'Drift Algae',  'M. Cellularia',  'Scyphomedusae',  
            'Eutonina sp.',  'Sarsia sp.',  'Cnidaria',  'Background',  'A. Labiata',  'Embiotocidae',  
            'C. aggregata',  'Bolinopsidae',  'C. aggregata (dark morph)',  'R. vacca',  'Fish',  
            'F. Fish (unk)',  'Sch-Embiotocidae',  'Ex-Embiotocidae',  'E. Lateralis',  'Sch-C.pallasii',  
            'Ex-C. pallasii',  'Clupeidae',  'Ex-Clupeidae',  'B. Frenatus',  'Ex-C. aggregata',  
            'Sch-C. aggregata',  'Diving Birds',  'TBD 44',  'Scyphozoa',  'N. breviconis',  'C. pallasii']

def get_binary_category(class_id, class_names):
    if class_id >= len(class_names):
        return 0
    
    forage_fish_classes = [
        'F. Fish (unk)',
        'Sch-C.pallasii',
        'Ex-C. pallasii', 
        'Clupeidae',
        'Ex-Clupeidae',
        'Ex-Embiotocidae',
        'Sch-Embiotocidae',
        'Sch-E. mordax',
        'Ex-E. mordax',
    ]
    class_name = class_names[class_id]
    return 1 if class_name in forage_fish_classes else 0

def read_label_safely(label_path):
    try:
        with open(label_path) as f:
            for line in f:
                if line.strip():
                    parts = line.strip().split()
                    if parts:
                        return int(parts[0])
    except Exception as e:
        print(f"Error reading {label_path}: {e}")
    return None

def create_data_template():
    return {
        "info": {
            "description": "ff_test_data",
            "year": 2025,
            "contributor": "Talen",
            "date_created": datetime.now().strftime("%Y-%m-%d")
        },
        "images": [],
        "annotations": [],
        "categories": [
            {"id": 1, "name": "forage_fish"},
            {"id": 0, "name": "other"}
        ]
    }

def extract_video_id(filename):
    """Extract timestamp/video ID from filename"""
    match = re.search(r'(\d{8}T\d{6})', filename)
    return match.group(1) if match else None

def process_dataset():
    img_dir = Path('/Users/talenrimmer/Desktop/All_training_data/fish/images/train')
    label_dir = Path('/Users/talenrimmer/Desktop/All_training_data/fish/labels/train')
    output_img_dir = Path.cwd() / 'eccv_18_all_images_sm'
    
    if output_img_dir.exists():
        shutil.rmtree(output_img_dir)
    output_img_dir.mkdir(exist_ok=True)

    class_names = load_class_names()
    video_records = {}
    processed = 0

    # Group by video ID
    for img_path in img_dir.glob('*.png'):
        label_path = label_dir / f"{img_path.stem}.txt"
        if not label_path.exists():
            continue

        class_id = read_label_safely(label_path)
        if class_id is None or class_id >= len(class_names):
            continue

        video_id = extract_video_id(img_path.name)
        if video_id is None:
            continue

        record = {
            'image_id': processed,
            'file_name': img_path.name,
            'category_id': get_binary_category(class_id, class_names),
            'img_path': img_path,
            'video_id': video_id
        }
        
        if video_id not in video_records:
            video_records[video_id] = []
        video_records[video_id].append(record)
        processed += 1

    # Split videos maintaining independence
    video_ids = list(video_records.keys())
    random.shuffle(video_ids)
    
    n_videos = len(video_ids)
    train_idx = int(0.7 * n_videos)
    val_idx = int(0.9 * n_videos)
    
    train_videos = video_ids[:train_idx]
    val_videos = video_ids[train_idx:val_idx]
    test_videos = video_ids[val_idx:]

    # Combine records by split
    train_records = [r for vid in train_videos for r in video_records[vid]]
    val_records = [r for vid in val_videos for r in video_records[vid]]
    test_records = [r for vid in test_videos for r in video_records[vid]]

    # Balance categories within each split
    def balance_split(records):
        cat_1 = [r for r in records if r['category_id'] == 1]
        cat_0 = [r for r in records if r['category_id'] == 0]
        n_samples = min(len(cat_1), len(cat_0))
        if n_samples == 0:
            return []
        return random.sample(cat_1, n_samples) + random.sample(cat_0, n_samples)

    train_records = balance_split(train_records)
    val_records = balance_split(val_records)
    test_records = balance_split(test_records)

    # Copy images
    for record in train_records + val_records + test_records:
        shutil.copy2(record['img_path'], output_img_dir / record['file_name'])

    splits = {
        'train_annotations.json': train_records,
        'cis_val_annotations.json': val_records,
        'cis_test_annotations.json': test_records
    }

    print("\nVideo Distribution in Splits:")
    for name, records in splits.items():
        videos = set(r['video_id'] for r in records)
        cats = Counter(r['category_id'] for r in records)
        print(f"\n{name}:")
        print(f"Unique videos: {len(videos)}")
        print(f"Total images: {len(records)}")
        print(f"Category 1 (forage fish): {cats[1]}")
        print(f"Category 0 (other): {cats[0]}")
        print(f"Videos included: {sorted(list(videos))}")

        output = create_data_template()
        output["images"] = [{"id": r["image_id"], "file_name": r["file_name"]} for r in records]
        output["annotations"] = [{"image_id": r["image_id"], "category_id": r["category_id"]} for r in records]
        
        with open(name, 'w') as f:
            json.dump(output, f, indent=4)

if __name__ == "__main__":
    process_dataset()

In [None]:
import os
import json
import shutil
import random
from pathlib import Path
from sklearn.model_selection import train_test_split
from datetime import datetime
from collections import Counter

def load_class_names():
    return [ 'Pinnipedia',  'Z. californianus',  'Else/Other',  'Gelatinous Object',  'Hydromedusae',  
            'P. bachei',  'Ctenophora',  'A. flavidus',  'Cydippida',  'Polyorchis sp.',  'Aequorea sp.',  
            'Hexagrammidae',  'Actinopterygii',  'Drift Algae',  'M. Cellularia',  'Scyphomedusae',  
            'Eutonina sp.',  'Sarsia sp.',  'Cnidaria',  'Background',  'A. Labiata',  'Embiotocidae',  
            'C. aggregata',  'Bolinopsidae',  'C. aggregata (dark morph)',  'R. vacca',  'Fish',  
            'F. Fish (unk)',  'Sch-Embiotocidae',  'Ex-Embiotocidae',  'E. Lateralis',  'Sch-C.pallasii',  
            'Ex-C. pallasii',  'Clupeidae',  'Ex-Clupeidae',  'B. Frenatus',  'Ex-C. aggregata',  
            'Sch-C. aggregata',  'Diving Birds',  'TBD 44',  'Scyphozoa',  'N. breviconis',  'C. pallasii']

def get_binary_category(class_id, class_names):
    if class_id >= len(class_names):
        return 0
    
    forage_fish_classes = [
        'F. Fish (unk)',
        'Sch-C.pallasii',
        'Ex-C. pallasii', 
        'Clupeidae',
        'Ex-Clupeidae',
        # 'Ex-C. aggregata',
        # 'Sch-C. aggregata'
        # 'Ex-Embiotocidae',
        # 'Sch-Embiotocidae',
        'Sch-E. mordax',
        'Ex-E. mordax',
        # 'Ex-R. vacca',
        # 'Sch-R. vacca',
    ]
    class_name = class_names[class_id]
    return 1 if class_name in forage_fish_classes else 0

def read_label_safely(label_path):
    try:
        with open(label_path) as f:
            for line in f:
                if line.strip():
                    parts = line.strip().split()
                    if parts:
                        return int(parts[0])
    except Exception as e:
        print(f"Error reading {label_path}: {e}")
    return None

def create_data_template():
    return {
        "info": {
            "description": "ff_test_data",
            "year": 2025,
            "contributor": "Talen",
            "date_created": datetime.now().strftime("%Y-%m-%d")
        },
        "images": [],
        "annotations": [],
        "categories": [
            {"id": 1, "name": "forage_fish"},
            {"id": 0, "name": "other"}
        ]
    }

def process_dataset():
    img_dir = Path('/Users/talenrimmer/Desktop/All_training_data/fish/images/train')
    label_dir = Path('/Users/talenrimmer/Desktop/All_training_data/fish/labels/train')
    output_img_dir = Path.cwd() / 'eccv_18_all_images_sm'
    
    if output_img_dir.exists():
        shutil.rmtree(output_img_dir)
    output_img_dir.mkdir(exist_ok=True)

    class_names = load_class_names()
    category_1_samples = []
    category_0_samples = []
    processed = 0
    skipped = 0

    for img_path in img_dir.glob('*.png'):
        label_path = label_dir / f"{img_path.stem}.txt"
        if not label_path.exists():
            skipped += 1
            continue

        class_id = read_label_safely(label_path)
        if class_id is None or class_id >= len(class_names):
            skipped += 1
            continue

        record = {
            'image_id': processed + 1,
            'file_name': img_path.name,
            'category_id': get_binary_category(class_id, class_names),
            'img_path': img_path
        }

        if record['category_id'] == 1:
            category_1_samples.append(record)
        else:
            category_0_samples.append(record)
        
        processed += 1

    # Limit category 1 samples to target size
    if len(category_1_samples) > 652:
        category_1_samples = random.sample(category_1_samples, 652)
    
    num_positive = len(category_1_samples)
    selected_category_0 = random.sample(category_0_samples, num_positive)
    balanced_dataset = category_1_samples + selected_category_0
    random.shuffle(balanced_dataset)

    print(f"\nInitial Dataset Summary:")
    print(f"Total category 1 (forage fish): {len(category_1_samples)}")
    print(f"Total category 0 (other) available: {len(category_0_samples)}")
    print(f"Category 0 selected for balance: {len(selected_category_0)}")
    print(f"Total balanced samples: {len(balanced_dataset)}")

    copied_images = set()
    for record in balanced_dataset:
        if record['file_name'] not in copied_images:
            shutil.copy2(record['img_path'], output_img_dir / record['file_name'])
            copied_images.add(record['file_name'])

    train_records, temp = train_test_split(balanced_dataset, test_size=0.3, random_state=42)
    val_records, test_records = train_test_split(temp, test_size=1/3, random_state=42)

    splits = {
        'train_annotations.json': train_records,
        'cis_val_annotations.json': val_records,
        'cis_test_annotations.json': test_records
    }

    print("\nSplit Distribution:")
    for name, records in splits.items():
        counts = Counter(r['category_id'] for r in records)
        print(f"\n{name}:")
        print(f"Total images: {len(records)}")
        print(f"Category 1 (forage fish): {counts[1]}")
        print(f"Category 0 (other): {counts[0]}")

        output = create_data_template()
        output["images"] = [{"id": r["image_id"], "file_name": r["file_name"]} for r in records]
        output["annotations"] = [{"image_id": r["image_id"], "category_id": r["category_id"]} for r in records]
        
        with open(name, 'w') as f:
            json.dump(output, f, indent=4)

if __name__ == "__main__":
    process_dataset()

In [None]:
import json
import random
import matplotlib.pyplot as plt
from pathlib import Path
import numpy as np
from PIL import Image

def visualize_samples():
    # Load JSON file (using train set as example)
    with open('train_annotations.json', 'r') as f:
        data = json.load(f)
    
    # Get image directory
    img_dir = Path.cwd() / 'eccv_18_all_images_sm'
    
    # Separate by category
    cat_0_images = []
    cat_1_images = []
    
    for img, ann in zip(data['images'], data['annotations']):
        if ann['category_id'] == 0:
            cat_0_images.append(img['file_name'])
        else:
            cat_1_images.append(img['file_name'])
    
    # Sample 5 random images from each category
    cat_0_samples = random.sample(cat_0_images, 5)
    cat_1_samples = random.sample(cat_1_images, 5)
    
    # Create subplot grid
    fig, axes = plt.subplots(2, 5, figsize=(20, 8))
    
    # Plot category 0 samples
    for idx, img_name in enumerate(cat_0_samples):
        img_path = img_dir / img_name
        img = Image.open(img_path)
        axes[0, idx].imshow(img)
        axes[0, idx].axis('off')
        axes[0, idx].set_title(f'Category 0\n{img_name[:20]}...')
    
    # Plot category 1 samples
    for idx, img_name in enumerate(cat_1_samples):
        img_path = img_dir / img_name
        img = Image.open(img_path)
        axes[1, idx].imshow(img)
        axes[1, idx].axis('off')
        axes[1, idx].set_title(f'Category 1\n{img_name[:20]}...')
    
    plt.suptitle('Sample Images by Category', fontsize=16)
    plt.tight_layout()
    plt.show()

if __name__ == "__main__":
    visualize_samples()

In [None]:
import json
import pandas as pd
import re
from pathlib import Path

def load_json_data(filename):
    """Load and extract data from JSON annotation file"""
    with open(filename, 'r') as f:
        data = json.load(f)
    return data['images'], data['annotations']

def extract_timestamp(filename):
    """Extract timestamp from image filename"""
    match = re.search(r'(\d{8}T\d{6})', filename)
    return match.group(1) if match else None

def analyze_video_data():
    # Load all annotation files
    json_files = ['train_annotations.json', 
                  'cis_val_annotations.json', 
                  'cis_test_annotations.json']
    
    # Store all data
    all_timestamps = []
    all_categories = []
    
    # Process each JSON file
    for json_file in json_files:
        images, annotations = load_json_data(json_file)
        
        # Extract data from each image/annotation pair
        for img, ann in zip(images, annotations):
            timestamp = extract_timestamp(img['file_name'])
            if timestamp:
                all_timestamps.append(timestamp)
                all_categories.append(ann['category_id'])
    
    # Create and process dataframe
    df = pd.DataFrame({
        'timestamp': all_timestamps,
        'category_id': all_categories
    })
    
    # Group and aggregate data
    grouped_df = df.groupby('timestamp').agg({
        'category_id': ['sum', 'count']
    }).reset_index()
    
    # Format columns
    grouped_df.columns = ['video_id', 'forage_fish_count', 'number_of_images']
    
    # Sort and calculate percentages
    grouped_df['percentage_forage_fish'] = (
        grouped_df['forage_fish_count'] / grouped_df['number_of_images'] * 100
    ).round(2)
    
    # Sort by count
    grouped_df = grouped_df.sort_values('forage_fish_count', ascending=False)
    
    # Display results
    pd.set_option('display.max_rows', None)
    print("\nForage Fish Analysis by Video:")
    print(grouped_df)
    
    return grouped_df

if __name__ == "__main__":
    df = analyze_video_data()

In [None]:
# Now we need to group the images by video to visualize the distribution of forage fish across videos:
import json
import pandas as pd
import re
from pathlib import Path

def analyze_annotations():
    # Load JSON data from train annotations (contains most data)
    with open('train_annotations.json', 'r') as f:
        data = json.load(f)
    
    # Initialize lists to store data
    timestamps = []
    categories = []
    
    # Extract timestamp and category for each image
    for img, ann in zip(data['images'], data['annotations']):
        # Extract timestamp using regex (matches pattern like '20220709T210155')
        match = re.search(r'(\d{8}T\d{6})', img['file_name'])
        if match:
            timestamps.append(match.group(1))
            categories.append(ann['category_id'])
    
    # Create initial dataframe
    df = pd.DataFrame({
        'timestamp': timestamps,
        'category_id': categories
    })
    
    # Group by timestamp and aggregate data
    grouped_df = df.groupby('timestamp').agg({
        'category_id': ['sum', 'count']  # sum for forage fish count, count for total images
    }).reset_index()
    
    # Clean up column names
    grouped_df.columns = ['video_id', 'forage_fish_count', 'number_of_images']
    
    # Sort by forage fish count
    grouped_df = grouped_df.sort_values('forage_fish_count', ascending=False)
    
    print("\nForage Fish Counts by Video:")
    print(grouped_df)
    
    return grouped_df

if __name__ == "__main__":
    analyze_annotations()