## After we get clip_manifest.parquet

### Inspecting the clip_manifest

In [10]:
import pandas as pd
import numpy as np
import sys
import os
from pathlib import Path
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

In [11]:
# === Absolute roots ===
HOME = Path(os.environ["HOME"])
REPO_ROOT = HOME / "Uni-stuff/semester-2/applied_Ml/reef_zmsc"

# === Paths ===
IN_MANIFEST = REPO_ROOT / "data/manifests/clip_manifest.parquet"

In [12]:
def load_manifest(manifest_path):
    """Load and display basic info about the manifest"""
    print("=" * 80)
    print("LOADING MANIFEST")
    print("=" * 80)
    
    df = pd.read_parquet(manifest_path)
    
    print(f"\n✓ Loaded manifest with {len(df):,} clips")
    print(f"\nColumns: {list(df.columns)}")
    print(f"\nData types:\n{df.dtypes}")
    print(f"\nMemory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    
    return df

def check_required_columns(df):
    """Verify essential columns exist"""
    print("\n" + "=" * 80)
    print("CHECKING REQUIRED COLUMNS")
    print("=" * 80)
    
    required = ['clip_id', 'filepath', 'start_time']
    optional = ['end_time', 'duration', 'logger_id', 'date', 'hour']
    
    missing = [col for col in required if col not in df.columns]
    present_optional = [col for col in optional if col in df.columns]
    
    if missing:
        print(f"\n✗ Missing required columns: {missing}")
        print("\nExpected columns for this analysis:")
        print("  - clip_id: Unique identifier")
        print("  - filepath: Path to audio file")
        print("  - start_time: Timestamp (for temporal stratification)")
    else:
        print(f"\n✓ All required columns present")
    
    if present_optional:
        print(f"\n✓ Optional columns found: {present_optional}")
    
    return len(missing) == 0

def inspect_timestamps(df):
    """Analyze temporal distribution"""
    print("\n" + "=" * 80)
    print("TEMPORAL ANALYSIS")
    print("=" * 80)
    
    if 'start_time' not in df.columns:
        print("\n✗ No 'start_time' column - cannot analyze temporal patterns")
        return None
    
    # Try to parse timestamps
    try:
        df['start_time'] = pd.to_datetime(df['start_time'])
        print("\n✓ Timestamps parsed successfully")
    except Exception as e:
        print(f"\n✗ Error parsing timestamps: {e}")
        print(f"\nSample values:\n{df['start_time'].head()}")
        return None
    
    # Extract temporal features
    df['date'] = df['start_time'].dt.date
    df['hour'] = df['start_time'].dt.hour
    df['day_of_week'] = df['start_time'].dt.day_name()
    
    # Basic stats
    print(f"\nDate range: {df['date'].min()} to {df['date'].max()}")
    print(f"Total days: {df['date'].nunique()}")
    print(f"Total hours covered: {(df['start_time'].max() - df['start_time'].min()).total_seconds() / 3600:.1f}")
    
    # Hourly distribution
    print("\n" + "-" * 80)
    print("HOURLY DISTRIBUTION")
    print("-" * 80)
    
    hourly_counts = df['hour'].value_counts().sort_index()
    print("\nClips per hour:")
    for hour, count in hourly_counts.items():
        bar = '█' * int(count / hourly_counts.max() * 50)
        print(f"  {hour:02d}:00  {count:>7,}  {bar}")
    
    # Identify biological windows
    dawn_clips = df[df['hour'].between(4, 6)].shape[0]
    dusk_clips = df[df['hour'].between(17, 19)].shape[0]
    total_bio_windows = dawn_clips + dusk_clips
    
    print("\n" + "-" * 80)
    print("BIOLOGICAL ACTIVITY WINDOWS")
    print("-" * 80)
    print(f"\nDawn chorus (04:00-06:59): {dawn_clips:,} clips ({dawn_clips/len(df)*100:.1f}%)")
    print(f"Dusk chorus (17:00-19:59): {dusk_clips:,} clips ({dusk_clips/len(df)*100:.1f}%)")
    print(f"Total biological windows:   {total_bio_windows:,} clips ({total_bio_windows/len(df)*100:.1f}%)")
    
    if total_bio_windows / len(df) < 0.15:
        print("\n⚠ WARNING: <15% of clips in dawn/dusk windows")
        print("  Random sampling will likely miss biological activity!")
    
    return df

def check_file_paths(df, sample_size=10):
    """Verify file paths exist"""
    print("\n" + "=" * 80)
    print("FILE PATH VALIDATION")
    print("=" * 80)
    
    if 'filepath' not in df.columns:
        print("\n✗ No 'filepath' column found")
        return
    
    print(f"\nChecking sample of {sample_size} file paths...")
    
    sample_paths = df['filepath'].sample(min(sample_size, len(df)))
    existing = 0
    missing = 0
    
    for path in sample_paths:
        if Path(path).exists():
            existing += 1
        else:
            missing += 1
            if missing == 1:  # Show first missing file as example
                print(f"\nExample path: {path}")
                print(f"Exists: {Path(path).exists()}")
    
    print(f"\n✓ Existing files: {existing}/{sample_size}")
    if missing > 0:
        print(f"✗ Missing files: {missing}/{sample_size}")
        print("\nNote: Files may be on a different machine/mount")
    else:
        print("\n✓ All sampled files exist")

def analyze_logger_distribution(df):
    """Check if multiple loggers/deployments"""
    print("\n" + "=" * 80)
    print("LOGGER/DEPLOYMENT ANALYSIS")
    print("=" * 80)
    
    logger_cols = [col for col in df.columns if 'logger' in col.lower() or 'deployment' in col.lower()]
    
    if not logger_cols:
        print("\n⚠ No logger/deployment columns found")
        print("  Assuming single deployment")
        return
    
    for col in logger_cols:
        unique_vals = df[col].nunique()
        print(f"\n{col}:")
        print(f"  Unique values: {unique_vals}")
        if unique_vals < 10:
            print(f"  Distribution:\n{df[col].value_counts()}")

def calculate_sampling_recommendations(df):
    """Suggest sampling strategy based on data"""
    print("\n" + "=" * 80)
    print("SAMPLING RECOMMENDATIONS")
    print("=" * 80)
    
    if 'hour' not in df.columns:
        print("\n⚠ Cannot provide temporal recommendations without hour data")
        return
    
    total_clips = len(df)
    
    # Define temporal bins
    bins = {
        'dawn_chorus': (4, 7),
        'morning': (7, 12),
        'midday': (12, 16),
        'dusk_chorus': (17, 20),
        'night': (20, 24),
        'pre_dawn': (0, 4)
    }
    
    print("\nFor 20,000 clip sample:")
    print("\n" + "-" * 60)
    print(f"{'Temporal Bin':<20} {'Available':>12} {'Random':>10} {'Stratified':>10}")
    print("-" * 60)
    
    for bin_name, (start, end) in bins.items():
        available = df[df['hour'].between(start, end-1)].shape[0]
        random_expected = int(20000 * (available / total_clips))
        
        # Stratified suggestion (oversample biological windows)
        if 'chorus' in bin_name:
            stratified = 5000  # 25% each for dawn/dusk
        elif bin_name == 'morning' or bin_name == 'night':
            stratified = 3000  # 15% each
        else:
            stratified = 2000  # 10% each
        
        print(f"{bin_name:<20} {available:>12,} {random_expected:>10,} {stratified:>10,}")
    
    print("-" * 60)
    print("\nKey insight:")
    print("  • Random sampling: ~17% in dawn/dusk (proportional to data)")
    print("  • Stratified sampling: 50% in dawn/dusk (biological focus)")
    print("\nRecommendation: Use STRATIFIED sampling to capture biological activity")

def generate_summary_report(df):
    """Create summary statistics"""
    print("\n" + "=" * 80)
    print("SUMMARY REPORT")
    print("=" * 80)
    
    print(f"\nTotal clips: {len(df):,}")
    
    if 'start_time' in df.columns and pd.api.types.is_datetime64_any_dtype(df['start_time']):
        duration_days = (df['start_time'].max() - df['start_time'].min()).days
        print(f"Time span: {duration_days} days")
        print(f"Clips per day: {len(df) / max(duration_days, 1):,.0f}")
    
    if 'duration' in df.columns:
        total_hours = df['duration'].sum() / 3600
        print(f"Total audio: {total_hours:,.1f} hours")
    elif 'start_time' in df.columns:
        # Assume 10s clips
        total_hours = len(df) * 10 / 3600
        print(f"Total audio (estimated): {total_hours:,.1f} hours")
    
    print("\nData quality:")
    print(f"  Missing values: {df.isnull().sum().sum()}")
    print(f"  Duplicate clip_ids: {df['clip_id'].duplicated().sum() if 'clip_id' in df.columns else 'N/A'}")


In [None]:
def main():
    print(f"Inspecting manifest at: {IN_MANIFEST}")
    print()
    
    if not IN_MANIFEST.exists():
        print(f"✗ Error: File not found: {IN_MANIFEST}")
        print(f"\nExpected location: {IN_MANIFEST}")
        print(f"Repository root: {REPO_ROOT}")
        print(f"\nPlease verify:")
        print(f"  1. Repository is at: {REPO_ROOT}")
        print(f"  2. Manifest exists at: data/manifests/clip_manifest.parquet")
        sys.exit(1)
    
    # Run inspection pipeline
    df = load_manifest(IN_MANIFEST)
    
    if not check_required_columns(df):
        print("\n⚠ WARNING: Missing required columns. Analysis may be incomplete.")
    
    df = inspect_timestamps(df)
    check_file_paths(df, sample_size=10)
    analyze_logger_distribution(df)
    
    if df is not None and 'hour' in df.columns:
        calculate_sampling_recommendations(df)
    
    generate_summary_report(df if df is not None else pd.read_parquet(IN_MANIFEST))

if __name__ == "__main__":
    main()

Inspecting manifest at: /home/sparch/Uni-stuff/semester-2/applied_Ml/reef_zmsc/data/manifests/clip_manifest.parquet

LOADING MANIFEST

✓ Loaded manifest with 1,053,610 clips

Columns: ['clip_id', 'filepath', 'logger', 'date', 'clip_index', 'start_time', 'timestamp_source', 'start_s', 'end_s', 'duration_s', 'sr', 'channels', 'start_frame', 'end_frame', 'n_frames', 'overload_flags', 'first_data_time_utc', 'finalised_time_utc', 'rec_start_time_utc', 'rms']

Data types:
clip_id                 object
filepath                object
logger                  object
date                    object
clip_index               int64
start_time              object
timestamp_source        object
start_s                float64
end_s                  float64
duration_s             float64
sr                       int64
channels                 int64
start_frame              int64
end_frame                int64
n_frames                 int64
overload_flags          object
first_data_time_utc     object
fi