### Verifying the paths are working

In [1]:
from pathlib import Path
import os

# Base directory - assumes your notebook is in /notebooks
base_dir = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
print(f"Base directory: {base_dir}\nExists: {base_dir.exists()}\n")

# Data directories
data_dir = base_dir / 'data'
raw_data_dir = data_dir / 'raw'
processed_data_dir = data_dir / 'processed'
features_dir = data_dir / 'features'

print(f"Data directory: {data_dir}\nExists: {data_dir.exists()}")
print(f"Raw data directory: {raw_data_dir}\nExists: {raw_data_dir.exists()}")
print(f"Processed data directory: {processed_data_dir}\nExists: {processed_data_dir.exists()}")
print(f"Features directory: {features_dir}\nExists: {features_dir.exists()}\n")

# Path to metadata CSV
metadata_path = raw_data_dir / 'Birds Voice.csv'
print(f"Metadata path: {metadata_path}\nExists: {metadata_path.exists()}\n")

# Path to audio files
audio_files_dir = raw_data_dir / 'Voice of Birds'
print(f"Audio files directory: {audio_files_dir}\nExists: {audio_files_dir.exists()}\n")

# List bird folders if audio directory exists
if audio_files_dir.exists():
    bird_folders = [d for d in audio_files_dir.iterdir() if d.is_dir()]
    print(f"Found {len(bird_folders)} bird folders")
    if bird_folders:
        print("\nFirst few bird folders:")
        for bird_folder in bird_folders[:3]:  # Print first 3 as samples
            print(f"- {bird_folder.name}")
        
        # Check MP3 files in first bird folder
        sample_bird = bird_folders[0]
        mp3_files = list(sample_bird.glob('*.mp3'))
        print(f"\nFound {len(mp3_files)} MP3 files in {sample_bird.name}")
        if mp3_files:
            print("\nFirst few MP3 files:")
            for mp3 in mp3_files[:3]:  # Print first 3 as samples
                print(f"- {mp3.name}")
else:
    print("Warning: Audio files directory not found!")

Base directory: /Users/arnaumartin/Birdify
Exists: True

Data directory: /Users/arnaumartin/Birdify/data
Exists: True
Raw data directory: /Users/arnaumartin/Birdify/data/raw
Exists: False
Processed data directory: /Users/arnaumartin/Birdify/data/processed
Exists: True
Features directory: /Users/arnaumartin/Birdify/data/features
Exists: True

Metadata path: /Users/arnaumartin/Birdify/data/raw/Birds Voice.csv
Exists: False

Audio files directory: /Users/arnaumartin/Birdify/data/raw/Voice of Birds
Exists: False



In [2]:
import os
import essentia.standard as ess
import numpy as np
import pandas as pd
import glob
import random
from tqdm import tqdm
import warnings
from pathlib import Path
from collections import defaultdict
from audiomentations import Compose, TimeStretch

warnings.filterwarnings("ignore", category=RuntimeWarning)

# Configuration
BASE_DIR = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
DATA_DIR = BASE_DIR / 'data'
RAW_DATA_DIR = DATA_DIR / 'raw'
PROCESSED_DATA_DIR = DATA_DIR / 'processed'
METADATA_PATH = RAW_DATA_DIR / 'Birds Voice.csv'
AUDIO_FILES_DIR = RAW_DATA_DIR / 'Voice of Birds'
OUTPUT_DIR = PROCESSED_DATA_DIR / 'birdcall_segments_3s'
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Audio processing parameters
fs = 22050
segment_duration = 5  # seconds
segment_samples = int(fs * segment_duration)
windowSize = 4096 * 4
hopSize = 4096 * 2
NRG_threshold_ratio = 0.01
required_segments = 50
target_bird_count = 10

def amplitude_normalization(x):
    max = np.max(np.abs(x))
    return x / max
    
def safe_mono_loader(filepath, sample_rate):
    try:
        return ess.MonoLoader(filename=str(filepath), sampleRate=sample_rate)()
    except Exception as e:
        print(f"Warning: Could not load {filepath.name} - {str(e)}")
        return None

def extract_birdcall_segments(x, params):
    if x is None or len(x) < params['segment_samples']:
        return []

    NRG = []
    for frame in ess.FrameGenerator(x, frameSize=params['windowSize'], 
                                  hopSize=params['hopSize'], startFromZero=True):
        NRG.append(ess.Energy()(frame))
    
    if not NRG:
        return []

    NRG = np.array(NRG)
    NRG = NRG / (np.max(NRG) + 1e-10)
    active_frames = np.where(NRG > params['NRG_threshold_ratio'])[0]

    if len(active_frames) == 0:
        return []

    segments = []
    for frame_idx in active_frames:
        sample_idx = frame_idx * params['hopSize']
        start = max(0, sample_idx - params['segment_samples'] // 2)
        end = start + params['segment_samples']
        if end > len(x):
            continue
        segment = x[start:end]
        if np.max(np.abs(segment)) > 0.05:  # Minimum amplitude threshold
            normalized_segment = amplitude_normalization(segment)
            segments.append(normalized_segment)
    
    return segments

def find_best_segments(all_segments, required_count):
    """Select the most representative segments by energy distribution"""
    if len(all_segments) <= required_count:
        return all_segments
    
    # Calculate energy for each segment
    energies = [np.mean(np.abs(seg)**2) for seg in all_segments]
    
    # Sort by energy and select diverse samples
    sorted_indices = np.argsort(energies)
    selected_indices = np.linspace(0, len(sorted_indices)-1, required_count, dtype=int)
    return [all_segments[i] for i in sorted_indices[selected_indices]]

augmentation = Compose([TimeStretch(min_rate=random.uniform(0.2, 0.6), max_rate=random.uniform(0.6, 1))])

def process_bird_species(bird_folders, limit=10):
    """Process bird folders and return up to the specified limit"""
    bird_stats = {}
    
    for bird_name, folder_name in tqdm(bird_folders[:limit], desc="Processing birds"):
        full_folder_path = AUDIO_FILES_DIR / folder_name
        audio_files = list(full_folder_path.glob("*.mp3")) + \
                     list(full_folder_path.glob("*.wav")) + \
                     list(full_folder_path.glob("*.flac"))
        
        if not audio_files:
            print(f"\nWarning: No audio files found for {bird_name}")
            continue

        print(f"\nProcessing {bird_name} ({len(audio_files)} audio files)...")
        
        # Parameters for this bird
        params = {
            'windowSize': windowSize,
            'hopSize': hopSize,
            'NRG_threshold_ratio': NRG_threshold_ratio,
            'segment_samples': segment_samples
        }
        
        # Process all audio files for this bird
        all_segments = []
        for file_path in tqdm(audio_files, desc=f"Processing {bird_name[:15]}...", leave=False):
            audio = safe_mono_loader(file_path, fs)
            if audio is not None:
                segments = extract_birdcall_segments(audio, params)
                all_segments.extend(segments)
        
        if not all_segments:
            print(f"Warning: No segments extracted for {bird_name}")
            continue
            
        # Select best segments
        final_segments = find_best_segments(all_segments, required_segments)
        
        # If we don't have enough, augment with pitch shifting
        if len(final_segments) < required_segments:
            needed = required_segments - len(final_segments)
            augmented = []
            source_segments = final_segments if final_segments else all_segments
            for _ in range(needed):
                seg = random.choice(source_segments)
                try:
                    augmented_segment = augmented(seg,fs)
                    augmented.append(augmented_segment[:segment_samples])
                except Exception:
                    continue
            final_segments.extend(augmented)
        
        # Save segments
        for i, segment in enumerate(final_segments[:required_segments]):
            safe_name = bird_name.replace("/", "-").replace(" ", "_")
            filename = OUTPUT_DIR / f"{safe_name}_{i:03d}.wav"
            ess.MonoWriter(filename=str(filename), format='wav', sampleRate=fs)(segment)
        
        # Record stats
        bird_stats[bird_name] = {
            'total_files': len(audio_files),
            'segments_found': len(all_segments),
            'final_segments': len(final_segments),
            'augmented': max(0, required_segments - len(all_segments))
        }
    
    return bird_stats

def find_all_bird_folders():
    """Find all valid bird folders with audio files"""
    bird_folders = []
    for folder in AUDIO_FILES_DIR.glob("*_sound"):
        if folder.is_dir():
            # Convert folder name to bird name
            bird_name = folder.stem.replace("_sound", "").replace("_", " ").replace("-", "/")
            bird_folders.append((bird_name, folder.name))
    return bird_folders

# Main execution
if __name__ == "__main__":
    print("Starting bird audio processing...")
    
    # Find all available bird folders
    all_bird_folders = find_all_bird_folders()
    if not all_bird_folders:
        raise RuntimeError("No bird folders found! Check your data directory.")
    
    print(f"\nFound {len(all_bird_folders)} bird folders:")
    for i, (name, _) in enumerate(all_bird_folders[:10]):
        print(f"{i+1}. {name}")
    
    # Process birds (up to target count)
    stats = process_bird_species(all_bird_folders, limit=target_bird_count)
    
    # Print summary
    print("\n" + "="*80)
    print("Processing Summary:")
    print("{:<30} {:<15} {:<15} {:<15} {:<15}".format(
        "Bird Species", "Audio Files", "Segments Found", "Final Segments", "Augmented"))
    
    for bird, data in stats.items():
        print("{:<30} {:<15} {:<15} {:<15} {:<15}".format(
            bird[:28] + ("..." if len(bird) > 28 else ""),
            data['total_files'],
            data['segments_found'],
            data['final_segments'],
            data['augmented']
        ))
    
    print(f"\n✅ Successfully processed {len(stats)} birds")
    print(f"📁 Output saved to: {OUTPUT_DIR}")

: 

: 