### Preprocessing audio files

In [None]:
import os
import essentia.standard as ess
import numpy as np
import pandas as pd
import glob
import random
from tqdm import tqdm
import warnings
from pathlib import Path
from collections import defaultdict
from audiomentations import Compose, TimeStretch, AddGaussianNoise, Shift
from scipy.signal import butter, lfilter
import noisereduce as nr
import requests
from itertools import combinations
import time
import psutil
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

try:
    import GPUtil
    GPU_MONITORING = True
except ImportError:
    GPU_MONITORING = False
try:
    from pyspectator.processor import Cpu
    CPU_MONITORING = True
except ImportError:
    CPU_MONITORING = False

warnings.filterwarnings("ignore", category=RuntimeWarning)

# Temperature monitoring configuration
MAX_CPU_TEMP = 85  # °C - threshold for CPU temperature
MAX_GPU_TEMP = 85  # °C - threshold for GPU temperature
COOL_DOWN_TEMP = 75  # °C - resume processing when cooled to this temp
COOL_DOWN_TIME = 60  # seconds to wait when overheating
MONITOR_INTERVAL = 10  # seconds between temperature checks
PROCESSING_BATCH_SIZE = 3  # number of audio files to process between checks

# Configuration
BASE_DIR = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
DATA_DIR = BASE_DIR / 'data'
RAW_DATA_DIR = DATA_DIR / 'raw'
PROCESSED_DATA_DIR = DATA_DIR / 'processed'
METADATA_PATH = RAW_DATA_DIR / 'Birds_Voice.csv'
AUDIO_FILES_DIR = RAW_DATA_DIR / 'Voice of Birds'
OUTPUT_DIR = PROCESSED_DATA_DIR / 'birdcall_segments_5s_113'
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Create metadata CSV path
METADATA_CSV_PATH = PROCESSED_DATA_DIR / 'birdcall_metadata_113.csv'

VALID_VOCALIZATIONS = {'song', 'call'}

def get_system_temperatures():
    """Get CPU and GPU temperatures"""
    temps = {'cpu': None, 'gpu': None}
    
    # Get CPU temperature
    if CPU_MONITORING:
        try:
            cpu = Cpu(monitoring_latency=1)
            temps['cpu'] = cpu.temperature
        except Exception as e:
            print(f"Could not read CPU temperature: {e}")
    
    # Get GPU temperature
    if GPU_MONITORING:
        try:
            gpus = GPUtil.getGPUs()
            if gpus:
                temps['gpu'] = gpus[0].temperature
        except Exception as e:
            print(f"Could not read GPU temperature: {e}")
    
    return temps

def check_overheating():
    """Check if system is overheating"""
    temps = get_system_temperatures()
    overheating = False
    
    if temps['cpu'] and temps['cpu'] >= MAX_CPU_TEMP:
        print(f"⚠️ CPU overheating: {temps['cpu']}°C")
        overheating = True
    
    if temps['gpu'] and temps['gpu'] >= MAX_GPU_TEMP:
        print(f"⚠️ GPU overheating: {temps['gpu']}°C")
        overheating = True
    
    return overheating, temps

def cool_down():
    """Pause processing to allow system to cool down"""
    print(f"\n🚨 System overheating! Cooling down for {COOL_DOWN_TIME} seconds...")
    start_time = time.time()
    
    with tqdm(total=COOL_DOWN_TIME, desc="Cooling down") as pbar:
        while time.time() - start_time < COOL_DOWN_TIME:
            time.sleep(1)
            pbar.update(1)
            
            # Check temperatures during cooldown
            _, temps = check_overheating()
            if temps['cpu'] and temps['cpu'] <= COOL_DOWN_TEMP and \
               temps['gpu'] and temps['gpu'] <= COOL_DOWN_TEMP:
                break
    
    print("✅ Resuming processing")

def monitor_temperature():
    """Wrapper to check temperature periodically"""
    last_check = 0
    
    def wrapper():
        nonlocal last_check
        current_time = time.time()
        if current_time - last_check >= MONITOR_INTERVAL:
            last_check = current_time
            overheating, temps = check_overheating()
            if overheating:
                cool_down()
        return True
    
    return wrapper

# Read metadata and filtering
df = pd.read_csv(METADATA_PATH)
df['TYPE'] = df['TYPE'].str.lower()
df['Country'] = df['Country'].str.strip()
df = df[df['TYPE'].isin(VALID_VOCALIZATIONS)]

def select_diverse_birds(df, target_count=113):
    """Select birds with maximum diversity using TF-IDF and cosine similarity"""
    # Get all unique bird names
    bird_names = df['common_name'].unique().tolist()
    
    # Create TF-IDF vectors for bird names
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(bird_names)
    
    # Compute pairwise similarity matrix
    similarity_matrix = cosine_similarity(X)
    
    selected_indices = []
    remaining_indices = set(range(len(bird_names)))
    
    # Start with the bird that's least similar to others on average
    avg_similarities = np.mean(similarity_matrix, axis=1)
    first_bird = np.argmin(avg_similarities)
    selected_indices.append(first_bird)
    remaining_indices.remove(first_bird)
    
    while remaining_indices and len(selected_indices) < target_count:
        # For each remaining bird, calculate minimum similarity to selected birds
        min_similarities = []
        for candidate in remaining_indices:
            min_sim = min(similarity_matrix[candidate, s] for s in selected_indices)
            min_similarities.append((candidate, min_sim))
        
        # Select the bird with maximum minimum similarity
        candidate, _ = max(min_similarities, key=lambda x: x[1])
        selected_indices.append(candidate)
        remaining_indices.remove(candidate)
    
    # Return the selected bird names
    return [bird_names[i] for i in selected_indices]

# Select diverse birds
diverse_birds = select_diverse_birds(df)
df = df[df['common_name'].isin(diverse_birds)]

# Download only if not already downloaded
for idx, row in df.iterrows():
    folder_name = row['common_name'].strip().replace(' ', '_').lower() + '_sound'
    folder_path = AUDIO_FILES_DIR / folder_name
    os.makedirs(folder_path, exist_ok=True)
    audio_url = row['Download_link']
    audio_filename = f"{row['xc_id']}.mp3"
    audio_path = folder_path / audio_filename
    if not audio_path.exists():
        try:
            r = requests.get(audio_url, timeout=20)
            if r.status_code == 200:
                with open(audio_path, 'wb') as f:
                    f.write(r.content)
                print(f"Downloading: {audio_filename} in {folder_name}")
            else:
                print(f"Error downloading {audio_url}: status {r.status_code}")
        except Exception as e:
            print(f"Error downloading {audio_url}: {e}")
    else:
        print(f"Already exists: {audio_filename} in {folder_name}")

# Audio processing parameters
fs = 22050
segment_duration = 5  # seconds
segment_samples = int(fs * segment_duration)
windowSize = 4096
hopSize = 2048
NRG_threshold_ratio = 0.01
required_segments = 100  # Changed from 50 to 100
target_bird_count = 113

# Define functions

def load_and_filter_metadata():
    df = pd.read_csv(METADATA_PATH, sep=",")
    df['TYPE'] = df['TYPE'].str.lower()
    df['Country'] = df['Country'].str.strip()
    vocalization_filter = df['TYPE'].isin(VALID_VOCALIZATIONS)
    filtered_df = df[vocalization_filter].copy()
    filtered_df = filtered_df[filtered_df['common_name'].isin(diverse_birds)]
    return filtered_df

def get_bird_folders_for_species(metadata_df):
    metadata_df['folder_name'] = metadata_df['common_name'].str.replace(' ', '_').str.lower() + '_sound'
    print("Expected folders:")
    print(metadata_df['folder_name'].unique())
    print("\nActual folders in Voice of Birds:")
    print([f.name for f in AUDIO_FILES_DIR.glob("*") if f.is_dir()])
    existing_folders = []
    for folder in metadata_df['folder_name'].unique():
        full_path = AUDIO_FILES_DIR / folder
        if full_path.exists():
            existing_folders.append((folder.replace('_sound', ''), folder))
    return existing_folders

# Audio processing functions
def amplitude_normalization(x):
    max = np.max(np.abs(x))
    return x / max

def bandpass_filter(x, fs, low_freq=500, high_freq=9000, order=4):
    nyquist = 0.5 * fs
    low = low_freq / nyquist
    high = high_freq / nyquist
    b, a = butter(order, [low, high], btype='band')
    return lfilter(b, a, x)

def audio_preprocessing(x, fs):
    # Filtering
    x = bandpass_filter(x, fs)

    # Denoising
    x = nr.reduce_noise(y=x, sr=fs)
    
    # Amplitude normalization
    x = amplitude_normalization(x)
    
    return x

def safe_mono_loader(filepath, sample_rate):
    try:
        return ess.MonoLoader(filename=str(filepath), sampleRate=sample_rate)()
    except Exception as e:
        print(f"Warning: Could not load {filepath.name} - {str(e)}")
        return None

# Segmentation and preprocessing
def extract_non_overlapping_segments(x, params):
    """Extract non-overlapping segments with bird vocalizations"""
    if x is None or len(x) < params['segment_samples']:
        return []

    NRG = []
    for frame in ess.FrameGenerator(x, frameSize=params['windowSize'], 
                                  hopSize=params['hopSize'], startFromZero=True):
        NRG.append(ess.Energy()(frame))
    
    if not NRG:
        return []

    NRG = np.array(NRG)
    NRG = NRG / (np.max(NRG) + 1e-10)
    
    # Find regions with energy above threshold
    active_regions = np.where(NRG > params['NRG_threshold_ratio'])[0]
    if len(active_regions) == 0:
        return []

    # Group consecutive active frames into regions
    regions = []
    current_start = active_regions[0]
    for i in range(1, len(active_regions)):
        if active_regions[i] - active_regions[i-1] > 1:
            regions.append((current_start, active_regions[i-1]))
            current_start = active_regions[i]
    regions.append((current_start, active_regions[-1]))

    segments = []
    used_samples = set()
    
    for start_frame, end_frame in regions:
        # Convert frame indices to sample indices
        start_sample = start_frame * params['hopSize']
        end_sample = end_frame * params['hopSize'] + params['windowSize']
        
        # Calculate possible segment starts within this region
        possible_starts = range(
            start_sample,
            end_sample - params['segment_samples'],
            params['segment_samples']  # Non-overlapping
        )
        
        for start in possible_starts:
            end = start + params['segment_samples']
            
            # Check if this segment overlaps with any used samples
            overlap = any(s in used_samples for s in range(start, end))
            if not overlap and end <= len(x):
                segment = x[start:end]
                if np.max(np.abs(segment)) > 0.05:  # Minimum amplitude threshold
                    processed_segment = audio_preprocessing(segment, fs)
                    segments.append(processed_segment)
                    # Mark these samples as used
                    used_samples.update(range(start, end))
    
    return segments

def find_best_segments(all_segments, required_count):
    """Select the most representative segments by energy distribution"""
    if len(all_segments) <= required_count:
        return all_segments
    
    # Calculate energy for each segment
    energies = [np.mean(np.abs(seg)**2) for seg in all_segments]
    
    # Sort by energy and select diverse samples
    sorted_indices = np.argsort(energies)
    selected_indices = np.linspace(0, len(sorted_indices)-1, required_count, dtype=int)
    return [all_segments[i] for i in sorted_indices[selected_indices]]

# Data augmentation
augmentation = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
    TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),
    Shift(min_shift=-0.5, max_shift=0.5, p=0.5)
])

def process_bird_species(bird_folders, limit=113):
    """Process bird folders and return up to the specified limit"""
    bird_stats = {}
    metadata_records = []  # List to store metadata for each segment
    temp_check = monitor_temperature()
    
    for bird_name, folder_name in tqdm(bird_folders[:limit], desc="Processing birds"):
        # Check temperature before processing each bird
        temp_check()
        
        full_folder_path = AUDIO_FILES_DIR / folder_name
        audio_files = list(full_folder_path.glob("*.mp3")) + \
                     list(full_folder_path.glob("*.wav")) + \
                     list(full_folder_path.glob("*.flac"))
        
        if not audio_files:
            print(f"\nWarning: No audio files found for {bird_name}")
            continue

        print(f"\nProcessing {bird_name} ({len(audio_files)} audio files)...")
        
        # Get the call type for this bird from the original metadata
        bird_metadata = df[df['common_name'].str.replace(' ', '_').str.lower() == bird_name.replace(' ', '_')]
        if not bird_metadata.empty:
            call_type = bird_metadata.iloc[0]['TYPE']
        else:
            call_type = 'unknown'
        
        # Parameters for this bird
        params = {
            'windowSize': windowSize,
            'hopSize': hopSize,
            'NRG_threshold_ratio': NRG_threshold_ratio,
            'segment_samples': segment_samples
        }
        
        # Process all audio files for this bird in batches with temperature checks
        all_segments = []
        for i in range(0, len(audio_files), PROCESSING_BATCH_SIZE):
            batch_files = audio_files[i:i+PROCESSING_BATCH_SIZE]
            
            # Check temperature before each batch
            temp_check()
            
            for file_path in tqdm(batch_files, desc=f"Processing {bird_name[:15]}...", leave=False):
                audio = safe_mono_loader(file_path, fs)
                if audio is not None:
                    segments = extract_non_overlapping_segments(audio, params)
                    all_segments.extend(segments)
        
        if not all_segments:
            print(f"Warning: No segments extracted for {bird_name}")
            continue
            
        # Select best segments
        final_segments = find_best_segments(all_segments, required_segments)
        
        # If we don't have enough, augment with time stretch, shift, and noise
        if len(final_segments) < required_segments:
            needed = required_segments - len(final_segments)
            augmented = []
            source_segments = final_segments if final_segments else all_segments
            for _ in range(needed):
                seg = random.choice(source_segments)
                try:
                    augmented_segment = augmentation(seg, fs)
                    augmented.append(augmented_segment[:segment_samples])
                except Exception:
                    continue
            final_segments.extend(augmented)
        
        # Save segments and record metadata
        for i, segment in enumerate(final_segments[:required_segments]):
            safe_name = bird_name.replace("/", "-").replace(" ", "_")
            filename = f"{safe_name}_{i:03d}.wav"
            filepath = OUTPUT_DIR / filename
            ess.MonoWriter(filename=str(filepath), format='wav', sampleRate=fs)(segment)
            
            # Add to metadata records
            metadata_records.append({
                'filename': filename,
                'species': bird_name,
                'call_type': call_type
            })
        
        # Record stats
        bird_stats[bird_name] = {
            'total_files': len(audio_files),
            'segments_found': len(all_segments),
            'final_segments': len(final_segments),
            'augmented': max(0, required_segments - len(all_segments))
        }
    
    # Save metadata to CSV
    metadata_df = pd.DataFrame(metadata_records)
    metadata_df.to_csv(METADATA_CSV_PATH, index=False)
    print(f"\nMetadata saved to: {METADATA_CSV_PATH}")
    
    return bird_stats

def find_all_bird_folders():
    """Find all valid bird folders with audio files"""
    bird_folders = []
    for folder in AUDIO_FILES_DIR.glob("*_sound"):
        if folder.is_dir():
            # Convert folder name to bird name
            bird_name = folder.stem.replace("_sound", "").replace("_", " ").replace("-", "/")
            bird_folders.append((bird_name, folder.name))
    return bird_folders

# Main
def main():
    print("Starting diverse bird audio processing...")
    print(f"Temperature thresholds - CPU: {MAX_CPU_TEMP}°C, GPU: {MAX_GPU_TEMP}°C")
    
    # Check if temperature monitoring is available
    if not CPU_MONITORING and not GPU_MONITORING:
        print("⚠️ Warning: Temperature monitoring not available. Install packages:")
        print("For CPU: pip install pyspectator")
        print("For GPU: pip install gputil")
        print("Continuing without temperature monitoring...")

    # 1. Load and filter metadata
    print("\nLoading and filtering metadata...")
    filtered_metadata = load_and_filter_metadata()

    # 2. Get folders for selected species
    bird_folders = get_bird_folders_for_species(filtered_metadata)

    if not bird_folders:
        raise RuntimeError("No valid bird folders found for the selected species!")

    print(f"\nFound {len(bird_folders)} bird species to process:")
    for i, (name, _) in enumerate(bird_folders[:10]):  # Show only the first 10
        print(f"{i+1}. {name}")
    if len(bird_folders) > 10:
        print(f"... and {len(bird_folders)-10} more")

    # 3. Process the species
    stats = process_bird_species(bird_folders)

    # 4. Generate summary
    print("\n" + "="*80)
    print("Processing Summary:")
    print("{:<30} {:<15} {:<15} {:<15} {:<15}".format(
        "Bird Species", "Audio Files", "Segments Found", "Final Segments", "Augmented"))

    for bird, data in stats.items():
        print("{:<30} {:<15} {:<15} {:<15} {:<15}".format(
            bird[:28] + ("..." if len(bird) > 28 else ""),
            data['total_files'],
            data['segments_found'],
            data['final_segments'],
            data['augmented']
        ))

    print(f"\n✅ Successfully processed {len(stats)} diverse bird species")
    print(f"📁 Output saved to: {OUTPUT_DIR}")
    print(f"📄 Metadata saved to: {METADATA_CSV_PATH}")

if __name__ == "__main__":
    main()