### Performing Feature Extraction 

In [None]:
import os
import essentia.standard as ess
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from pathlib import Path
import warnings

warnings.filterwarnings("ignore")

# Configuration
BASE_DIR = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
DATA_DIR = BASE_DIR / 'data'
PROCESSED_DATA_DIR = DATA_DIR / 'processed' / 'birdcall_segments_5s_113'
METADATA_PATH = DATA_DIR / 'processed' / 'birdcall_metadata_113.csv'
FEATURES_DIR = DATA_DIR / 'features'
FEATURES_DIR.mkdir(parents=True, exist_ok=True)

def load_metadata():
    """Load and validate metadata CSV"""
    metadata = pd.read_csv(METADATA_PATH)
    
    # Validate required columns
    required_cols = {'filename', 'species', 'call_type'}
    if not required_cols.issubset(metadata.columns):
        missing = required_cols - set(metadata.columns)
        raise ValueError(f"Metadata missing required columns: {missing}")
    
    # Create full paths and verify files exist
    metadata['full_path'] = metadata['filename'].apply(
        lambda x: str(PROCESSED_DATA_DIR / x)
    )
    
    # Check which files actually exist
    metadata['file_exists'] = metadata['full_path'].apply(
        lambda x: Path(x).exists()
    )
    
    existing_files = metadata[metadata['file_exists']]
    if len(existing_files) == 0:
        raise FileNotFoundError("No audio files found matching metadata records")
    
    print(f"Loaded metadata for {len(metadata)} records")
    print(f"Found {len(existing_files)} matching audio files")
    
    return existing_files[['full_path', 'species', 'call_type']].to_dict('records')

def initialize_extractor():
    """Configure audio feature extractor with optimal settings"""
    return ess.FreesoundExtractor(
        lowlevelStats=["mean", "stdev"],
        tonalStats=["mean", "stdev"],
        mfccStats=["mean", "stdev"],
        gfccStats=["mean", "stdev"],
        lowlevelFrameSize=2048,  # Smaller window for bird calls
        lowlevelHopSize=1024,
        lowlevelSilentFrames="drop"
    )

def extract_features(audio_files, output_csv):
    """Batch feature extraction with progress tracking"""
    extractor = initialize_extractor()
    features_data = []
    failed_files = []
    
    # Get feature names from first successful file
    sample_features = None
    for file in audio_files[:5]:  # Try first 5 files
        try:
            features, _ = extractor(file['full_path'])
            sample_features = features
            break
        except Exception:
            continue
    
    if sample_features is None:
        raise RuntimeError("Could not extract features from any sample file")
    
    # Select only numeric features (skip arrays)
    feature_names = sorted([
        desc for desc in sample_features.descriptorNames()
        if isinstance(sample_features[desc], (float, int)) and 
        any(x in desc for x in ["lowlevel", "mfcc", "gfcc", "tonal"])
    ])
    
    # Batch processing with error handling
    for file in tqdm(audio_files, desc="Extracting features"):
        try:
            features, _ = extractor(file['full_path'])
            row = {name: features[name] for name in feature_names}
            row.update({
                'species': file['species'],
                'call_type': file['call_type'],
                'filename': Path(file['full_path']).name
            })
            features_data.append(row)
        except Exception as e:
            failed_files.append((file['full_path'], str(e)))
    
    # Error reporting
    if failed_files:
        print(f"\nFailed to process {len(failed_files)} files ({(len(failed_files)/len(audio_files)*100):.1f}%)")
        for path, error in failed_files[:3]:
            print(f"- {Path(path).name}: {error}")
    
    # Create and save dataframe
    features_df = pd.DataFrame(features_data)
    
    # Versioned output
    version = 1
    while output_csv.exists():
        output_csv = output_csv.parent / f"{output_csv.stem}_v{version}{output_csv.suffix}"
        version += 1
    
    features_df.to_csv(output_csv, index=False)
    return features_df


def main():
    print("🚀 Starting Bird Call Feature Extraction Pipeline")
    
    try:
        # 1. Load and validate data
        print("\n🔍 Loading metadata and audio files...")
        audio_files = load_metadata()
        
        # 2. Feature extraction
        print("\n⚙️ Extracting audio features...")
        features_csv = FEATURES_DIR / 'birdcall_features_113.csv'
        features_df = extract_features(audio_files, features_csv)
        print(f"✅ Extracted {len(features_df.columns)-3} features from {len(features_df)} files")
        
        print(f"\n🎉 Pipeline completed! Features saved to:\n{features_csv}")
        
    except Exception as e:
        print(f"\n❌ Pipeline failed: {str(e)}")

if __name__ == "__main__":
    main()

### TOP 50 BIRD SELECTION (THE USER COULD SELECT THE SUBSET THEY WANT TO TRAIN WITH)

In [None]:
import os
import shutil
import pandas as pd
from pathlib import Path

# Configuration
BASE_DIR = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
DATA_DIR = BASE_DIR / 'data'
PROCESSED_DATA_DIR = DATA_DIR / 'processed'

# Original folders and files
ORIGINAL_SEGMENTS_DIR = PROCESSED_DATA_DIR / 'birdcall_segments_5s_113'
ORIGINAL_METADATA = PROCESSED_DATA_DIR / 'birdcall_metadata_113.csv'

# New folders for top 50
TOP50_SEGMENTS_DIR = PROCESSED_DATA_DIR / 'birdcall_segments_5s_TOP50'
TOP50_METADATA = PROCESSED_DATA_DIR / 'birdcall_metadata_TOP50.csv'

# List of top 50 bird species (from your ranking) with both naming formats
TOP50_SPECIES = {
    # Format: "Display Name": ["csv_name", "filename_prefix"]
    "Curve-billed Tinamou": ["curve-billed_tinamou", "curve-billed_tinamou"],
    "Tongan Megapode": ["tongan_megapode", "tongan_megapode"],
    "Micronesian Megapode": ["micronesian_megapode", "micronesian_megapode"],
    "Elegant Crested Tinamou": ["elegant_crested_tinamou", "elegant_crested_tinamou"],
    "Tepui Tinamou": ["tepui_tinamou", "tepui_tinamou"],
    "Cinereous Tinamou": ["cinereous_tinamou", "cinereous_tinamou"],
    "Lesser Nothura": ["lesser_nothura", "lesser_nothura"],
    "Puna Tinamou": ["puna_tinamou", "puna_tinamou"],
    "Lesser Rhea": ["lesser_rhea", "lesser_rhea"],
    "Barred Tinamou": ["barred_tinamou", "barred_tinamou"],
    "Dwarf Tinamou": ["dwarf_tinamou", "dwarf_tinamou"],
    "Little Spotted Kiwi": ["little_spotted_kiwi", "little_spotted_kiwi"],
    "Sula Megapode": ["sula_megapode", "sula_megapode"],
    "Vanuatu Megapode": ["vanuatu_megapode", "vanuatu_megapode"],
    "Baudo Guan": ["baudo_guan", "baudo_guan"],
    "Undulated Tinamou": ["undulated_tinamou", "undulated_tinamou"],
    "Biak Scrubfowl": ["biak_scrubfowl", "biak_scrubfowl"],
    "Bartlett's Tinamou": ["bartlett's_tinamou", "bartletts'_tinamou"],
    "Huayco Tinamou": ["huayco_tinamou", "huayco_tinamou"],
    "Wattled Brushturkey": ["wattled_brushturkey", "wattled_brushturkey"],
    "Chestnut-headed Chachalaca": ["chestnut-headed_chachalaca", "chestnut-headed_chachalaca"],
    "Common Ostrich": ["common_ostrich", "common_ostrich"],
    "New Guinea Scrubfowl": ["new_guinea_scrubfowl", "new_guinea_scrubfowl"],
    "Ornate Tinamou": ["ornate_tinamou", "ornate_tinamou"],
    "Trinidad Piping Guan": ["trinidad_piping_guan", "trinidad_piping_guan"],
    "Red-billed Brushturkey": ["red-billed_brushturkey", "red-billed_brushturkey"],
    "Andean Guan": ["andean_guan", "andean_guan"],
    "Quebracho Crested Tinamou": ["quebracho_crested_tinamou", "quebracho_crested_tinamou"],
    "Berlepsch's Tinamou": ["berlepsch's_tinamou", "berlepsch's_tinamou"],
    "White-winged Guan": ["white-winged_guan", "white-winged_guan"],
    "Hooded Tinamou": ["hooded_tinamou", "hooded_tinamou"],
    "Southern Brown Kiwi": ["southern_brown_kiwi", "southern_brown_kiwi"],
    "Great Spotted Kiwi": ["great_spotted_kiwi", "great_spotted_kiwi"],
    "Chilean Tinamou": ["chilean_tinamou", "chilean_tinamou"],
    "Band-tailed Guan": ["band-tailed_guan", "band-tailed_guan"],
    "Highland Tinamou": ["highland_tinamou", "highland_tinamou"],
    "Malleefowl": ["malleefowl", "malleefowl"],
    "Bearded Guan": ["bearded_guan", "bearded_guan"],
    "Brown Tinamou": ["brown_tinamou", "brown_tinamou"],
    "White-bellied Nothura": ["white-bellied_nothura", "white-bellied_nothura"],
    "Spix's Guan": ["spix's_guan", "spix's_guan"],
    "Choco Tinamou": ["choco_tinamou", "choco_tinamou"],
    "Grey-headed Chachalaca": ["grey-headed_chachalaca", "grey-headed_chachalaca"],
    "Black-fronted Piping Guan": ["black-fronted_piping_guan", "black-fronted_piping_guan"],
    "Cauca Guan": ["cauca_guan", "cauca_guan"],
    "Thicket Tinamou": ["thicket_tinamou", "thicket_tinamou"],
    "Great Tinamou": ["great_tinamou", "great_tinamou"],
    "Dusky-legged Guan": ["dusky-legged_guan", "dusky-legged_guan"],
    "Chaco Chachalaca": ["chaco_chachalaca", "chaco_chachalaca"],
    "Black-capped Tinamou": ["black-capped_tinamou", "black-capped_tinamou"]
}

def create_top50_dataset():
    # Create output directory if it doesn't exist
    TOP50_SEGMENTS_DIR.mkdir(parents=True, exist_ok=True)
    
    # Load original metadata
    original_metadata_df = pd.read_csv(ORIGINAL_METADATA)
    
    # Create a mapping from CSV species names to display names
    species_mapping = {v[0]: k for k, v in TOP50_SPECIES.items()}
    
    # Filter metadata for top 50 species
    top50_metadata_df = original_metadata_df[
        original_metadata_df['species'].isin(species_mapping.keys())
    ].copy()
    
    # Map the species names to their display names
    top50_metadata_df['species'] = top50_metadata_df['species'].map(species_mapping)
    
    # Save filtered metadata
    top50_metadata_df.to_csv(TOP50_METADATA, index=False)
    print(f"Saved metadata for {len(top50_metadata_df)} segments to {TOP50_METADATA}")
    
    # Copy corresponding audio files
    copied_files = 0
    for _, row in top50_metadata_df.iterrows():
        src_path = ORIGINAL_SEGMENTS_DIR / row['filename']
        dst_path = TOP50_SEGMENTS_DIR / row['filename']
        
        if src_path.exists():
            shutil.copy2(src_path, dst_path)
            copied_files += 1
        else:
            print(f"Warning: File not found - {src_path}")
    
    print(f"Copied {copied_files} audio files to {TOP50_SEGMENTS_DIR}")
    
    # Verify counts
    unique_species = top50_metadata_df['species'].nunique()
    print(f"\nDataset contains {unique_species} species and {len(top50_metadata_df)} segments")
    print("Top 50 species distribution:")
    print(top50_metadata_df['species'].value_counts().head(10))
    print("...")  # Truncated for brevity

if __name__ == "__main__":
    print("Creating dataset for top 50 performing bird species...")
    create_top50_dataset()
    print("\nDone!")

### FEATURE ENGINEERING VISUALIZATION

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import (SelectKBest, mutual_info_classif, 
                                     RFE, SelectFromModel)
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import pairwise_distances
from scipy.cluster.hierarchy import linkage, dendrogram
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings("ignore")
from pathlib import Path


# Configuration
BASE_DIR = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
DATA_DIR = BASE_DIR / 'data'
PROCESSED_DATA_DIR = DATA_DIR / 'processed' / 'birdcall_segments_5s_TOP50'
METADATA_PATH = DATA_DIR / 'processed' / 'birdcall_metadata_TOP50.csv'
FEATURES_DIR = DATA_DIR / 'features'
FEATURES_DIR.mkdir(parents=True, exist_ok=True)


# Configuration
FEATURES_PATH = FEATURES_DIR / 'birdcall_features_TOP50.csv'
OUTPUT_DIR = FEATURES_DIR / 'selected_feature_TOP50'
RANDOM_STATE = 42
N_JOBS = -1  # Use all available cores

from sklearn.preprocessing import LabelEncoder

def load_and_preprocess_features(features_path):
    """Load features and preprocess data"""
    df = pd.read_csv(features_path)
    
    # Separate features and labels
    X = df.drop(columns=['species', 'call_type', 'filename'])
    y = df['species']
    
    # Encode species names to numerical labels
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)
    
    # Standardize features
    scaler = StandardScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
    
    return X_scaled, y_encoded, df, le

def comprehensive_feature_selection(X, y):
    """Multi-method feature selection pipeline"""
    print("\n🔍 Running comprehensive feature selection...")
    
    # 1. Mutual Information (Filter method)
    print("  - Mutual Information selection...")
    mi_selector = SelectKBest(mutual_info_classif, k=20)
    mi_selector.fit(X, y)
    mi_features = X.columns[mi_selector.get_support()]
    
    # 2. Recursive Feature Elimination (Wrapper method)
    print("  - Recursive Feature Elimination...")
    rfe_selector = RFE(
        estimator=SVC(kernel="linear", random_state=RANDOM_STATE),
        n_features_to_select=20,
        step=5
    )
    rfe_selector.fit(X, y)
    rfe_features = X.columns[rfe_selector.get_support()]
    
    # 3. Random Forest Importance (Embedded method)
    print("  - Random Forest Importance...")
    rf = RandomForestClassifier(n_estimators=500, random_state=RANDOM_STATE, n_jobs=N_JOBS)
    rf.fit(X, y)
    rf_selector = SelectFromModel(rf, prefit=True, threshold="1.25*mean")
    rf_features = X.columns[rf_selector.get_support()]
    
    # 4. XGBoost Importance (Embedded method)
    print("  - XGBoost Importance...")
    xgb = XGBClassifier(n_estimators=500, random_state=RANDOM_STATE, n_jobs=N_JOBS)
    xgb.fit(X, y)
    xgb_selector = SelectFromModel(xgb, prefit=True, threshold="1.25*mean")
    xgb_features = X.columns[xgb_selector.get_support()]
    
    # Combine results
    feature_scores = pd.DataFrame({
        'feature': X.columns,
        'MI_score': mi_selector.scores_,
        'RFE_rank': rfe_selector.ranking_,
        'RF_importance': rf.feature_importances_,
        'XGB_importance': xgb.feature_importances_
    })
    
    # Calculate consensus score
    feature_scores['consensus_score'] = (
        feature_scores['MI_score'].rank() +
        (X.shape[1] - feature_scores['RFE_rank']).rank() +
        feature_scores['RF_importance'].rank() +
        feature_scores['XGB_importance'].rank()
    )
    
    # Get top 20 features by consensus
    top_features = feature_scores.nlargest(20, 'consensus_score')['feature'].tolist()
    
    return top_features, feature_scores

def visualize_feature_space(X, y, features, method='PCA'):
    """Dimensionality reduction visualization"""
    print(f"\n📊 Visualizing feature space with {method}...")
    
    # Select top features
    X_top = X[features]
    
    if method == 'PCA':
        reducer = PCA(n_components=2, random_state=RANDOM_STATE)
        components = reducer.fit_transform(X_top)
        x_label, y_label = 'PC1', 'PC2'
    else:  # t-SNE
        reducer = TSNE(n_components=2, random_state=RANDOM_STATE, perplexity=30)
        components = reducer.fit_transform(X_top)
        x_label, y_label = 't-SNE1', 't-SNE2'
    
    # Create interactive plot
    fig = px.scatter(
        x=components[:, 0], y=components[:, 1],
        color=y, hover_name=y,
        labels={'color': 'Species'},
        title=f"{method} Projection of Top Features"
    )
    
    fig.update_layout(
        xaxis_title=x_label,
        yaxis_title=y_label,
        legend_title_text='Species'
    )
    
    fig.show()
    
    return components

def plot_feature_importance(feature_scores, top_features):
    """Visualize feature importance metrics"""
    print("\n📈 Plotting feature importance...")
    
    # Prepare data
    top_scores = feature_scores[feature_scores['feature'].isin(top_features)]
    top_scores = top_scores.sort_values('consensus_score', ascending=False)
    
    # Create figure
    fig = make_subplots(rows=2, cols=2, subplot_titles=(
        "Mutual Information Scores", 
        "Random Forest Importance",
        "XGBoost Importance",
        "Consensus Ranking"
    ))
    
    # Mutual Information
    fig.add_trace(
        go.Bar(
            x=top_scores['feature'],
            y=top_scores['MI_score'],
            name="MI Score"
        ),
        row=1, col=1
    )
    
    # Random Forest
    fig.add_trace(
        go.Bar(
            x=top_scores['feature'],
            y=top_scores['RF_importance'],
            name="RF Importance"
        ),
        row=1, col=2
    )
    
    # XGBoost
    fig.add_trace(
        go.Bar(
            x=top_scores['feature'],
            y=top_scores['XGB_importance'],
            name="XGB Importance"
        ),
        row=2, col=1
    )
    
    # Consensus
    fig.add_trace(
        go.Bar(
            x=top_scores['feature'],
            y=top_scores['consensus_score'],
            name="Consensus"
        ),
        row=2, col=2
    )
    
    fig.update_layout(
        height=800,
        width=1200,
        showlegend=False,
        title_text="Feature Importance Metrics"
    )
    
    fig.update_xaxes(tickangle=45)
    fig.show()

def plot_feature_clustering(X, y, features):
    """Hierarchical clustering of features"""
    print("\n🌳 Plotting feature clustering...")
    
    # Calculate correlations
    corr = X[features].corr()
    
    # Calculate distance matrix
    dist_matrix = 1 - np.abs(corr)
    dist_matrix = np.nan_to_num(dist_matrix)
    
    # Perform hierarchical clustering
    linkage_matrix = linkage(dist_matrix, 'ward')
    
    # Plot dendrogram
    plt.figure(figsize=(15, 8))
    dendrogram(
        linkage_matrix,
        labels=features,
        orientation='top',
        leaf_rotation=90
    )
    plt.title('Hierarchical Clustering of Features')
    plt.xlabel('Features')
    plt.ylabel('Distance')
    plt.tight_layout()
    plt.show()

def plot_feature_distributions(X, y, features):
    """Violin plots of top features"""
    print("\n🎻 Plotting feature distributions...")
    
    # Select top 4 features
    top4 = features[:4]
    df = X[top4].copy()
    df['species'] = y
    
    # Melt for seaborn
    df_melt = df.melt(id_vars='species', var_name='feature')
    
    # Create plot
    plt.figure(figsize=(15, 8))
    sns.violinplot(
        data=df_melt,
        x='feature',
        y='value',
        hue='species',
        split=True,
        inner='quartile',
        palette='Set3'
    )
    plt.title('Distribution of Top Features Across Species')
    plt.xticks(rotation=45)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()

def save_selected_features(df, features, output_dir):
    """Save selected features and metadata"""
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    # Save selected features
    selected_df = df[['filename', 'species', 'call_type'] + features]
    selected_df.to_csv(f"{output_dir}/selected_features.csv", index=False)
    
    # Save feature importance
    feature_importance = pd.DataFrame({
        'feature': features,
        'rank': range(1, len(features)+1)
    })
    feature_importance.to_csv(f"{output_dir}/feature_importance.csv", index=False)
    
    print(f"\n💾 Saved selected features to {output_dir}")
def plot_grouped_dimensionality(X, y, features, method='PCA', n_groups=5, species_per_group=10):
    """Plot PCA/t-SNE in groups of 10 species for better visualization."""
    unique_species = sorted(y.unique())
    
    # Split into groups of 10 species
    species_groups = np.array_split(unique_species, n_groups)
    
    for i, group in enumerate(species_groups):
        print(f"\n📊 {method} - Group {i+1}: {', '.join(group)}")
        
        # Filter data for current group
        mask = y.isin(group)
        X_group = X[mask][features]
        y_group = y[mask]
        
        if method == 'PCA':
            reducer = PCA(n_components=2, random_state=RANDOM_STATE)
        else:  # t-SNE
            reducer = TSNE(n_components=2, random_state=RANDOM_STATE, perplexity=min(30, len(X_group)-1))
        
        components = reducer.fit_transform(X_group)
        
        # Plot
        plt.figure(figsize=(10, 8))
        sns.scatterplot(
            x=components[:, 0], y=components[:, 1],
            hue=y_group,
            palette='tab20',
            s=100,
            alpha=0.8
        )
        plt.title(f"{method} - Group {i+1} ({len(group)} species)")
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.tight_layout()
        plt.show()

def main():
    print("🔬 Starting Scientific Feature Engineering Pipeline")
    
    try:
        # 1. Load and preprocess data
        X, y_encoded, df, label_encoder = load_and_preprocess_features(FEATURES_PATH)
        
        # 2. Feature selection
        top_features, feature_scores = comprehensive_feature_selection(X, y_encoded)
        print(f"\n✅ Selected top 20 features:")
        print("\n".join(f"- {f}" for f in top_features))
        
        # 3. Visualizations
        plot_feature_importance(feature_scores, top_features)
        plot_feature_clustering(X, df['species'], top_features)
        plot_feature_distributions(X, df['species'], top_features)
        
        # 4. Grouped Dimensionality Reduction (5 PCA & 5 t-SNE plots)
        print("\n📊 Generating Grouped Dimensionality Reduction Plots...")
        plot_grouped_dimensionality(X, df['species'], top_features, method='t-SNE')
        
        # 5. Save results
        save_selected_features(df, top_features, OUTPUT_DIR)
        
        print("\n🎉 Feature engineering completed successfully!")
        
    except Exception as e:
        print(f"\n❌ Pipeline failed: {str(e)}")

if __name__ == "__main__":
    main()