In [None]:
# ================================================================
# ADVANCED AUDIO FEATURE EXTRACTION + MULTI-FOLDER MERGE + RESUME
# ================================================================
import random
import os, tempfile, time
import numpy as np
import pandas as pd
import librosa, noisereduce as nr
from pydub import AudioSegment
from pydub.utils import which
from scipy.stats import entropy
from scipy.signal import find_peaks

# --- Verify FFmpeg ---
ffmpeg_path, ffprobe_path = which("ffmpeg"), which("ffprobe")
if ffmpeg_path is None or ffprobe_path is None:
    raise EnvironmentError("‚ùå FFmpeg/ffprobe not found. Add them to PATH.")
else:
    print(f"‚úÖ FFmpeg: {ffmpeg_path}")

# --- numpy fix for librosa ---
if not hasattr(np, 'complex'):
    np.complex = np.complex128

# ================================================================
# PATHS (EDIT THIS PART)
# ================================================================
OUTPUT_CSV = r"projectFile.csv"

# ü™∂ Add all folders you want to include below:
AUDIO_FOLDERS = [
    r"E:\Insect459\Train\Train",
]

# ================================================================
# AUDIO LOADING + CONVERSION
# ================================================================
def load_audio_any_format(file_path, target_sr=22050):
    ext = os.path.splitext(file_path)[1].lower()
    if ext != ".wav":
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
            AudioSegment.from_file(file_path).export(tmp.name, format="wav")
            tmp_path = tmp.name
        file_path = tmp_path
    y, sr = librosa.load(file_path, sr=target_sr, mono=True)
    if "tmp" in file_path and os.path.exists(file_path):
        try:
            os.remove(file_path)
        except OSError:
            pass
    return y, sr

# ================================================================
# AUDIO CLEANING
# ================================================================
def clean_audio(y, sr, segment_duration=3.0, num_segments=5):
    """Return multiple random segments of fixed duration from long audio."""

    # Trim silence
    y, _ = librosa.effects.trim(y, top_db=20)
    if len(y) < 0.2 * sr:
        raise ValueError("Too short after trimming")

    # Denoise
    y = nr.reduce_noise(y=y, sr=sr)

    # Normalize
    y = y / (np.max(np.abs(y)) + 1e-6)

    segment_len = int(sr * segment_duration)

    # If audio is shorter than one segment ‚Üí pad to segment size
    if len(y) <= segment_len:
        y = np.pad(y, (0, segment_len - len(y)))
        return [y]  # one segment only

    # If audio is long ‚Üí sample multiple segments
    max_start = len(y) - segment_len
    segments = []

    for _ in range(num_segments):
        start = random.randint(0, max_start)
        seg = y[start:start + segment_len]
        segments.append(seg)

    return segments

# ================================================================
# FEATURE EXTRACTION (FULL SET)
# ================================================================
def extract_features(y, sr):
    # Core spectral features
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40, n_fft=2048, hop_length=512, fmin=50, fmax=12000)
    delta, delta2 = librosa.feature.delta(mfcc), librosa.feature.delta(mfcc, order=2)
    centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
    bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    flatness = librosa.feature.spectral_flatness(y=y)
    zcr = librosa.feature.zero_crossing_rate(y)
    rms = librosa.feature.rms(y=y)

    # Advanced metrics
    S = np.abs(librosa.stft(y))
    power_spec = S ** 2
    spec_entropy = entropy(np.mean(power_spec, axis=1))
    crest_factor = np.max(np.abs(y)) / (np.sqrt(np.mean(y**2)) + 1e-8)

    # Band energy ratios
    fmax = min(12000, sr // 2)
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=fmax)
    mel_db = librosa.power_to_db(mel_spec)
    bands = [(200,1000), (1000,3000), (3000,8000), (8000,12000)]
    mel_freqs = librosa.mel_frequencies(n_mels=128, fmin=0, fmax=fmax)
    band_means = []
    for low, high in bands:
        mask = (mel_freqs >= low) & (mel_freqs < high)
        band_means.append(np.mean(mel_db[mask]))
    ratios = [band_means[i]/(np.sum(band_means)+1e-6) for i in range(len(band_means))]

    # Peak frequencies
    freqs = np.fft.rfftfreq(len(y), 1/sr)
    psd = np.abs(np.fft.rfft(y))**2
    peaks, _ = find_peaks(psd)
    top_peaks = sorted(zip(psd[peaks], freqs[peaks]), reverse=True)[:3]
    peaks_vals = [p[1] for p in top_peaks] + [0]*(3-len(top_peaks))

    # Onset rate
    onset_env = librosa.onset.onset_strength(y=y, sr=sr)
    onset_rate = librosa.beat.tempo(onset_envelope=onset_env, sr=sr)[0]

    # SNR estimate
    signal_power = np.mean(y**2)
    noise_floor = np.mean((y - librosa.effects.preemphasis(y))**2)
    snr = 10 * np.log10((signal_power + 1e-6) / (noise_floor + 1e-6))

    features = np.hstack([
        np.mean(mfcc, axis=1), np.std(mfcc, axis=1),
        np.mean(delta, axis=1), np.std(delta, axis=1),
        np.mean(delta2, axis=1), np.std(delta2, axis=1),
        np.mean(centroid), np.std(centroid),
        np.mean(bandwidth), np.std(bandwidth),
        np.mean(contrast), np.std(contrast),
        np.mean(rolloff), np.std(rolloff),
        np.mean(flatness), np.std(flatness),
        np.mean(zcr), np.std(zcr),
        np.mean(rms), np.std(rms),
        spec_entropy, crest_factor,
        *ratios, *peaks_vals,
        onset_rate, snr
    ])
    return features

# ================================================================
# PROCESS ONE FOLDER (USED INTERNALLY)
# ================================================================
def process_folder(folder_path, processed_files, output_csv):
    data = []
    print(f"\nüìÇ Processing folder: {folder_path}")

    for root, _, files in os.walk(folder_path):
        for file in files:
            if not file.lower().endswith((".wav", ".mp3", ".flac")):
                continue

            # Skip duplicates based on filename
            if file in processed_files:
                continue

            full_path = os.path.join(root, file)
            label = os.path.basename(root)

            try:
                y, sr = load_audio_any_format(full_path)

                # ‚¨áÔ∏è NEW ‚Üí multiple 3-second segments
                segments = clean_audio(y, sr)

                segment_index = 1
                for seg in segments:
                    feats = extract_features(seg, sr)
                    # Save segment as new unique row
                    new_filename = f"{file}_seg{segment_index}"
                    data.append(np.append(feats, [label, new_filename]))
                    segment_index += 1

                print(f"‚úÖ {file} ‚Üí {label}  ({len(segments)} segments)")

                # Save every 10 rows
                if len(data) >= 10:
                    safe_append_to_csv(data, output_csv)
                    data.clear()

            except Exception as e:
                print(f"‚ö†Ô∏è Skipped {file}: {e}")

    if data:
        safe_append_to_csv(data, output_csv)


# ================================================================
# SAFE CSV APPEND (auto create + retry)
# ================================================================
def safe_append_to_csv(data, output_csv, max_retries=5):
    n_mfcc = 40
    columns = (
        [f"mfcc_mean_{i}" for i in range(n_mfcc)] +
        [f"mfcc_std_{i}" for i in range(n_mfcc)] +
        [f"delta_mean_{i}" for i in range(n_mfcc)] +
        [f"delta_std_{i}" for i in range(n_mfcc)] +
        [f"delta2_mean_{i}" for i in range(n_mfcc)] +
        [f"delta2_std_{i}" for i in range(n_mfcc)] +
        [
            "spectral_centroid_mean","spectral_centroid_std",
            "spectral_bandwidth_mean","spectral_bandwidth_std",
            "spectral_contrast_mean","spectral_contrast_std",
            "rolloff_mean","rolloff_std",
            "flatness_mean","flatness_std",
            "zcr_mean","zcr_std",
            "rms_mean","rms_std",
            "spectral_entropy","crest_factor",
            "band_low_ratio","band_midlow_ratio","band_midhigh_ratio","band_high_ratio",
            "peak_freq_1","peak_freq_2","peak_freq_3",
            "onset_rate","snr","label","file"
        ]
    )
    df = pd.DataFrame(data, columns=columns)
    write_header = not os.path.exists(output_csv)
    if write_header:
        os.makedirs(os.path.dirname(output_csv), exist_ok=True)
        print(f"üÜï Creating new CSV file at: {output_csv}")

    for attempt in range(max_retries):
        try:
            df.to_csv(output_csv, mode='a', index=False, header=write_header)
            return
        except PermissionError:
            print(f"‚ö†Ô∏è File locked. Retry {attempt+1}/{max_retries} in 3s...")
            time.sleep(3)
        except Exception as e:
            print(f"‚ö†Ô∏è Error writing to CSV: {e}")
            break
    print("‚ùå Failed to append after multiple retries.")

# ================================================================
# MAIN EXECUTION
# ================================================================
if __name__ == "__main__":
    processed_files = set()
    if os.path.exists(OUTPUT_CSV):
        df = pd.read_csv(OUTPUT_CSV)
        if 'file' in df.columns:
            processed_files = set(df['file'].values)
        print(f"üîÑ Resuming from existing CSV ({len(processed_files)} files found).")
    else:
        print(f"üÜï No CSV found, will create a new one at:\n   {OUTPUT_CSV}")

    # Process all folders in the list
    for folder in AUDIO_FOLDERS:
        if os.path.exists(folder):
            process_folder(folder, processed_files, OUTPUT_CSV)
        else:
            print(f"‚ö†Ô∏è Skipped (folder not found): {folder}")

    print("\nüéâ All folders processed successfully!")


In [None]:
# ================================================================
# MEMORY-SAFE FEATURE EXTRACTOR (FULL-SCHEMA) ‚Äî Segment-by-segment
# Keeps SAME columns as your previous CSV; safe resume + low RAM
# ================================================================
import os, tempfile, time, random
import numpy as np
import pandas as pd
import librosa, noisereduce as nr
from pydub import AudioSegment
from pydub.utils import which
from scipy.stats import entropy
from scipy.signal import find_peaks

# ----------------- CONFIG (edit) -----------------
OUTPUT_CSV = r"D:\Projects\Minor\Audio\InsectsXeno\Audio\insect459seg_audio_features.csv"
AUDIO_FOLDERS = [ r"E:\Insect459\Train\Train" ]   # list of root folders to scan
SEGMENT_DURATION = 3.0    # seconds per segment
NUM_SEGMENTS = 5          # number of segments per file (set 1..5). Use 1 for minimal load.
DENR_NOISE_FOR_ALL = False  # set True to apply denoising for all folders (default False)
SKIP_FOLDERS = {"other_insects"}  # folder names to skip entirely
# -------------------------------------------------

# check ffmpeg
ffmpeg_path, ffprobe_path = which("ffmpeg"), which("ffprobe")
print("FFmpeg:", ffmpeg_path)

# numpy compatibility
if not hasattr(np, "complex"):
    np.complex = np.complex128

# --- helper: detect esc-like folder (no denoise) ---
def is_esc_folder(path):
    return ("esc" in path.lower() or "esc-50" in path.lower())

# ---------------- audio load ----------------
def load_audio_any_format(file_path, target_sr=22050):
    ext = os.path.splitext(file_path)[1].lower()
    tmp_path = None
    if ext != ".wav":
        tf = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
        tmp_path = tf.name
        tf.close()
        AudioSegment.from_file(file_path).export(tmp_path, format="wav")
        file_path = tmp_path
    y, sr = librosa.load(file_path, sr=target_sr, mono=True)
    if tmp_path and os.path.exists(tmp_path):
        try:
            os.remove(tmp_path)
        except Exception:
            pass
    return y, sr

# ---------------- get random segment(s) ----------------
def get_random_segments(y, sr, duration=3.0, num_segments=5):
    seg_len = int(sr * duration)
    if len(y) <= seg_len:
        # pad and return single segment
        return [np.pad(y, (0, seg_len - len(y)))]
    segments = []
    max_start = len(y) - seg_len
    # sample distinct start positions if possible
    starts = set()
    tries = 0
    while len(starts) < num_segments and tries < num_segments*5:
        starts.add(random.randint(0, max_start))
        tries += 1
    for s in sorted(list(starts))[:num_segments]:
        segments.append(y[s:s+seg_len])
    return segments

# ---------------- feature extractor (same schema) ----------------
def extract_features(y, sr):
    # MFCCs + deltas
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40, n_fft=2048, hop_length=512, fmin=50, fmax=min(12000, sr//2))
    delta  = librosa.feature.delta(mfcc)
    delta2 = librosa.feature.delta(mfcc, order=2)

    # spectral features
    centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
    bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    flatness = librosa.feature.spectral_flatness(y=y)
    zcr = librosa.feature.zero_crossing_rate(y)
    rms = librosa.feature.rms(y=y)

    # advanced metrics
    S = np.abs(librosa.stft(y))
    power_spec = S ** 2
    spec_entropy = float(entropy(np.mean(power_spec, axis=1)))
    crest_factor = float(np.max(np.abs(y)) / (np.sqrt(np.mean(y**2)) + 1e-8))

    # band energy ratios (4 bands)
    fmax = min(12000, sr // 2)
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=fmax)
    mel_db = librosa.power_to_db(mel_spec)
    mel_freqs = librosa.mel_frequencies(n_mels=128, fmin=0, fmax=fmax)
    bands = [(200,1000), (1000,3000), (3000,8000), (8000,12000)]
    band_means = []
    for low, high in bands:
        mask = (mel_freqs >= low) & (mel_freqs < high)
        # guard empty mask (rare)
        if np.any(mask):
            band_means.append(np.mean(mel_db[mask]))
        else:
            band_means.append(0.0)
    total = (np.sum(band_means) + 1e-9)
    ratios = [bm/total for bm in band_means]

    # peak frequencies from PSD
    freqs = np.fft.rfftfreq(len(y), 1/sr)
    psd = np.abs(np.fft.rfft(y))**2
    peaks, _ = find_peaks(psd)
    top_peaks = sorted(zip(psd[peaks], freqs[peaks]), reverse=True)[:3] if len(peaks)>0 else []
    peaks_vals = [p[1] for p in top_peaks] + [0.0]*(3-len(top_peaks))

    # onset rate (librosa-version-safe)
    onset_env = librosa.onset.onset_strength(y=y, sr=sr)
    onset_frames = librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr)
    if len(onset_frames) > 1:
        onset_times = librosa.frames_to_time(onset_frames, sr=sr)
        onset_rate = float(1.0 / np.mean(np.diff(onset_times)))
    else:
        onset_rate = 0.0

    # SNR estimate
    signal_power = np.mean(y**2)
    noise_floor = np.mean((y - librosa.effects.preemphasis(y))**2)
    snr = float(10 * np.log10((signal_power + 1e-9) / (noise_floor + 1e-9)))

    # flatten in the exact order used previously
    features = np.hstack([
        np.mean(mfcc, axis=1), np.std(mfcc, axis=1),
        np.mean(delta, axis=1), np.std(delta, axis=1),
        np.mean(delta2, axis=1), np.std(delta2, axis=1),
        np.mean(centroid), np.std(centroid),
        np.mean(bandwidth), np.std(bandwidth),
        np.mean(contrast), np.std(contrast),
        np.mean(rolloff), np.std(rolloff),
        np.mean(flatness), np.std(flatness),
        np.mean(zcr), np.std(zcr),
        np.mean(rms), np.std(rms),
        spec_entropy, crest_factor,
        *ratios, *peaks_vals,
        onset_rate, snr
    ]).astype(float)

    return features

# ---------------- build columns (exact schema) ----------------
def build_columns(n_mfcc=40):
    cols = (
        [f"mfcc_mean_{i}" for i in range(n_mfcc)] +
        [f"mfcc_std_{i}" for i in range(n_mfcc)] +
        [f"delta_mean_{i}" for i in range(n_mfcc)] +
        [f"delta_std_{i}" for i in range(n_mfcc)] +
        [f"delta2_mean_{i}" for i in range(n_mfcc)] +
        [f"delta2_std_{i}" for i in range(n_mfcc)] +
        [
            "spectral_centroid_mean","spectral_centroid_std",
            "spectral_bandwidth_mean","spectral_bandwidth_std",
            "spectral_contrast_mean","spectral_contrast_std",
            "rolloff_mean","rolloff_std",
            "flatness_mean","flatness_std",
            "zcr_mean","zcr_std",
            "rms_mean","rms_std",
            "spectral_entropy","crest_factor",
            "band_low_ratio","band_midlow_ratio","band_midhigh_ratio","band_high_ratio",
            "peak_freq_1","peak_freq_2","peak_freq_3",
            "onset_rate","snr","label","file"
        ]
    )
    return cols

# ---------------- append single row safely ----------------
def append_row_to_csv(row_values, output_csv, retry=5):
    cols = build_columns()
    df = pd.DataFrame([row_values], columns=cols)
    write_header = not os.path.exists(output_csv)
    for attempt in range(retry):
        try:
            df.to_csv(output_csv, mode='a', index=False, header=write_header)
            return True
        except PermissionError:
            time.sleep(1 + attempt)
        except Exception as e:
            print("CSV write error:", e)
            return False
    print("Failed to write after retries.")
    return False

# ---------------- main processing loop (memory-safe) ----------------
def process_folder(folder_path, processed_files, output_csv, segment_duration, num_segments):
    print("\nProcessing folder:", folder_path)
    for root, _, files in os.walk(folder_path):
        label = os.path.basename(root)
        if label in SKIP_FOLDERS:
            print("Skipping folder (by config):", root)
            continue

        for file in files:
            if not file.lower().endswith((".wav", ".mp3", ".flac", ".m4a")):
                continue

            # build unique processed key per segment e.g filename_seg1
            base_filename = file
            # if any segment was already processed, we will skip only those segments
            # we store processed 'file' entries in CSV exactly as 'origname_segX'
            try:
                if num_segments == 1:
                    seg_keys = [f"{base_filename}_seg1"]
                else:
                    seg_keys = [f"{base_filename}_seg{i}" for i in range(1, num_segments+1)]
            except Exception:
                seg_keys = [f"{base_filename}_seg1"]

            # if all segments already processed, skip
            if all(k in processed_files for k in seg_keys):
                continue

            full_path = os.path.join(root, file)
            apply_denoise = DENR_NOISE_FOR_ALL or (not is_esc_folder(full_path))

            try:
                y, sr = load_audio_any_format(full_path)
            except Exception as e:
                print("Failed to load:", file, "|", e)
                continue

            segments = get_random_segments(y, sr, duration=segment_duration, num_segments=num_segments)

            for idx, seg in enumerate(segments, start=1):
                seg_key = f"{base_filename}_seg{idx}"
                if seg_key in processed_files:
                    continue

                # optionally denoise the segment only (small memory)
                if apply_denoise:
                    try:
                        seg = nr.reduce_noise(y=seg, sr=sr)
                    except Exception:
                        # fallback: skip denoising if it fails
                        pass

                # normalize
                if np.max(np.abs(seg)) > 0:
                    seg = seg / (np.max(np.abs(seg)) + 1e-9)

                try:
                    feats = extract_features(seg, sr)
                except Exception as e:
                    print("Feature extraction failed for", seg_key, "|", e)
                    continue

                row = list(feats) + [label, seg_key]

                ok = append_row_to_csv(row, output_csv)
                if ok:
                    processed_files.add(seg_key)
                    print("Saved:", seg_key, "->", label)
                else:
                    print("Failed to save:", seg_key)

# ---------------- ENTRY POINT ----------------
if __name__ == "__main__":
    # load processed keys for resume
    processed_files = set()
    if os.path.exists(OUTPUT_CSV):
        try:
            df_done = pd.read_csv(OUTPUT_CSV)
            if "file" in df_done.columns:
                processed_files = set(df_done["file"].astype(str).tolist())
            print("Resuming. Already processed rows:", len(processed_files))
        except Exception as e:
            print("Could not read existing CSV for resume:", e)

    for folder in AUDIO_FOLDERS:
        if os.path.exists(folder):
            process_folder(folder, processed_files, OUTPUT_CSV, SEGMENT_DURATION, NUM_SEGMENTS)
        else:
            print("Folder not found:", folder)

    print("\nALL DONE.")


In [None]:
df = pd.read_csv(OUTPUT_CSV)
print("Total rows:", len(df))
print("Unique file names:", df['file'].nunique())
print("Duplicates:", len(df) - df['file'].nunique())


In [None]:

pip install --upgrade librosa




In [None]:
import pandas as pd
import numpy as np
import os

CSV_PATH = r"D:\Projects\Minor\Audio\InsectsXeno\Audio\xenoinsect_audio_features.csv"   # <<< change this

df = pd.read_csv(CSV_PATH)
print("Rows:", len(df), "Columns:", len(df.columns))
# print("Columns:", df.columns.tolist())
.0

# 1. Missing values
missing = df.isna().sum()
print("\nMissing values per column (showing >0):")
print(missing[missing > 0])

# 2. Duplicate files
if 'file' in df.columns:
    dup = df['file'].duplicated().sum()
    print("\nDuplicate file entries:", dup)

# 3. Label counts
import pandas as pd

pd.set_option('display.max_rows', None)  # show all rows
pd.set_option('display.max_columns', None)  # show all columns (if needed)
pd.set_option('display.width', None)  # don‚Äôt wrap lines
pd.set_option('display.max_colwidth', None)  # don‚Äôt cut long labels

if 'label' in df.columns:
    counts = df['label'].value_counts()
    print("\nLabel distribution:")
    print(counts)


# 4. Numeric feature stats
num = df.select_dtypes(include=[np.number])
print("\nNumeric feature count:", num.shape[1])
stats = num.describe().T[['count','mean','std','min','25%','50%','75%','max']]
print("\nFeature summary (first 10 rows):")
print(stats.head(10))

# 5. Constant features
const = [c for c in num.columns if num[c].nunique()<=1]
print("\nConstant features (should be none):", const)

# 6. NaN rate threshold
nan_rate = (missing / len(df)).max()
print("\nMax NaN rate across columns:", nan_rate)

# Save simple report
report_path = os.path.splitext(CSV_PATH)[0] + "_diagnostic_report.csv"
stats.to_csv(report_path)
print("\nSaved numeric stats to:", report_path)


In [None]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import seaborn as sns   # optional if available

CSV_PATH = r"D:\Projects\Minor\Audio\InsectsXeno\Audio\insect459seg_audio_features.csv"
df = pd.read_csv(CSV_PATH)
num = df.select_dtypes(include=[np.number]).fillna(0)

# 1. Class balance bar chart
if 'label' in df.columns:
    vc = df['label'].value_counts()
    plt.figure(figsize=(16,9)); vc.plot(kind='bar'); plt.title("Class counts"); plt.show()

# 2. Histogram of a sample feature (e.g., mfcc_mean_0)
col = [c for c in num.columns if 'mfcc_mean_0' in c or True][0]
plt.figure(figsize=(18,9)); plt.hist(num[col], bins=50); plt.title(col); plt.show()

# 3. Correlation heatmap (top 50 features by variance to keep plot readable)
var_sorted = num.var().sort_values(ascending=False)
top = var_sorted.index[:50]
plt.figure(figsize=(20,16))
corr = num[top].corr()
sns.heatmap(corr, cmap='coolwarm', vmin=-1, vmax=1)
plt.title("Top-50 feature correlation"); plt.show()

# 4. PCA 2D colored by label
labels = df['label'] if 'label' in df.columns else None
pca = PCA(n_components=2)
Xp = pca.fit_transform(num.sample(n=min(len(num),2000), random_state=0))  # sample for speed
if labels is not None:
    lab_sample = labels.sample(n=min(len(labels),2000), random_state=0).values
else:
    lab_sample = None

plt.figure(figsize=(16,16))
if lab_sample is not None:
    for lab in np.unique(lab_sample):
        mask = (lab_sample==lab)
        plt.scatter(Xp[mask,0], Xp[mask,1], label=str(lab), s=10)
    plt.legend(fontsize=6, loc='best')
else:
    plt.scatter(Xp[:,0], Xp[:,1], s=10)
plt.title("PCA 2D"); plt.show()


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import warnings

warnings.filterwarnings("ignore")

# ================================
# LOAD DATA
# ================================
CSV = r"D:\Projects\Minor\Audio\InsectsXeno\Audio\insect459seg_audio_features.csv"
df = pd.read_csv(CSV)

# Extract features and labels
X = df.select_dtypes(include=[np.number]).values
y = df['label'].values

print("Total samples:", X.shape[0])
print("Total features:", X.shape[1])
print("Classes:", len(np.unique(y)))

# ================================
# OPTIONAL: SAMPLE FOR SPEED
# ================================
MAX_SAMPLES = 25000    # <-- you can increase or decrease
if X.shape[0] > MAX_SAMPLES:
    idx = np.random.choice(X.shape[0], MAX_SAMPLES, replace=False)
    X, y = X[idx], y[idx]
    print(f"‚ö†Ô∏è Dataset reduced to {MAX_SAMPLES} samples for faster training.")

# ================================
# REMOVE CONSTANT FEATURES
# ================================
stds = X.std(axis=0)
constant_cols = np.where(stds == 0)[0]

if len(constant_cols) > 0:
    print("Removing constant features:", constant_cols)
    X = np.delete(X, constant_cols, axis=1)
else:
    print("No constant features detected.")

# ================================
# RANDOM FOREST MODEL
# ================================
clf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    n_jobs=-1,
    class_weight="balanced_subsample",   # <-- IMPORTANT for imbalanced insects
    random_state=42
)

# ================================
# 5-FOLD STRATIFIED CROSS-VALIDATION
# ================================
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_val_score(
    clf,
    X,
    y,
    cv=cv,
    scoring="accuracy",
    n_jobs=-1
)

print("\n========== CROSS-VALIDATION RESULTS ==========")
print("Fold scores:", scores)
print("Mean CV accuracy:", scores.mean())

# ================================
# TRAIN/TEST SPLIT FOR DETAILED REPORT
# ================================
Xtr, Xte, ytr, yte = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)

clf.fit(Xtr, ytr)
yp = clf.predict(Xte)

print("\n========== CLASSIFICATION REPORT (TEST SET) ==========")
print(classification_report(yte, yp, zero_division=0))

# Confusion matrix (optional)
cm = confusion_matrix(yte, yp)
print("Confusion matrix shape:", cm.shape)


In [None]:
# ================================================================
# ADVANCED AUDIO FEATURE EXTRACTION + MULTI-FOLDER MERGE + RESUME
# WITH ESC-50 COMPATIBILITY + ENVIRONMENT LABEL
# ================================================================
import random
import os, tempfile, time
import numpy as np
import pandas as pd
import librosa, noisereduce as nr
from pydub import AudioSegment
from pydub.utils import which
from scipy.stats import entropy
from scipy.signal import find_peaks

# --- Verify FFmpeg ---
ffmpeg_path, ffprobe_path = which("ffmpeg"), which("ffprobe")
if ffmpeg_path is None or ffprobe_path is None:
    raise EnvironmentError("‚ùå FFmpeg/ffprobe not found. Add them to PATH.")
else:
    print(f"‚úÖ FFmpeg: {ffmpeg_path}")

# --- numpy fix for librosa ---
if not hasattr(np, 'complex'):
    np.complex = np.complex128


# ================================================================
# PATHS ‚Äî EDIT THESE
# ================================================================
OUTPUT_CSV = r"D:\Projects\Minor\Audio\InsectsXeno\Audio\env_audio_features.csv"

AUDIO_FOLDERS = [
    r"D:\Projects\Minor\Audio\InsectsXeno\Audio\ESC-50-master\ESC-50-master\audio"   # üîß Add ESC-50 here
]


# ================================================================
# HELPER ‚Äî DETECT IF FILE BELONGS TO ESC-50
# ================================================================
def is_esc50(path):
    return ("esc" in path.lower() or "esc-50" in path.lower())


# ================================================================
# AUDIO LOADING + CONVERSION
# ================================================================
def load_audio_any_format(file_path, target_sr=22050):
    ext = os.path.splitext(file_path)[1].lower()

    if ext != ".wav":
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
            AudioSegment.from_file(file_path).export(tmp.name, format="wav")
            tmp_path = tmp.name
        file_path = tmp_path

    y, sr = librosa.load(file_path, sr=target_sr, mono=True)

    if "tmp" in file_path and os.path.exists(file_path):
        try:
            os.remove(file_path)
        except OSError:
            pass

    return y, sr


# ================================================================
# AUDIO CLEANING ‚Äî ESC50 gets no denoise (üîß MODIFIED)
# ================================================================
def clean_audio(y, sr, segment_duration=3.0, num_segments=5, apply_denoise=True):

    # trim silence
    y, _ = librosa.effects.trim(y, top_db=20)
    if len(y) < 0.2 * sr:
        raise ValueError("Too short after trimming")

    # üîß ESC-50 should NOT be denoised
    if apply_denoise:
        y = nr.reduce_noise(y=y, sr=sr)

    # normalize
    y = y / (np.max(np.abs(y)) + 1e-6)

    segment_len = int(sr * segment_duration)

    # if too short ‚Üí pad
    if len(y) <= segment_len:
        y = np.pad(y, (0, segment_len - len(y)))
        return [y]

    # random segments for long audio
    max_start = len(y) - segment_len
    segments = []

    for _ in range(num_segments):
        start = random.randint(0, max_start)
        seg = y[start:start + segment_len]
        segments.append(seg)

    return segments


# ================================================================
# FEATURE EXTRACTION (unchanged)
# ================================================================
def extract_features(y, sr):
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40, n_fft=2048, hop_length=512, fmin=50, fmax=12000)
    delta = librosa.feature.delta(mfcc)
    delta2 = librosa.feature.delta(mfcc, order=2)

    centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
    bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    flatness = librosa.feature.spectral_flatness(y=y)
    zcr = librosa.feature.zero_crossing_rate(y)
    rms = librosa.feature.rms(y=y)

    S = np.abs(librosa.stft(y))
    power_spec = S ** 2
    spec_entropy = entropy(np.mean(power_spec, axis=1))
    crest_factor = np.max(np.abs(y)) / (np.sqrt(np.mean(y**2)) + 1e-8)

    # band energy
    fmax = min(12000, sr // 2)
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=fmax)
    mel_db = librosa.power_to_db(mel_spec)
    mel_freqs = librosa.mel_frequencies(n_mels=128, fmin=0, fmax=fmax)

    bands = [(200,1000), (1000,3000), (3000,8000), (8000,12000)]
    band_means = []

    for low, high in bands:
        mask = (mel_freqs >= low) & (mel_freqs < high)
        band_means.append(np.mean(mel_db[mask]))

    ratios = [band_means[i] / (np.sum(band_means) + 1e-6) for i in range(4)]

    # peaks
    freqs = np.fft.rfftfreq(len(y), 1/sr)
    psd = np.abs(np.fft.rfft(y))**2
    peaks, _ = find_peaks(psd)
    top_peaks = sorted(zip(psd[peaks], freqs[peaks]), reverse=True)[:3]
    peak_vals = [p[1] for p in top_peaks] + [0]*(3 - len(top_peaks))

    # onset
    onset_env = librosa.onset.onset_strength(y=y, sr=sr)
    onset_rate = librosa.feature.rhythm.tempo(onset_envelope=onset_env, sr=sr)[0]

    # snr
    signal_power = np.mean(y**2)
    noise_floor = np.mean((y - librosa.effects.preemphasis(y))**2)
    snr = 10 * np.log10((signal_power + 1e-6) / (noise_floor + 1e-6))

    features = np.hstack([
        np.mean(mfcc, axis=1), np.std(mfcc, axis=1),
        np.mean(delta, axis=1), np.std(delta, axis=1),
        np.mean(delta2, axis=1), np.std(delta2, axis=1),
        np.mean(centroid), np.std(centroid),
        np.mean(bandwidth), np.std(bandwidth),
        np.mean(contrast), np.std(contrast),
        np.mean(rolloff), np.std(rolloff),
        np.mean(flatness), np.std(flatness),
        np.mean(zcr), np.std(zcr),
        np.mean(rms), np.std(rms),
        spec_entropy, crest_factor,
        *ratios,
        *peak_vals,
        onset_rate, snr
    ])
    return features


# ================================================================
# PROCESS A FOLDER
# ================================================================
def process_folder(folder_path, processed_files, output_csv):
    data = []

    print(f"\nüìÇ Processing folder: {folder_path}")

    for root, _, files in os.walk(folder_path):
        for file in files:

            if not file.lower().endswith((".wav", ".mp3", ".flac")):
                continue

            if file in processed_files:
                continue

            full_path = os.path.join(root, file)

            # üîß ESC-50 ‚Üí label = "environment"
            if is_esc50(full_path):
                label = "environment"
                apply_denoise = False  # üîß DO NOT denoise ESC50
            else:
                label = os.path.basename(root)
                apply_denoise = True

            try:
                y, sr = load_audio_any_format(full_path)

                # get 3-second segments
                segments = clean_audio(y, sr, apply_denoise=apply_denoise)

                seg_idx = 1
                for seg in segments:
                    feats = extract_features(seg, sr)
                    new_filename = f"{file}_seg{seg_idx}"
                    data.append(np.append(feats, [label, new_filename]))
                    seg_idx += 1

                print(f"‚úÖ {file} ‚Üí {label} ({len(segments)} segments)")

                if len(data) >= 10:
                    safe_append_to_csv(data, output_csv)
                    data.clear()

            except Exception as e:
                print(f"‚ö†Ô∏è Skipped {file}: {e}")

    if data:
        safe_append_to_csv(data, output_csv)


# ================================================================
# CSV APPENDER
# ================================================================
def safe_append_to_csv(data, output_csv, max_retries=5):
    n_mfcc = 40
    columns = (
        [f"mfcc_mean_{i}" for i in range(n_mfcc)] +
        [f"mfcc_std_{i}" for i in range(n_mfcc)] +
        [f"delta_mean_{i}" for i in range(n_mfcc)] +
        [f"delta_std_{i}" for i in range(n_mfcc)] +
        [f"delta2_mean_{i}" for i in range(n_mfcc)] +
        [f"delta2_std_{i}" for i in range(n_mfcc)] +
        [
            "spectral_centroid_mean","spectral_centroid_std",
            "spectral_bandwidth_mean","spectral_bandwidth_std",
            "spectral_contrast_mean","spectral_contrast_std",
            "rolloff_mean","rolloff_std",
            "flatness_mean","flatness_std",
            "zcr_mean","zcr_std",
            "rms_mean","rms_std",
            "spectral_entropy","crest_factor",
            "band_low_ratio","band_midlow_ratio","band_midhigh_ratio","band_high_ratio",
            "peak_freq_1","peak_freq_2","peak_freq_3",
            "onset_rate","snr","label","file"
        ]
    )

    df = pd.DataFrame(data, columns=columns)
    write_header = not os.path.exists(output_csv)

    if write_header:
        os.makedirs(os.path.dirname(output_csv), exist_ok=True)
        print(f"üÜï Creating CSV at: {output_csv}")

    for attempt in range(max_retries):
        try:
            df.to_csv(output_csv, mode='a', index=False, header=write_header)
            return
        except PermissionError:
            print(f"‚ö†Ô∏è CSV locked. Retry {attempt+1}/{max_retries}...")
            time.sleep(3)
        except Exception as e:
            print(f"‚ùå CSV write error: {e}")
            return


# ================================================================
# MAIN
# ================================================================
if __name__ == "__main__":

    processed_files = set()

    if os.path.exists(OUTPUT_CSV):
        df = pd.read_csv(OUTPUT_CSV)
        if "file" in df.columns:
            processed_files = set(df["file"].values)
            print(f"üîÑ Resuming ‚Äî found {len(processed_files)} processed files.")
    else:
        print(f"üÜï No CSV found ‚Äî will create: {OUTPUT_CSV}")

    for folder in AUDIO_FOLDERS:
        if os.path.exists(folder):
            process_folder(folder, processed_files, OUTPUT_CSV)
        else:
            print(f"‚ö†Ô∏è Folder not found: {folder}")

    print("\nüéâ ALL DONE! CSV READY.")


In [None]:
import pandas as pd
import numpy as np

# ===============================================================
# 1. LOAD CSV
# ===============================================================

CSV_PATH = r"D:\Projects\Minor\Audio\InsectsXeno\Audio\env_audio_features.csv"

df = pd.read_csv(CSV_PATH)
print("üü¢ CSV Loaded:", CSV_PATH)
print("Shape:", df.shape)



# ===============================================================
# 2. BASIC STRUCTURE CHECKS
# ===============================================================

print("\n===== COLUMN CHECK =====")
expected_feature_count = (
    40*6 +  # mfcc_mean, mfcc_std, delta_mean, delta_std, delta2_mean, delta2_std
    2*7 +   # spectral_* stats
    4 +     # band ratios
    3 +     # peak freqs
    2       # onset_rate, snr
)

expected_total = expected_feature_count + 2  # + label + file

print(f"Columns: {len(df.columns)} (expected ~{expected_total})")

if len(df.columns) != expected_total:
    print("‚ö†Ô∏è Column count mismatch!")
else:
    print("‚úÖ Column count correct.")



# ===============================================================
# 3. CHECK FOR MISSING VALUES
# ===============================================================

print("\n===== MISSING VALUE CHECK =====")
missing = df.isnull().sum().sum()
print(f"Total missing values: {missing}")

if missing > 0:
    print(df.isnull().sum()[df.isnull().sum() > 0])
    print("‚ö†Ô∏è There are missing values!")
else:
    print("‚úÖ No missing values.")



# ===============================================================
# 4. CHECK FOR INFINITE VALUES
# ===============================================================

print("\n===== INFINITY CHECK =====")

# Select only numeric columns
numeric_df = df.select_dtypes(include=[np.number])

if np.isinf(numeric_df.values).any():
    print("‚ö†Ô∏è CSV contains infinite values!")
else:
    print("‚úÖ No infinite values found.")




# ===============================================================
# 5. CHECK LABEL DISTRIBUTION (Class Imbalance)
# ===============================================================

print("\n===== LABEL DISTRIBUTION =====")
if "label" in df.columns:
    print(df["label"].value_counts())
else:
    print("‚ùå No 'label' column found!")



# ===============================================================
# 6. CHECK DUPLICATE FILES + ROWS
# ===============================================================

print("\n===== DUPLICATE CHECK =====")

if "file" in df.columns:
    dup_files = df["file"].duplicated().sum()
    print("Duplicate files:", dup_files)
else:
    print("‚ö†Ô∏è No 'file' column found.")

dup_rows = df.duplicated().sum()
print("Duplicate rows:", dup_rows)



# ===============================================================
# 7. FEATURE DISTRIBUTION SUMMARY (detect broken data)
# ===============================================================

print("\n===== FEATURE STATISTICS =====")
stats = df.describe()
print(stats)



# ===============================================================
# 8. CHECK FOR ZERO-VARIANCE COLUMNS (bad features)
# ===============================================================

print("\n===== ZERO VARIANCE FEATURE CHECK =====")
zero_var = stats.loc["std"] == 0
if zero_var.any():
    print("‚ö†Ô∏è Zero-variance columns found:")
    print(zero_var[zero_var == True])
else:
    print("‚úÖ No zero-variance columns.")



# ===============================================================
# 9. CHECK MFCC RANGES (detect extraction bugs)
# ===============================================================

print("\n===== MFCC RANGE CHECK =====")
mfcc_cols = [col for col in df.columns if "mfcc_mean" in col]

if len(mfcc_cols) != 40:
    print("‚ö†Ô∏è Expected 40 MFCC mean features, found:", len(mfcc_cols))
else:
    mean_range = df[mfcc_cols].describe()
    print("MFCC mean min/max:")
    print("Min:", mean_range.loc["min"].min())
    print("Max:", mean_range.loc["max"].max())
    print("Range looks normal.")


# Count how many columns are NOT part of expected schema
expected_cols = set([
    *[f"mfcc_mean_{i}" for i in range(40)],
    *[f"mfcc_std_{i}" for i in range(40)],
    *[f"delta_mean_{i}" for i in range(40)],
    *[f"delta_std_{i}" for i in range(40)],
    *[f"delta2_mean_{i}" for i in range(40)],
    *[f"delta2_std_{i}" for i in range(40)],
    "spectral_centroid_mean","spectral_centroid_std",
    "spectral_bandwidth_mean","spectral_bandwidth_std",
    "spectral_contrast_mean","spectral_contrast_std",
    "rolloff_mean","rolloff_std",
    "flatness_mean","flatness_std",
    "zcr_mean","zcr_std",
    "rms_mean","rms_std",
    "spectral_entropy","crest_factor",
    "band_low_ratio","band_midlow_ratio","band_midhigh_ratio","band_high_ratio",
    "peak_freq_1","peak_freq_2","peak_freq_3",
    "onset_rate","snr",
    "label","file"
])

actual_cols = set(df.columns)

extra = actual_cols - expected_cols
missing = expected_cols - actual_cols

print("EXTRA COLUMNS:", extra)
print("MISSING COLUMNS:", missing)


# ===============================================================
# 10. PRINT CONCLUSION
# ===============================================================

print("\n===== FINAL VALIDATION =====")
if missing == 0 and dup_rows == 0:
    print("üéâ CSV looks clean and ready for ML!")
else:
    print("‚ö†Ô∏è CSV contains issues. Review warnings above.")



In [None]:
"""
Clean ESC-50, relabel insect datasets (TOP3 + Others), and merge all datasets.
Fully corrected version.
"""

import os
import pandas as pd
from pathlib import Path

# -------------------------
# CONFIG ‚Äî EDIT THESE PATHS
# -------------------------
INSECT459_CSV = r"D:\Projects\Minor\Audio\InsectsXeno\Audio\insect459seg_audio_features.csv"
XENO_CSV      = r"D:\Projects\Minor\Audio\InsectsXeno\Audio\xenoinsect_audio_features.csv"
ESC50_META    = r"D:\Projects\Minor\Audio\InsectsXeno\Audio\esc50.csv"
ESC50_FEATURES_CSV = r"D:\Projects\Minor\Audio\InsectsXeno\Audio\env_audio_features.csv"

OUTPUT_DIR = r"D:\Projects\Minor\Audio\InsectsXeno\Audio\merged_output"

# Your chosen top 3
TOP3 = [
    "Chorthippus_biguttulus",
    "Gryllus_bimaculatus",
    "Ruspolia_nitidula"
]

# ESC insect-related keyword filtering
ESC_INSECT_KEYWORDS = {
    "insect", "insects", "cricket", "crickets", "grasshopper",
    "bee", "mosquito", "fly", "flies", "buzz", "cicada"
}

# -------------------------
def ensure_dir(p):
    Path(p).mkdir(parents=True, exist_ok=True)

def safe_read_csv(path):
    if not os.path.exists(path):
        raise FileNotFoundError(path)
    return pd.read_csv(path)

# -------------------------
# LOAD FILES
# -------------------------
ensure_dir(OUTPUT_DIR)
print("Loading datasets...")

df_insect459 = safe_read_csv(INSECT459_CSV)
df_xeno = safe_read_csv(XENO_CSV)
df_esc_meta = safe_read_csv(ESC50_META)
df_esc_feats = safe_read_csv(ESC50_FEATURES_CSV)

print("Loaded shapes:")
print("  insect459:", df_insect459.shape)
print("  xenocanto:", df_xeno.shape)
print("  ESC meta:", df_esc_meta.shape)
print("  ESC feats:", df_esc_feats.shape)

# -------------------------
# CLEAN ESC METADATA
# -------------------------
print("\nCleaning ESC-50...")

# Normalize metadata filename column
meta_fname_col = "filename"
df_esc_meta["file_clean"] = df_esc_meta[meta_fname_col].str.lower()

# Normalize ESC feature filenames: remove _segX
df_esc_feats["file_clean"] = (
    df_esc_feats["file"].str.lower()
    .str.replace(r"_seg\d+$", "", regex=True)
)

# Merge metadata into feature rows
esc_merged = df_esc_feats.merge(
    df_esc_meta[["file_clean", "category"]],
    on="file_clean",
    how="left"
)

# Drop rows without category (should not happen)
esc_merged = esc_merged.dropna(subset=["category"])

# LOWERCASE for filtering
esc_merged["category_lc"] = esc_merged["category"].str.lower()

# Filter OUT insect-like ESC categories
mask_env = ~esc_merged["category_lc"].apply(
    lambda x: any(kw in x for kw in ESC_INSECT_KEYWORDS)
)

esc_env = esc_merged[mask_env].copy()
esc_env["label"] = "Env"

print("ESC cleaned rows kept:", esc_env.shape[0])

esc_env_out = os.path.join(OUTPUT_DIR, "esc50_env_cleaned.csv")
esc_env.to_csv(esc_env_out, index=False)
print("Saved:", esc_env_out)

# -------------------------
# RELABEL INSECT459
# -------------------------
print("\nRelabeling insect459...")

df_insect459["orig_label"] = df_insect459["label"]

df_insect459["label"] = df_insect459["orig_label"].apply(
    lambda x: x if x in TOP3 else "Others"
)

print(df_insect459["label"].value_counts())

insect_out = os.path.join(OUTPUT_DIR, "insect459_relabeled.csv")
df_insect459.to_csv(insect_out, index=False)
print("Saved:", insect_out)

# -------------------------
# RELABEL XENOCANTO
# -------------------------
print("\nRelabeling xenocanto...")

def relabel_xeno(row):
    f = str(row["file"]).lower()
    l = str(row["label"]).lower()
    for sp in TOP3:
        if sp.lower() in f or sp.lower() in l:
            return sp
    return "Others"

df_xeno["orig_label"] = df_xeno["label"]
df_xeno["label"] = df_xeno.apply(relabel_xeno, axis=1)

print(df_xeno["label"].value_counts())

xeno_out = os.path.join(OUTPUT_DIR, "xeno_relabeled.csv")
df_xeno.to_csv(xeno_out, index=False)
print("Saved:", xeno_out)

# -------------------------
# ALIGN COLUMNS AND MERGE
# -------------------------
print("\nMerging datasets...")

dfs = [df_insect459, df_xeno, esc_env]

# Identify common feature columns
common_cols = set(dfs[0].columns)
for d in dfs:
    common_cols = common_cols.intersection(d.columns)

common_cols = sorted(list(common_cols))

print("Common cols:", len(common_cols))

# Keep only those columns
dfs = [d[common_cols] for d in dfs]

final = pd.concat(dfs, ignore_index=True)
print("Final merged shape:", final.shape)
print("Final label distribution:\n", final["label"].value_counts())

final_out = os.path.join(OUTPUT_DIR, "final_insect_env_merged.csv")
final.to_csv(final_out, index=False)
print("Saved final merged dataset:", final_out)

print("\nDONE.")


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from lightgbm import LGBMClassifier

# ----------------------------
# CONFIG
# ----------------------------
CSV_PATH = r"D:\Projects\Minor\Audio\InsectsXeno\Audio\merged_output\final_insect_env_merged.csv"
N_FOLDS = 5
SEED = 42

# ----------------------------
# LOAD DATA
# ----------------------------
df = pd.read_csv(CSV_PATH)
print("Loaded:", df.shape)

# separate features and labels
X = df.drop(columns=["label", "file"], errors="ignore")
y = df["label"]

# encode labels
le = LabelEncoder()
y_enc = le.fit_transform(y)

print("Classes:", list(le.classes_))

# ----------------------------
# K-Fold Cross-Validation
# ----------------------------
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

fold_accuracies = []

print("\n========== CROSS VALIDATION ==========")

for fold, (train_idx, test_idx) in enumerate(skf.split(X, y_enc), 1):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y_enc[train_idx], y_enc[test_idx]

    model = LGBMClassifier(
        n_estimators=300,
        learning_rate=0.05,
        num_leaves=55,
        max_depth=-1,
        random_state=SEED,
        class_weight="balanced"  # important for imbalance
    )

    model.fit(X_train, y_train)

    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    fold_accuracies.append(acc)

    print(f"\n--- Fold {fold} Accuracy: {acc:.4f} ---")
    print(classification_report(y_test, preds, target_names=le.classes_))

print("\n========== FINAL RESULTS ==========")
print("Fold Accuracies:", np.round(fold_accuracies, 4))
print("Mean Accuracy:", np.mean(fold_accuracies))


In [None]:
pip install lightgbm

In [None]:
"""
Ultra-fast Mel Extractor v3 (Fixed for nested folders + Windows)
 - GPU batched mel-spectrogram extraction
 - Multiprocess audio loading + segmentation
 - Correctly detects ALL nested subfolder audio files
 - Save .pt tensors per segment (resume-safe)
Requirements:
  pip install torch torchaudio soundfile pydub tqdm
  (ffmpeg required for non-wav files)
"""

import os
import math
import time
import glob
import random
import argparse
import multiprocessing as mp
from pathlib import Path
from functools import partial
from collections import Counter

import numpy as np
import torch
import torchaudio
import soundfile as sf
from pydub import AudioSegment
from tqdm import tqdm


# ---------------- CONFIG ----------------
INPUT_FOLDERS = [
    r"E:\Insect459\Train\Train",
    r"D:\Projects\Minor\Audio\InsectsXeno\Audio",
    r"C:\Users\DEV\OneDrive\Desktop\Macaulay\Macaulary\macaulay_categorized",
]

OUTPUT_DIR = r"E:\MelSpectros\fast_mels_v3"

SAMPLE_RATE = 22050
N_MELS = 128
N_FFT = 2048
HOP_LENGTH = 512
FMIN = 50
FMAX = 12000

SEGMENT_DURATION = 3.0
NUM_SEGMENTS = 3
BATCH_SIZE = 128

USE_FP16 = True
ENABLE_LOGMEL = True
NORMALIZE_PER_SPEC = True
NUM_WORKERS = min(8, mp.cpu_count() - 1)
ALLOWED_EXTS = (".wav", ".mp3", ".flac", ".m4a", ".ogg", ".aiff", ".aif")
# ----------------------------------------

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DTYPE = torch.float16 if (USE_FP16 and DEVICE.type == "cuda") else torch.float32


# ------- FIXED: Nested-folder collector -------
def collect_files(input_folders, allowed_exts):
    collected = []
    allowed = tuple([e.lower() for e in allowed_exts])

    for folder in input_folders:
        folder_abs = os.path.abspath(folder)
        if not os.path.isdir(folder_abs):
            print(f"[WARNING] Not a directory: {folder_abs}")
            continue

        for root, _, files in os.walk(folder_abs):
            for f in files:
                if f.lower().endswith(allowed):
                    full_path = os.path.join(root, f)
                    rel_path = os.path.relpath(full_path, folder_abs)
                    collected.append((folder_abs, rel_path))

    return collected


# ---------------- AUDIO LOADING ----------------
def load_audio_file(path, target_sr):
    ext = Path(path).suffix.lower()

    # Try soundfile ‚Üí torchaudio ‚Üí pydub fallback
    try:
        y, sr = sf.read(path, dtype='float32')
        if y.ndim > 1:
            y = y.mean(axis=1)
        if sr != target_sr:
            wav = torch.from_numpy(y).unsqueeze(0)
            wav = torchaudio.transforms.Resample(sr, target_sr)(wav)
            y = wav.squeeze(0).numpy()
        return y.astype(np.float32), target_sr

    except Exception:
        try:
            wav, sr = torchaudio.load(path)
            wav = wav.mean(dim=0).numpy() if wav.ndim > 1 else wav.numpy()
            if sr != target_sr:
                wav_t = torch.from_numpy(wav).unsqueeze(0)
                wav_t = torchaudio.transforms.Resample(sr, target_sr)(wav_t)
                wav = wav_t.squeeze(0).numpy()
            return wav.astype(np.float32), target_sr

        except Exception:
            audio = AudioSegment.from_file(path)
            audio = audio.set_frame_rate(target_sr).set_channels(1).set_sample_width(2)
            samples = np.array(audio.get_array_of_samples()).astype(np.float32) / (2**15)
            return samples.astype(np.float32), target_sr


# ---------------- SEGMENT LOGIC ----------------
def generate_segment_starts(total_len, seg_len, num_segments):
    if total_len <= seg_len:
        return [0]

    max_start = total_len - seg_len

    if num_segments == 1:
        return [random.randint(0, max_start)]

    starts = set()
    while len(starts) < num_segments:
        starts.add(random.randint(0, max_start))

    return sorted(list(starts))[:num_segments]


# ------------ WORKER: load + segment --------------
def worker_load_and_segment(root_rel, seg_len_samples, num_segments, sample_rate):
    root, rel_path = root_rel
    full_path = os.path.join(root, rel_path)

    # Label = top folder name
    label = os.path.basename(root)

    basename = Path(rel_path).stem

    try:
        y, sr = load_audio_file(full_path, sample_rate)
    except Exception:
        return []

    if len(y) == 0:
        return []

    starts = generate_segment_starts(len(y), seg_len_samples, num_segments)
    out = []

    for i, s in enumerate(starts, start=1):
        seg = y[s:s+seg_len_samples]
        if len(seg) < seg_len_samples:
            seg = np.pad(seg, (0, seg_len_samples - len(seg)))

        key = f"{basename}_seg{i}"
        out.append((label, basename, key, seg.astype(np.float32)))

    return out


# ---------- SAVE TENSOR ----------
def save_tensor(tensor, out_path):
    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    torch.save(tensor.cpu(), out_path)


# ---------- PROCESS & SAVE BATCH ----------
def process_and_save_batch(batch_seg_arrays, batch_meta, mel_transform, amp_to_db, spect_dir):
    wav_batch = torch.from_numpy(np.stack(batch_seg_arrays)).to(DEVICE).unsqueeze(1)

    with torch.no_grad():
        mels = mel_transform(wav_batch)

        if ENABLE_LOGMEL:
            mels = amp_to_db(mels)

        mels = mels.to(DTYPE)

    for i, meta in enumerate(batch_meta):
        label, basename, key = meta

        out_dir = spect_dir / label
        out_dir.mkdir(parents=True, exist_ok=True)

        out_path = out_dir / f"{key}.pt"

        spec = mels[i]

        if NORMALIZE_PER_SPEC:
            mn = float(spec.min().cpu())
            mx = float(spec.max().cpu())
            if mx - mn > 1e-6:
                spec = (spec - mn) / (mx - mn)

        save_tensor(spec, str(out_path))


# =================== MAIN ===================
def main():
    print("Device:", DEVICE, "dtype:", DTYPE)

    seg_len_samples = int(SEGMENT_DURATION * SAMPLE_RATE)
    mel_transform = torchaudio.transforms.MelSpectrogram(
        sample_rate=SAMPLE_RATE,
        n_fft=N_FFT,
        hop_length=HOP_LENGTH,
        n_mels=N_MELS,
        f_min=FMIN,
        f_max=min(FMAX, SAMPLE_RATE // 2),
        power=2.0,
        norm='slaney',
        mel_scale='htk'
    ).to(DEVICE)

    amp_to_db = torchaudio.transforms.AmplitudeToDB(stype='power').to(DEVICE)

    # -------- NEW FIXED COLLECTOR --------
    file_list = collect_files(INPUT_FOLDERS, ALLOWED_EXTS)

    print("Files found:", len(file_list))
    print("Estimated segments:", len(file_list) * NUM_SEGMENTS)

    if len(file_list) == 0:
        print("No audio files found!")
        return

    # -------- Make output dirs --------
    spect_dir = Path(OUTPUT_DIR) / "spectrograms"
    spect_dir.mkdir(parents=True, exist_ok=True)

    # -------- Windows-safe multiprocessing --------
    ctx = mp.get_context("spawn")
    pool = ctx.Pool(processes=NUM_WORKERS)

    loader_fn = partial(
        worker_load_and_segment,
        seg_len_samples=seg_len_samples,
        num_segments=NUM_SEGMENTS,
        sample_rate=SAMPLE_RATE
    )

    batch_seg_arrays = []
    batch_meta = []
    processed = 0
    failed = 0

    pbar = tqdm(total=len(file_list), desc="Loading & segmenting files")

    for file_segments in pool.imap_unordered(loader_fn, file_list, chunksize=1):
        pbar.update(1)

        if not file_segments:
            failed += 1
            continue

        for label, basename, key, seg in file_segments:

            out_path = spect_dir / label / f"{key}.pt"
            if out_path.exists():
                continue

            batch_seg_arrays.append(seg)
            batch_meta.append((label, basename, key))

            if len(batch_seg_arrays) >= BATCH_SIZE:
                process_and_save_batch(batch_seg_arrays, batch_meta, mel_transform, amp_to_db, spect_dir)
                processed += len(batch_seg_arrays)
                batch_seg_arrays = []
                batch_meta = []

    pbar.close()
    pool.close()
    pool.join()

    if batch_seg_arrays:
        process_and_save_batch(batch_seg_arrays, batch_meta, mel_transform, amp_to_db, spect_dir)
        processed += len(batch_seg_arrays)

    print(f"\nDone. Processed: {processed} segments, Failed files: {failed}")
    print("Saved to:", spect_dir)


if __name__ == "__main__":
    mp.freeze_support()
    start = time.time()
    main()
    print("Total time (s):", time.time() - start)
