In [1]:
import os
import pandas as pd
import soundfile as sf
import numpy as np
import librosa
from scipy.stats import gmean
from tqdm.notebook import tqdm
import warnings
import gc

warnings.filterwarnings('ignore')

# ==================== –ü–ê–†–ê–ú–ï–¢–†–´ ====================
WIN_LEN = 20
HOP_LEN = 12
SAMPLE_RATE = 16000
HOP_LENGTH = 160
N_FFT = 512

BIN_MIN = int(2000 / (SAMPLE_RATE / N_FFT))      # ~64
BIN_MAX = int(3000 / (SAMPLE_RATE / N_FFT))      # ~96

# –í–∞–ª–∏–¥–Ω—ã–µ —Ç–∏–ø—ã –Ω–µ–∏—Å–ø—Ä–∞–≤–Ω–æ—Å—Ç–µ–π (–∫–∞–∫ –≤ –º–µ–ª-–¥–∞—Ç–∞—Å–µ—Ç–µ)
VALID_FAULTS = {'MF1', 'MF2', 'MF3', 'MF4', 'PC1', 'PC2', 'PC3', 'PC4', 'N'}

def parse_filename(filename):
    """
    –ü–∞—Ä—Å–∏—Ç –∏–º—è —Ñ–∞–π–ª–∞ –∏ –≤–æ–∑–≤—Ä–∞—â–∞–µ—Ç model_type, maneuvering_direction, fault.
    –§–æ—Ä–º–∞—Ç –∏–º–µ–Ω–∏: MODEL_MANEUVER_FAULT_... .wav
    """
    basename = filename.replace('.wav', '')
    parts = basename.split('_')
    
    model_type = 'unknown'
    maneuvering_direction = 'unknown'
    fault = 'unknown'
    
    if len(parts) >= 3:
        model_type = parts[0]
        maneuvering_direction = parts[1]
        fault_candidate = parts[2]
        
        if fault_candidate in VALID_FAULTS:
            fault = fault_candidate
        else:
            # –ï—Å–ª–∏ –Ω–µ –Ω–∞ —Ç—Ä–µ—Ç—å–µ–º –º–µ—Å—Ç–µ, –∏—â–µ–º –≥–ª—É–±–∂–µ
            for i, part in enumerate(parts[2:], start=2):
                if part in VALID_FAULTS:
                    fault = part
                    break
    return model_type, maneuvering_direction, fault

def extract_frame_features(y, sr):
    """–ò–∑–≤–ª–µ–∫–∞–µ—Ç –ø–æ–∫–∞–¥—Ä–æ–≤—ã–µ –ø—Ä–∏–∑–Ω–∞–∫–∏ (8 —à—Ç—É–∫) –∏–∑ –∞—É–¥–∏–æ—Å–∏–≥–Ω–∞–ª–∞"""
    stft_matrix = librosa.stft(y, n_fft=N_FFT, hop_length=HOP_LENGTH)
    power_spec = np.abs(stft_matrix)**2
    
    # –£–∑–∫–æ–ø–æ–ª–æ—Å–Ω—ã–µ –ø—Ä–∏–∑–Ω–∞–∫–∏ (2-3 –∫–ì—Ü)
    band_spec = power_spec[BIN_MIN:BIN_MAX, :]
    band_energy = np.sum(band_spec, axis=0)
    
    amean = np.mean(band_spec + 1e-10, axis=0)
    g_mean = gmean(band_spec + 1e-10, axis=0)
    band_flatness = g_mean / amean
    
    peak_bins = np.argmax(band_spec, axis=0) + BIN_MIN
    band_peak_freq = peak_bins * (sr / N_FFT)
    
    # –®–∏—Ä–æ–∫–æ–ø–æ–ª–æ—Å–Ω—ã–µ –ø—Ä–∏–∑–Ω–∞–∫–∏
    zcr = librosa.feature.zero_crossing_rate(y, frame_length=N_FFT, hop_length=HOP_LENGTH)[0]
    centroid = librosa.feature.spectral_centroid(S=np.abs(stft_matrix), sr=sr)[0]
    bandwidth = librosa.feature.spectral_bandwidth(S=np.abs(stft_matrix), sr=sr)[0]
    rolloff = librosa.feature.spectral_rolloff(S=np.abs(stft_matrix), sr=sr, roll_percent=0.85)[0]
    rms = librosa.feature.rms(y=y, frame_length=N_FFT, hop_length=HOP_LENGTH)[0]
    
    min_len = min(len(band_energy), len(zcr), len(centroid), len(bandwidth), len(rolloff), len(rms))
    
    frame_features = np.vstack([
        band_energy[:min_len], band_flatness[:min_len], band_peak_freq[:min_len],
        zcr[:min_len], centroid[:min_len], bandwidth[:min_len], 
        rolloff[:min_len], rms[:min_len]
    ]).T  
    
    return frame_features

def split_and_aggregate(frame_features, win_len=WIN_LEN, hop_len=HOP_LEN):
    """–†–∞–∑–±–∏–≤–∞–µ—Ç –ø–æ–∫–∞–¥—Ä–æ–≤—ã–µ –ø—Ä–∏–∑–Ω–∞–∫–∏ –Ω–∞ –æ–∫–Ω–∞ –∏ –≤—ã—á–∏—Å–ª—è–µ—Ç —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∏"""
    if frame_features.shape[0] < win_len:
        padding = win_len - frame_features.shape[0]
        frame_features = np.pad(frame_features, ((0, padding), (0, 0)), mode='constant')
        
    N = frame_features.shape[0]
    windows_stats = []
    start = 0
    base_names = ['band_energy', 'band_flatness', 'band_freq', 'zcr', 'centroid', 'bandwidth', 'rolloff', 'rms']
    
    while start + win_len <= N:
        window = frame_features[start:start + win_len]
        
        w_mean = np.mean(window, axis=0)
        w_std = np.std(window, axis=0)
        w_max = np.max(window, axis=0)
        w_min = np.min(window, axis=0)
        
        window_combined = np.concatenate([w_mean, w_std, w_max, w_min])
        windows_stats.append(window_combined)
        start += hop_len
        
    col_names = []
    for stat in ['mean', 'std', 'max', 'min']:
        for name in base_names:
            col_names.append(f"custom_{name}_{stat}")
            
    return windows_stats, col_names

def process_file_custom(audio_path):
    """–û–±—Ä–∞–±–∞—Ç—ã–≤–∞–µ—Ç –æ–¥–∏–Ω –∞—É–¥–∏–æ—Ñ–∞–π–ª: –∏–∑–≤–ª–µ–∫–∞–µ—Ç –ø—Ä–∏–∑–Ω–∞–∫–∏, –¥–æ–±–∞–≤–ª—è–µ—Ç –º–µ—Ç–∞–¥–∞–Ω–Ω—ã–µ"""
    try:
        data, sr = sf.read(audio_path)
        
        if len(data.shape) > 1:
            data = np.mean(data, axis=1)
            
        if sr != SAMPLE_RATE:
            data = librosa.resample(data, orig_sr=sr, target_sr=SAMPLE_RATE)
            
        frame_feats = extract_frame_features(data, SAMPLE_RATE)
        windows, col_names = split_and_aggregate(frame_feats)
        
        filename = os.path.basename(audio_path)
        
        # –ü–∞—Ä—Å–∏–º –º–µ—Ç–∫–∏ –∏–∑ –∏–º–µ–Ω–∏ —Ñ–∞–π–ª–∞
        model_type, maneuvering_direction, fault = parse_filename(filename)
        
        if windows:
            df = pd.DataFrame(windows, columns=col_names)
            df['filename'] = filename
            df['window_id'] = range(len(df))          # –ø–æ—Ä—è–¥–∫–æ–≤—ã–π –Ω–æ–º–µ—Ä –æ–∫–Ω–∞ –≤ —Ñ–∞–π–ª–µ
            df['model_type'] = model_type
            df['maneuvering_direction'] = maneuvering_direction
            df['fault'] = fault
            return df, None 
        return None, "–ù–µ —É–¥–∞–ª–æ—Å—å —Å–æ–∑–¥–∞—Ç—å –æ–∫–Ω–∞"
        
    except Exception as e:
        return None, str(e)

def process_single_folder(folder_path, folder_id, output_dir):
    """–û–±—Ä–∞–±–∞—Ç—ã–≤–∞–µ—Ç –æ–¥–Ω—É –ø–∞–ø–∫—É —Å —É—á—ë—Ç–æ–º –∫—ç—à–∏—Ä–æ–≤–∞–Ω–∏—è"""
    cache_file = os.path.join(output_dir, "by_folder", f"{folder_id}.parquet")
    
    # –ï—Å–ª–∏ —É–∂–µ –æ–±—Ä–∞–±–æ—Ç–∞–Ω–æ ‚Äì –∑–∞–≥—Ä—É–∂–∞–µ–º –∏–∑ –∫—ç—à–∞
    if os.path.exists(cache_file):
        print(f"‚è≠Ô∏è –ü–∞–ø–∫–∞ {folder_id} —É–∂–µ –æ–±—Ä–∞–±–æ—Ç–∞–Ω–∞. –ó–∞–≥—Ä—É–∂–∞–µ–º –∏–∑ –∫—ç—à–∞...")
        return pd.read_parquet(cache_file), []
    
    print(f"\nüìÅ –û–±—Ä–∞–±–æ—Ç–∫–∞ –ø–∞–ø–∫–∏: {folder_id}")
    
    wav_files = [f for f in os.listdir(folder_path) if f.lower().endswith('.wav')]
    if not wav_files:
        print("   ‚ö†Ô∏è –ù–µ—Ç WAV —Ñ–∞–π–ª–æ–≤.")
        return None, []
        
    folder_dfs = []
    folder_bad_files = []
    
    for wav_file in tqdm(wav_files, desc=f"–§–∞–π–ª—ã –≤ {folder_id}", leave=False):
        full_path = os.path.join(folder_path, wav_file)
        df, error_msg = process_file_custom(full_path)
        
        if df is not None:
            folder_dfs.append(df)
        else:
            folder_bad_files.append({"filename": wav_file, "folder": folder_id, "error": error_msg})
            
    if folder_dfs:
        folder_combined_df = pd.concat(folder_dfs, ignore_index=True)
        folder_combined_df.to_parquet(cache_file, engine='pyarrow', compression='snappy', index=False)
        print(f"   ‚úÖ –°–æ—Ö—Ä–∞–Ω–µ–Ω–æ –≤ –∫—ç—à: {len(folder_combined_df)} –æ–∫–æ–Ω.")
        return folder_combined_df, folder_bad_files
    else:
        print("   ‚ùå –ù–µ —É–¥–∞–ª–æ—Å—å –∏–∑–≤–ª–µ—á—å –ø—Ä–∏–∑–Ω–∞–∫–∏ –∏–∑ —Ñ–∞–π–ª–æ–≤ –ø–∞–ø–∫–∏.")
        return None, folder_bad_files

# ==================== –û–°–ù–û–í–ù–û–ô –ë–õ–û–ö ====================
audio_folder = r"C:\Users\–ï5\Documents\olesya\vkr\drone"
output_dir = os.path.join(audio_folder, "features_win20_hop12_stft_feat32")

# –°–æ–∑–¥–∞—ë–º —Å—Ç—Ä—É–∫—Ç—É—Ä—É –ø–∞–ø–æ–∫
os.makedirs(output_dir, exist_ok=True)
os.makedirs(os.path.join(output_dir, "by_folder"), exist_ok=True)
os.makedirs(os.path.join(output_dir, "combined"), exist_ok=True)

all_dfs = []
all_bad_files = [] 

# –°–æ–±–∏—Ä–∞–µ–º –≤—Å–µ –ø–∞–ø–∫–∏ –¥–ª—è –æ–±—Ä–∞–±–æ—Ç–∫–∏ (–∫–∞–∫ –≤ –º–µ–ª-—Å–∫—Ä–∏–ø—Ç–µ)
folders_to_process = []
for main_folder in ['A', 'B', 'C']:
    main_path = os.path.join(audio_folder, main_folder)
    if not os.path.exists(main_path): continue
        
    for data_split in ['train', 'valid', 'test']:
        split_path = os.path.join(main_path, data_split)
        if not os.path.exists(split_path): continue
            
        for mic_folder in ['mic1', 'mic2']:
            mic_path = os.path.join(split_path, mic_folder)
            if not os.path.exists(mic_path): continue
                
            folder_id = f"{main_folder}_{data_split}_{mic_folder}"
            folders_to_process.append({
                'path': mic_path,
                'id': folder_id
            })

print(f"üöÄ –ù–∞–π–¥–µ–Ω–æ –ø–∞–ø–æ–∫ –¥–ª—è –æ–±—Ä–∞–±–æ—Ç–∫–∏: {len(folders_to_process)}")

# –û–±—Ä–∞–±–∞—Ç—ã–≤–∞–µ–º –∫–∞–∂–¥—É—é –ø–∞–ø–∫—É
for folder_info in folders_to_process:
    df, bad_files = process_single_folder(folder_info['path'], folder_info['id'], output_dir)
    
    if df is not None:
        all_dfs.append(df)
    if bad_files:
        all_bad_files.extend(bad_files)
        
    gc.collect()

# –§–∏–Ω–∞–ª—å–Ω–æ–µ –æ–±—ä–µ–¥–∏–Ω–µ–Ω–∏–µ
if all_dfs:
    print("\nüîÑ –°–æ–∑–¥–∞–Ω–∏–µ —Ñ–∏–Ω–∞–ª—å–Ω–æ–≥–æ –æ–±—ä–µ–¥–∏–Ω–µ–Ω–Ω–æ–≥–æ —Ñ–∞–π–ª–∞...")
    final_combined_df = pd.concat(all_dfs, ignore_index=True)
    
    out_file = os.path.join(output_dir, "combined", "custom_data_v1.parquet")
    final_combined_df.to_parquet(out_file, engine='pyarrow', compression='snappy', index=False)
    
    print(f"‚úÖ –£–°–ü–ï–®–ù–û –ó–ê–í–ï–†–®–ï–ù–û!")
    print(f"üíæ –ò—Ç–æ–≥–æ–≤—ã–π —Ä–∞–∑–º–µ—Ä: {len(final_combined_df)} –æ–∫–æ–Ω.")
    print(f"üìÅ –ü—É—Ç—å: {out_file}")
    
    if all_bad_files:
        print(f"\n‚ö†Ô∏è –í–Ω–∏–º–∞–Ω–∏–µ! –ù–∞–π–¥–µ–Ω–æ –±–∏—Ç—ã—Ö —Ñ–∞–π–ª–æ–≤: {len(all_bad_files)}")
        for bf in all_bad_files[:5]:
            print(f" - [{bf['folder']}] {bf['filename']}: {bf['error'][:50]}...")
else:
    print("‚ùå –ù–µ—Ç –¥–∞–Ω–Ω—ã—Ö –¥–ª—è –æ–±—ä–µ–¥–∏–Ω–µ–Ω–∏—è.")

üöÄ –ù–∞–π–¥–µ–Ω–æ –ø–∞–ø–æ–∫ –¥–ª—è –æ–±—Ä–∞–±–æ—Ç–∫–∏: 18

üìÅ –û–±—Ä–∞–±–æ—Ç–∫–∞ –ø–∞–ø–∫–∏: A_train_mic1


–§–∞–π–ª—ã –≤ A_train_mic1:   0%|          | 0/32400 [00:00<?, ?it/s]

   ‚úÖ –°–æ—Ö—Ä–∞–Ω–µ–Ω–æ –≤ –∫—ç—à: 97200 –æ–∫–æ–Ω.

üìÅ –û–±—Ä–∞–±–æ—Ç–∫–∞ –ø–∞–ø–∫–∏: A_train_mic2


–§–∞–π–ª—ã –≤ A_train_mic2:   0%|          | 0/32400 [00:00<?, ?it/s]

   ‚úÖ –°–æ—Ö—Ä–∞–Ω–µ–Ω–æ –≤ –∫—ç—à: 97200 –æ–∫–æ–Ω.

üìÅ –û–±—Ä–∞–±–æ—Ç–∫–∞ –ø–∞–ø–∫–∏: A_valid_mic1


–§–∞–π–ª—ã –≤ A_valid_mic1:   0%|          | 0/10800 [00:00<?, ?it/s]

   ‚úÖ –°–æ—Ö—Ä–∞–Ω–µ–Ω–æ –≤ –∫—ç—à: 32328 –æ–∫–æ–Ω.

üìÅ –û–±—Ä–∞–±–æ—Ç–∫–∞ –ø–∞–ø–∫–∏: A_valid_mic2


–§–∞–π–ª—ã –≤ A_valid_mic2:   0%|          | 0/10800 [00:00<?, ?it/s]

   ‚úÖ –°–æ—Ö—Ä–∞–Ω–µ–Ω–æ –≤ –∫—ç—à: 32400 –æ–∫–æ–Ω.

üìÅ –û–±—Ä–∞–±–æ—Ç–∫–∞ –ø–∞–ø–∫–∏: A_test_mic1


–§–∞–π–ª—ã –≤ A_test_mic1:   0%|          | 0/10800 [00:00<?, ?it/s]

   ‚úÖ –°–æ—Ö—Ä–∞–Ω–µ–Ω–æ –≤ –∫—ç—à: 32319 –æ–∫–æ–Ω.

üìÅ –û–±—Ä–∞–±–æ—Ç–∫–∞ –ø–∞–ø–∫–∏: A_test_mic2


–§–∞–π–ª—ã –≤ A_test_mic2:   0%|          | 0/10800 [00:00<?, ?it/s]

   ‚úÖ –°–æ—Ö—Ä–∞–Ω–µ–Ω–æ –≤ –∫—ç—à: 32388 –æ–∫–æ–Ω.

üìÅ –û–±—Ä–∞–±–æ—Ç–∫–∞ –ø–∞–ø–∫–∏: B_train_mic1


–§–∞–π–ª—ã –≤ B_train_mic1:   0%|          | 0/32400 [00:00<?, ?it/s]

   ‚úÖ –°–æ—Ö—Ä–∞–Ω–µ–Ω–æ –≤ –∫—ç—à: 97200 –æ–∫–æ–Ω.

üìÅ –û–±—Ä–∞–±–æ—Ç–∫–∞ –ø–∞–ø–∫–∏: B_train_mic2


–§–∞–π–ª—ã –≤ B_train_mic2:   0%|          | 0/32400 [00:00<?, ?it/s]

   ‚úÖ –°–æ—Ö—Ä–∞–Ω–µ–Ω–æ –≤ –∫—ç—à: 97200 –æ–∫–æ–Ω.

üìÅ –û–±—Ä–∞–±–æ—Ç–∫–∞ –ø–∞–ø–∫–∏: B_valid_mic1


–§–∞–π–ª—ã –≤ B_valid_mic1:   0%|          | 0/10800 [00:00<?, ?it/s]

   ‚úÖ –°–æ—Ö—Ä–∞–Ω–µ–Ω–æ –≤ –∫—ç—à: 32358 –æ–∫–æ–Ω.

üìÅ –û–±—Ä–∞–±–æ—Ç–∫–∞ –ø–∞–ø–∫–∏: B_valid_mic2


–§–∞–π–ª—ã –≤ B_valid_mic2:   0%|          | 0/10800 [00:00<?, ?it/s]

   ‚úÖ –°–æ—Ö—Ä–∞–Ω–µ–Ω–æ –≤ –∫—ç—à: 32358 –æ–∫–æ–Ω.

üìÅ –û–±—Ä–∞–±–æ—Ç–∫–∞ –ø–∞–ø–∫–∏: B_test_mic1


–§–∞–π–ª—ã –≤ B_test_mic1:   0%|          | 0/10800 [00:00<?, ?it/s]

   ‚úÖ –°–æ—Ö—Ä–∞–Ω–µ–Ω–æ –≤ –∫—ç—à: 32379 –æ–∫–æ–Ω.

üìÅ –û–±—Ä–∞–±–æ—Ç–∫–∞ –ø–∞–ø–∫–∏: B_test_mic2


–§–∞–π–ª—ã –≤ B_test_mic2:   0%|          | 0/10800 [00:00<?, ?it/s]

   ‚úÖ –°–æ—Ö—Ä–∞–Ω–µ–Ω–æ –≤ –∫—ç—à: 32391 –æ–∫–æ–Ω.

üìÅ –û–±—Ä–∞–±–æ—Ç–∫–∞ –ø–∞–ø–∫–∏: C_train_mic1


–§–∞–π–ª—ã –≤ C_train_mic1:   0%|          | 0/32400 [00:00<?, ?it/s]

   ‚úÖ –°–æ—Ö—Ä–∞–Ω–µ–Ω–æ –≤ –∫—ç—à: 97200 –æ–∫–æ–Ω.

üìÅ –û–±—Ä–∞–±–æ—Ç–∫–∞ –ø–∞–ø–∫–∏: C_train_mic2


–§–∞–π–ª—ã –≤ C_train_mic2:   0%|          | 0/32400 [00:00<?, ?it/s]

   ‚úÖ –°–æ—Ö—Ä–∞–Ω–µ–Ω–æ –≤ –∫—ç—à: 97200 –æ–∫–æ–Ω.

üìÅ –û–±—Ä–∞–±–æ—Ç–∫–∞ –ø–∞–ø–∫–∏: C_valid_mic1


–§–∞–π–ª—ã –≤ C_valid_mic1:   0%|          | 0/10800 [00:00<?, ?it/s]

   ‚úÖ –°–æ—Ö—Ä–∞–Ω–µ–Ω–æ –≤ –∫—ç—à: 32400 –æ–∫–æ–Ω.

üìÅ –û–±—Ä–∞–±–æ—Ç–∫–∞ –ø–∞–ø–∫–∏: C_valid_mic2


–§–∞–π–ª—ã –≤ C_valid_mic2:   0%|          | 0/10800 [00:00<?, ?it/s]

   ‚úÖ –°–æ—Ö—Ä–∞–Ω–µ–Ω–æ –≤ –∫—ç—à: 32400 –æ–∫–æ–Ω.

üìÅ –û–±—Ä–∞–±–æ—Ç–∫–∞ –ø–∞–ø–∫–∏: C_test_mic1


–§–∞–π–ª—ã –≤ C_test_mic1:   0%|          | 0/10800 [00:00<?, ?it/s]

   ‚úÖ –°–æ—Ö—Ä–∞–Ω–µ–Ω–æ –≤ –∫—ç—à: 32400 –æ–∫–æ–Ω.

üìÅ –û–±—Ä–∞–±–æ—Ç–∫–∞ –ø–∞–ø–∫–∏: C_test_mic2


–§–∞–π–ª—ã –≤ C_test_mic2:   0%|          | 0/10800 [00:00<?, ?it/s]

   ‚úÖ –°–æ—Ö—Ä–∞–Ω–µ–Ω–æ –≤ –∫—ç—à: 32400 –æ–∫–æ–Ω.

üîÑ –°–æ–∑–¥–∞–Ω–∏–µ —Ñ–∏–Ω–∞–ª—å–Ω–æ–≥–æ –æ–±—ä–µ–¥–∏–Ω–µ–Ω–Ω–æ–≥–æ —Ñ–∞–π–ª–∞...
‚úÖ –£–°–ü–ï–®–ù–û –ó–ê–í–ï–†–®–ï–ù–û!
üíæ –ò—Ç–æ–≥–æ–≤—ã–π —Ä–∞–∑–º–µ—Ä: 971721 –æ–∫–æ–Ω.
üìÅ –ü—É—Ç—å: C:\Users\–ï5\Documents\olesya\vkr\drone\features_win20_hop12_stft_feat32\combined\custom_data_v1.parquet

‚ö†Ô∏è –í–Ω–∏–º–∞–Ω–∏–µ! –ù–∞–π–¥–µ–Ω–æ –±–∏—Ç—ã—Ö —Ñ–∞–π–ª–æ–≤: 93
 - [A_valid_mic1] A_B_MF2_788_WestDoor_728_snr=10.737970939546484.wav: Internal psf_fseek() failed....
 - [A_valid_mic1] A_B_MF3_719_SportsComplex_361_snr=12.020708415650434.wav: Error : unknown error in flac decoder....
 - [A_valid_mic1] A_B_MF3_768_ConstructionSite_286_snr=10.842793445321204.wav: Error : unknown error in flac decoder....
 - [A_valid_mic1] A_B_PC2_731_ConstructionSite_253_snr=11.166238417763065.wav: Error : unknown error in flac decoder....
 - [A_valid_mic1] A_B_PC4_651_WestDoor_535_snr=11.978523582907995.wav: Internal psf_fseek() failed....
