In [1]:
import pandas as pd
from pathlib import Path
import soundfile as sf
import subprocess
from tqdm import tqdm
import os
import re
import math
from sklearn.model_selection import train_test_split # Use simple random split
import numpy as np
import uuid # For unique IDs
import shutil # For moving final files

# --- Configuration ---

# Base directory
root_dir = Path('../')
data_dir = root_dir / 'data'

# INPUT/OUTPUT Base Directory (Processing IN-PLACE within 'combined')
combined_base_dir = data_dir / 'combined'
dementia_dir = combined_base_dir / 'dementia'
nodem_dir = combined_base_dir / 'nodementia'

# TEMP directory for intermediate processing
temp_dir = combined_base_dir / 'temp_processing'

# Metadata output directory
metadata_dir = data_dir / 'processed' # Save final splits here

# Audio Processing Settings
TARGET_SAMPLE_RATE = 16000
TARGET_FORMAT = "wav"

# Segmentation Settings
SEGMENT_DURATION_S = 20 # Target duration
MIN_SEGMENT_DURATION_S = 2  # Discard segments shorter than this
OVERLAP_S = 1 # Overlap

# --- Ensure Directories Exist ---
dementia_dir.mkdir(parents=True, exist_ok=True)
nodem_dir.mkdir(parents=True, exist_ok=True)
temp_dir.mkdir(parents=True, exist_ok=True)
metadata_dir.mkdir(parents=True, exist_ok=True)

print(f"Working within Base Dir: {combined_base_dir}")
print(f"Temporary Processing Dir: {temp_dir}")
print(f"Final Metadata Dir: {metadata_dir}")
print(f"Target Segment Duration: {SEGMENT_DURATION_S}s")
print(f"Minimum Segment Duration: {MIN_SEGMENT_DURATION_S}s")
print(f"Segment Overlap: {OVERLAP_S}s")

Working within Base Dir: ..\data\combined
Temporary Processing Dir: ..\data\combined\temp_processing
Final Metadata Dir: ..\data\processed
Target Segment Duration: 20s
Minimum Segment Duration: 2s
Segment Overlap: 1s


In [2]:
# # Cell 2: Cleanup Non-Audio Files

# print("\n--- Cleaning up non-audio files ---")
# allowed_extensions = ['.wav', '.mp3', '.flac']
# files_deleted = 0

# for directory in [dementia_dir, nodem_dir]:
#     print(f"Scanning {directory}...")
#     if not directory.is_dir():
#         print(f"Warning: Directory not found: {directory}")
#         continue
#     for item_path in list(directory.iterdir()): # Use list to avoid issues while deleting
#         if item_path.is_file():
#             if item_path.suffix.lower() not in allowed_extensions:
#                 print(f"  Deleting non-audio file: {item_path.name}")
#                 try:
#                     item_path.unlink()
#                     files_deleted += 1
#                 except Exception as e:
#                     print(f"    Error deleting {item_path.name}: {e}")
#         # Optional: remove empty subdirectories if any exist? For now, only files.

# print(f"\nCleanup complete. Deleted {files_deleted} non-audio files.")

In [3]:
# # Cell 3: Standardize, Segment, Replace/Delete Originals

# print("\n--- Standardizing format and segmenting long files ---")

# all_processed_paths = [] # Keep track of final files generated in this step
# all_errors = []
# files_processed_count = 0
# segments_created_count = 0
# short_files_deleted_count = 0
# long_files_replaced_count = 0
# valid_short_files_kept_count = 0


# # Use a temporary location for ffmpeg output before segmentation/moving
# temp_dir.mkdir(parents=True, exist_ok=True)

# for directory in [dementia_dir, nodem_dir]:
#     print(f"\nProcessing directory: {directory}")
#     if not directory.is_dir():
#         print(f"Warning: Directory not found: {directory}")
#         continue

#     # Get list of current audio files before processing starts
#     current_audio_files = [p for p in directory.iterdir() if p.is_file() and p.suffix.lower() in ['.wav', '.mp3', '.flac']]

#     for input_path in tqdm(current_audio_files, desc=f"Processing {directory.name}"):
#         files_processed_count += 1
#         original_name_stem = input_path.stem
#         original_suffix = input_path.suffix
#         temp_standardized_wav_path = temp_dir / f"{original_name_stem}_tempstandard.wav"
#         processed_successfully = False

#         try:
#             # 1. Standardize to 16kHz mono WAV in temp dir
#             convert_command = [
#                 "ffmpeg", "-i", str(input_path),
#                 "-ar", str(TARGET_SAMPLE_RATE), "-ac", "1",
#                 "-vn", "-loglevel", "error", "-y",
#                 str(temp_standardized_wav_path)
#             ]
#             subprocess.run(convert_command, check=True, capture_output=True, timeout=300)

#             # 2. Get duration of standardized file
#             info = sf.info(temp_standardized_wav_path)
#             duration = info.duration
#             total_frames = info.frames
#             if info.samplerate != TARGET_SAMPLE_RATE or info.channels != 1:
#                  raise ValueError(f"ffmpeg conversion incorrect format: {info.samplerate}Hz, {info.channels}ch")

#             # 3. Decide action based on duration
#             if duration < MIN_SEGMENT_DURATION_S:
#                 # Delete original file, discard temp
#                 print(f"  Discarding {input_path.name} (duration {duration:.2f}s < {MIN_SEGMENT_DURATION_S}s)")
#                 input_path.unlink() # Delete original
#                 short_files_deleted_count += 1
#                 processed_successfully = True # Processed in the sense of deciding its fate

#             elif duration <= SEGMENT_DURATION_S:
#                  # Keep this file, replace original with standardized temp version
#                  final_path = directory / f"{original_name_stem}.wav" # Ensure final name is .wav
#                  shutil.move(str(temp_standardized_wav_path), str(final_path))
#                  # If original was not .wav, delete it
#                  if input_path.suffix.lower() != '.wav' and input_path.exists():
#                        input_path.unlink()
#                  all_processed_paths.append(final_path)
#                  valid_short_files_kept_count += 1
#                  processed_successfully = True

#             else: # Duration > SEGMENT_DURATION_S -> Segment
#                  segment_len_frames = int(SEGMENT_DURATION_S * TARGET_SAMPLE_RATE)
#                  min_segment_len_frames = int(MIN_SEGMENT_DURATION_S * TARGET_SAMPLE_RATE)
#                  overlap_frames = int(OVERLAP_S * TARGET_SAMPLE_RATE)
#                  step_frames = max(1, segment_len_frames - overlap_frames) # Ensure step > 0

#                  current_pos_frames = 0
#                  segment_index = 0
#                  segments_saved_for_this_file = 0

#                  with sf.SoundFile(temp_standardized_wav_path, 'r') as sndfile:
#                      while current_pos_frames < total_frames:
#                          read_start = current_pos_frames
#                          read_frames = segment_len_frames
#                          if read_start + read_frames > total_frames:
#                              read_frames = total_frames - read_start

#                          if read_frames >= min_segment_len_frames:
#                              sndfile.seek(read_start)
#                              segment_data = sndfile.read(frames=read_frames, dtype='float32')

#                              segment_filename = f"{original_name_stem}_seg{segment_index:04d}.wav"
#                              segment_output_path = directory / segment_filename # Save directly in original dir

#                              sf.write(segment_output_path, segment_data, TARGET_SAMPLE_RATE)
#                              all_processed_paths.append(segment_output_path)
#                              segments_saved_for_this_file += 1
#                              segments_created_count += 1
#                              segment_index += 1

#                          current_pos_frames += step_frames
#                          if step_frames == 0: break # Safety break

#                  if segments_saved_for_this_file > 0:
#                      input_path.unlink() # Delete original file only if segments were saved
#                      long_files_replaced_count += 1
#                      processed_successfully = True
#                  else:
#                      # No valid segments created (maybe original was slightly > 20s but no segment >= 2s?)
#                      print(f"  Warning: No valid segments created for {input_path.name} (duration {duration:.2f}s). Discarding.")
#                      input_path.unlink() # Delete original
#                      short_files_deleted_count +=1 # Count as discarded short file
#                      processed_successfully = True


#         except Exception as e:
#             error_msg = f"Failed processing {input_path.name}: {type(e).__name__} - {e}"
#             print(f"  ERROR: {error_msg}")
#             all_errors.append(error_msg)
#             # Don't delete original if processing failed, but delete temp
#             temp_standardized_wav_path.unlink(missing_ok=True)

#         finally:
#             # Ensure temp file is always deleted unless moved
#              if temp_standardized_wav_path.exists():
#                   temp_standardized_wav_path.unlink(missing_ok=True)


# # Cleanup empty temp directory
# try:
#     temp_dir.rmdir()
# except OSError:
#      print(f"Warning: Temp directory {temp_dir} not empty or could not be removed.")


# print("\n--- Standardization and Segmentation Summary ---")
# print(f"Attempted to process {files_processed_count} original audio files.")
# print(f"Kept/Standardized {valid_short_files_kept_count} files (duration <= {SEGMENT_DURATION_S}s).")
# print(f"Segmented {long_files_replaced_count} long files into {segments_created_count} segments.")
# print(f"Discarded {short_files_deleted_count} files (duration < {MIN_SEGMENT_DURATION_S}s or no valid segments).")
# print(f"Encountered {len(all_errors)} errors.")
# if all_errors:
#     print("Sample Errors:")
#     for err in all_errors[:10]: print(f"  - {err}")

In [4]:
# # Cell 4: Rename Processed Files to Unique IDs and Add Duration

# import uuid # Ensure uuid is imported
# import hashlib # Re-import if needed

# print("\n--- Renaming processed files to unique IDs and recording duration ---")

# uuid_map = [] # To store mapping {uuid_str: {final_path, label, original_name, duration_seconds}}
# rename_errors = []
# files_renamed_count = 0
# duration_read_errors = 0

# # Scan directories for final .wav files to rename
# print("Scanning directories for final .wav files...")
# final_wav_files = []
# for directory in [dementia_dir, nodem_dir]:
#      if directory.is_dir():
#           final_wav_files.extend([p for p in directory.glob('*.wav') if p.is_file()])

# print(f"Found {len(final_wav_files)} .wav files.")

# for current_path in tqdm(final_wav_files, desc="Renaming and Reading Duration"):
#     original_name_for_map = current_path.name # Store name before potential rename
#     duration_seconds = -1.0 # Default duration if error
#     new_path = current_path # Assume no rename needed initially if already UUID

#     try:
#         # Check if already looks like a UUID
#         is_already_uuid = len(current_path.stem) == 36 and '-' in current_path.stem

#         if not is_already_uuid:
#              # Generate new UUID name only if needed
#              unique_id = str(uuid.uuid4())
#              new_filename = f"{unique_id}.wav"
#              new_path = current_path.parent / new_filename
#              if new_path.exists(): # Extremely unlikely collision
#                   unique_id = str(uuid.uuid4())
#                   new_filename = f"{unique_id}.wav"
#                   new_path = current_path.parent / new_filename
#              # Rename the file
#              current_path.rename(new_path)
#              files_renamed_count += 1
#         # else: use current_path as new_path

#         # Get duration from the file (potentially renamed)
#         try:
#             info = sf.info(new_path)
#             duration_seconds = info.duration
#         except Exception as sf_err:
#             print(f"  Warning: Could not read duration for {new_path.name}: {sf_err}")
#             duration_read_errors += 1

#         # Add info to map
#         label = 1 if new_path.parent.name == 'dementia' else 0
#         uuid_map.append({
#             'uuid': new_path.stem, # UUID is the stem of the final path
#             'final_path': str(new_path.relative_to(combined_base_dir)),
#             'label': label,
#             'original_name': original_name_for_map, # Name before rename
#             'duration_seconds': duration_seconds
#         })

#     except Exception as e:
#         error_msg = f"Error processing {original_name_for_map}: {e}"
#         print(f"  ERROR: {error_msg}")
#         rename_errors.append(error_msg)

# # Save the mapping with duration
# df_uuid_map = pd.DataFrame(uuid_map)
# map_path = metadata_dir / "uuid_mapping_with_duration.csv" # New filename
# df_uuid_map.to_csv(map_path, index=False)

# print("\n--- Renaming and Duration Summary ---")
# print(f"Processed {len(final_wav_files)} .wav files.")
# print(f"Renamed {files_renamed_count} files to unique IDs.")
# print(f"Encountered {duration_read_errors} errors reading duration.")
# print(f"Encountered {len(rename_errors)} other errors.")
# if rename_errors:
#      print("Sample Other Errors:")
#      for err in rename_errors[:5]: print(f"  - {err}")
# print(f"UUID mapping with duration saved to: {map_path}")
# if not df_uuid_map.empty:
#     print(df_uuid_map.head())
# else:
#     print("Warning: UUID map is empty. Check file processing.")

In [5]:
# # Cell 5: Filter Controls by Duration, Undersample Controls, and Split Data

# import pandas as pd
# from pathlib import Path
# from sklearn.model_selection import train_test_split
# import numpy as np

# print("\n--- Creating Final Balanced Metadata and Splitting ---")

# # --- Define Paths ---
# metadata_dir = data_dir / 'processed' # Defined in Cell 1
# combined_base_dir = data_dir / 'combined' # Defined in Cell 1
# map_path = metadata_dir / "uuid_mapping_with_duration.csv" # LOAD THIS FILE

# # Output Paths for final splits
# final_train_meta_path = metadata_dir / 'train_meta_len_filtered_balanced.csv' # New descriptive name
# final_val_meta_path = metadata_dir / 'val_meta_len_filtered_balanced.csv'   # New descriptive name
# final_test_meta_path = metadata_dir / 'test_meta_len_filtered_balanced.csv'  # New descriptive name

# # --- Load Mapping File with Duration ---
# try:
#     print(f"Loading UUID map with duration from: {map_path}")
#     df_map = pd.read_csv(map_path)
#     # Drop rows where duration could not be read
#     df_map = df_map[df_map['duration_seconds'] >= 0].copy()
#     if df_map.empty: raise ValueError("No valid entries after loading map.")
#     print(f"Loaded {len(df_map)} samples with valid duration.")
# except FileNotFoundError:
#     print(f"ERROR: Mapping file not found: {map_path}. Please run Cell 4 first.")
#     raise
# except ValueError as e:
#      print(f"ERROR: {e}")
#      raise
# except Exception as e:
#     print(f"ERROR loading mapping file: {e}")
#     raise


# # --- Separate Dementia and Control Samples ---
# df_dementia = df_map[df_map['label'] == 1].copy()
# df_controls = df_map[df_map['label'] == 0].copy()
# print(f"Total Dementia Samples: {len(df_dementia)}")
# print(f"Total Control Samples: {len(df_controls)}")


# # --- Filter Controls by Duration ---
# min_duration_filter = 10.0 # Keep controls >= 10 seconds
# print(f"\nFiltering control samples to keep duration >= {min_duration_filter}s...")
# df_controls_filtered = df_controls[df_controls['duration_seconds'] >= min_duration_filter].copy()
# print(f"Found {len(df_controls_filtered)} control samples >= {min_duration_filter}s (discarded {len(df_controls) - len(df_controls_filtered)}).")

# if df_controls_filtered.empty:
#      raise ValueError("No control samples remaining after duration filter!")


# # --- Undersample Filtered Controls to Match Dementia Count ---
# n_dementia = len(df_dementia)
# n_controls_filtered = len(df_controls_filtered)

# if n_controls_filtered >= n_dementia:
#     print(f"\nUndersampling filtered controls from {n_controls_filtered} to match {n_dementia} dementia samples...")
#     df_controls_sampled = df_controls_filtered.sample(n=n_dementia, random_state=42)
# else:
#     # This case is unlikely given previous counts, but handle it
#     print(f"\nWarning: Fewer filtered controls ({n_controls_filtered}) than dementia samples ({n_dementia}). Using all {n_controls_filtered} filtered controls.")
#     # To maintain balance here, we would have to undersample dementia, but the user wants all dementia.
#     # So, we proceed with the imbalance if controls are the minority after filtering.
#     df_controls_sampled = df_controls_filtered
#     print("The final dataset will NOT be perfectly balanced by count.")

# # Combine final selected controls and all dementia samples
# df_balanced_final = pd.concat([df_controls_sampled, df_dementia], ignore_index=True)
# print(f"\nCreated final dataset with {len(df_balanced_final)} total samples:")
# print(f"  Controls (>= {min_duration_filter}s): {len(df_controls_sampled)}")
# print(f"  Dementia (All): {len(df_dementia)}")


# # --- Select final columns and Rename for Training ---
# df_final_metadata = df_balanced_final[['final_path', 'label']].copy()
# df_final_metadata.rename(columns={'final_path': 'relative_audio_path'}, inplace=True)


# # --- Perform Random Split (Ignoring Speakers) ---
# print("\nPerforming random 80/10/10 split on the final dataset...")

# min_samples_per_class = df_final_metadata['label'].value_counts().min()
# n_splits_possible = min(min_samples_per_class, 2) # Need at least 2 for stratify

# if df_final_metadata.empty or len(df_final_metadata) < 4 or n_splits_possible < 2: # Need enough for train/val/test splits
#     print("Error: Not enough data or classes for stratified splitting after filtering/sampling.")
# else:
#     try:
#         # Split off 20% for validation + test
#         df_train, df_valtest = train_test_split(
#             df_final_metadata,
#             test_size=0.20,
#             random_state=42,
#             stratify=df_final_metadata['label']
#         )

#         # Check if valtest is sufficient for another stratified split
#         min_samples_valtest = df_valtest['label'].value_counts().min() if not df_valtest.empty else 0
#         if len(df_valtest) < 2 or min_samples_valtest < 2 :
#             print("Warning: Not enough data/classes in val/test set for further stratified split. Splitting randomly.")
#             df_val, df_test = train_test_split(df_valtest, test_size=0.50, random_state=42) # No stratify if too small
#         else:
#             # Split the 20% into 10% validation and 10% test
#             df_val, df_test = train_test_split(
#                 df_valtest,
#                 test_size=0.50,
#                 random_state=42,
#                 stratify=df_valtest['label']
#             )

#         # --- Save Final Metadata Splits ---
#         df_train.to_csv(final_train_meta_path, index=False)
#         df_val.to_csv(final_val_meta_path, index=False)
#         df_test.to_csv(final_test_meta_path, index=False)

#         print("\n--- Final Split Summary ---")
#         print(f"Train set: {len(df_train)} segments ({df_train['label'].sum()} dementia, {len(df_train)-df_train['label'].sum()} control).")
#         print(f"Validation set: {len(df_val)} segments ({df_val['label'].sum()} dementia, {len(df_val)-df_val['label'].sum()} control).")
#         print(f"Test set: {len(df_test)} segments ({df_test['label'].sum()} dementia, {len(df_test)-df_test['label'].sum()} control).")
#         print(f"Saved Train metadata to: {final_train_meta_path}")
#         print(f"Saved Validation metadata to: {final_val_meta_path}")
#         print(f"Saved Test metadata to: {final_test_meta_path}")

#     except Exception as e:
#         print(f"Error during splitting or saving: {e}")


# print("\n--- Final Data Preparation Script Finished ---")

In [6]:
# # Cell: Verify Audio Sample Rates

# import soundfile as sf
# from pathlib import Path
# from tqdm import tqdm

# print("\n--- Verifying Sample Rates in Final Audio Directories ---")

# # Get paths from Cell 1 (ensure Cell 1 was run)
# if 'combined_base_dir' not in locals():
#      print("ERROR: combined_base_dir not defined. Run Cell 1 first.")
#      # Define fallback if running standalone
#      # combined_base_dir = Path('../data/combined')
#      # TARGET_SAMPLE_RATE = 16000
# else:
#      dementia_dir = combined_base_dir / 'dementia'
#      nodem_dir = combined_base_dir / 'nodementia'
#      TARGET_SAMPLE_RATE = 16000 # Defined in Cell 1

# mismatched_files = []
# error_reading_files = []
# total_files_checked = 0

# # Combine file lists from both directories
# all_wav_files = []
# if dementia_dir.is_dir():
#     all_wav_files.extend(list(dementia_dir.glob('*.wav')))
# if nodem_dir.is_dir():
#     all_wav_files.extend(list(nodem_dir.glob('*.wav')))

# print(f"Found {len(all_wav_files)} total .wav files to check...")

# for file_path in tqdm(all_wav_files, desc="Checking Sample Rates"):
#     if not file_path.is_file(): # Skip if it's somehow a directory
#         continue

#     total_files_checked += 1
#     try:
#         info = sf.info(file_path)
#         if info.samplerate != TARGET_SAMPLE_RATE:
#             mismatched_files.append({
#                 'path': str(file_path.relative_to(combined_base_dir)),
#                 'found_rate': info.samplerate,
#                 'expected_rate': TARGET_SAMPLE_RATE
#             })
#     except Exception as e:
#         error_reading_files.append({
#             'path': str(file_path.relative_to(combined_base_dir)),
#             'error': str(e)
#         })

# print("\n--- Verification Summary ---")
# print(f"Total .wav files checked: {total_files_checked}")
# print(f"Files with mismatched sample rate: {len(mismatched_files)}")
# print(f"Files that failed to read: {len(error_reading_files)}")

# if mismatched_files:
#     print("\nFiles with Mismatched Sample Rates:")
#     limit = 20 # Show first N mismatches
#     for i, f_info in enumerate(mismatched_files):
#         if i >= limit:
#             print(f"... and {len(mismatched_files) - limit} more.")
#             break
#         print(f"  - Path: {f_info['path']}, Found: {f_info['found_rate']} Hz")

# if error_reading_files:
#     print("\nFiles That Failed to Read:")
#     limit = 20 # Show first N errors
#     for i, f_info in enumerate(error_reading_files):
#          if i >= limit:
#             print(f"... and {len(error_reading_files) - limit} more.")
#             break
#          print(f"  - Path: {f_info['path']}, Error: {f_info['error']}")

# if not mismatched_files and not error_reading_files:
#     print("\nAll checked .wav files have the correct sample rate (16000 Hz).")
# else:
#     print("\nIssues found. You may need to re-run the preprocessing (Cell 3) or manually fix/delete problematic files.")


--- Verifying Sample Rates in Final Audio Directories ---
Found 94489 total .wav files to check...


Checking Sample Rates: 100%|█| 94489/94489 [11:21<00:00,


--- Verification Summary ---
Total .wav files checked: 94489
Files with mismatched sample rate: 69
Files that failed to read: 0

Files with Mismatched Sample Rates:
  - Path: nodementia\04e77862-e76e-410a-bd32-aeb6e71c9e12.wav, Found: 44100 Hz
  - Path: nodementia\10e7f964-13b8-45b8-af02-e2c1b27aef6a.wav, Found: 44100 Hz
  - Path: nodementia\1221d439-f3b8-4800-b98f-31d21e652634.wav, Found: 44100 Hz
  - Path: nodementia\135d96ae-e170-4e22-bc80-7d66cd3208ed.wav, Found: 44100 Hz
  - Path: nodementia\15532063-fb79-4f63-833a-440680f1194f.wav, Found: 44100 Hz
  - Path: nodementia\15a0e730-4df9-4049-97c4-5cf9a88a29c9.wav, Found: 44100 Hz
  - Path: nodementia\18b42b10-874c-4022-b8d4-db3c4f73da7c.wav, Found: 44100 Hz
  - Path: nodementia\1953aeac-1cf4-4ec3-a67d-f3864334738d.wav, Found: 44100 Hz
  - Path: nodementia\24a7d59d-f21f-41d1-b059-ff5ef88d1f08.wav, Found: 44100 Hz
  - Path: nodementia\2eef3fd0-2b8d-46f4-a582-9a6de380d8f2.wav, Found: 44100 Hz
  - Path: nodementia\325a50c3-0f16-4998-9f2c




In [7]:
# # Cell: Targeted Resampling of Mismatched Files

# import soundfile as sf
# from pathlib import Path
# import subprocess
# from tqdm import tqdm
# import shutil
# import os

# print("\n--- Resampling Mismatched Files ---")

# # Ensure variables from previous cells are available
# if 'mismatched_files' not in locals() or not mismatched_files:
#     print("ERROR: 'mismatched_files' list not found or empty. Run the verification cell first.")
# elif 'combined_base_dir' not in locals() or 'TARGET_SAMPLE_RATE' not in locals():
#     print("ERROR: 'combined_base_dir' or 'TARGET_SAMPLE_RATE' not defined. Run Cell 1 first.")
# else:
#     print(f"Found {len(mismatched_files)} files to attempt resampling.")
#     resample_errors = []
#     files_resampled_count = 0

#     # Define a temporary directory for conversion output
#     temp_resample_dir = combined_base_dir / "temp_resampling"
#     temp_resample_dir.mkdir(exist_ok=True)

#     for file_info in tqdm(mismatched_files, desc="Resampling Files"):
#         relative_path = file_info['path']
#         input_path = combined_base_dir / relative_path
#         temp_output_path = temp_resample_dir / input_path.name # Use same name in temp dir

#         if not input_path.is_file():
#             print(f"  Skipping (File not found): {input_path}")
#             resample_errors.append(f"File not found: {relative_path}")
#             continue

#         try:
#             # Construct ffmpeg command to overwrite temp file
#             command = [
#                 "ffmpeg",
#                 "-i", str(input_path),   # Input file
#                 "-ar", str(TARGET_SAMPLE_RATE), # Target sample rate
#                 "-ac", "1",            # Target channels (mono)
#                 "-vn",                 # No video output
#                 "-loglevel", "error",  # Show only errors
#                 "-y",                  # Overwrite output without asking
#                 str(temp_output_path)  # Output to temp file
#             ]

#             # Run ffmpeg
#             subprocess.run(command, check=True, capture_output=True, timeout=300) # 5 min timeout

#             # Verify the temporary output file
#             try:
#                  info = sf.info(temp_output_path)
#                  if info.samplerate == TARGET_SAMPLE_RATE and info.channels == 1:
#                       # Verification successful, replace original with temp file
#                       shutil.move(str(temp_output_path), str(input_path))
#                       files_resampled_count += 1
#                  else:
#                       raise ValueError(f"Verification failed: Rate={info.samplerate}, Channels={info.channels}")

#             except Exception as verify_err:
#                  error_msg = f"Verification failed for {input_path.name} after conversion: {verify_err}"
#                  print(f"  ERROR: {error_msg}")
#                  resample_errors.append(error_msg)
#                  # Don't move the failed file, clean up temp
#                  temp_output_path.unlink(missing_ok=True)


#         except subprocess.CalledProcessError as e:
#             error_msg = f"ffmpeg failed for {input_path.name}: {e.stderr.decode('utf-8', errors='replace') if e.stderr else 'Unknown ffmpeg error'}"
#             print(f"  ERROR: {error_msg}")
#             resample_errors.append(error_msg)
#             temp_output_path.unlink(missing_ok=True) # Clean up failed temp file
#         except Exception as e:
#             error_msg = f"Unexpected error resampling {input_path.name}: {e}"
#             print(f"  ERROR: {error_msg}")
#             resample_errors.append(error_msg)
#             temp_output_path.unlink(missing_ok=True) # Clean up failed temp file


#     # Cleanup empty temp directory
#     try:
#         if not any(temp_resample_dir.iterdir()): # Only remove if empty
#             temp_resample_dir.rmdir()
#     except OSError:
#          print(f"Warning: Temp directory {temp_resample_dir} not empty or could not be removed.")


#     print("\n--- Resampling Summary ---")
#     print(f"Attempted to resample {len(mismatched_files)} files.")
#     print(f"Successfully resampled and replaced {files_resampled_count} files.")
#     print(f"Encountered {len(resample_errors)} errors during resampling/verification.")
#     if resample_errors:
#         print("Sample Errors:")
#         for err in resample_errors[:10]: print(f"  - {err}")


--- Resampling Mismatched Files ---
Found 69 files to attempt resampling.


Resampling Files: 100%|█| 69/69 [00:06<00:00, 10.50it/s]


--- Resampling Summary ---
Attempted to resample 69 files.
Successfully resampled and replaced 69 files.
Encountered 0 errors during resampling/verification.



