In [1]:
# Cell 1: Imports and Setup (People's Speech Focus + Cache Redirect)
import subprocess
import pandas as pd
from pathlib import Path
import glob
import os
from tqdm import tqdm
from datasets import load_dataset, Audio # Import datasets library
import soundfile as sf # Use soundfile for saving WAV
import shutil # Import shutil for moving files
import numpy as np # For audio array check
import random # For shuffling

# --- Configuration ---

# Base directory (assuming this notebook is in the 'ml' folder)
root_dir = Path('../')
data_dir = root_dir / 'data'

# --- !!! HF Cache Directory !!! ---
# Ensure this is set correctly to where your Parquet files are!
new_cache_dir = root_dir / "hf_cache"
os.environ['HF_HOME'] = str(new_cache_dir.resolve())
print(f"INFO: Using Hugging Face cache directory (HF_HOME): {os.environ['HF_HOME']}")
# Path where the actual downloaded parquet files likely are
# Adjust 'clean' and 'train' if you used a different config/split
ps_cache_data_path = new_cache_dir / "hub" / "datasets--MLCommons--peoples_speech" / "snapshots" / "f10597c5d3d3a63f8b6827701297c3afdf178272" / "clean"
print(f"INFO: Expecting Parquet files for People's Speech in: {ps_cache_data_path}")


# Processed output location for People's Speech controls
processed_ps_dir = data_dir / 'processed' / 'controls' / 'peoples_speech'
processed_audio_dir = processed_ps_dir / 'audio'
processed_metadata_dir = processed_ps_dir / 'metadata'

# Target audio format
TARGET_SAMPLE_RATE = 16000
AUDIO_FORMAT = "wav"
BYTES_PER_SAMPLE = 2 # For 16-bit WAV

# Target total size for the control dataset (in GB)
TARGET_SIZE_GB = 30
TARGET_SIZE_BYTES = TARGET_SIZE_GB * (1024**3)

# People's Speech config and split used during download attempt
PS_CONFIG = "clean"
PS_SPLIT = "train"

# --- Ensure directories exist ---
processed_audio_dir.mkdir(parents=True, exist_ok=True)
processed_metadata_dir.mkdir(parents=True, exist_ok=True)

print(f"Processed People's Speech Output Dir: {processed_ps_dir}")
print(f"Target Control Data Size: {TARGET_SIZE_GB} GB ({TARGET_SIZE_BYTES} bytes)")
print(f"Using People's Speech Config: '{PS_CONFIG}', Split: '{PS_SPLIT}'")

INFO: Using Hugging Face cache directory (HF_HOME): D:\Cornell\Academic\Spring 2025\Startup Studio\MVP\neurotone\ml\hf_cache
INFO: Expecting Parquet files for People's Speech in: ..\hf_cache\hub\datasets--MLCommons--peoples_speech\snapshots\f10597c5d3d3a63f8b6827701297c3afdf178272\clean
Processed People's Speech Output Dir: ..\data\processed\controls\peoples_speech
Target Control Data Size: 30 GB (32212254720 bytes)
Using People's Speech Config: 'clean', Split: 'train'


In [2]:
# Cell 2: Find Available Parquet Files and Estimate Sample Sizes

print("\n--- Finding available Parquet files and estimating potential WAV sizes ---")

# Find the parquet files that were actually downloaded in the cache
available_parquet_files = glob.glob(str(ps_cache_data_path / "*.parquet"))

if not available_parquet_files:
    raise FileNotFoundError(f"No Parquet files found in the expected cache location: {ps_cache_data_path}. Cannot proceed.")

print(f"Found {len(available_parquet_files)} Parquet files to process.")

all_samples_info = []
total_estimated_size = 0

# Iterate through the found parquet files to get metadata
for pq_file in tqdm(available_parquet_files, desc="Reading Parquet Metadata"):
    try:
        df_pq = pd.read_parquet(pq_file, columns=['id', 'duration_ms']) # Only load needed columns
        for index, row in df_pq.iterrows():
            duration_s = row['duration_ms'] / 1000.0
            # Estimate size of 16kHz, 16-bit mono WAV
            estimated_wav_size = duration_s * TARGET_SAMPLE_RATE * BYTES_PER_SAMPLE
            all_samples_info.append({
                'id': row['id'],
                'duration_ms': row['duration_ms'],
                'estimated_wav_size_bytes': estimated_wav_size,
                'source_parquet': Path(pq_file).name # Track source for potential debugging
            })
            total_estimated_size += estimated_wav_size
    except Exception as e:
        print(f"Warning: Failed to read or process {pq_file}: {e}")

total_estimated_gb = total_estimated_size / (1024**3)
print(f"\nFound metadata for {len(all_samples_info)} total samples across available Parquet files.")
print(f"Total estimated size if all were converted to {TARGET_SAMPLE_RATE}Hz mono WAV: {total_estimated_gb:.2f} GB")

if not all_samples_info:
    raise ValueError("No valid sample metadata could be extracted from the Parquet files.")


--- Finding available Parquet files and estimating potential WAV sizes ---
Found 231 Parquet files to process.


Reading Parquet Metadata: 100%|█| 231/231 [00:24<00:00, 


Found metadata for 431480 total samples across available Parquet files.
Total estimated size if all were converted to 16000Hz mono WAV: 177.23 GB





In [4]:
# Cell 3: Shuffle and Select Subset Based on Target Size

print(f"\n--- Selecting a random subset targeting ~{TARGET_SIZE_GB} GB ---")

# Make sure all_samples_info exists from Cell 2
if 'all_samples_info' not in locals() or not all_samples_info:
     raise NameError("Variable 'all_samples_info' not found or is empty. Ensure Cell 2 ran successfully.")

# Shuffle the list of all potential samples
random.shuffle(all_samples_info)

selected_samples_for_processing = []
current_selected_size = 0

for sample_info in tqdm(all_samples_info, desc="Selecting Samples"):
    # Add samples until we reach or exceed the target size
    if current_selected_size < TARGET_SIZE_BYTES:
        selected_samples_for_processing.append(sample_info)
        # Ensure the estimated size is treated as a number
        current_selected_size += float(sample_info.get('estimated_wav_size_bytes', 0))
    # Keep adding samples *until* the target size is met or exceeded
    # This ensures we get at least TARGET_SIZE_BYTES unless all samples are exhausted
    if current_selected_size >= TARGET_SIZE_BYTES:
         break


selected_ids_set = {s['id'] for s in selected_samples_for_processing}
final_selected_size_gb = current_selected_size / (1024**3)

print(f"\nSelected {len(selected_samples_for_processing)} samples with an estimated total size of {final_selected_size_gb:.2f} GB.")
if not selected_samples_for_processing:
      raise ValueError("Selection resulted in zero samples. Check target size and estimated sizes.")


--- Selecting a random subset targeting ~30 GB ---


Selecting Samples:  17%|▏| 72958/431480 [00:00<00:00, 10


Selected 72959 samples with an estimated total size of 30.00 GB.





In [6]:
# Cell 4: Process ONLY the Selected People's Speech Samples (Iterative Loading)

print("\n--- Loading and Preprocessing the SELECTED People's Speech Audio (Iterative Approach) ---")

# Ensure selected_ids_set exists from Cell 3
if 'selected_ids_set' not in locals() or not selected_ids_set:
    raise NameError("Variable 'selected_ids_set' not found or is empty. Ensure Cell 3 ran successfully.")
if 'available_parquet_files' not in locals() or not available_parquet_files:
     raise NameError("Variable 'available_parquet_files' not found or is empty. Ensure Cell 2 ran successfully.")

processed_metadata_list = []
preprocess_errors = []
skipped_count = 0
processed_count = 0
processed_ids_tracker = set() # Keep track of IDs we have processed/skipped

# Define the processing function (same as before)
def process_audio_sample_ps(sample):
    global processed_count, skipped_count, preprocess_errors, processed_ids_tracker

    try:
        sample_id = sample['id']
        # Construct the expected unique output filename
        output_filename = f"ps_{sample_id.replace('/', '_').replace('.', '_')}.{AUDIO_FORMAT}"
        output_path = processed_audio_dir / output_filename

        # Double check if we somehow already processed this ID in this run
        if sample_id in processed_ids_tracker:
             skipped_count +=1
             return None # Already handled

        # Check if the final WAV file already exists
        if output_path.exists():
            skipped_count += 1
            processed_ids_tracker.add(sample_id)
            # Create metadata entry for existing file
            return {
                "processed": True,
                "sample_id": sample_id,
                "speaker_id": sample_id,
                "label": 0,
                "relative_audio_path": str(output_path.relative_to(processed_ps_dir)),
                "original_corpus": "PeopleSpeech",
                "original_split": PS_SPLIT,
                "duration_ms": sample.get('duration_ms') # Duration might be missing if only checking file
            }

        # --- If WAV doesn't exist, proceed to load audio and process ---
        # Load audio array - THIS IS THE POINT WHERE datasets loads from cache/source
        # Ensure the 'audio' column exists and is loadable
        if 'audio' not in sample or sample['audio'] is None:
             raise ValueError("Audio data missing or null in sample.")

        audio_data = sample['audio']
        waveform = audio_data['array']
        original_sr = audio_data['sampling_rate']

        # Ensure waveform is numpy array
        if not isinstance(waveform, np.ndarray):
            waveform = np.array(waveform)

        # Ensure mono
        if waveform.ndim > 1:
            if waveform.shape[0] == 1: waveform = waveform[0]
            elif waveform.shape[1] == 1: waveform = waveform[:,0]
            else: waveform = waveform.mean(axis=1 if waveform.shape[1] < waveform.shape[0] else 0)

        # Resample if necessary using ffmpeg via subprocess
        resampled_waveform = waveform
        if original_sr != TARGET_SAMPLE_RATE:
            # Unique temp file names
            temp_id = output_filename.replace('.','_').replace('/','_')
            temp_input_path = processed_audio_dir / f"temp_{temp_id}_in.wav"
            temp_output_path = processed_audio_dir / f"temp_{temp_id}_out.wav"
            temp_input_path.parent.mkdir(parents=True, exist_ok=True)
            try:
                sf.write(temp_input_path, waveform, original_sr)
                command = [
                    "ffmpeg", "-i", str(temp_input_path),
                    "-ar", str(TARGET_SAMPLE_RATE), "-ac", "1",
                    "-vn", "-loglevel", "error", "-y", str(temp_output_path)
                ]
                subprocess.run(command, check=True, capture_output=True, timeout=180)
                resampled_waveform, sr_check = sf.read(temp_output_path, dtype='float32')
                if sr_check != TARGET_SAMPLE_RATE:
                     raise ValueError(f"ffmpeg post-read SR mismatch: {sr_check}")
            finally:
                 temp_input_path.unlink(missing_ok=True)
                 temp_output_path.unlink(missing_ok=True)

        # Save the processed audio
        sf.write(output_path, resampled_waveform, TARGET_SAMPLE_RATE)
        processed_count += 1
        processed_ids_tracker.add(sample_id) # Mark as processed in this run

        # Return metadata for the successfully processed file
        return {
            "processed": True,
            "sample_id": sample_id,
            "speaker_id": sample_id,
            "label": 0,
            "relative_audio_path": str(output_path.relative_to(processed_ps_dir)),
            "original_corpus": "PeopleSpeech",
            "original_split": PS_SPLIT,
            "duration_ms": sample.get('duration_ms')
         }

    except Exception as e:
        error_msg = f"Error processing sample with id {sample.get('id','N/A')}: {type(e).__name__} - {e}"
        preprocess_errors.append(error_msg)
        if 'output_path' in locals() and isinstance(output_path, Path) and output_path.exists():
            output_path.unlink(missing_ok=True)
        return {"processed": False, "error": error_msg}

# --- Iterate through available Parquet files, loading and processing selected samples ---
print(f"\nIterating through {len(available_parquet_files)} available Parquet files...")

samples_to_process_count = len(selected_ids_set)
print(f"Targeting {samples_to_process_count} selected sample IDs.")

# Wrap the loop with tqdm
with tqdm(total=samples_to_process_count, desc="Processing Selected Samples") as pbar:
    for pq_file_path in available_parquet_files:
        if len(processed_ids_tracker) >= samples_to_process_count:
             print("Already processed/found enough samples matching the selection. Stopping Parquet scan.")
             break # Optimization: stop if we already have enough

        try:
            # Load one parquet file
            ds_chunk = load_dataset("parquet", data_files={'train': str(pq_file_path)})['train']

            # Filter this chunk for IDs we care about
            chunk_ids = set(ds_chunk['id'])
            relevant_ids_in_chunk = selected_ids_set.intersection(chunk_ids)

            if relevant_ids_in_chunk:
                # Filter the dataset chunk
                ds_chunk_filtered = ds_chunk.filter(lambda x: x['id'] in relevant_ids_in_chunk, num_proc=1) # Filter first

                # Cast audio column ONLY for the filtered chunk before processing
                try:
                    ds_chunk_filtered = ds_chunk_filtered.cast_column("audio", Audio(decode=True)) # Decode audio
                except Exception as cast_e:
                     print(f"Warning: Failed to cast audio column for chunk from {Path(pq_file_path).name}: {cast_e}. Skipping chunk.")
                     continue # Skip this parquet file if casting fails

                # Process samples in this filtered chunk
                for sample in ds_chunk_filtered:
                    if sample['id'] in selected_ids_set and sample['id'] not in processed_ids_tracker:
                         result = process_audio_sample_ps(sample)
                         if result and result.get("processed"):
                              processed_metadata_list.append(result)
                              pbar.update(1) # Update progress bar per processed/skipped sample

        except Exception as e:
            print(f"ERROR processing Parquet file {Path(pq_file_path).name}: {e}")
            preprocess_errors.append(f"Parquet load/filter error: {Path(pq_file_path).name} - {e}")

        # Explicitly delete dataset chunk object to free memory
        if 'ds_chunk' in locals(): del ds_chunk
        if 'ds_chunk_filtered' in locals(): del ds_chunk_filtered
        import gc; gc.collect() # Force garbage collection


# Final Summary and Save Metadata
if processed_metadata_list:
    df_processed_controls = pd.DataFrame(processed_metadata_list)
    output_csv_path = processed_metadata_dir / "peoples_speech_controls_metadata_selected_30gb.csv"
    df_processed_controls.to_csv(output_csv_path, index=False)

    print(f"\n--- Audio Processing Summary ---")
    print(f"Selected {len(selected_ids_set)} samples based on ~{TARGET_SIZE_GB}GB target.")
    print(f"Found/Processed {len(processed_metadata_list)} corresponding audio files.")
    print(f"Processed {processed_count} new files.")
    print(f"Skipped {skipped_count} files that already existed (or processed in this run).")
    print(f"Encountered {len(preprocess_errors)} errors during processing attempts.")
    if preprocess_errors:
        print("Sample Errors:")
        for err in preprocess_errors[:20]: # Print more errors if needed
            print(f"- {err}")
    print(f"Processed control metadata (subset) saved to: {output_csv_path}")
    print("\n--- Sample Processed Control Metadata ---")
    print(df_processed_controls.head())
else:
    print("\nNo audio samples were successfully processed or found from the selected subset.")

print("\n--- People's Speech control data preparation finished ---")


--- Loading and Preprocessing the SELECTED People's Speech Audio (Iterative Approach) ---

Iterating through 231 available Parquet files...
Targeting 72959 selected sample IDs.


Processing Selected Samples:   0%| | 0/72959 [00:00<?, ?

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:   0%| | 315/72959 [00:15<09

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:   1%| | 611/72959 [00:26<09

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:   1%| | 912/72959 [00:36<08

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:   2%| | 1250/72959 [00:46<0

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:   2%| | 1555/72959 [00:55<0

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:   3%| | 1855/72959 [01:05<0

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:   3%| | 2151/72959 [01:17<0

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:   3%| | 2448/72959 [01:26<0

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:   4%| | 2770/72959 [01:36<0

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:   4%| | 3088/72959 [01:46<0

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:   5%| | 3387/72959 [01:58<2

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:   5%| | 3710/72959 [02:07<0

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:   6%| | 4025/72959 [02:19<0

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:   6%| | 4323/72959 [02:30<0

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:   6%| | 4664/72959 [02:39<0

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:   7%| | 4971/72959 [02:48<0

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:   7%| | 5274/72959 [02:57<0

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:   8%| | 5562/72959 [03:07<0

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:   8%| | 5905/72959 [03:16<0

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:   9%| | 6220/72959 [03:25<0

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:   9%| | 6508/72959 [03:35<0

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:   9%| | 6825/72959 [03:45<0

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  10%| | 7142/72959 [03:56<0

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  10%| | 7465/72959 [04:10<1

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  11%| | 7793/72959 [04:28<5

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  11%| | 8118/72959 [04:43<1

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  12%| | 8423/72959 [04:58<1

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  12%| | 8743/72959 [05:13<1

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  12%| | 9051/72959 [05:29<1

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  13%|▏| 9347/72959 [05:45<5

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  13%|▏| 9630/72959 [05:59<1

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  14%|▏| 9923/72959 [06:11<0

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  14%|▏| 10221/72959 [06:24<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  14%|▏| 10539/72959 [06:40<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  15%|▏| 10882/72959 [06:55<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  15%|▏| 11191/72959 [07:10<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  16%|▏| 11518/72959 [07:26<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  16%|▏| 11833/72959 [07:41<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  17%|▏| 12134/72959 [07:57<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  17%|▏| 12423/72959 [08:11<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  17%|▏| 12738/72959 [08:24<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  18%|▏| 13062/72959 [08:35<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  18%|▏| 13401/72959 [08:45<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  19%|▏| 13702/72959 [08:55<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  19%|▏| 14013/72959 [09:07<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  20%|▏| 14333/72959 [09:19<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  20%|▏| 14674/72959 [09:30<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  21%|▏| 15005/72959 [09:43<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  21%|▏| 15340/72959 [09:56<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  21%|▏| 15650/72959 [10:07<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  22%|▏| 15980/72959 [10:19<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  22%|▏| 16311/72959 [10:30<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  23%|▏| 16631/72959 [10:42<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  23%|▏| 16956/72959 [10:53<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  24%|▏| 17265/72959 [11:04<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  24%|▏| 17604/72959 [11:17<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  25%|▏| 17925/72959 [11:30<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  25%|▏| 18236/72959 [11:39<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  25%|▎| 18551/72959 [11:51<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  26%|▎| 18864/72959 [12:05<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  26%|▎| 19190/72959 [12:15<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  27%|▎| 19530/72959 [12:25<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  27%|▎| 19838/72959 [12:37<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  28%|▎| 20168/72959 [12:55<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  28%|▎| 20478/72959 [13:09<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  29%|▎| 20801/72959 [13:23<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  29%|▎| 21135/72959 [13:35<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  29%|▎| 21431/72959 [13:47<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  30%|▎| 21766/72959 [14:03<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  30%|▎| 22095/72959 [14:14<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  31%|▎| 22430/72959 [14:27<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  31%|▎| 22731/72959 [14:39<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  32%|▎| 23052/72959 [14:53<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  32%|▎| 23368/72959 [15:04<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  32%|▎| 23709/72959 [15:16<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  33%|▎| 24017/72959 [15:29<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  33%|▎| 24354/72959 [15:46<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  34%|▎| 24655/72959 [15:57<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  34%|▎| 24954/72959 [16:07<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  35%|▎| 25265/72959 [16:17<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  35%|▎| 25583/72959 [16:28<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  36%|▎| 25904/72959 [16:38<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  36%|▎| 26232/72959 [16:49<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  36%|▎| 26539/72959 [16:59<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  37%|▎| 26838/72959 [17:09<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  37%|▎| 27175/72959 [17:23<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  38%|▍| 27515/72959 [17:33<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  38%|▍| 27829/72959 [17:43<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  39%|▍| 28136/72959 [17:54<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  39%|▍| 28445/72959 [18:04<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  39%|▍| 28762/72959 [18:16<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  40%|▍| 29079/72959 [18:29<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  40%|▍| 29389/72959 [18:41<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  41%|▍| 29710/72959 [18:54<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  41%|▍| 30027/72959 [19:06<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  42%|▍| 30328/72959 [19:18<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  42%|▍| 30651/72959 [19:29<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  42%|▍| 30981/72959 [19:42<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  43%|▍| 31293/72959 [19:57<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  43%|▍| 31608/72959 [20:07<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  44%|▍| 31939/72959 [20:20<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  44%|▍| 32297/72959 [20:32<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  45%|▍| 32594/72959 [20:43<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  45%|▍| 32918/72959 [20:57<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  46%|▍| 33249/72959 [21:08<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  46%|▍| 33599/72959 [21:19<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  46%|▍| 33922/72959 [21:29<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  47%|▍| 34232/72959 [21:43<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  47%|▍| 34548/72959 [21:53<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  48%|▍| 34876/72959 [22:06<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  48%|▍| 35163/72959 [22:17<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  49%|▍| 35478/72959 [22:29<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  49%|▍| 35814/72959 [22:42<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  49%|▍| 36113/72959 [22:53<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  50%|▍| 36415/72959 [23:03<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  50%|▌| 36740/72959 [23:16<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  51%|▌| 37005/72959 [23:24<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  51%|▌| 37332/72959 [23:36<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  52%|▌| 37627/72959 [23:48<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  52%|▌| 37950/72959 [23:59<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  52%|▌| 38264/72959 [24:10<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  53%|▌| 38625/72959 [24:24<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  53%|▌| 38955/72959 [24:37<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  54%|▌| 39267/72959 [24:51<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  54%|▌| 39591/72959 [25:05<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  55%|▌| 39885/72959 [25:19<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  55%|▌| 40178/72959 [25:32<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  55%|▌| 40468/72959 [25:43<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  56%|▌| 40784/72959 [25:58<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  56%|▌| 41097/72959 [26:10<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  57%|▌| 41397/72959 [26:23<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  57%|▌| 41708/72959 [26:35<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  58%|▌| 41994/72959 [26:47<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  58%|▌| 42312/72959 [26:59<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  58%|▌| 42643/72959 [27:11<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  59%|▌| 42950/72959 [27:20<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  59%|▌| 43289/72959 [27:31<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  60%|▌| 43594/72959 [27:43<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  60%|▌| 43894/72959 [27:56<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  61%|▌| 44204/72959 [28:08<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  61%|▌| 44505/72959 [28:22<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  61%|▌| 44835/72959 [28:36<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  62%|▌| 45170/72959 [28:48<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  62%|▌| 45477/72959 [28:59<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  63%|▋| 45814/72959 [29:11<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  63%|▋| 46133/72959 [29:23<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  64%|▋| 46438/72959 [29:34<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  64%|▋| 46748/72959 [29:46<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  65%|▋| 47069/72959 [29:57<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  65%|▋| 47378/72959 [30:09<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  65%|▋| 47709/72959 [30:22<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  66%|▋| 48043/72959 [30:34<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  66%|▋| 48352/72959 [30:46<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  67%|▋| 48665/72959 [31:00<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  67%|▋| 48959/72959 [31:11<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  68%|▋| 49265/72959 [31:26<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  68%|▋| 49586/72959 [31:44<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  68%|▋| 49925/72959 [31:57<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  69%|▋| 50212/72959 [32:13<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  69%|▋| 50551/72959 [32:24<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  70%|▋| 50866/72959 [32:39<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  70%|▋| 51162/72959 [32:50<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  71%|▋| 51504/72959 [33:00<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  71%|▋| 51807/72959 [33:15<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  71%|▋| 52112/72959 [33:31<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  72%|▋| 52436/72959 [33:47<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  72%|▋| 52732/72959 [34:01<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  73%|▋| 53015/72959 [34:12<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  73%|▋| 53320/72959 [34:24<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  74%|▋| 53662/72959 [34:36<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  74%|▋| 53997/72959 [34:49<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  74%|▋| 54324/72959 [35:02<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  75%|▋| 54639/72959 [35:15<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  75%|▊| 54938/72959 [35:26<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  76%|▊| 55243/72959 [35:38<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  76%|▊| 55543/72959 [35:51<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  77%|▊| 55884/72959 [36:04<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  77%|▊| 56197/72959 [36:16<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  77%|▊| 56526/72959 [36:31<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  78%|▊| 56830/72959 [36:42<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  78%|▊| 57170/72959 [36:55<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  79%|▊| 57461/72959 [37:07<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  79%|▊| 57775/72959 [37:20<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  80%|▊| 58077/72959 [37:32<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  80%|▊| 58385/72959 [37:43<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  80%|▊| 58717/72959 [37:57<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  81%|▊| 59026/72959 [38:12<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  81%|▊| 59354/72959 [38:28<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  82%|▊| 59669/72959 [38:42<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  82%|▊| 60017/72959 [38:55<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  83%|▊| 60313/72959 [39:06<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  83%|▊| 60629/72959 [39:19<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  84%|▊| 60927/72959 [39:30<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  84%|▊| 61236/72959 [39:42<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  84%|▊| 61564/72959 [39:54<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  85%|▊| 61889/72959 [40:10<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  85%|▊| 62221/72959 [40:25<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  86%|▊| 62529/72959 [40:40<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  86%|▊| 62850/72959 [40:52<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  87%|▊| 63167/72959 [41:07<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  87%|▊| 63505/72959 [41:22<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  87%|▊| 63823/72959 [41:34<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1868 [00:00<?, ? examples/s]

Processing Selected Samples:  88%|▉| 64131/72959 [41:46<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1867 [00:00<?, ? examples/s]

Processing Selected Samples:  88%|▉| 64429/72959 [41:58<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1867 [00:00<?, ? examples/s]

Processing Selected Samples:  89%|▉| 64767/72959 [42:09<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1867 [00:00<?, ? examples/s]

Processing Selected Samples:  89%|▉| 65092/72959 [42:21<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1867 [00:00<?, ? examples/s]

Processing Selected Samples:  90%|▉| 65404/72959 [42:36<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1867 [00:00<?, ? examples/s]

Processing Selected Samples:  90%|▉| 65716/72959 [42:48<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1867 [00:00<?, ? examples/s]

Processing Selected Samples:  90%|▉| 66019/72959 [43:00<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1867 [00:00<?, ? examples/s]

Processing Selected Samples:  91%|▉| 66327/72959 [43:12<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1867 [00:00<?, ? examples/s]

Processing Selected Samples:  91%|▉| 66657/72959 [43:25<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1867 [00:00<?, ? examples/s]

Processing Selected Samples:  92%|▉| 66968/72959 [43:39<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1867 [00:00<?, ? examples/s]

Processing Selected Samples:  92%|▉| 67273/72959 [43:51<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1867 [00:00<?, ? examples/s]

Processing Selected Samples:  93%|▉| 67587/72959 [44:04<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1867 [00:00<?, ? examples/s]

Processing Selected Samples:  93%|▉| 67895/72959 [44:17<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1867 [00:00<?, ? examples/s]

Processing Selected Samples:  93%|▉| 68201/72959 [44:35<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1867 [00:00<?, ? examples/s]

Processing Selected Samples:  94%|▉| 68507/72959 [44:52<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1867 [00:00<?, ? examples/s]

Processing Selected Samples:  94%|▉| 68828/72959 [45:11<

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1867 [00:00<?, ? examples/s]

Processing Selected Samples:  95%|▉| 69148/72959 [45:26<

Generating train split: 0 examples [00:00, ? examples/s]

ERROR processing Parquet file train-00219-of-00804.parquet: An error occurred while generating the dataset


Generating train split: 0 examples [00:00, ? examples/s]

ERROR processing Parquet file train-00220-of-00804.parquet: An error occurred while generating the dataset


Generating train split: 0 examples [00:00, ? examples/s]

ERROR processing Parquet file train-00221-of-00804.parquet: An error occurred while generating the dataset


Generating train split: 0 examples [00:00, ? examples/s]

ERROR processing Parquet file train-00222-of-00804.parquet: An error occurred while generating the dataset


Generating train split: 0 examples [00:00, ? examples/s]

ERROR processing Parquet file train-00223-of-00804.parquet: An error occurred while generating the dataset


Generating train split: 0 examples [00:00, ? examples/s]

ERROR processing Parquet file train-00224-of-00804.parquet: An error occurred while generating the dataset


Generating train split: 0 examples [00:00, ? examples/s]

ERROR processing Parquet file train-00225-of-00804.parquet: An error occurred while generating the dataset


Generating train split: 0 examples [00:00, ? examples/s]

ERROR processing Parquet file train-00226-of-00804.parquet: An error occurred while generating the dataset


Generating train split: 0 examples [00:00, ? examples/s]

ERROR processing Parquet file train-00227-of-00804.parquet: An error occurred while generating the dataset


Generating train split: 0 examples [00:00, ? examples/s]

ERROR processing Parquet file train-00228-of-00804.parquet: An error occurred while generating the dataset


Generating train split: 0 examples [00:00, ? examples/s]

ERROR processing Parquet file train-00229-of-00804.parquet: An error occurred while generating the dataset


Generating train split: 0 examples [00:00, ? examples/s]

Processing Selected Samples:  95%|▉| 69152/72959 [45:34<

ERROR processing Parquet file train-00230-of-00804.parquet: An error occurred while generating the dataset






--- Audio Processing Summary ---
Selected 72959 samples based on ~30GB target.
Found/Processed 69152 corresponding audio files.
Processed 69151 new files.
Skipped 1 files that already existed (or processed in this run).
Encountered 17 errors during processing attempts.
Sample Errors:
- Error processing sample with id 42313BoardOfSelectmen_SLASH_4:23:13_SPACE_Board_SPACE_of_SPACE_Selectmen_DOT_mp3_00013.flac: LibsndfileError - Error opening '..\\data\\processed\\controls\\peoples_speech\\audio\\ps_42313BoardOfSelectmen_SLASH_4:23:13_SPACE_Board_SPACE_of_SPACE_Selectmen_DOT_mp3_00013_flac.wav': System error.
- Error processing sample with id 42313BoardOfSelectmen_SLASH_4:23:13_SPACE_Board_SPACE_of_SPACE_Selectmen_DOT_mp3_00014.flac: LibsndfileError - Error opening '..\\data\\processed\\controls\\peoples_speech\\audio\\ps_42313BoardOfSelectmen_SLASH_4:23:13_SPACE_Board_SPACE_of_SPACE_Selectmen_DOT_mp3_00014_flac.wav': System error.
- Error processing sample with id 42313BoardOfSelectmen_

In [7]:
# Cell 5: Combine Metadata and Create Final Splits

import pandas as pd
from pathlib import Path
from sklearn.model_selection import GroupShuffleSplit
import numpy as np

print("--- Combining and Splitting Final Dataset ---")

# --- Define Paths ---
root_dir = Path('../')
data_dir = root_dir / 'data'
processed_dir = data_dir / 'processed' # Main processed dir for outputs

# Input Metadata Paths
ps_controls_meta_path = processed_dir / 'controls' / 'peoples_speech' / 'metadata' / 'peoples_speech_controls_metadata_selected_30gb.csv'
# !! Adjust path if you saved Dementia Diaries metadata elsewhere !!
diaries_dem_meta_path = data_dir / 'raw' / 'dementia_diaries' / 'dementia_diaries_metadata.csv'
# Original DementiaNet metadata (assuming they are in processed_dir from the first notebook)
dnet_train_meta_path = processed_dir / 'train_metadata.csv'
dnet_val_meta_path = processed_dir / 'val_metadata.csv'
dnet_test_meta_path = processed_dir / 'test_metadata.csv'

# Output Paths for final splits
final_train_meta_path = processed_dir / 'train_meta_balanced.csv'
final_val_meta_path = processed_dir / 'val_meta_balanced.csv'
final_test_meta_path = processed_dir / 'test_meta_balanced.csv'

# --- Load Individual Metadata Files ---
try:
    df_ps_controls = pd.read_csv(ps_controls_meta_path)
    print(f"Loaded {len(df_ps_controls)} People's Speech control samples.")

    df_diaries_dem = pd.read_csv(diaries_dem_meta_path)
    print(f"Loaded {len(df_diaries_dem)} Dementia Diaries dementia samples.")

    df_dnet_train = pd.read_csv(dnet_train_meta_path)
    df_dnet_val = pd.read_csv(dnet_val_meta_path)
    df_dnet_test = pd.read_csv(dnet_test_meta_path)
    df_dnet_all = pd.concat([df_dnet_train, df_dnet_val, df_dnet_test], ignore_index=True)
    df_dnet_controls = df_dnet_all[df_dnet_all['label'] == 0].copy()
    df_dnet_dementia = df_dnet_all[df_dnet_all['label'] == 1].copy()
    print(f"Loaded {len(df_dnet_controls)} DementiaNet control samples.")
    print(f"Loaded {len(df_dnet_dementia)} DementiaNet dementia samples.")

except FileNotFoundError as e:
    print(f"ERROR: Could not find one of the metadata files: {e}")
    raise # Stop execution if files are missing

# --- Prepare and Standardize Columns ---
# Select relevant columns and standardize names
# People's Speech: Need 'relative_audio_path', 'speaker_id', 'label'
# Dementia Diaries: Need 'relative_audio_path', 'diarist_name' (as speaker_id), 'label'
# DementiaNet: Need 'file_path' (needs adjusting), 'speaker_id', 'label'

df_ps_controls = df_ps_controls[['relative_audio_path', 'speaker_id', 'label']].copy()
df_ps_controls['relative_audio_path'] = 'controls/peoples_speech/' + df_ps_controls['relative_audio_path'] # Make path relative to processed_dir
df_ps_controls['original_corpus'] = 'PeopleSpeech'

df_diaries_dem = df_diaries_dem[['relative_audio_path', 'diarist_name', 'label']].copy()
df_diaries_dem.rename(columns={'diarist_name': 'speaker_id'}, inplace=True)
# !! Adjust diaries relative path based on where WAVs are saved !!
# Assuming they end up in data/processed/dementia_diaries/audio/ relative to notebook root
# Path should be relative to processed_dir for consistency
# Example: if wavs are in ../data/processed/dementia_diaries/audio
df_diaries_dem['relative_audio_path'] = 'dementia_diaries/' + df_diaries_dem['relative_audio_path']
df_diaries_dem['original_corpus'] = 'Diaries'

# DementiaNet paths need fixing - they point to original sources or processed splits
# We need paths relative to the processed_dir where the FINAL WAVs reside
# Assuming DNet files are in processed/train|validation|test/dementia|nodementia/
def adjust_dnet_path(row):
    label_dir = 'dementia' if row['label'] == 1 else 'nodementia'
    # Determine split from original path (this is fragile)
    if 'train' in row['file_path']: split = 'train'
    elif 'validation' in row['file_path']: split = 'validation'
    elif 'test' in row['file_path']: split = 'test'
    else: split = 'unknown_split' # Fallback
    # Use the file_name column if it exists and is just the basename
    filename = row.get('file_name', Path(row['file_path']).name)
    return f"{split}/{label_dir}/{filename}"

df_dnet_controls['relative_audio_path'] = df_dnet_all[df_dnet_all['label'] == 0].apply(adjust_dnet_path, axis=1)
df_dnet_dementia['relative_audio_path'] = df_dnet_all[df_dnet_all['label'] == 1].apply(adjust_dnet_path, axis=1)
df_dnet_controls = df_dnet_controls[['relative_audio_path', 'speaker_id', 'label']]
df_dnet_dementia = df_dnet_dementia[['relative_audio_path', 'speaker_id', 'label']]
df_dnet_controls['original_corpus'] = 'DNetControl'
df_dnet_dementia['original_corpus'] = 'DNetDementia'

# --- Combine All Metadata ---
df_all_controls = pd.concat([df_ps_controls, df_dnet_controls], ignore_index=True)
df_all_dementia = pd.concat([df_diaries_dem, df_dnet_dementia], ignore_index=True)

print(f"\nTotal Control Samples: {len(df_all_controls)}")
print(f"Total Dementia Samples: {len(df_all_dementia)}")

# --- Undersample Dementia Data to Match Controls ---
n_control = len(df_all_controls)
n_dementia = len(df_all_dementia)

if n_dementia > n_control:
    print(f"Undersampling Dementia data from {n_dementia} to {n_control} samples...")
    df_dementia_sampled = df_all_dementia.sample(n=n_control, random_state=42)
else:
    print("Warning: Fewer Dementia samples than Control samples. Using all Dementia samples.")
    # Optionally, undersample controls instead if desired:
    # df_all_controls = df_all_controls.sample(n=n_dementia, random_state=42)
    df_dementia_sampled = df_all_dementia

df_balanced = pd.concat([df_all_controls, df_dementia_sampled], ignore_index=True)
print(f"Created balanced dataset with {len(df_balanced)} total samples ({len(df_all_controls)} controls, {len(df_dementia_sampled)} dementia).")

# Ensure unique speaker IDs across corpora if necessary (e.g., prefixing)
# Example: df_balanced['speaker_id'] = df_balanced['original_corpus'] + '_' + df_balanced['speaker_id'].astype(str)
# For now, assume IDs are unique enough or collision risk is low

# --- Split Balanced Data (Train/Val/Test using Speaker Groups) ---
print("\nSplitting balanced data into Train/Validation/Test sets (80/10/10 split) by speaker...")

if df_balanced.empty:
     raise ValueError("Balanced dataframe is empty, cannot split.")
if 'speaker_id' not in df_balanced.columns:
     raise KeyError("Column 'speaker_id' not found for GroupShuffleSplit.")


# First split: 80% train, 20% temp (val+test)
gss_train_valtest = GroupShuffleSplit(n_splits=1, test_size=0.20, random_state=42)
train_idx, valtest_idx = next(gss_train_valtest.split(df_balanced, groups=df_balanced['speaker_id']))

df_train = df_balanced.iloc[train_idx]
df_valtest = df_balanced.iloc[valtest_idx]

# Second split: 50% of temp is validation, 50% is test (gives 10% overall)
gss_val_test = GroupShuffleSplit(n_splits=1, test_size=0.50, random_state=42)
val_idx, test_idx = next(gss_val_test.split(df_valtest, groups=df_valtest['speaker_id']))

df_val = df_valtest.iloc[val_idx]
df_test = df_valtest.iloc[test_idx]

# --- Save Final Metadata Splits ---
df_train.to_csv(final_train_meta_path, index=False)
df_val.to_csv(final_val_meta_path, index=False)
df_test.to_csv(final_test_meta_path, index=False)

print("\n--- Final Split Summary ---")
print(f"Train set: {len(df_train)} samples ({df_train['label'].sum()} dementia, {len(df_train)-df_train['label'].sum()} control) from {df_train['speaker_id'].nunique()} speakers.")
print(f"Validation set: {len(df_val)} samples ({df_val['label'].sum()} dementia, {len(df_val)-df_val['label'].sum()} control) from {df_val['speaker_id'].nunique()} speakers.")
print(f"Test set: {len(df_test)} samples ({df_test['label'].sum()} dementia, {len(df_test)-df_test['label'].sum()} control) from {df_test['speaker_id'].nunique()} speakers.")
print(f"Saved Train metadata to: {final_train_meta_path}")
print(f"Saved Validation metadata to: {final_val_meta_path}")
print(f"Saved Test metadata to: {final_test_meta_path}")

print("\n--- Data Combination and Splitting Finished ---")

--- Combining and Splitting Final Dataset ---
Loaded 69152 People's Speech control samples.
Loaded 1818 Dementia Diaries dementia samples.
Loaded 324 DementiaNet control samples.
Loaded 131 DementiaNet dementia samples.

Total Control Samples: 69476
Total Dementia Samples: 1949
Created balanced dataset with 71425 total samples (69476 controls, 1949 dementia).

Splitting balanced data into Train/Validation/Test sets (80/10/10 split) by speaker...

--- Final Split Summary ---
Train set: 57483 samples (1887 dementia, 55596 control) from 55612 speakers.
Validation set: 6976 samples (32 dementia, 6944 control) from 6951 speakers.
Test set: 6966 samples (30 dementia, 6936 control) from 6952 speakers.
Saved Train metadata to: ..\data\processed\train_meta_balanced.csv
Saved Validation metadata to: ..\data\processed\val_meta_balanced.csv
Saved Test metadata to: ..\data\processed\test_meta_balanced.csv

--- Data Combination and Splitting Finished ---


In [16]:
# from pathlib import Path
# import shutil

# src_dir = Path(r"D:\Cornell\Academic\Spring 2025\Startup Studio\MVP\neurotone\ml\data\processed\test\nodementia")
# dst_dir = Path(r"D:\Cornell\Academic\Spring 2025\Startup Studio\MVP\neurotone\ml\data\no_dementia")
# dst_dir.mkdir(parents=True, exist_ok=True)

# for file_path in src_dir.rglob('*'):
#     if not file_path.is_file():
#         continue
#     dest = dst_dir / file_path.name
#     if dest.exists():
#         # Skip duplicates by filename
#         continue
#     shutil.move(str(file_path), str(dest))