# Feature Extraction
Make sure to use the correct data. The .phn files should be in the TIMIT format and the audios as wav files

In [4]:
import os
import sys
import librosa
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
from multiprocessing import cpu_count
import logging
import pickle
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class FeatureExtractor:
    """Improved Feature Extractor with better error handling and normalization"""
    
    def __init__(self, config=None):
        # Default configuration
        self.sr = 16000
        self.n_fft = 400         # 25 ms
        self.hop_length = 160    # 10 ms  
        self.n_mfcc = 13
        self.include_delta = True
        self.include_delta2 = True
        
        # Updated paths to match actual structure
        self.base_dir = Path("datasets_original/en")
        self.wav_dir = self.base_dir / "wav"
        self.phn_dir = self.base_dir / "phn"
        self.features_out = self.base_dir / "features"
        self.labels_out = self.base_dir / "labels"
        
        # Create output directories
        self.features_out.mkdir(parents=True, exist_ok=True)
        self.labels_out.mkdir(parents=True, exist_ok=True)
        
        # Statistics for global normalization
        self.scaler = StandardScaler()
        self.is_fitted = False
        
        # Error tracking
        self.errors = []
        self.processed_files = []
        
    def validate_paths(self):
        """Validate that required directories exist"""
        if not self.wav_dir.exists():
            raise FileNotFoundError(f"WAV directory not found: {self.wav_dir}")
        if not self.phn_dir.exists():
            raise FileNotFoundError(f"PHN directory not found: {self.phn_dir}")
            
        wav_files = list(self.wav_dir.glob("*.wav"))
        phn_files = list(self.phn_dir.glob("*.phn"))
        
        logger.info(f"Found {len(wav_files)} WAV files and {len(phn_files)} PHN files")
        return len(wav_files), len(phn_files)
    
    def load_phoneme_intervals(self, phn_path):
        """Load phoneme intervals from PHN file with validation"""
        intervals = []
        try:
            with open(phn_path, "r", encoding='utf-8') as f:
                for line_num, line in enumerate(f, 1):
                    line = line.strip()
                    if not line:
                        continue
                        
                    parts = line.split()
                    if len(parts) != 3:
                        logger.warning(f"Invalid line {line_num} in {phn_path}: {line}")
                        continue
                    
                    try:
                        start_samples, end_samples, phoneme = parts
                        start_time = int(start_samples) / self.sr
                        end_time = int(end_samples) / self.sr
                        
                        # Validation
                        if start_time >= end_time:
                            logger.warning(f"Invalid time interval in {phn_path}: {start_time} >= {end_time}")
                            continue
                            
                        intervals.append((start_time, end_time, phoneme.upper()))
                    except ValueError as e:
                        logger.warning(f"Could not parse line {line_num} in {phn_path}: {e}")
                        continue
                        
        except Exception as e:
            logger.error(f"Error reading {phn_path}: {e}")
            return []
        
        return intervals
    
    def assign_labels_robust(self, num_frames, frame_times, intervals):
        """Robust label assignment with overlap handling"""
        labels = ["SIL"] * num_frames
        
        for i, frame_time in enumerate(frame_times):
            assigned = False
            for start_time, end_time, phoneme in intervals:
                # Use frame center for assignment
                if start_time <= frame_time < end_time:
                    labels[i] = phoneme
                    assigned = True
                    break
            
            # If not assigned and we have intervals, find closest
            if not assigned and intervals:
                closest_interval = min(intervals, 
                                     key=lambda x: min(abs(frame_time - x[0]), abs(frame_time - x[1])))
                # Only assign if reasonably close (within 50ms)
                if min(abs(frame_time - closest_interval[0]), abs(frame_time - closest_interval[1])) < 0.05:
                    labels[i] = closest_interval[2]
        
        return labels
    
    def extract_features_single(self, wav_path, normalize=False):
        """Extract features from a single audio file"""
        try:
            # Load audio with validation
            y, sr_actual = librosa.load(wav_path, sr=self.sr)
            
            if len(y) == 0:
                raise ValueError("Empty audio file")
            
            if sr_actual != self.sr:
                logger.warning(f"Sample rate mismatch in {wav_path}: {sr_actual} vs {self.sr}")
            
            # Extract MFCC features
            mfcc = librosa.feature.mfcc(
                y=y, sr=self.sr, n_mfcc=self.n_mfcc, 
                n_fft=self.n_fft, hop_length=self.hop_length
            )
            
            features = [mfcc]
            
            # Add delta features if requested
            if self.include_delta:
                delta = librosa.feature.delta(mfcc)
                features.append(delta)
                
            if self.include_delta2:
                delta2 = librosa.feature.delta(mfcc, order=2)
                features.append(delta2)
            
            # Combine features [n_features, n_frames] -> [n_frames, n_features]
            combined_features = np.vstack(features).T
            
            # Local normalization if global not available yet
            if normalize and not self.is_fitted:
                mean = np.mean(combined_features, axis=0)
                std = np.std(combined_features, axis=0) + 1e-8
                combined_features = (combined_features - mean) / std
            
            return combined_features, y
            
        except Exception as e:
            logger.error(f"Feature extraction failed for {wav_path}: {e}")
            return None, None

In [5]:
def process_file_robust(self, base_name):
    """Process a single file with comprehensive error handling"""
    wav_path = self.wav_dir / f"{base_name}.wav"
    phn_path = self.phn_dir / f"{base_name}.phn"
    
    # Check if files exist
    if not wav_path.exists():
        error_msg = f"WAV file missing: {wav_path}"
        self.errors.append(error_msg)
        return {"status": "missing_wav", "base": base_name, "error": error_msg}
        
    if not phn_path.exists():
        error_msg = f"PHN file missing: {phn_path}"
        self.errors.append(error_msg)
        return {"status": "missing_phn", "base": base_name, "error": error_msg}
    
    try:
        # Extract features
        features, audio = self.extract_features_single(wav_path, normalize=True)
        if features is None:
            return {"status": "feature_error", "base": base_name}
        
        # Load phoneme intervals
        intervals = self.load_phoneme_intervals(phn_path)
        if not intervals:
            error_msg = f"No valid phoneme intervals found in {phn_path}"
            self.errors.append(error_msg)
            return {"status": "no_intervals", "base": base_name, "error": error_msg}
        
        # Calculate frame times
        num_frames = features.shape[0]
        frame_times = (np.arange(num_frames) * self.hop_length + self.n_fft // 2) / self.sr
        
        # Assign labels
        labels = self.assign_labels_robust(num_frames, frame_times, intervals)
        
        # Validation: check if we have reasonable label distribution
        unique_labels = set(labels)
        if len(unique_labels) <= 1:
            error_msg = f"Only one unique label found: {unique_labels}"
            self.errors.append(error_msg)
            return {"status": "label_error", "base": base_name, "error": error_msg}
        
        # Save features and labels
        feature_file = self.features_out / f"{base_name}.npy"
        label_file = self.labels_out / f"{base_name}.txt"
        
        np.save(feature_file, features.astype(np.float32))
        with open(label_file, "w", encoding='utf-8') as f:
            f.write("\n".join(labels))
        
        # Return statistics
        return {
            "status": "success",
            "base": base_name,
            "frames": num_frames,
            "duration": len(audio) / self.sr,
            "unique_labels": len(unique_labels),
            "label_distribution": {label: labels.count(label) for label in unique_labels}
        }
        
    except Exception as e:
        error_msg = f"Processing error for {base_name}: {str(e)}"
        logger.error(error_msg)
        self.errors.append(error_msg)
        return {"status": "processing_error", "base": base_name, "error": error_msg}

def collect_statistics_pass(self, file_list, max_files=None):
    """First pass: collect statistics for global normalization"""
    logger.info("Collecting statistics for global normalization...")
    
    if max_files:
        file_list = file_list[:max_files]
    
    all_features = []
    
    with ThreadPoolExecutor(max_workers=min(cpu_count(), 8)) as executor:
        futures = []
        for base_name in file_list:
            wav_path = self.wav_dir / f"{base_name}.wav"
            phn_path = self.phn_dir / f"{base_name}.phn"
            # Only process if both files exist
            if wav_path.exists() and phn_path.exists():
                future = executor.submit(self.extract_features_single, wav_path, normalize=False)
                futures.append(future)
        
        for future in tqdm(as_completed(futures), total=len(futures), desc="Collecting stats"):
            try:
                features, _ = future.result()
                if features is not None:
                    # Sample frames if too many (for memory efficiency)
                    if features.shape[0] > 1000:
                        indices = np.random.choice(features.shape[0], 1000, replace=False)
                        features = features[indices]
                    all_features.append(features)
            except Exception as e:
                logger.warning(f"Error in statistics collection: {e}")
    
    if all_features:
        # Combine all features for global statistics
        combined = np.vstack(all_features)
        self.scaler.fit(combined)
        self.is_fitted = True
        
        # Save scaler for later use
        scaler_path = self.base_dir / "feature_scaler.pkl"
        with open(scaler_path, 'wb') as f:
            pickle.dump(self.scaler, f)
        
        logger.info(f"Global normalization fitted on {combined.shape[0]} frames")
        logger.info(f"Feature dimensions: {combined.shape[1]}")
        logger.info(f"Scaler saved to: {scaler_path}")
    else:
        logger.error("No features collected for normalization!")

def process_all_files(self, max_workers=None, collect_stats=True, max_stats_files=1000):
    """Process all files with improved pipeline - only files with both WAV and PHN"""
    try:
        # Validate paths first
        self.validate_paths()
        
        # Get all base names that have both WAV and PHN files
        wav_files = list(self.wav_dir.glob("*.wav"))
        phn_files = list(self.phn_dir.glob("*.phn"))
        
        wav_bases = {f.stem for f in wav_files}
        phn_bases = {f.stem for f in phn_files}
        
        # Only process files that have both WAV and PHN
        matching_bases = wav_bases & phn_bases
        missing_phn = wav_bases - phn_bases
        
        if not matching_bases:
            logger.error("No files found that have both WAV and PHN!")
            return
        
        logger.info(f"Found {len(wav_files)} WAV files and {len(phn_files)} PHN files")
        logger.info(f"Processing {len(matching_bases)} files that have both WAV and PHN")
        if missing_phn:
            logger.info(f"Skipping {len(missing_phn)} WAV files without corresponding PHN files")
        
        base_names = list(matching_bases)
        
        # First pass: collect statistics for normalization
        if collect_stats:
            self.collect_statistics_pass(base_names, max_stats_files)
        
        # Second pass: process all files with global normalization
        logger.info("Processing all matching files...")
        
        if max_workers is None:
            max_workers = min(cpu_count(), 8)
        
        results = []
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = {executor.submit(self.process_file_robust, base): base for base in base_names}
            
            for future in tqdm(as_completed(futures), total=len(futures), 
                                desc="Processing files", unit="files"):
                try:
                    result = future.result()
                    results.append(result)
                except Exception as e:
                    base = futures[future]
                    error_msg = f"Unexpected error processing {base}: {e}"
                    logger.error(error_msg)
                    self.errors.append(error_msg)
                    results.append({"status": "unexpected_error", "base": base, "error": error_msg})
        
        # Process results
        self.analyze_results(results)
        return results
        
    except Exception as e:
        logger.error(f"Fatal error in processing pipeline: {e}")
        raise

def analyze_results(self, results):
    """Analyze and report processing results"""
    status_counts = {}
    successful_files = []
    
    for result in results:
        status = result["status"]
        status_counts[status] = status_counts.get(status, 0) + 1
        if status == "success":
            successful_files.append(result)
    
    # Print summary
    print("\n" + "="*50)
    print("FEATURE EXTRACTION RESULTS")
    print("="*50)
    print(f"Total files processed: {len(results)}")
    
    for status, count in status_counts.items():
        print(f"{status:.<20} {count:>6}")
    
    if successful_files:
        # Statistics on successful files
        total_frames = sum(r["frames"] for r in successful_files)
        total_duration = sum(r["duration"] for r in successful_files)
        avg_duration = total_duration / len(successful_files)
        
        print(f"\nSuccessful processing statistics:")
        print(f"Total frames: {total_frames:,}")
        print(f"Total duration: {total_duration:.2f} seconds ({total_duration/3600:.2f} hours)")
        print(f"Average duration: {avg_duration:.2f} seconds")
    
    # Save error log
    if self.errors:
        error_log_path = self.base_dir / "feature_extraction_errors.log"
        with open(error_log_path, "w", encoding='utf-8') as f:
            f.write("\n".join(self.errors))
        print(f"\nErrors logged to: {error_log_path}")
    
    print("="*50)

# Dynamically add the methods to the FeatureExtractor class
FeatureExtractor.process_file_robust = process_file_robust
FeatureExtractor.collect_statistics_pass = collect_statistics_pass  
FeatureExtractor.process_all_files = process_all_files
FeatureExtractor.analyze_results = analyze_results

print("✅ Methods successfully updated to handle WAV-PHN file matching!")

✅ Methods successfully updated to handle WAV-PHN file matching!


In [10]:
# Initialize the improved feature extractor with correct paths
import os
from pathlib import Path

# Get the correct base directory
notebook_dir = Path.cwd()
print(f"Current directory: {notebook_dir}")

# Navigate to project root
if "src" in notebook_dir.parts:
    # We're in src/data_processing, go up to project root
    project_root = notebook_dir.parent.parent
elif notebook_dir.name == "notebooks":
    project_root = notebook_dir.parent
else:
    project_root = notebook_dir

print(f"Project root: {project_root}")
print(f"Looking for data in: {project_root / 'datasets_original' / 'en'}")

# Check if the path exists
data_path = project_root / "datasets_original" / "en"
if data_path.exists():
    print(f"✅ Data directory found: {data_path}")
    wav_path = data_path / "wav"
    phn_path = data_path / "phn"
    print(f"WAV dir exists: {wav_path.exists()}")
    print(f"PHN dir exists: {phn_path.exists()}")
    
    if wav_path.exists() and phn_path.exists():
        # Initialize extractor with correct base directory
        extractor = FeatureExtractor()
        extractor.base_dir = data_path
        extractor.wav_dir = data_path / "wav"
        extractor.phn_dir = data_path / "phn"
        extractor.features_out = data_path / "features"
        extractor.labels_out = data_path / "labels"
        
        # Create output directories
        extractor.features_out.mkdir(parents=True, exist_ok=True)
        extractor.labels_out.mkdir(parents=True, exist_ok=True)
        
        print(f"✅ Extractor configured with:")
        print(f"  Base dir: {extractor.base_dir}")
        print(f"  WAV dir: {extractor.wav_dir}")
        print(f"  PHN dir: {extractor.phn_dir}")
        print(f"  Features out: {extractor.features_out}")
        print(f"  Labels out: {extractor.labels_out}")
        
        # Check if features and labels already exist
        feature_files = list(extractor.features_out.glob("*.npy"))
        label_files = list(extractor.labels_out.glob("*.txt"))
        
        print(f"\n📊 Existing Data Analysis:")
        print(f"Existing feature files: {len(feature_files)}")
        print(f"Existing label files: {len(label_files)}")
        
        if feature_files and label_files:
            print("✅ Found existing features and labels - ready for OOV cleaning!")
        else:
            print("⚠️ No existing features/labels found - run feature extraction first!")
        
    else:
        print("❌ WAV or PHN directories not found!")
        print(f"Available in {data_path}:")
        if data_path.exists():
            for item in data_path.iterdir():
                if item.is_dir():
                    print(f"  {item.name}")
else:
    print(f"❌ Data directory not found: {data_path}")
    print("Available directories in project root:")
    if project_root.exists():
        for item in project_root.iterdir():
            if item.is_dir():
                print(f"  {item.name}")

Current directory: c:\Users\sebas\Desktop\commonVoiceDataset\src\data_processing
Project root: c:\Users\sebas\Desktop\commonVoiceDataset
Looking for data in: c:\Users\sebas\Desktop\commonVoiceDataset\datasets_original\en
✅ Data directory found: c:\Users\sebas\Desktop\commonVoiceDataset\datasets_original\en
WAV dir exists: True
PHN dir exists: True
✅ Extractor configured with:
  Base dir: c:\Users\sebas\Desktop\commonVoiceDataset\datasets_original\en
  WAV dir: c:\Users\sebas\Desktop\commonVoiceDataset\datasets_original\en\wav
  PHN dir: c:\Users\sebas\Desktop\commonVoiceDataset\datasets_original\en\phn
  Features out: c:\Users\sebas\Desktop\commonVoiceDataset\datasets_original\en\features
  Labels out: c:\Users\sebas\Desktop\commonVoiceDataset\datasets_original\en\labels

📊 Existing Data Analysis:
Existing feature files: 26482
Existing label files: 26482
✅ Found existing features and labels - ready for OOV cleaning!


In [8]:
# Full processing with the improved pipeline
print("\n" + "="*60)
print("STARTING FULL FEATURE EXTRACTION")
print("="*60)

try:
    # Process all files
    results = extractor.process_all_files(
        max_workers=cpu_count(),  # Use all available cores
        collect_stats=True,       # Collect statistics for global normalization
        max_stats_files=1000      # Use subset for statistics (memory efficiency)
    )
    
    print("\n✅ Feature extraction completed successfully!")
    
    # Optional: Save processing report
    if results:
        report_path = extractor.base_dir / "processing_report.txt"
        with open(report_path, "w", encoding='utf-8') as f:
            f.write("Feature Extraction Report\n")
            f.write("="*50 + "\n\n")
            
            status_counts = {}
            for result in results:
                status = result["status"]
                status_counts[status] = status_counts.get(status, 0) + 1
            
            for status, count in status_counts.items():
                f.write(f"{status}: {count}\n")
        
        print(f"📄 Processing report saved to: {report_path}")
    
except Exception as e:
    print(f"❌ Feature extraction failed: {e}")
    import traceback
    traceback.print_exc()


STARTING FULL FEATURE EXTRACTION


INFO:__main__:Found 676641 WAV files and 26482 PHN files
INFO:__main__:Found 676641 WAV files and 26482 PHN files
INFO:__main__:Processing 26482 files that have both WAV and PHN
INFO:__main__:Skipping 650159 WAV files without corresponding PHN files
INFO:__main__:Collecting statistics for global normalization...
INFO:__main__:Found 676641 WAV files and 26482 PHN files
INFO:__main__:Processing 26482 files that have both WAV and PHN
INFO:__main__:Skipping 650159 WAV files without corresponding PHN files
INFO:__main__:Collecting statistics for global normalization...
Collecting stats: 100%|██████████| 1000/1000 [00:02<00:00, 403.41it/s]

INFO:__main__:Global normalization fitted on 523595 frames
INFO:__main__:Feature dimensions: 39
INFO:__main__:Scaler saved to: c:\Users\sebas\Desktop\commonVoiceDataset\datasets_original\en\feature_scaler.pkl
INFO:__main__:Global normalization fitted on 523595 frames
INFO:__main__:Feature dimensions: 39
INFO:__main__:Scaler saved to: c:\Users\sebas\Deskto


FEATURE EXTRACTION RESULTS
Total files processed: 26482
success.............  26482

Successful processing statistics:
Total frames: 14,237,225
Total duration: 142221.29 seconds (39.51 hours)
Average duration: 5.37 seconds

✅ Feature extraction completed successfully!
📄 Processing report saved to: c:\Users\sebas\Desktop\commonVoiceDataset\datasets_original\en\processing_report.txt


In [9]:
# Data validation and analysis functions
def validate_extracted_features(features_dir, labels_dir, sample_size=10):
    """Validate extracted features and labels"""
    print("\n" + "="*50)
    print("VALIDATING EXTRACTED DATA")
    print("="*50)
    
    features_dir = Path(features_dir)
    labels_dir = Path(labels_dir)
    
    feature_files = list(features_dir.glob("*.npy"))
    label_files = list(labels_dir.glob("*.txt"))
    
    print(f"Feature files: {len(feature_files)}")
    print(f"Label files: {len(label_files)}")
    
    # Check matching files
    feature_bases = {f.stem for f in feature_files}
    label_bases = {f.stem for f in label_files}
    
    matching = feature_bases & label_bases
    missing_labels = feature_bases - label_bases
    missing_features = label_bases - feature_bases
    
    print(f"Matching pairs: {len(matching)}")
    if missing_labels:
        print(f"Missing labels: {len(missing_labels)}")
    if missing_features:
        print(f"Missing features: {len(missing_features)}")
    
    # Sample validation
    if matching:
        sample_files = list(matching)[:sample_size]
        print(f"\nValidating {len(sample_files)} sample files...")
        
        all_phonemes = set()
        feature_shapes = []
        
        for base in sample_files:
            try:
                # Load features
                features = np.load(features_dir / f"{base}.npy")
                feature_shapes.append(features.shape)
                
                # Load labels
                with open(labels_dir / f"{base}.txt", 'r') as f:
                    labels = [line.strip() for line in f.readlines()]
                
                all_phonemes.update(labels)
                
                # Check alignment
                if features.shape[0] != len(labels):
                    print(f"⚠️ Length mismatch in {base}: features={features.shape[0]}, labels={len(labels)}")
                else:
                    print(f"✅ {base}: {features.shape[0]} frames, {len(set(labels))} unique phonemes")
                    
            except Exception as e:
                print(f"❌ Error validating {base}: {e}")
        
        print(f"\nSummary:")
        print(f"Feature dimensions: {set(feature_shapes)}")
        print(f"Total unique phonemes found: {len(all_phonemes)}")
        print(f"Phonemes: {sorted(all_phonemes)}")
        
        return {
            'total_files': len(matching),
            'feature_shapes': feature_shapes,
            'phonemes': sorted(all_phonemes),
            'missing_labels': len(missing_labels),
            'missing_features': len(missing_features)
        }
    
    return None

# Run validation
validation_results = validate_extracted_features(
    extractor.features_out, 
    extractor.labels_out, 
    sample_size=20
)


VALIDATING EXTRACTED DATA
Feature files: 26482
Label files: 26482
Matching pairs: 26482

Validating 20 sample files...
✅ 449dc4bdcb0b674ce1a0a868a1c46a4b959625fe76065922d7329e44673e727e1051c7e09e6fe923487bdbb8f8a4726845c69fb038026083e26469c8f5502c24: 697 frames, 24 unique phonemes
✅ 7edb2af1e2f4fc20864cd7ca641c5f43a77b96dc50ae9091417b39e237bf1936e1ae7a28d52dc5fb591108b9921c15c63896914fe6f6b89021ff10aba540888e: 346 frames, 12 unique phonemes
✅ 8b6f2401d6c5d31d3afa804c37559dce47696cb31735bdd213a3c6567b44c38bdd14b303110a950c71c4cd33d75da0ca43aca8efdb9dedd44ee848e6aa7f8ea6: 310 frames, 13 unique phonemes
✅ d6c3894252d0c62099fffaefb680eee43f84cd181a88ec4e219daf2da8e3b4becea0db68f946a9592434d01efe5a2a22890f86ae6fe2728a739dd718ab0dd9a1: 989 frames, 16 unique phonemes
✅ a721196a35faa57cae99a3cd0260647d5b64beef00f6d83a2ce38d5c2f8b987faca1043b04e1e3671f43a93a396a8503fdaf5eb9808a960f63524b4ad1088bdc: 274 frames, 12 unique phonemes
✅ 39bd19fcce3ff45adf9cf8e6168117f0b0cdba49df6ad802ea0cf80925aba17

## Remove OOV Frames

In [11]:
def clean_oov_data(self, remove_oov_utterances=False, segment_at_oov=True, replace_with_silence=False):
    """
    Clean OOV labels from extracted features and labels
    
    Args:
        remove_oov_utterances: Remove entire utterances that contain OOV
        segment_at_oov: Split sequences at OOV boundaries (recommended)
        replace_with_silence: Replace OOV with SIL instead of segmenting
    """
    print("\n" + "="*60)
    print("CLEANING OOV DATA")
    print("="*60)
    
    feature_files = list(self.features_out.glob("*.npy"))
    total_files = len(feature_files)
    total_segments_created = 0
    total_utterances_removed = 0
    total_oov_frames = 0
    
    # Create clean output directories
    clean_features_dir = self.base_dir / "features_clean"
    clean_labels_dir = self.base_dir / "labels_clean"
    clean_features_dir.mkdir(exist_ok=True)
    clean_labels_dir.mkdir(exist_ok=True)
    
    oov_stats = {}
    
    for feature_file in tqdm(feature_files, desc="Processing files"):
        base_name = feature_file.stem
        label_file = self.labels_out / f"{base_name}.txt"
        
        if not label_file.exists():
            continue
            
        # Load features and labels
        features = np.load(feature_file)
        with open(label_file, 'r') as f:
            labels = [line.strip() for line in f.readlines()]
        
        # Check for OOV
        oov_indices = [i for i, label in enumerate(labels) if label == 'OOV']
        
        if not oov_indices:
            # No OOV, copy as is
            np.save(clean_features_dir / f"{base_name}.npy", features)
            with open(clean_labels_dir / f"{base_name}.txt", 'w') as f:
                f.write('\n'.join(labels))
            continue
        
        total_oov_frames += len(oov_indices)
        oov_stats[base_name] = len(oov_indices)
        
        if remove_oov_utterances:
            # Skip entire utterance if it contains OOV
            total_utterances_removed += 1
            continue
        
        if replace_with_silence:
            # Replace OOV with SIL
            clean_labels = [label if label != 'OOV' else 'SIL' for label in labels]
            np.save(clean_features_dir / f"{base_name}.npy", features)
            with open(clean_labels_dir / f"{base_name}.txt", 'w') as f:
                f.write('\n'.join(clean_labels))
            continue
        
        if segment_at_oov:
            # Segment at OOV boundaries
            segments = self._create_segments_from_oov(features, labels, oov_indices)
            
            for seg_idx, (seg_features, seg_labels) in enumerate(segments):
                if len(seg_features) < 10:  # Skip very short segments
                    continue
                
                segment_name = f"{base_name}_seg{seg_idx:02d}"
                np.save(clean_features_dir / f"{segment_name}.npy", seg_features)
                with open(clean_labels_dir / f"{segment_name}.txt", 'w') as f:
                    f.write('\n'.join(seg_labels))
                total_segments_created += 1
    
    # Print statistics
    print(f"\n📊 OOV Cleaning Results:")
    print(f"Total files processed: {total_files}")
    print(f"Files with OOV: {len(oov_stats)}")
    print(f"Total OOV frames removed: {total_oov_frames}")
    
    if remove_oov_utterances:
        print(f"Utterances removed: {total_utterances_removed}")
    elif segment_at_oov:
        print(f"Segments created: {total_segments_created}")
    
    return oov_stats

def _create_segments_from_oov(self, features, labels, oov_indices):
    """Create segments by splitting at OOV boundaries"""
    segments = []
    
    # Add boundaries at start, OOV positions, and end
    boundaries = [0] + oov_indices + [len(labels)]
    boundaries = sorted(set(boundaries))  # Remove duplicates and sort
    
    for i in range(len(boundaries) - 1):
        start = boundaries[i]
        end = boundaries[i + 1]
        
        # Skip if this segment would start with OOV
        if start in oov_indices:
            # Find next non-OOV position
            while start < end and start in oov_indices:
                start += 1
        
        # Skip if this segment would end with OOV  
        if end - 1 in oov_indices:
            # Find last non-OOV position
            while end > start and (end - 1) in oov_indices:
                end -= 1
        
        if start < end and end - start >= 5:  # Minimum segment length
            seg_features = features[start:end]
            seg_labels = labels[start:end]
            
            # Verify no OOV in this segment
            if 'OOV' not in seg_labels:
                segments.append((seg_features, seg_labels))
    
    return segments

# Add method to FeatureExtractor class
FeatureExtractor.clean_oov_data = clean_oov_data
FeatureExtractor._create_segments_from_oov = _create_segments_from_oov

In [12]:
oov_stats = extractor.clean_oov_data(segment_at_oov=True)



CLEANING OOV DATA


Processing files: 100%|██████████| 26482/26482 [05:12<00:00, 84.64it/s]


📊 OOV Cleaning Results:
Total files processed: 26482
Files with OOV: 6136
Total OOV frames removed: 298734
Segments created: 13079





In [13]:
# Validate cleaned data
def validate_cleaned_data():
    """Validate the cleaned features and labels"""
    print("\n" + "="*60)
    print("VALIDATING CLEANED DATA")
    print("="*60)
    
    clean_features_dir = extractor.base_dir / "features_clean"
    clean_labels_dir = extractor.base_dir / "labels_clean"
    
    # Count files
    clean_feature_files = list(clean_features_dir.glob("*.npy"))
    clean_label_files = list(clean_labels_dir.glob("*.txt"))
    
    print(f"Clean feature files: {len(clean_feature_files)}")
    print(f"Clean label files: {len(clean_label_files)}")
    
    # Sample some files to check for OOV
    sample_files = clean_label_files[:20]
    total_oov_found = 0
    total_frames = 0
    all_phonemes = set()
    
    print(f"\nChecking {len(sample_files)} sample files for OOV...")
    
    for label_file in sample_files:
        base_name = label_file.stem
        feature_file = clean_features_dir / f"{base_name}.npy"
        
        if feature_file.exists():
            # Load labels and check for OOV
            with open(label_file, 'r') as f:
                labels = [line.strip() for line in f.readlines()]
            
            # Load features
            features = np.load(feature_file)
            
            oov_count = labels.count('OOV')
            total_oov_found += oov_count
            total_frames += len(labels)
            all_phonemes.update(labels)
            
            if oov_count > 0:
                print(f"⚠️ Found {oov_count} OOV in {base_name}")
            else:
                print(f"✅ {base_name}: {len(labels)} frames, {len(set(labels))} unique phonemes")
    
    print(f"\n📊 Validation Results:")
    print(f"Total frames checked: {total_frames}")
    print(f"Total OOV found: {total_oov_found}")
    print(f"OOV percentage: {(total_oov_found/total_frames)*100:.4f}%")
    print(f"Unique phonemes in sample: {len(all_phonemes)}")
    print(f"Phonemes: {sorted(all_phonemes)}")
    
    # Check for segment files
    segment_files = [f for f in clean_feature_files if '_seg' in f.name]
    original_files = [f for f in clean_feature_files if '_seg' not in f.name]
    
    print(f"\n📈 File Distribution:")
    print(f"Original files (no OOV): {len(original_files)}")
    print(f"Segment files (from OOV split): {len(segment_files)}")
    print(f"Total clean files: {len(clean_feature_files)}")
    
    return {
        'total_files': len(clean_feature_files),
        'original_files': len(original_files),
        'segment_files': len(segment_files),
        'oov_found': total_oov_found,
        'phonemes': sorted(all_phonemes)
    }

# Run validation
validation_results = validate_cleaned_data()


VALIDATING CLEANED DATA
Clean feature files: 33425
Clean label files: 33425

Checking 20 sample files for OOV...
✅ 0001de01fdda751c9232125a5abf13b96a40786d90edae74724c441d867fea3238a4cd28be62255ab3881695c93a8b21a5818c0fa400642493da246be69df417: 334 frames, 18 unique phonemes
✅ 000678f13f95dba9bd927bd4e955233c4168496654e5629981f21bd3bdeef10af51d761b55ca11657f4844c836d61ef8502ee18b7edeb9520e216b3e5fdad561: 435 frames, 26 unique phonemes
✅ 00088b44ab0466aa98e4515a7d44b50f8890d2bbbe457899ba549456c8273afb6da8c557c4842f19cb52cd4c1c83cf628994dcc8f4f1c7e9b4d0930d0ee13805_seg00: 155 frames, 9 unique phonemes
✅ 00088b44ab0466aa98e4515a7d44b50f8890d2bbbe457899ba549456c8273afb6da8c557c4842f19cb52cd4c1c83cf628994dcc8f4f1c7e9b4d0930d0ee13805_seg01: 12 frames, 1 unique phonemes
✅ 00088b44ab0466aa98e4515a7d44b50f8890d2bbbe457899ba549456c8273afb6da8c557c4842f19cb52cd4c1c83cf628994dcc8f4f1c7e9b4d0930d0ee13805_seg02: 229 frames, 15 unique phonemes
✅ 000948bbca46377ba9cf2d3be56a95926e7eb35bc17317dea7e5a7