In [1]:
import os
import glob
import random
import numpy as np
import librosa
import soundfile as sf
import pandas as pd
from tqdm import tqdm

# --- CONFIGURATION ---
INPUT_ROOT = "RawDataset"         # Must contain train/ test/ val subfolders
OUTPUT_ROOT = "processed_data"  # Where clean files will go
CSV_FILENAME = "dataset_final.csv"

TARGET_SR = 16000     # Audio-MAE requirement
MAX_FILES_TRAIN = 1000 # Limit for TRAIN folders
MAX_FILES_TEST = 200   # Limit for TEST/VAL (usually we want fewer here)
TRIM_SILENCE = True   

def preprocess_audio(file_path, output_path):
    try:
        # 1. Load & Resample
        y, sr = librosa.load(file_path, sr=TARGET_SR, mono=True)
        
        # 2. Trim Silence
        if TRIM_SILENCE:
            y, _ = librosa.effects.trim(y, top_db=20)
            
        # 3. Peak Normalization (-1 dB)
        max_val = np.max(np.abs(y))
        if max_val > 0:
            y = y / max_val * 0.9
            
        # 4. Save
        sf.write(output_path, y, sr)
        return True
    except Exception as e:
        print(f"Error: {e}")
        return False

def main():
    if not os.path.exists(INPUT_ROOT):
        print(f"‚ùå Error: '{INPUT_ROOT}' not found.")
        return

    processed_records = []
    
    # We look for these 3 specific subfolders
    splits = ['train', 'val', 'test']
    
    for split_name in splits:
        split_path = os.path.join(INPUT_ROOT, split_name)
        
        if not os.path.exists(split_path):
            print(f"‚ö†Ô∏è  Warning: Split folder '{split_name}' not found in raw_data. Skipping.")
            continue
            
        print(f"\nüìÇ Processing SPLIT: {split_name.upper()}")
        
        # Determine the file limit for this split
        # (We usually cap training at 1000, but keep test sets smaller or as-is)
        limit = MAX_FILES_TRAIN if split_name == 'train' else MAX_FILES_TEST

        # Get classes inside this split folder
        classes = sorted([d for d in os.listdir(split_path) if os.path.isdir(os.path.join(split_path, d))])
        
        for class_name in classes:
            # Setup paths
            source_dir = os.path.join(split_path, class_name)
            target_dir = os.path.join(OUTPUT_ROOT, split_name, class_name)
            os.makedirs(target_dir, exist_ok=True)
            
            # Get files
            files = glob.glob(os.path.join(source_dir, "*"))
            files = [f for f in files if f.lower().endswith(('.wav', '.mp3', '.flac'))]
            
            # --- BALANCING ---
            if len(files) > limit:
                print(f"   üî∏ {class_name}: {len(files)} -> Capping at {limit}")
                random.shuffle(files)
                selected_files = files[:limit]
            else:
                print(f"   üîπ {class_name}: {len(files)} (Kept all)")
                selected_files = files
                
            # --- PROCESSING ---
            for file_path in tqdm(selected_files, desc=f"     Processing {class_name}", leave=False):
                filename = os.path.basename(file_path)
                dst_filename = os.path.splitext(filename)[0] + ".wav"
                dst_path = os.path.join(target_dir, dst_filename)
                
                success = preprocess_audio(file_path, dst_path)
                
                if success:
                    processed_records.append({
                        'file_path': dst_path,
                        'label_name': class_name,
                        'split': split_name  # Crucial: We save which split it belongs to
                    })

    # --- SAVE FINAL INDEX ---
    df = pd.DataFrame(processed_records)
    
    # Create integer mappings for classes
    unique_classes = sorted(df['label_name'].unique())
    label_map = {name: i for i, name in enumerate(unique_classes)}
    df['label_idx'] = df['label_name'].map(label_map)

    df.to_csv(CSV_FILENAME, index=False)
    print(f"\n‚úÖ Done! Data saved to '{OUTPUT_ROOT}'")
    print(f"üìù Index saved to '{CSV_FILENAME}'")
    print(f"   Total Files: {len(df)}")
    print(df['split'].value_counts())

if __name__ == "__main__":
    main()


üìÇ Processing SPLIT: TRAIN
   üîπ Cysts_Structural: 342 (Kept all)


  from .autonotebook import tqdm as notebook_tqdm
                                                                                    

   üîπ Dysarthia: 662 (Kept all)


                                                                             

   üîπ Laryngitis: 672 (Kept all)


                                                                              

   üî∏ Vox senilis: 1488 -> Capping at 1000


                                                                                

   üî∏ parkinson: 10786 -> Capping at 1000


                                                                              

   üîπ spasmodische_dysphonie: 306 (Kept all)


                                                                                          


üìÇ Processing SPLIT: VAL
   üîπ Cysts_Structural: 21 (Kept all)


                                                                        

   üîπ Dysarthia: 41 (Kept all)


                                                                 

   üîπ Laryngitis: 42 (Kept all)


                                                                  

   üîπ Vox senilis: 93 (Kept all)


                                                                             

   üî∏ parkinson: 674 -> Capping at 200


                                                                             

   üîπ spasmodische_dysphonie: 19 (Kept all)


                                                                              


üìÇ Processing SPLIT: TEST
   üîπ Cysts_Structural: 22 (Kept all)


                                                                                  

   üîπ Dysarthia: 42 (Kept all)


                                                                           

   üîπ Laryngitis: 42 (Kept all)


                                                                            

   üîπ Vox senilis: 93 (Kept all)


                                                                             

   üî∏ parkinson: 675 -> Capping at 200


                                                                             

   üîπ spasmodische_dysphonie: 20 (Kept all)


                                                                                        


‚úÖ Done! Data saved to 'processed_data'
üìù Index saved to 'dataset_final.csv'
   Total Files: 4817
split
train    3982
test      419
val       416
Name: count, dtype: int64




In [2]:
import os
import pandas as pd
import numpy as np
import parselmouth
from parselmouth.praat import call
from tqdm import tqdm

# --- CONFIGURATION ---
INPUT_CSV = "dataset_final.csv"
OUTPUT_CSV = "sfm_features.csv"

def extract_features(file_path):
    """
    Extracts 18 clinical features using Praat (Parselmouth).
    Returns a dictionary of features or None if extraction fails.
    """
    try:
        # Load Sound
        sound = parselmouth.Sound(file_path)
        
        # 1. Pitch & HNR Analysis
        pitch = sound.to_pitch()
        pulses = parselmouth.praat.call([sound, pitch], "To PointProcess (cc)")
        
        # HNR (Harmonics to Noise Ratio)
        harmonicity = sound.to_harmonicity()
        hnr = call(harmonicity, "Get mean", 0, 0)
        
        # 2. Jitter (Frequency Perturbation)
        jitter_local = call(pulses, "Get jitter (local)", 0, 0, 0.0001, 0.02, 1.3)
        jitter_rap   = call(pulses, "Get jitter (rap)", 0, 0, 0.0001, 0.02, 1.3)
        
        # 3. Shimmer (Amplitude Perturbation)
        shimmer_local = call([sound, pulses], "Get shimmer (local)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
        shimmer_apq3  = call([sound, pulses], "Get shimmer (apq3)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
        
        # 4. Formants (F1, F2, F3, F4) - The "Filter"
        # We look for 5 formants up to 5500Hz (standard for adult voice)
        formant = sound.to_formant_burg(time_step=0.01, max_number_of_formants=5, maximum_formant=5500.0)
        
        f1 = call(formant, "Get mean", 1, 0, 0, "Hertz")
        f2 = call(formant, "Get mean", 2, 0, 0, "Hertz")
        f3 = call(formant, "Get mean", 3, 0, 0, "Hertz")
        f4 = call(formant, "Get mean", 4, 0, 0, "Hertz")
        
        # 5. Basic Stats
        f0_mean = call(pitch, "Get mean", 0, 0, "Hertz")
        
        # Handle "NaN" (Silent files or errors) - replace with 0
        features = {
            'jitter_local': jitter_local,
            'jitter_rap': jitter_rap,
            'shimmer_local': shimmer_local,
            'shimmer_apq3': shimmer_apq3,
            'hnr': hnr,
            'f1': f1,
            'f2': f2,
            'f3': f3,
            'f4': f4,
            'f0_mean': f0_mean
        }
        
        # Clean NaNs (Parselmouth returns 'nan' if voice is too quiet)
        return {k: (0.0 if np.isnan(v) else v) for k, v in features.items()}

    except Exception as e:
        # print(f"Error processing {file_path}: {e}")
        return None

def main():
    if not os.path.exists(INPUT_CSV):
        print(f"‚ùå Error: '{INPUT_CSV}' not found. Run Phase 1 first.")
        return

    print("ü©∫ Starting Clinical Feature Extraction (This may take a moment)...")
    
    df = pd.read_csv(INPUT_CSV)
    
    # Storage for features
    extracted_data = []
    
    # Iterate through every file in our index
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        file_path = row['file_path']
        
        if os.path.exists(file_path):
            feats = extract_features(file_path)
            if feats:
                # Add file_path to link it back later
                feats['file_path'] = file_path
                extracted_data.append(feats)
        else:
            print(f"‚ö†Ô∏è Warning: File missing {file_path}")

    # Convert to DataFrame
    sfm_df = pd.DataFrame(extracted_data)
    
    # Merge with original info (Labels/Splits) so we have a Single Master Training CSV
    final_df = pd.merge(df, sfm_df, on='file_path', how='inner')
    
    # Save
    final_df.to_csv(OUTPUT_CSV, index=False)
    
    print(f"\n‚úÖ Extraction Complete!")
    print(f"   Features saved to: '{OUTPUT_CSV}'")
    print(f"   Original Files: {len(df)} -> Successfully Processed: {len(final_df)}")
    print("   (Dropped files were likely too short or silent for Praat to analyze)")

if __name__ == "__main__":
    main()

ü©∫ Starting Clinical Feature Extraction (This may take a moment)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4817/4817 [02:09<00:00, 37.07it/s]



‚úÖ Extraction Complete!
   Features saved to: 'sfm_features.csv'
   Original Files: 4817 -> Successfully Processed: 4817
   (Dropped files were likely too short or silent for Praat to analyze)
