In [1]:
import pandas as pd
import numpy as np
import torch
import torchaudio
import os
from transformers import Wav2Vec2Processor
import pickle
import json
from tqdm import tqdm
import gc
from sklearn.model_selection import train_test_split


In [5]:
def extract_features_locally():
    print("Loading dataset...")
    df = pd.read_csv('Dataset/SEP-28k_labels.csv')
    
    # Define non-stuttering event labels to filter out
    non_stutter_events = [
        'Music', 
        'NoSpeech', 
        'Unsure', 
        'DifficultToUnderstand', 
        'PoorAudioQuality'
    ]
    
    # Filter out clips with any non-stuttering events (count > 0)
    print("Filtering out clips with non-stuttering events...")
    for event in non_stutter_events:
        if event in df.columns:
            df = df[df[event] == 0]
    
    print(f"Remaining clips after filtering: {len(df)}")
    
    # Define the dysfluency labels using actual column names
    dysfluency_labels = ['Prolongation', 'Block', 'SoundRep', 'WordRep', 'Interjection']
    
    # Create binary labels for each dysfluency type (count > 0)
    for label in dysfluency_labels:
        df[f'{label}_binary'] = (df[label] > 0).astype(int)
    
    # Create a combined repetition binary label (SoundRep OR WordRep)
    df['Repetition_binary'] = ((df['SoundRep'] > 0) | (df['WordRep'] > 0)).astype(int)
    
    # Create the fluent/dysfluent binary label
    # A clip is fluent if NoStutteredWords > 0 or if all dysfluency types have count = 0
    has_any_dysfluency = (df[['Prolongation', 'Block', 'SoundRep', 'WordRep', 'Interjection']] > 0).any(axis=1).astype(int)
    df['Fluent_binary'] = (~has_any_dysfluency.astype(bool)).astype(int)  # 1 means fluent, 0 means dysfluent
    
    # Print class distribution
    print("\nClass distribution after applying new criteria:")
    total = len(df)
    print(f"Fluent: {df['Fluent_binary'].sum()} samples ({df['Fluent_binary'].sum()/total*100:.2f}%)")
    print(f"Dysfluent: {total - df['Fluent_binary'].sum()} samples ({(total - df['Fluent_binary'].sum())/total*100:.2f}%)")
    
    print("\nDysfluency type distribution:")
    for label in dysfluency_labels:
        binary_col = f'{label}_binary'
        positive_count = df[binary_col].sum()
        print(f"{label}: {positive_count} samples ({positive_count/total*100:.2f}%)")
    print(f"Combined Repetition: {df['Repetition_binary'].sum()} samples ({df['Repetition_binary'].sum()/total*100:.2f}%)")
    
    # Create a SetID if it doesn't exist
    if 'SetID' not in df.columns:
        print("\nSetID column not found. Creating a SetID column for splitting.")
        # Group by unique clips
        unique_clips = df[['Show', 'EpId', 'ClipId']].drop_duplicates()
        
        # Assign 1-10 for train, 11-20 for val, 21-30 for test, with appropriate distributions
        num_train = int(len(unique_clips) * 0.7)
        num_val = int(len(unique_clips) * 0.15)
        
        # Shuffle the clips
        unique_clips = unique_clips.sample(frac=1, random_state=42).reset_index(drop=True)
        
        # Assign SetIDs
        unique_clips.loc[:num_train-1, 'SetID'] = np.random.randint(1, 11, size=num_train)
        unique_clips.loc[num_train:num_train+num_val-1, 'SetID'] = np.random.randint(11, 21, size=num_val)
        unique_clips.loc[num_train+num_val:, 'SetID'] = np.random.randint(21, 31, size=len(unique_clips)-num_train-num_val)
        
        # Merge back to get SetID for all rows
        df = pd.merge(df, unique_clips, on=['Show', 'EpId', 'ClipId'])
    
    # Split data according to SetID
    train_df = df[df['SetID'].isin(range(1, 11))]
    val_df = df[df['SetID'].isin(range(11, 21))]
    test_df = df[df['SetID'].isin(range(21, 31))]
    
    # If the split doesn't work, fall back to random split
    if len(train_df) == 0 or len(val_df) == 0 or len(test_df) == 0:
        print("SetID doesn't match expected pattern. Using a random split.")
        unique_clips = df[['Show', 'EpId', 'ClipId']].drop_duplicates()
        train_clips, temp_clips = train_test_split(unique_clips, test_size=0.3, random_state=42)
        val_clips, test_clips = train_test_split(temp_clips, test_size=0.5, random_state=42)
        
        # Merge back to get the full data for each split
        train_df = pd.merge(df, train_clips, on=['Show', 'EpId', 'ClipId'])
        val_df = pd.merge(df, val_clips, on=['Show', 'EpId', 'ClipId'])
        test_df = pd.merge(df, test_clips, on=['Show', 'EpId', 'ClipId'])
    
    print(f"\nTraining samples: {len(train_df)}")
    print(f"Validation samples: {len(val_df)}")
    print(f"Testing samples: {len(test_df)}")
    
    # Create directories for saving features
    os.makedirs('processed_features', exist_ok=True)
    os.makedirs('processed_features/train', exist_ok=True)
    os.makedirs('processed_features/val', exist_ok=True)
    os.makedirs('processed_features/test', exist_ok=True)
    
    # Save the label dataframes
    train_df.to_csv('processed_features/train_labels.csv', index=False)
    val_df.to_csv('processed_features/val_labels.csv', index=False)
    test_df.to_csv('processed_features/test_labels.csv', index=False)
    
    # For the combined model:
    # We'll use these labels for multi-label classification
    dysfluency_labels_for_model = ['Repetition_binary', 'Prolongation_binary', 'Block_binary', 'Interjection_binary']
    
    # Calculate class weights for multi-label classification
    weights = []
    total_samples = len(train_df)
    for label in dysfluency_labels_for_model:
        positive_count = train_df[label].sum()
        if positive_count > 0:
            weight = total_samples / (2 * positive_count)
        else:
            weight = 1.0
        weights.append(weight)
    
    # Map back to original label names for clarity
    label_names = ['Repetition', 'Prolongation', 'Block', 'Interjection']
    
    # Save the class weights
    with open('processed_features/class_weights.json', 'w') as f:
        json.dump({label: float(weight) for label, weight in zip(label_names, weights)}, f)
    
    # Load Wav2Vec2 processor
    print("\nLoading Wav2Vec2 processor...")
    processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
    
    # Function to extract and save features for a dataframe
    def extract_features_for_split(dataframe, split_name, audio_dir):
        print(f"Extracting features for {split_name} split...")
        
        # Dictionary to store file paths and labels
        split_data = {
            'file_paths': [],
            'binary_labels': [],
            'multi_labels': []
        }
        
        for idx, row in tqdm(dataframe.iterrows(), total=len(dataframe)):
            # Construct filename with proper underscores
            show = row['Show']
            ep_id = row['EpId']
            clip_id = row['ClipId']
            filename = f"{show}_{ep_id}_{clip_id}.wav"
            audio_path = os.path.join(audio_dir, filename)
            
            feature_path = f"processed_features/{split_name}/{show}_{ep_id}_{clip_id}.pt"
            
            # Skip if features already exist
            if os.path.exists(feature_path):
                # Add to the split data
                split_data['file_paths'].append(feature_path)
                split_data['binary_labels'].append(row['Fluent_binary'])
                split_data['multi_labels'].append([row[label] for label in dysfluency_labels_for_model])
                continue
            
            try:
                # Load audio
                waveform, sample_rate = torchaudio.load(audio_path)
                
                # Convert to mono if stereo
                if waveform.shape[0] > 1:
                    waveform = torch.mean(waveform, dim=0, keepdim=True)
                
                # Resample if needed (Wav2Vec2 expects 16kHz)
                if sample_rate != 16000:
                    resampler = torchaudio.transforms.Resample(sample_rate, 16000)
                    waveform = resampler(waveform)
                
                # Squeeze to remove channel dimension
                waveform = waveform.squeeze(0)
                
                # Process with Wav2Vec2
                inputs = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)
                
                # Save processed features
                torch.save(inputs, feature_path)
                
                # Add to the split data
                split_data['file_paths'].append(feature_path)
                split_data['binary_labels'].append(row['Fluent_binary'])
                split_data['multi_labels'].append([row[label] for label in dysfluency_labels_for_model])
                
            except Exception as e:
                print(f"Error processing {audio_path}: {e}")
        
        # Save the split data
        with open(f'processed_features/{split_name}_data.pkl', 'wb') as f:
            pickle.dump(split_data, f)
    
    # Extract features for each split
    audio_dir = "Dataset\\SEP-28K"  # Update with your local path
    extract_features_for_split(train_df, 'train', audio_dir)
    extract_features_for_split(val_df, 'val', audio_dir)
    extract_features_for_split(test_df, 'test', audio_dir)
    
    # Save the label information for the model
    label_info = {
        'binary_label': 'Fluent_binary',
        'multi_labels': dysfluency_labels_for_model,
        'label_names': label_names
    }
    with open('processed_features/label_info.json', 'w') as f:
        json.dump(label_info, f)
    
    print("Feature extraction complete!")
    print("Upload the 'processed_features' folder to Colab to continue with model training.")

if __name__ == "__main__":
    extract_features_locally()

Loading dataset...
Filtering out clips with non-stuttering events...
Remaining clips after filtering: 20851

Class distribution after applying new criteria:
Fluent: 4434 samples (21.27%)
Dysfluent: 16417 samples (78.73%)

Dysfluency type distribution:
Prolongation: 6360 samples (30.50%)
Block: 8484 samples (40.69%)
SoundRep: 3680 samples (17.65%)
WordRep: 3625 samples (17.39%)
Interjection: 7522 samples (36.08%)
Combined Repetition: 6175 samples (29.61%)

SetID column not found. Creating a SetID column for splitting.

Training samples: 14595
Validation samples: 3127
Testing samples: 3129

Loading Wav2Vec2 processor...
Extracting features for train split...


  0%|          | 35/14595 [00:00<00:43, 335.53it/s]

Error processing Dataset\SEP-28K\HeStutters_0_0.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\HeStutters_0_2.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\HeStutters_0_3.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\HeStutters_0_4.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\HeStutters_0_5.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\HeStutters_0_6.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\HeStutters_0_7.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\HeStutters_0_8.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\HeStutters_0_10.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\HeStutters_0_12.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\HeStutters_0_14.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\HeStutters_0_15.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\HeStutters_0_22.wav: Failed to decode a

 33%|███▎      | 4795/14595 [00:54<01:18, 124.94it/s]

Error processing Dataset\SEP-28K\StrongVoices_25_0.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\StrongVoices_25_1.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\StrongVoices_25_2.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\StrongVoices_25_3.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\StrongVoices_25_6.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\StrongVoices_25_9.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\StrongVoices_25_10.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\StrongVoices_25_11.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\StrongVoices_25_13.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\StrongVoices_25_14.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\StrongVoices_25_15.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\StrongVoices_25_18.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\S

 49%|████▉     | 7214/14595 [01:25<01:32, 79.76it/s] 

Error processing Dataset\SEP-28K\StutterTalk_59_31.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\StutterTalk_59_34.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\StutterTalk_59_35.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\StutterTalk_59_36.wav: Failed to decode audio.


 68%|██████▊   | 9928/14595 [02:07<00:38, 121.24it/s]

Error processing Dataset\SEP-28K\WomenWhoStutter_0_31.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\WomenWhoStutter_0_36.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\WomenWhoStutter_0_39.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\WomenWhoStutter_0_45.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\WomenWhoStutter_0_46.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\WomenWhoStutter_0_49.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\WomenWhoStutter_0_50.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\WomenWhoStutter_0_53.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\WomenWhoStutter_0_55.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\WomenWhoStutter_0_71.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\WomenWhoStutter_0_72.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\WomenWhoStutter_0_77.wav: Failed to decode audio.
Erro

100%|██████████| 14595/14595 [03:21<00:00, 72.50it/s]


Extracting features for val split...


  0%|          | 14/3127 [00:00<00:25, 124.36it/s]

Error processing Dataset\SEP-28K\HeStutters_0_1.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\HeStutters_0_27.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\HeStutters_0_28.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\HeStutters_1_25.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\HeStutters_1_59.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\HeStutters_1_63.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\HeStutters_1_101.wav: Failed to decode audio.


 34%|███▍      | 1058/3127 [00:16<00:30, 68.14it/s]

Error processing Dataset\SEP-28K\StrongVoices_25_17.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\StrongVoices_25_27.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\StrongVoices_25_28.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\StrongVoices_25_36.wav: Failed to decode audio.


 49%|████▉     | 1531/3127 [00:23<00:26, 59.20it/s]

Error processing Dataset\SEP-28K\StutterTalk_59_37.wav: Failed to decode audio.


 68%|██████▊   | 2121/3127 [00:33<00:14, 67.68it/s]

Error processing Dataset\SEP-28K\WomenWhoStutter_0_40.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\WomenWhoStutter_0_63.wav: Failed to decode audio.


100%|██████████| 3127/3127 [00:49<00:00, 62.99it/s]


Extracting features for test split...


  0%|          | 14/3129 [00:00<00:25, 122.86it/s]

Error processing Dataset\SEP-28K\HeStutters_0_9.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\HeStutters_0_11.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\HeStutters_0_13.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\HeStutters_0_16.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\HeStutters_1_32.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\HeStutters_1_43.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\HeStutters_1_85.wav: Failed to decode audio.


 33%|███▎      | 1026/3129 [00:13<00:27, 75.18it/s]

Error processing Dataset\SEP-28K\StrongVoices_25_8.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\StrongVoices_25_12.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\StrongVoices_25_23.wav: Failed to decode audio.


 49%|████▉     | 1541/3129 [00:20<00:20, 78.39it/s]

Error processing Dataset\SEP-28K\StutterTalk_59_33.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\StutterTalk_59_38.wav: Failed to decode audio.


 68%|██████▊   | 2133/3129 [00:28<00:11, 83.94it/s]

Error processing Dataset\SEP-28K\WomenWhoStutter_0_51.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\WomenWhoStutter_0_56.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\WomenWhoStutter_0_59.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\WomenWhoStutter_0_60.wav: Failed to decode audio.
Error processing Dataset\SEP-28K\WomenWhoStutter_0_112.wav: Failed to decode audio.


100%|██████████| 3129/3129 [00:41<00:00, 74.80it/s]

Feature extraction complete!
Upload the 'processed_features' folder to Colab to continue with model training.





In [6]:
import pandas as pd
import numpy as np
import torch
import torchaudio
import os
from transformers import Wav2Vec2Processor
import pickle
import json
from tqdm import tqdm
import gc
from sklearn.model_selection import train_test_split

# Create output directories
os.makedirs('fluencybank_processed_features', exist_ok=True)
os.makedirs('fluencybank_processed_features/train', exist_ok=True)
os.makedirs('fluencybank_processed_features/val', exist_ok=True)
os.makedirs('fluencybank_processed_features/test', exist_ok=True)

In [None]:
# Load FluencyBank annotations
print("Loading FluencyBank dataset...")
fluencybank_df = pd.read_csv('Dataset/fluencybank_labels.csv')

# Define non-stuttering event labels to filter out
non_stutter_events = [
    'Music', 
    'NoSpeech', 
    'Unsure', 
    'DifficultToUnderstand', 
    'PoorAudioQuality'
]

# Filter out clips with any non-stuttering events (count > 0)
print("Filtering out clips with non-stuttering events...")
for event in non_stutter_events:
    if event in fluencybank_df.columns:
        fluencybank_df = fluencybank_df[fluencybank_df[event] == 0]

print(f"Remaining clips after filtering: {len(fluencybank_df)}")

Loading FluencyBank dataset...
Filtering out clips with non-stuttering events...
Remaining clips after filtering: 3624


In [10]:
# Handle SoundRep and WordRep combination
# Create a combined 'Repetition' column if SoundRep and WordRep exist separately
if 'SoundRep' in fluencybank_df.columns and 'WordRep' in fluencybank_df.columns:
    print("Combining SoundRep and WordRep into Repetition...")
    fluencybank_df['Repetition'] = fluencybank_df[['SoundRep', 'WordRep']].max(axis=1)
    dysfluency_labels = ['Repetition', 'Prolongation', 'Block', 'Interjection']
else:
    # If they're already combined or have different names
    dysfluency_labels = [col for col in ['Repetition', 'Prolongation', 'Block', 'Interjection'] 
                        if col in fluencybank_df.columns]

# Create binary labels for each dysfluency type (count > 0)
for label in dysfluency_labels:
    fluencybank_df[f'{label}_binary'] = (fluencybank_df[label] > 0).astype(int)

# Create binary fluent/dysfluent label
has_any_dysfluency = (fluencybank_df[[f'{label}_binary' for label in dysfluency_labels]].sum(axis=1) > 0).astype(int)
fluencybank_df['Fluent_binary'] = (~has_any_dysfluency.astype(bool)).astype(int)

# Print class distribution
print("\nClass distribution after applying new criteria:")
total = len(fluencybank_df)
print(f"Fluent: {fluencybank_df['Fluent_binary'].sum()} samples ({fluencybank_df['Fluent_binary'].sum()/total*100:.2f}%)")
print(f"Dysfluent: {total - fluencybank_df['Fluent_binary'].sum()} samples ({(total - fluencybank_df['Fluent_binary'].sum())/total*100:.2f}%)")

print("\nDysfluency type distribution:")
for label in dysfluency_labels:
    binary_col = f'{label}_binary'
    positive_count = fluencybank_df[binary_col].sum()
    print(f"{label}: {positive_count} samples ({positive_count/total*100:.2f}%)")

Combining SoundRep and WordRep into Repetition...

Class distribution after applying new criteria:
Fluent: 931 samples (25.69%)
Dysfluent: 2693 samples (74.31%)

Dysfluency type distribution:
Repetition: 1424 samples (39.29%)
Prolongation: 814 samples (22.46%)
Block: 1192 samples (32.89%)
Interjection: 1484 samples (40.95%)


In [12]:
# Split data into train, val, test sets
# Create filename by combining columns with proper formatting
fluencybank_df['filename'] = fluencybank_df.apply(
    lambda row: f"FluencyBank_{str(row['EpId']).zfill(3)}_{row['ClipId']}.wav", 
    axis=1
)

# Now split using this constructed filename
unique_files = fluencybank_df[['filename']].drop_duplicates()

train_files, temp_files = train_test_split(unique_files, test_size=0.3, random_state=42)
val_files, test_files = train_test_split(temp_files, test_size=0.5, random_state=42)

# Create dataframes for each split
train_df = pd.merge(fluencybank_df, train_files, on=['filename'])
val_df = pd.merge(fluencybank_df, val_files, on=['filename'])
test_df = pd.merge(fluencybank_df, test_files, on=['filename'])

# Save the label dataframes
train_df.to_csv('fluencybank_processed_features/train_labels.csv', index=False)
val_df.to_csv('fluencybank_processed_features/val_labels.csv', index=False)
test_df.to_csv('fluencybank_processed_features/test_labels.csv', index=False)

print(f"Training samples: {len(train_df)}")
print(f"Validation samples: {len(val_df)}")
print(f"Testing samples: {len(test_df)}")

Training samples: 2536
Validation samples: 544
Testing samples: 544


In [13]:
# Calculate class weights for multi-label classification
weights = []
total_samples = len(train_df)
for label in dysfluency_labels:
    binary_col = f'{label}_binary'
    positive_count = train_df[binary_col].sum()
    if positive_count > 0:
        weight = total_samples / (2 * positive_count)
    else:
        weight = 1.0
    weights.append(weight)

# Save the class weights
with open('fluencybank_processed_features/class_weights.json', 'w') as f:
    json.dump({label: float(weight) for label, weight in zip(dysfluency_labels, weights)}, f)

print("Class weights:")
for label, weight in zip(dysfluency_labels, weights):
    print(f"{label}: {weight:.4f}")

Class weights:
Repetition: 1.2617
Prolongation: 2.1712
Block: 1.5259
Interjection: 1.1940


In [14]:
# Load Wav2Vec2 processor
print("\nLoading Wav2Vec2 processor...")
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")

# Function to extract and save features for a dataframe
def extract_features_for_split(dataframe, split_name, audio_dir):
    print(f"Extracting features for {split_name} split...")
    
    # Dictionary to store file paths and labels
    split_data = {
        'file_paths': [],
        'binary_labels': [],
        'multi_labels': []
    }
    
    for idx, row in tqdm(dataframe.iterrows(), total=len(dataframe)):
        # Construct filename based on FluencyBank naming conventions
        # Adjust this part based on your actual audio file naming pattern
        filename = row['filename']  # Change to your column name for audio filenames
        audio_path = os.path.join(audio_dir, filename)
        
        feature_path = f"fluencybank_processed_features/{split_name}/{filename.replace('.wav', '.pt')}"
        
        # Skip if features already exist
        if os.path.exists(feature_path):
            # Add to the split data
            split_data['file_paths'].append(feature_path)
            split_data['binary_labels'].append(row['Fluent_binary'])
            split_data['multi_labels'].append([row[f'{label}_binary'] for label in dysfluency_labels])
            continue
        
        try:
            # Load audio
            waveform, sample_rate = torchaudio.load(audio_path)
            
            # Convert to mono if stereo
            if waveform.shape[0] > 1:
                waveform = torch.mean(waveform, dim=0, keepdim=True)
            
            # Resample if needed (Wav2Vec2 expects 16kHz)
            if sample_rate != 16000:
                resampler = torchaudio.transforms.Resample(sample_rate, 16000)
                waveform = resampler(waveform)
            
            # Squeeze to remove channel dimension
            waveform = waveform.squeeze(0)
            
            # Process with Wav2Vec2
            inputs = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)
            
            # Save processed features
            torch.save(inputs, feature_path)
            
            # Add to the split data
            split_data['file_paths'].append(feature_path)
            split_data['binary_labels'].append(row['Fluent_binary'])
            split_data['multi_labels'].append([row[f'{label}_binary'] for label in dysfluency_labels])
            
        except Exception as e:
            print(f"Error processing {audio_path}: {e}")
    
    # Save the split data
    with open(f'fluencybank_processed_features/{split_name}_data.pkl', 'wb') as f:
        pickle.dump(split_data, f)
    
    print(f"Saved {len(split_data['file_paths'])} features for {split_name} split")


Loading Wav2Vec2 processor...


In [15]:
# Update this path to point to your local FluencyBank audio files
audio_dir = "Dataset\FluencyBank"  

# Process each split
extract_features_for_split(train_df, 'train', audio_dir)
extract_features_for_split(val_df, 'val', audio_dir)
extract_features_for_split(test_df, 'test', audio_dir)

  audio_dir = "Dataset\FluencyBank"


Extracting features for train split...


  9%|▊         | 221/2536 [00:02<00:14, 156.15it/s]

Error processing Dataset\FluencyBank\FluencyBank_019_7.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_019_8.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_019_11.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_019_12.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_019_16.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_019_17.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_019_22.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_019_24.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_019_25.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_019_26.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_019_29.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_019_33.wav: Failed

 66%|██████▌   | 1664/2536 [00:21<00:04, 181.83it/s]

Error processing Dataset\FluencyBank\FluencyBank_118_5.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_118_6.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_118_8.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_118_9.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_118_10.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_118_12.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_118_14.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_118_19.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_118_20.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_118_21.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_118_22.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_118_23.wav: Failed t

 66%|██████▋   | 1683/2536 [00:22<00:05, 148.27it/s]

Error processing Dataset\FluencyBank\FluencyBank_118_91.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_118_92.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_118_93.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_118_94.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_118_99.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_118_101.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_118_104.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_118_105.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_118_108.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_118_109.wav: Failed to decode audio.


100%|██████████| 2536/2536 [00:38<00:00, 65.25it/s] 


Saved 2435 features for train split
Extracting features for val split...


 12%|█▎        | 68/544 [00:01<00:05, 86.96it/s]

Error processing Dataset\FluencyBank\FluencyBank_019_9.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_019_10.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_019_23.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_019_27.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_019_28.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_019_30.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_019_31.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_019_42.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_019_46.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_019_47.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_019_49.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_019_54.wav: Faile

 70%|███████   | 381/544 [00:07<00:02, 60.37it/s]

Error processing Dataset\FluencyBank\FluencyBank_118_13.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_118_18.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_118_37.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_118_38.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_118_50.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_118_59.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_118_60.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_118_79.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_118_87.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_118_90.wav: Failed to decode audio.


100%|██████████| 544/544 [00:11<00:00, 49.08it/s]


Saved 521 features for val split
Extracting features for test split...


 10%|▉         | 54/544 [00:01<00:06, 73.51it/s]

Error processing Dataset\FluencyBank\FluencyBank_019_13.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_019_15.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_019_20.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_019_21.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_019_58.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_019_59.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_019_61.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_019_62.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_019_68.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_019_69.wav: Failed to decode audio.


 63%|██████▎   | 344/544 [00:07<00:03, 62.79it/s]

Error processing Dataset\FluencyBank\FluencyBank_118_7.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_118_11.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_118_17.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_118_32.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_118_39.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_118_46.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_118_52.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_118_64.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_118_65.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_118_81.wav: Failed to decode audio.
Error processing Dataset\FluencyBank\FluencyBank_118_100.wav: Failed to decode audio.


100%|██████████| 544/544 [00:12<00:00, 44.54it/s]

Saved 523 features for test split





In [16]:
# Save the label information for the model
label_info = {
    'binary_label': 'Fluent_binary',
    'multi_labels': [f'{label}_binary' for label in dysfluency_labels],
    'label_names': dysfluency_labels
}
with open('fluencybank_processed_features/label_info.json', 'w') as f:
    json.dump(label_info, f)

print("Feature extraction complete!")
print("Upload the 'fluencybank_processed_features' folder to Colab to continue with model training.")

Feature extraction complete!
Upload the 'fluencybank_processed_features' folder to Colab to continue with model training.
