In [2]:
# Import necessary libraries
import os
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset
from transformers import Wav2Vec2Processor, HubertForSequenceClassification, TrainingArguments, Trainer
import librosa
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay



# Function to collect data from the specified directories
def collect_data(directories, label, max_files_per_speaker=100):
    """
    Traverse through the dataset directories and collect audio paths and labels.
    """
    data = []
    for base_dir in directories:
        base_path = Path(base_dir)
        if not base_path.exists():
            print(f"Directory {base_dir} does not exist. Skipping.")
            continue
        speakers = [speaker for speaker in base_path.iterdir() if speaker.is_dir()]
        for speaker in speakers:
            sessions = [session for session in speaker.iterdir() if session.is_dir() and session.name.lower().startswith('session')]
            for session in sessions:
                wav_dirs = [session / 'wav_arrayMic', session / 'wav_headMic']
                for wav_dir in wav_dirs:
                    if wav_dir.exists():
                        wav_files = list(wav_dir.glob('*.wav'))[:max_files_per_speaker]
                        for wav_file in wav_files:
                            try:
                                # Attempt to load the audio file to check if it's valid
                                speech_array, sr = librosa.load(wav_file, sr=None)
                                data.append({'audio_path': str(wav_file.resolve()), 'label': label})
                            except Exception as e:
                                print(f"Error loading {wav_file}: {e}. Skipping this file.")
    return data

# Directories for dysarthric and non-dysarthric speech
dysarthric_dirs = [
    'D:/datasets/TORGO male with dysarthria',
    'D:/datasets/TORGO female with dysarthria'
]
non_dysarthric_dirs = [
    'D:/datasets/TORGO male without dysarthria',
    'D:/datasets/TORGO female without dysarthria'
]

# Collect data
dysarthric_data = collect_data(dysarthric_dirs, label=1, max_files_per_speaker=50)
non_dysarthric_data = collect_data(non_dysarthric_dirs, label=0, max_files_per_speaker=50)
all_data = dysarthric_data + non_dysarthric_data
df = pd.DataFrame(all_data).sample(frac=1).reset_index(drop=True)  # Shuffle data

# Split into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Custom Dataset class
class SpeechDataset(Dataset):
    def __init__(self, dataframe, processor, max_duration=5.0, sampling_rate=16000):
        self.dataframe = dataframe
        self.processor = processor
        self.max_duration = max_duration
        self.sampling_rate = sampling_rate

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        audio_path = self.dataframe.iloc[idx]['audio_path']
        label = self.dataframe.iloc[idx]['label']
        speech_array, sr = librosa.load(audio_path, sr=self.sampling_rate)
        max_length = int(self.max_duration * self.sampling_rate)
        if len(speech_array) > max_length:
            speech_array = speech_array[:max_length]
        else:
            speech_array = np.pad(speech_array, (0, max_length - len(speech_array)), 'constant')
        inputs = self.processor(speech_array, sampling_rate=self.sampling_rate, return_tensors="pt", padding=True)
        inputs['labels'] = torch.tensor(label, dtype=torch.long)
        return inputs

# Initialize processor and model
processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")
model = HubertForSequenceClassification.from_pretrained("facebook/hubert-large-ls960-ft", num_labels=2)

# Create datasets
train_dataset = SpeechDataset(train_df, processor)
test_dataset = SpeechDataset(test_df, processor)

# Set the output directory in Google Drive
output_dir = '/content/drive/MyDrive/HuBERT_fine_tuned_model_optimized'

# Training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,  # Adjust based on your GPU capacity
    per_device_eval_batch_size=8,
    num_train_epochs=10,  # Increase epochs for thorough learning
    weight_decay=0.01,
    save_total_limit=1,
    fp16=True,  # Enable mixed precision training if supported
)

# Data collator
def data_collator(features):
    batch = {
        'input_values': torch.stack([f['input_values'][0] for f in features]),
        'labels': torch.tensor([f['labels'] for f in features], dtype=torch.long)
    }
    return batch

# Compute metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    cm = confusion_matrix(labels, preds)
    ConfusionMatrixDisplay(cm, display_labels=['Non-Dysarthric', 'Dysarthric']).plot(cmap=plt.cm.Blues)
    plt.title('Confusion Matrix')
    plt.show()
    return {"accuracy": (preds == labels).mean()}

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train and evaluate
trainer.train()
trainer.evaluate()

# Save the final model and processor
trainer.save_model(output_dir)
processor.save_pretrained(output_dir)

# Visualize training and validation loss
log_history = trainer.state.log_history
train_loss = [entry['loss'] for entry in log_history if 'loss' in entry and 'eval_loss' not in entry]
eval_loss = [entry['eval_loss'] for entry in log_history if 'eval_loss' in entry]
epochs = range(1, len(eval_loss) + 1)

plt.figure(figsize=(10, 5))
plt.plot(epochs, train_loss[:len(epochs)], label='Training Loss')
plt.plot(epochs, eval_loss, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.grid(True)
plt.show()


Error while downloading from https://cdn-lfs.hf.co/facebook/hubert-large-ls960-ft/9cf43abec3f0410ad6854afa4d376c69ccb364b48ddddfd25c4c5aa16398eab0?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27pytorch_model.bin%3B+filename%3D%22pytorch_model.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1738435049&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczODQzNTA0OX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9mYWNlYm9vay9odWJlcnQtbGFyZ2UtbHM5NjAtZnQvOWNmNDNhYmVjM2YwNDEwYWQ2ODU0YWZhNGQzNzZjNjljY2IzNjRiNDhkZGRkZmQyNWM0YzVhYTE2Mzk4ZWFiMD9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSomcmVzcG9uc2UtY29udGVudC10eXBlPSoifV19&Signature=TD4CZfmAf70TmvoHCfeVjuGqlL5%7EsGDAjG0H%7Edjb6K2%7EByyGT2efBth5WKGTrATrfBfLxDOBd67EYqX6mBcN9em0Kmitum5kcUO2m6jaBWXMftaIeh7SZm0yPQzetAGoSKp5bGQLCaRmVht67aO6qSGIYg%7EbxzAu%7E802oXI0LRNwXMgVOWlmtjLqSjOprfZaKbTMNKw4zVH7-MVzceuq5I-Wk1kJnS%7EgVUvBlQCPc4Z7YrqvFMU1NoQM58dd20VCiOQ4uqYVap3c5-tRca%7E1UyslXHTf

KeyboardInterrupt: 