In [1]:
%pip install transformers datasets torch librosa scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [8]:
import librosa
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
import os
from sklearn.metrics import classification_report

# Assuming your data is in the form of a directory with subdirectories for each emotion
data_dir = '../database/Emotions'
emotions = ['Angry', 'Disgusted', 'Fearful', 'Happy', 'Neutral', 'Sad', 'Surprised']
label_encoder = LabelEncoder()
label_encoder.fit(emotions)

def preprocess_audio(file_path):
    # Load the audio file
    audio, sr = librosa.load(file_path, sr=16000)
    # Here you can add more preprocessing if needed, like padding or trimming silence
    return audio

# Create a dataset
class EmotionDataset(Dataset):
    def __init__(self, file_paths, labels):
        self.file_paths = file_paths
        self.labels = labels

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        audio = preprocess_audio(self.file_paths[idx])
        label = self.labels[idx]
        return torch.tensor(audio, dtype=torch.float32), torch.tensor(label, dtype=torch.int64)

# Collect all file paths and their labels
file_paths = []
labels = []
for emotion in emotions:
    emotion_dir = os.path.join(data_dir, emotion)
    for filename in os.listdir(emotion_dir):
        file_paths.append(os.path.join(emotion_dir, filename))
        labels.append(emotion)

# Encode labels to integers
labels = label_encoder.transform(labels)

# Split the dataset into training and testing
train_paths, test_paths, train_labels, test_labels = train_test_split(file_paths, labels, test_size=0.3, random_state=42)
# Further split test set into validation and test sets
val_paths, test_paths, val_labels, test_labels = train_test_split(test_paths, test_labels, test_size=0.5, random_state=42)

# Create datasets
train_dataset = EmotionDataset(train_paths, train_labels)
val_dataset = EmotionDataset(val_paths, val_labels)
test_dataset = EmotionDataset(test_paths, test_labels)

# Create data loaders
batch_size = 8  # You may need to adjust this based on your GPU memory



In [13]:
from transformers import Wav2Vec2ForSequenceClassification
from transformers import Wav2Vec2Processor

# Load the processor and the model
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForSequenceClassification.from_pretrained(
    "facebook/wav2vec2-base-960h",
    num_labels=len(emotions),
    problem_type="single_label_classification",
)

# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Using device: {device}")

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cpu


In [10]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

# Define a custom collate function for padding
def collate_fn(batch):
    # Assume batch is a list of (sequence, label)
    sequences, labels = zip(*batch)
    
    # Pad sequences to the maximum length of any sequence
    padded_sequences = pad_sequence(sequences, batch_first=True, padding_value=0)
    
    # Convert labels to a tensor
    labels = torch.tensor(labels)
    
    return padded_sequences, labels

# Create DataLoader with custom collate_fn
train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=False)


In [14]:
# Make sure the model is in evaluation mode
model.eval()

true_labels = []
predicted_labels = []

# Disable gradient calculation for inference
with torch.no_grad():
    for audio_inputs, labels in test_loader:
        audio_inputs = audio_inputs.to(device)
        labels = labels.to(device)
        
        # Forward pass
        outputs = model(audio_inputs)
        
        # Get the predictions
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        
        # Move predictions and labels to CPU and convert to numpy for sklearn
        predicted_labels.extend(predictions.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Convert encoded labels back to original labels
predicted_labels = label_encoder.inverse_transform(predicted_labels)
true_labels = label_encoder.inverse_transform(true_labels)

# Print classification report
print(classification_report(true_labels, predicted_labels, target_names=emotions))


              precision    recall  f1-score   support

       Angry       0.16      0.05      0.07       305
   Disgusted       0.00      0.00      0.00       290
     Fearful       0.00      0.00      0.00       309
       Happy       0.16      0.96      0.28       316
     Neutral       0.00      0.00      0.00       276
         Sad       0.00      0.00      0.00       324
   Surprised       0.00      0.00      0.00       100

    accuracy                           0.16      1920
   macro avg       0.05      0.14      0.05      1920
weighted avg       0.05      0.16      0.06      1920



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [11]:
from torch.optim import AdamW
from tqdm import tqdm

def train(model, train_loader, val_loader, epochs):
    optimizer = AdamW(model.parameters(), lr=1e-4)
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader):
            optimizer.zero_grad()
            input_values = batch[0].to(device)
            labels = batch[1].to(device)
            outputs = model(input_values=input_values, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Training loss after epoch {epoch + 1}: {total_loss / len(train_loader)}")

        # Validation
        model.eval()
        total_eval_loss = 0
        for batch in tqdm(val_loader):
            with torch.no_grad():
                input_values = batch[0].to(device)
                labels = batch[1].to(device)
                outputs = model(input_values=input_values, labels=labels)
                loss = outputs.loss
                total_eval_loss += loss.item()
        print(f"Validation loss after epoch {epoch + 1}: {total_eval_loss / len(val_loader)}")

# Train the model
train(model, train_loader, val_loader, epochs=3)


 15%|█▌        | 170/1120 [30:33<2:50:44, 10.78s/it]


KeyboardInterrupt: 