In [2]:
from dataclasses import dataclass
from typing import Dict, List, Optional, Union
import torch

import transformers
from transformers import Wav2Vec2Processor
from dataclasses import dataclass
from typing import Optional, Tuple
import torch
from transformers.file_utils import ModelOutput

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
from transformers import AutoConfig, Wav2Vec2Processor

import librosa
import IPython.display as ipd
import numpy as np
import pandas as pd

In [9]:
model_name_or_path = "m3hrdadfi/wav2vec2-xlsr-greek-speech-emotion-recognition"
config = AutoConfig.from_pretrained(model_name_or_path)

config.json:   0%|          | 0.00/1.93k [00:00<?, ?B/s]



In [5]:
@dataclass
class SpeechClassifierOutput(ModelOutput):
    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None

import torch
import torch.nn as nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from transformers.models.wav2vec2.modeling_wav2vec2 import (
    Wav2Vec2PreTrainedModel,
    Wav2Vec2Model
)


class Wav2Vec2ClassificationHead(nn.Module):
    """Head for wav2vec classification task."""

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.final_dropout)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        x = features
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x


class Frenzy(Wav2Vec2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.pooling_mode = config.pooling_mode
        self.config = config

        self.wav2vec2 = Wav2Vec2Model(config)
        self.classifier = Wav2Vec2ClassificationHead(config)

        self.init_weights()

    def freeze_feature_extractor(self):
        self.wav2vec2.feature_extractor._freeze_parameters()

    def merged_strategy(
            self,
            hidden_states,
            mode="mean"
    ):
        if mode == "mean":
            outputs = torch.mean(hidden_states, dim=1)
        elif mode == "sum":
            outputs = torch.sum(hidden_states, dim=1)
        elif mode == "max":
            outputs = torch.max(hidden_states, dim=1)[0]
        else:
            raise Exception(
                "The pooling method hasn't been defined! Your pooling mode must be one of these ['mean', 'sum', 'max']")

        return outputs

    def forward(
            self,
            input_values,
            attention_mask=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None,
            labels=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        outputs = self.wav2vec2(
            input_values,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = outputs[0]
        hidden_states = self.merged_strategy(hidden_states, mode=self.pooling_mode)
        logits = self.classifier(hidden_states)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SpeechClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


In [10]:
Frenzy(config=config)

Frenzy(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1-4): 4 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): L

## model

In [3]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import torchaudio
import librosa
from transformers import (
    Wav2Vec2Processor, 
    Wav2Vec2Model, 
    BertModel, 
    BertTokenizer
)
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

# Custom Dataset
class MultimodalEmotionDataset(Dataset):
    def __init__(self, csv_path, processor, tokenizer, max_length=128):
        # Read the CSV
        self.data = pd.read_csv(csv_path, sep='\t')
        
        # Add random text column if not exists
        if 'text' not in self.data.columns:
            self.data['text'] = [f"Random text for {name}" for name in self.data['name']]
        
        self.processor = processor
        self.tokenizer = tokenizer
        self.max_length = max_length
        
        # Mapping emotions to indices
        self.emotion_to_idx = {emotion: idx for idx, emotion in enumerate(self.data['emotion'].unique())}
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        
        # Process Audio
        speech_array, sampling_rate = torchaudio.load(row['path'])
        speech_array = speech_array.squeeze().numpy()
        speech_array = librosa.resample(np.asarray(speech_array), sampling_rate, self.processor.feature_extractor.sampling_rate)
        
        audio_inputs = self.processor(
            speech_array, 
            sampling_rate=self.processor.feature_extractor.sampling_rate, 
            return_tensors="pt"
        )
        
        # Process Text
        text_inputs = self.tokenizer(
            row['text'], 
            max_length=self.max_length, 
            padding='max_length', 
            truncation=True, 
            return_tensors="pt"
        )
        
        # Get emotion label
        label = self.emotion_to_idx[row['emotion']]
        
        return {
            'audio_input': audio_inputs.input_values.squeeze(),
            'audio_mask': audio_inputs.attention_mask.squeeze(),
            'text_input_ids': text_inputs['input_ids'].squeeze(),
            'text_attention_mask': text_inputs['attention_mask'].squeeze(),
            'label': label
        }

# Multimodal Fusion Model
class MultimodalEmotionClassifier(nn.Module):
    def __init__(self, num_labels, audio_model_path, text_model_path):
        super().__init__()
        
        # Audio Encoder (Wav2Vec2)
        self.audio_encoder = Wav2Vec2Model.from_pretrained(audio_model_path)
        
        # Text Encoder (BERT)
        self.text_encoder = BertModel.from_pretrained(text_model_path)
        
        # Freeze pretrained encoders (optional)
        for param in self.audio_encoder.parameters():
            param.requires_grad = False
        for param in self.text_encoder.parameters():
            param.requires_grad = False
        
        # Fusion Layer
        audio_feature_dim = self.audio_encoder.config.hidden_size
        text_feature_dim = self.text_encoder.config.hidden_size
        fusion_dim = audio_feature_dim + text_feature_dim
        
        self.fusion_layers = nn.Sequential(
            nn.Linear(fusion_dim, fusion_dim // 2),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(fusion_dim // 2, num_labels)
        )
    
    def forward(self, audio_input, audio_mask, text_input_ids, text_attention_mask):
        # Extract audio features
        audio_outputs = self.audio_encoder(
            audio_input, 
            attention_mask=audio_mask
        )
        audio_features = torch.mean(audio_outputs.last_hidden_state, dim=1)
        
        # Extract text features
        text_outputs = self.text_encoder(
            text_input_ids, 
            attention_mask=text_attention_mask
        )
        text_features = text_outputs.pooler_output
        
        # Concatenate features
        combined_features = torch.cat([audio_features, text_features], dim=1)
        
        # Classification
        logits = self.fusion_layers(combined_features)
        
        return logits

# Training Setup
def main():
    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Paths and Configurations
    audio_model_path = "m3hrdadfi/wav2vec2-xlsr-greek-speech-emotion-recognition"
    text_model_path = "bert-base-uncased"
    
    # Initialize Processors
    audio_processor = Wav2Vec2Processor.from_pretrained(audio_model_path)
    text_tokenizer = BertTokenizer.from_pretrained(text_model_path)
    
    # Create Datasets
    train_dataset = MultimodalEmotionDataset("dataset/train.csv", audio_processor, text_tokenizer)
    test_dataset = MultimodalEmotionDataset("dataset/test.csv", audio_processor, text_tokenizer)
    
    # Get number of labels
    num_labels = len(train_dataset.emotion_to_idx)
    
    # Initialize Model
    model = MultimodalEmotionClassifier(
        num_labels=num_labels, 
        audio_model_path=audio_model_path, 
        text_model_path=text_model_path
    ).to(device)
    
    print(f"Model Initialized with {num_labels} emotion classes")
    print("Emotion to Index mapping:", train_dataset.emotion_to_idx)

if __name__ == "__main__":
    #main() 
    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Paths and Configurations
    audio_model_path = "m3hrdadfi/wav2vec2-xlsr-greek-speech-emotion-recognition"
    text_model_path = "bert-base-uncased"
    
    # Initialize Processors
    audio_processor = Wav2Vec2Processor.from_pretrained(audio_model_path)
    text_tokenizer = BertTokenizer.from_pretrained(text_model_path)
    
    # Create Datasets
    train_dataset = MultimodalEmotionDataset("dataset/train.csv", audio_processor, text_tokenizer)
    test_dataset = MultimodalEmotionDataset("dataset/test.csv", audio_processor, text_tokenizer)
    
    # Get number of labels
    num_labels = len(train_dataset.emotion_to_idx)
    
    # Initialize Model
    model = MultimodalEmotionClassifier(
        num_labels=num_labels, 
        audio_model_path=audio_model_path, 
        text_model_path=text_model_path
    ).to(device)
    
    print(f"Model Initialized with {num_labels} emotion classes")
    print("Emotion to Index mapping:", train_dataset.emotion_to_idx)



Model Initialized with 5 emotion classes
Emotion to Index mapping: {'happiness': 0, 'anger': 1, 'disgust': 2, 'sadness': 3, 'fear': 4}


In [4]:
model

MultimodalEmotionClassifier(
  (audio_encoder): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1-4): 4 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=T

## training

In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import torchaudio
import librosa
import os
from transformers import (
    Wav2Vec2Processor, 
    Wav2Vec2Model, 
    BertModel, 
    BertTokenizer
)
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Custom Dataset
import torch.nn.functional as F

class MultimodalEmotionDataset(Dataset):
    def __init__(self, csv_path, processor, tokenizer, max_length=128, target_audio_length=16000):
        # Read the CSV
        self.data = pd.read_csv(csv_path, sep='\t')
        
        # Add random text column if not exists
        if 'text' not in self.data.columns:
            self.data['text'] = [f"Random text for {name}" for name in self.data['name']]
        
        self.processor = processor
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.target_audio_length = target_audio_length
        
        # Mapping emotions to indices
        self.emotion_to_idx = {emotion: idx for idx, emotion in enumerate(self.data['emotion'].unique())}
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        
        # Process Audio
        speech_array, sampling_rate = torchaudio.load(row['path'])
        speech_array = speech_array.squeeze().numpy()
        # speech_array = librosa.resample(np.asarray(speech_array), sampling_rate, self.processor.feature_extractor.sampling_rate)
        speech_array = librosa.resample(y=np.asarray(speech_array), orig_sr=sampling_rate, target_sr=self.processor.feature_extractor.sampling_rate)
        # Pad or truncate audio
        if len(speech_array) > self.target_audio_length:
            speech_array = speech_array[:self.target_audio_length]
        elif len(speech_array) < self.target_audio_length:
            padding = self.target_audio_length - len(speech_array)
            speech_array = np.pad(speech_array, (0, padding), mode='constant', constant_values=0)
        
        audio_inputs = self.processor(
            speech_array, 
            sampling_rate=self.processor.feature_extractor.sampling_rate, 
            return_tensors="pt"
        )
        
        # Process Text
        text_inputs = self.tokenizer(
            row['text'], 
            max_length=self.max_length, 
            padding='max_length', 
            truncation=True, 
            return_tensors="pt"
        )
        
        # Get emotion label
        label = self.emotion_to_idx[row['emotion']]
        
        return {
            'audio_input': audio_inputs.input_values.squeeze(),
            'audio_mask': audio_inputs.attention_mask.squeeze(),
            'text_input_ids': text_inputs['input_ids'].squeeze(),
            'text_attention_mask': text_inputs['attention_mask'].squeeze(),
            'label': label
        }


# Multimodal Fusion Model
class MultimodalEmotionClassifier(nn.Module):
    def __init__(self, num_labels, audio_model_path, text_model_path):
        super().__init__()
        
        # Audio Encoder (Wav2Vec2)
        self.audio_encoder = Wav2Vec2Model.from_pretrained(audio_model_path)
        
        # Text Encoder (BERT)
        self.text_encoder = BertModel.from_pretrained(text_model_path)
        
        # Freeze pretrained encoders (optional)
        for param in self.audio_encoder.parameters():
            param.requires_grad = False
        for param in self.text_encoder.parameters():
            param.requires_grad = False
        
        # Fusion Layer
        audio_feature_dim = self.audio_encoder.config.hidden_size
        text_feature_dim = self.text_encoder.config.hidden_size
        fusion_dim = audio_feature_dim + text_feature_dim
        
        self.fusion_layers = nn.Sequential(
            nn.Linear(fusion_dim, fusion_dim // 2),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(fusion_dim // 2, num_labels)
        )
    
    def forward(self, audio_input, audio_mask, text_input_ids, text_attention_mask):
        # Extract audio features
        audio_outputs = self.audio_encoder(
            audio_input, 
            attention_mask=audio_mask
        )
        audio_features = torch.mean(audio_outputs.last_hidden_state, dim=1)
        
        # Extract text features
        text_outputs = self.text_encoder(
            text_input_ids, 
            attention_mask=text_attention_mask
        )
        text_features = text_outputs.pooler_output
        
        # Concatenate features
        combined_features = torch.cat([audio_features, text_features], dim=1)
        
        # Classification
        logits = self.fusion_layers(combined_features)
        
        return logits

def train_model(model, train_loader, val_loader, device, epochs=10, learning_rate=1e-4):
    # Loss and Optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    # Create models directory if it doesn't exist
    os.makedirs('saved_models', exist_ok=True)
    
    best_val_accuracy = 0.0
    
    for epoch in range(epochs):
        # Training Phase
        model.train()
        train_losses = []
        train_preds = []
        train_true = []
        
        for batch in train_loader:
            # Move data to device
            audio_input = batch['audio_input'].to(device)
            audio_mask = batch['audio_mask'].to(device)
            text_input_ids = batch['text_input_ids'].to(device)
            text_attention_mask = batch['text_attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            # Zero the parameter gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(
                audio_input, 
                audio_mask, 
                text_input_ids, 
                text_attention_mask
            )
            
            # Compute loss
            loss = criterion(outputs, labels)
            
            # Backward pass and optimize
            loss.backward()
            optimizer.step()
            
            # Track training metrics
            train_losses.append(loss.item())
            preds = torch.argmax(outputs, dim=1)
            train_preds.extend(preds.cpu().numpy())
            train_true.extend(labels.cpu().numpy())
        
        # Validation Phase
        model.eval()
        val_losses = []
        val_preds = []
        val_true = []
        
        with torch.no_grad():
            for batch in val_loader:
                # Move data to device
                audio_input = batch['audio_input'].to(device)
                audio_mask = batch['audio_mask'].to(device)
                text_input_ids = batch['text_input_ids'].to(device)
                text_attention_mask = batch['text_attention_mask'].to(device)
                labels = batch['label'].to(device)
                
                # Forward pass
                outputs = model(
                    audio_input, 
                    audio_mask, 
                    text_input_ids, 
                    text_attention_mask
                )
                
                # Compute loss
                loss = criterion(outputs, labels)
                val_losses.append(loss.item())
                
                # Track validation metrics
                preds = torch.argmax(outputs, dim=1)
                val_preds.extend(preds.cpu().numpy())
                val_true.extend(labels.cpu().numpy())
        
        # Compute metrics
        train_accuracy = accuracy_score(train_true, train_preds)
        val_accuracy = accuracy_score(val_true, val_preds)
        
        # Print epoch summary
        print(f"Epoch {epoch+1}/{epochs}")
        print(f"Train Loss: {np.mean(train_losses):.4f}, Train Accuracy: {train_accuracy:.4f}")
        print(f"Val Loss: {np.mean(val_losses):.4f}, Val Accuracy: {val_accuracy:.4f}")
        
        # Save best model
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            torch.save({
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'best_val_accuracy': best_val_accuracy,
                'epoch': epoch
            }, f'saved_models/best_multimodal_model.pth')
            
            print(f"Saved new best model with validation accuracy: {best_val_accuracy:.4f}")
        
        # Optional: Print classification report for validation set
        if epoch % 5 == 0:
            print("\nValidation Classification Report:")
            print(classification_report(val_true, val_preds))
    
    return model

def main():
    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Paths and Configurations
    audio_model_path = "m3hrdadfi/wav2vec2-xlsr-greek-speech-emotion-recognition"
    text_model_path = "bert-base-uncased"
    
    # Initialize Processors
    audio_processor = Wav2Vec2Processor.from_pretrained(audio_model_path)
    text_tokenizer = BertTokenizer.from_pretrained(text_model_path)
    
    # Create Datasets
    train_dataset = MultimodalEmotionDataset("dataset/train.csv", audio_processor, text_tokenizer)
    test_dataset = MultimodalEmotionDataset("dataset/test.csv", audio_processor, text_tokenizer)
    
    # Split train into train and validation
    train_dataset, val_dataset = torch.utils.data.random_split(
        train_dataset, 
        [int(len(train_dataset)*0.8), len(train_dataset)-int(len(train_dataset)*0.8)]
    )
    
    # Create DataLoaders
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)
    
    # Get number of labels
    num_labels = len(train_dataset.dataset.emotion_to_idx)
    
    # Initialize Model
    model = MultimodalEmotionClassifier(
        num_labels=num_labels, 
        audio_model_path=audio_model_path, 
        text_model_path=text_model_path
    ).to(device)
    
    print(f"Model Initialized with {num_labels} emotion classes")
    print("Emotion to Index mapping:", train_dataset.dataset.emotion_to_idx)
    
    # Train Model
    trained_model = train_model(model, train_loader, val_loader, device, epochs=100)
    
    # Optional: Load and evaluate best saved model
    best_model_path = 'saved_models/best_multimodal_model.pth'
    checkpoint = torch.load(best_model_path)
    
    # Reinitialize model and load state dict
    best_model = MultimodalEmotionClassifier(
        num_labels=num_labels, 
        audio_model_path=audio_model_path, 
        text_model_path=text_model_path
    ).to(device)
    best_model.load_state_dict(checkpoint['model_state_dict'])
    
    # Test model
    best_model.eval()
    test_preds = []
    test_true = []
    
    with torch.no_grad():
        for batch in test_loader:
            # Move data to device
            audio_input = batch['audio_input'].to(device)
            audio_mask = batch['audio_mask'].to(device)
            text_input_ids = batch['text_input_ids'].to(device)
            text_attention_mask = batch['text_attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            # Forward pass
            outputs = best_model(
                audio_input, 
                audio_mask, 
                text_input_ids, 
                text_attention_mask
            )
            
            # Track test metrics
            preds = torch.argmax(outputs, dim=1)
            test_preds.extend(preds.cpu().numpy())
            test_true.extend(labels.cpu().numpy())
    
    # Print test classification report
    print("\nTest Classification Report:")
    print(classification_report(test_true, test_preds, 
        target_names=list(train_dataset.dataset.emotion_to_idx.keys())))

if __name__ == "__main__":
    main()



Model Initialized with 5 emotion classes
Emotion to Index mapping: {'happiness': 0, 'anger': 1, 'disgust': 2, 'sadness': 3, 'fear': 4}
Epoch 1/100
Train Loss: 1.2823, Train Accuracy: 0.5648
Val Loss: 0.8227, Val Accuracy: 0.7526
Saved new best model with validation accuracy: 0.7526

Validation Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.47      0.57        17
           1       0.80      0.73      0.76        22
           2       0.95      0.76      0.84        25
           3       0.67      0.95      0.78        19
           4       0.63      0.86      0.73        14

    accuracy                           0.75        97
   macro avg       0.76      0.75      0.74        97
weighted avg       0.78      0.75      0.75        97

Epoch 2/100
Train Loss: 1.0334, Train Accuracy: 0.6399
Val Loss: 0.7040, Val Accuracy: 0.7320
Epoch 3/100
Train Loss: 0.9594, Train Accuracy: 0.6373
Val Loss: 0.6502, Val Accuracy: 0.7526
Epoch




Test Classification Report:
              precision    recall  f1-score   support

   happiness       0.05      0.04      0.05        24
       anger       0.00      0.00      0.00        25
     disgust       0.00      0.00      0.00        24
     sadness       0.07      0.12      0.09        24
        fear       0.90      0.75      0.82        24

    accuracy                           0.18       121
   macro avg       0.20      0.18      0.19       121
weighted avg       0.20      0.18      0.19       121



In [1]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Paths and Configurations
audio_model_path = "m3hrdadfi/wav2vec2-xlsr-greek-speech-emotion-recognition"
text_model_path = "bert-base-uncased"

# Initialize Processors
audio_processor = Wav2Vec2Processor.from_pretrained(audio_model_path)
text_tokenizer = BertTokenizer.from_pretrained(text_model_path)

# Create Datasets
train_dataset = MultimodalEmotionDataset("dataset/train.csv", audio_processor, text_tokenizer)
test_dataset = MultimodalEmotionDataset("dataset/test.csv", audio_processor, text_tokenizer)

# Split train into train and validation
train_dataset, val_dataset = torch.utils.data.random_split(
    train_dataset, 
    [int(len(train_dataset)*0.8), len(train_dataset)-int(len(train_dataset)*0.8)]
)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Get number of labels
num_labels = len(train_dataset.dataset.emotion_to_idx)

# Initialize Model
model = MultimodalEmotionClassifier(
    num_labels=num_labels, 
    audio_model_path=audio_model_path, 
    text_model_path=text_model_path
).to(device)

print(f"Model Initialized with {num_labels} emotion classes")
print("Emotion to Index mapping:", train_dataset.dataset.emotion_to_idx)

NameError: name 'torch' is not defined

In [3]:
# import torch
# import torch.nn as nn
# import torch.optim as optim
# import pandas as pd
# import numpy as np
# import torchaudio
# import librosa
# from transformers import (
#     Wav2Vec2Processor, 
#     Wav2Vec2Model, 
#     BertModel, 
#     BertTokenizer
# )
# from torch.utils.data import Dataset, DataLoader
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import classification_report, accuracy_score

# # Custom Dataset (previous implementation remains the same)
# class MultimodalEmotionDataset(Dataset):
#     def __init__(self, csv_path, processor, tokenizer, max_length=128):
#         # Read the CSV
#         self.data = pd.read_csv(csv_path, sep='\t')
        
#         # Add random text column if not exists
#         if 'text' not in self.data.columns:
#             self.data['text'] = [f"Random text for {name}" for name in self.data['name']]
        
#         self.processor = processor
#         self.tokenizer = tokenizer
#         self.max_length = max_length
        
#         # Mapping emotions to indices
#         self.emotion_to_idx = {emotion: idx for idx, emotion in enumerate(self.data['emotion'].unique())}
#         self.idx_to_emotion = {idx: emotion for emotion, idx in self.emotion_to_idx.items()}
        
#     def __len__(self):
#         return len(self.data)
    
#     def __getitem__(self, idx):
#         row = self.data.iloc[idx]
        
#         # Process Audio
#         speech_array, sampling_rate = torchaudio.load(row['path'])
#         speech_array = speech_array.squeeze().numpy()
#         speech_array = librosa.resample(np.asarray(speech_array), sampling_rate, self.processor.feature_extractor.sampling_rate)
        
#         audio_inputs = self.processor(
#             speech_array, 
#             sampling_rate=self.processor.feature_extractor.sampling_rate, 
#             return_tensors="pt"
#         )
        
#         # Process Text
#         text_inputs = self.tokenizer(
#             row['text'], 
#             max_length=self.max_length, 
#             padding='max_length', 
#             truncation=True, 
#             return_tensors="pt"
#         )
        
#         # Get emotion label
#         label = self.emotion_to_idx[row['emotion']]
        
#         return {
#             'audio_input': audio_inputs.input_values.squeeze(),
#             'audio_mask': audio_inputs.attention_mask.squeeze(),
#             'text_input_ids': text_inputs['input_ids'].squeeze(),
#             'text_attention_mask': text_inputs['attention_mask'].squeeze(),
#             'label': label
#         }

# # Multimodal Fusion Model (previous implementation remains the same)
# class MultimodalEmotionClassifier(nn.Module):
#     def __init__(self, num_labels, audio_model_path, text_model_path):
#         super().__init__()
        
#         # Audio Encoder (Wav2Vec2)
#         self.audio_encoder = Wav2Vec2Model.from_pretrained(audio_model_path)
        
#         # Text Encoder (BERT)
#         self.text_encoder = BertModel.from_pretrained(text_model_path)
        
#         # Freeze pretrained encoders (optional)
#         for param in self.audio_encoder.parameters():
#             param.requires_grad = False
#         for param in self.text_encoder.parameters():
#             param.requires_grad = False
        
#         # Fusion Layer
#         audio_feature_dim = self.audio_encoder.config.hidden_size
#         text_feature_dim = self.text_encoder.config.hidden_size
#         fusion_dim = audio_feature_dim + text_feature_dim
        
#         self.fusion_layers = nn.Sequential(
#             nn.Linear(fusion_dim, fusion_dim // 2),
#             nn.ReLU(),
#             nn.Dropout(0.3),
#             nn.Linear(fusion_dim // 2, num_labels)
#         )
    
#     def forward(self, audio_input, audio_mask, text_input_ids, text_attention_mask):
#         # Extract audio features
#         audio_outputs = self.audio_encoder(
#             audio_input, 
#             attention_mask=audio_mask
#         )
#         audio_features = torch.mean(audio_outputs.last_hidden_state, dim=1)
        
#         # Extract text features
#         text_outputs = self.text_encoder(
#             text_input_ids, 
#             attention_mask=text_attention_mask
#         )
#         text_features = text_outputs.pooler_output
        
#         # Concatenate features
#         combined_features = torch.cat([audio_features, text_features], dim=1)
        
#         # Classification
#         logits = self.fusion_layers(combined_features)
        
#         return logits

# # Training Function
# def train_model(model, train_loader, val_loader, device, epochs=10, learning_rate=1e-3):
#     # Loss and Optimizer
#     criterion = nn.CrossEntropyLoss()
#     optimizer = optim.Adam(
#         [
#             {'params': model.fusion_layers.parameters(), 'lr': learning_rate},
#             # Uncomment and adjust learning rates if unfreezing more layers
#             # {'params': model.audio_encoder.parameters(), 'lr': learning_rate * 0.1},
#             # {'params': model.text_encoder.parameters(), 'lr': learning_rate * 0.1}
#         ]
#     )
    
#     best_val_accuracy = 0
    
#     for epoch in range(epochs):
#         # Training Phase
#         model.train()
#         train_loss = 0
#         train_correct = 0
#         train_total = 0
        
#         for batch in train_loader:
#             # Move data to device
#             audio_input = batch['audio_input'].to(device)
#             audio_mask = batch['audio_mask'].to(device)
#             text_input_ids = batch['text_input_ids'].to(device)
#             text_attention_mask = batch['text_attention_mask'].to(device)
#             labels = batch['label'].to(device)
            
#             # Zero the parameter gradients
#             optimizer.zero_grad()
            
#             # Forward pass
#             outputs = model(audio_input, audio_mask, text_input_ids, text_attention_mask)
#             loss = criterion(outputs, labels)
            
#             # Backward pass and optimize
#             loss.backward()
#             optimizer.step()
            
#             # Compute training metrics
#             train_loss += loss.item()
#             _, predicted = torch.max(outputs, 1)
#             train_total += labels.size(0)
#             train_correct += (predicted == labels).sum().item()
        
#         # Validation Phase
#         model.eval()
#         val_loss = 0
#         val_correct = 0
#         val_total = 0
#         val_predictions = []
#         val_true_labels = []
        
#         with torch.no_grad():
#             for batch in val_loader:
#                 # Move data to device
#                 audio_input = batch['audio_input'].to(device)
#                 audio_mask = batch['audio_mask'].to(device)
#                 text_input_ids = batch['text_input_ids'].to(device)
#                 text_attention_mask = batch['text_attention_mask'].to(device)
#                 labels = batch['label'].to(device)
                
#                 # Forward pass
#                 outputs = model(audio_input, audio_mask, text_input_ids, text_attention_mask)
#                 loss = criterion(outputs, labels)
                
#                 # Compute validation metrics
#                 val_loss += loss.item()
#                 _, predicted = torch.max(outputs, 1)
#                 val_total += labels.size(0)
#                 val_correct += (predicted == labels).sum().item()
                
#                 val_predictions.extend(predicted.cpu().numpy())
#                 val_true_labels.extend(labels.cpu().numpy())
        
#         # Print epoch summary
#         train_accuracy = 100 * train_correct / train_total
#         val_accuracy = 100 * val_correct / val_total
        
#         print(f'Epoch [{epoch+1}/{epochs}]')
#         print(f'Train Loss: {train_loss/len(train_loader):.4f}, Train Accuracy: {train_accuracy:.2f}%')
#         print(f'Val Loss: {val_loss/len(val_loader):.4f}, Val Accuracy: {val_accuracy:.2f}%')
        
#         # Classification Report
#         print("\nClassification Report:")
#         print(classification_report(
#             val_true_labels, 
#             val_predictions, 
#             target_names=list(train_dataset.emotion_to_idx.keys())
#         ))
        
#         # Save best model
#         if val_accuracy > best_val_accuracy:
#             best_val_accuracy = val_accuracy
#             torch.save(model.state_dict(), 'best_multimodal_emotion_model.pth')
    
#     return model

# # Inference Function
# def predict_emotion(model, audio_path, text, processor, tokenizer, device):
#     model.eval()
    
#     # Process Audio
#     speech_array, sampling_rate = torchaudio.load(audio_path)
#     speech_array = speech_array.squeeze().numpy()
#     speech_array = librosa.resample(np.asarray(speech_array), sampling_rate, processor.feature_extractor.sampling_rate)
    
#     audio_inputs = processor(
#         speech_array, 
#         sampling_rate=processor.feature_extractor.sampling_rate, 
#         return_tensors="pt"
#     )
    
#     # Process Text
#     text_inputs = tokenizer(
#         text, 
#         max_length=128, 
#         padding='max_length', 
#         truncation=True, 
#         return_tensors="pt"
#     )
    
#     # Move to device
#     audio_input = audio_inputs.input_values.to(device).squeeze()
#     audio_mask = audio_inputs.attention_mask.to(device).squeeze()
#     text_input_ids = text_inputs['input_ids'].to(device).squeeze()
#     text_attention_mask = text_inputs['attention_mask'].to(device).squeeze()
    
#     # Predict
#     with torch.no_grad():
#         outputs = model(
#             audio_input.unsqueeze(0), 
#             audio_mask.unsqueeze(0), 
#             text_input_ids.unsqueeze(0), 
#             text_attention_mask.unsqueeze(0)
#         )
#         probabilities = torch.softmax(outputs, dim=1)
#         predicted_class = torch.argmax(probabilities, dim=1)
    
#     return {
#         'emotion': train_dataset.idx_to_emotion[predicted_class.item()],
#         'probabilities': {emotion: prob.item() for emotion, prob in zip(train_dataset.idx_to_emotion.values(), probabilities[0])}
#     }

# # Main Execution
# def main():
#     # Set device
#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
#     # Paths and Configurations
#     audio_model_path = "m3hrdadfi/wav2vec2-xlsr-greek-speech-emotion-recognition"
#     text_model_path = "bert-base-uncased"
    
#     # Initialize Processors
#     audio_processor = Wav2Vec2Processor.from_pretrained(audio_model_path)
#     text_tokenizer = BertTokenizer.from_pretrained(text_model_path)
    
#     # Create Datasets
#     train_dataset = MultimodalEmotionDataset("dataset/train.csv", audio_processor, text_tokenizer)
#     test_dataset = MultimodalEmotionDataset("dataset/test.csv", audio_processor, text_tokenizer)
    
#     # DataLoaders
#     train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
#     val_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)
    
#     # Get number of labels
#     num_labels = len(train_dataset.emotion_to_idx)
    
#     # Initialize Model
#     model = MultimodalEmotionClassifier(
#         num_labels=num_labels, 
#         audio_model_path=audio_model_path, 
#         text_model_path=text_model_path
#     ).to(device)
    
#     # Train Model
#     trained_model = train_model(model, train_loader, val_loader, device)
    
#     # Example Inference
#     # Assuming you have a sample audio file and want to predict its emotion
#     sample_audio_path = test_dataset.data.iloc[0]['path']
#     sample_text = "Some random descriptive text"
    
#     prediction = predict_emotion(
#         trained_model, 
#         sample_audio_path, 
#         sample_text, 
#         audio_processor, 
#         text_tokenizer, 
#         device
#     )
    
#     print("\nSample Prediction:")
#     print("Predicted Emotion:", prediction['emotion'])
#     print("Emotion Probabilities:")
#     for emotion, prob in prediction['probabilities'].items():
#         print(f"{emotion}: {prob:.4f}")

# if __name__ == "__main__":
#     main()

In [10]:
import torch
import torchaudio
import librosa
import numpy as np
import pandas as pd
import torch.nn.functional as F
from transformers import (
    Wav2Vec2Processor, 
    BertTokenizer
)

class MultimodalEmotionInference:
    def __init__(self, model_path, audio_model_path, text_model_path):
        # Set device
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        # Load model
        self.model = torch.load(model_path).to(self.device)
        self.model.eval()
        
        # Load processors
        self.audio_processor = Wav2Vec2Processor.from_pretrained(audio_model_path)
        self.text_tokenizer = BertTokenizer.from_pretrained(text_model_path)
        
        # Emotion mapping (reverse of what was used during training)
        self.idx_to_emotion = {
            0: 'happiness', 
            1: 'anger', 
            2: 'disgust', 
            3: 'sadness', 
            4: 'fear'
        }
    
    def preprocess_audio(self, audio_path):
        """Preprocess audio file"""
        speech_array, sampling_rate = torchaudio.load(audio_path)
        speech_array = speech_array.squeeze().numpy()
        speech_array = librosa.resample(
            np.asarray(speech_array), 
            sampling_rate, 
            self.audio_processor.feature_extractor.sampling_rate
        )
        
        audio_inputs = self.audio_processor(
            speech_array, 
            sampling_rate=self.audio_processor.feature_extractor.sampling_rate, 
            return_tensors="pt"
        )
        
        return audio_inputs
    
    def preprocess_text(self, text, max_length=128):
        """Preprocess text input"""
        text_inputs = self.text_tokenizer(
            text, 
            max_length=max_length, 
            padding='max_length', 
            truncation=True, 
            return_tensors="pt"
        )
        
        return text_inputs
    
    def predict(self, audio_path, text):
        """Predict emotion for given audio and text"""
        # Preprocess inputs
        audio_inputs = self.preprocess_audio(audio_path)
        text_inputs = self.preprocess_text(text)
        
        # Move to device
        audio_input = audio_inputs.input_values.to(self.device)
        audio_mask = audio_inputs.attention_mask.to(self.device)
        text_input_ids = text_inputs['input_ids'].to(self.device)
        text_attention_mask = text_inputs['attention_mask'].to(self.device)
        
        # Inference
        with torch.no_grad():
            logits = self.model(
                audio_input, 
                audio_mask, 
                text_input_ids, 
                text_attention_mask
            )
            
            # Apply softmax to get probabilities
            probs = F.softmax(logits, dim=1).cpu().numpy()[0]
        
        # Prepare results
        results = [
            {
                "emotion": self.idx_to_emotion[i], 
                "probability": float(prob)
            } 
            for i, prob in enumerate(probs)
        ]
        
        # Sort by probability in descending order
        results.sort(key=lambda x: x['probability'], reverse=True)
        
        return results
    
    def batch_predict(self, audio_paths, texts):
        """Batch prediction for multiple samples"""
        results = []
        for audio_path, text in zip(audio_paths, texts):
            result = self.predict(audio_path, text)
            results.append(result)
        return results

def main():
    # Paths
    model_path = "saved_models/best_multimodal_model.pth"  # Assuming you saved the model
    audio_model_path = "m3hrdadfi/wav2vec2-xlsr-greek-speech-emotion-recognition"
    text_model_path = "bert-base-uncased"
    
    # Create inference object
    inferencer = MultimodalEmotionInference(
        model_path, 
        audio_model_path, 
        text_model_path
    )
    
    # Example usage - Single prediction
    audio_path = "Frenzy/data/anger/a01 (2).wav"
    text = "hi im angry"
    
    single_result = inferencer.predict(audio_path, text)
    print("Single Prediction:")
    for pred in single_result:
        print(f"{pred['emotion']}: {pred['probability']:.4f}")
    
    # Example usage - Batch prediction
    audio_paths = [
        "Frenzy/data/anger/a01 (2).wav", 
        "Frenzy/data/anger/a01 (3).wav"
    ]
    texts = [
        "im angry", 
        "im angry"
    ]
    
    batch_results = inferencer.batch_predict(audio_paths, texts)
    print("\nBatch Prediction:")
    for i, results in enumerate(batch_results):
        print(f"\nAudio {i+1}:")
        for pred in results:
            print(f"{pred['emotion']}: {pred['probability']:.4f}")

if __name__ == "__main__":
    main()

AttributeError: 'dict' object has no attribute 'to'