## Sentiment Analysis with wev2vec2

In [1]:
%load_ext autoreload
%autoreload 2
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import torchaudio
import torch
from data_loader import *
from utils import *
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
import torch.nn as nn
from torch.nn import CrossEntropyLoss
from sklearn.metrics import classification_report, confusion_matrix
import plotly.figure_factory as ff
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score, classification_report, confusion_matrix
from sklearn.utils import compute_class_weight


print("Is GPU available?", torch.cuda.is_available())
!nvidia-smi

Is GPU available? True
Sat Feb 15 18:03:22 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 531.14                 Driver Version: 531.14       CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                      TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce GTX 1650 w...  WDDM | 00000000:01:00.0 Off |                  N/A |
| N/A   55C    P0               12W /  N/A|      0MiB /  4096MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                             

In [2]:
# Caricamento del modello e del processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
wav2vec2 = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h").to('cuda' if torch.cuda.is_available() else 'cpu')

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
from transformers import Wav2Vec2ForSequenceClassification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Wav2Vec2ForSequenceClassification.from_pretrained(
    "facebook/wav2vec2-base-960h",
    num_labels=3
).to(device)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
dataset_manager = SoundDataset("../Dataset")
df_train, df_val, df_test = dataset_manager.load_datasets()
print(f"Train dataset: {df_train.shape} samples")
print(f"Validation dataset: {df_val.shape} samples")
print(f"Test dataset: {df_test.shape} samples")
df_val.head()

Train dataset: (11900, 5) samples
Validation dataset: (3400, 5) samples
Test dataset: (1700, 5) samples


Unnamed: 0,wav_id,label,audio_path,duration,text
0,1_1856,2,../Dataset/all/1_1856.wav,2.67,
1,1_1926,2,../Dataset/all/1_1926.wav,2.31,
2,1_1997,2,../Dataset/all/1_1997.wav,2.13,
3,1_14480,2,../Dataset/all/1_14480.wav,2.46,
4,1_4414,2,../Dataset/all/1_4414.wav,2.67,


wav2vec2 works better with 16k sample_rate

In [4]:
# get_info_sample_rate(df_train)
# get_info_sample_rate(df_val)
# get_info_sample_rate(df_test)

In [5]:
# sample_rate_16k(df_train)
# sample_rate_16k(df_val)
# sample_rate_16k(df_test)

#### Embeddigs with wev2vec2
shape 768

In [6]:
def extract_embeddings_for_dataset(df, wav2vec2, processor):
    embeddings_list = []
    for index, row in df.iterrows():
        print(f"Idx: {index}")
        file_path = row['audio_path']
        embedding = extract_single_embedding(file_path, wav2vec2, processor)
        embeddings_list.append(embedding)
    return embeddings_list

In [7]:
# X_train_embeddings = extract_embeddings_for_dataset(df_train, wav2vec2, processor)
# np.save('../Dataset/train_embeddings.npy', X_train_embeddings)
# print("Embeddings created and saved for train dataset")

# X_test_embeddings = extract_embeddings_for_dataset(df_test, wav2vec2, processor)
# np.save('../Dataset/test_embeddings.npy', X_test_embeddings)
# print("Embeddings created and saved for test dataset")

# X_val_embeddings = extract_embeddings_for_dataset(df_val)
# np.save('../Dataset/val_embeddings.npy', X_val_embeddings, wav2vec2, processor)
# print("Embeddings created and saved for validation dataset")

Load embeddings

In [8]:
X_train_embeddings = np.load('../Dataset/train_embeddings.npy')
X_test_embeddings = np.load('../Dataset/test_embeddings.npy')
X_val_embeddings = np.load('../Dataset/val_embeddings.npy')

y_train_labels = df_train['label'].tolist()  
y_test_labels = df_test['label'].tolist()
y_val_labels = df_val['label'].tolist()

Custom dataset with pytorch that takes the audio file and the labels

In [9]:
class AudioSentimentDataset(Dataset):
    def __init__(self, embedding_paths, labels):
        self.embeddings = np.load(embedding_paths)
        self.labels = labels

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        embedding = self.embeddings[idx]
        embedding = torch.tensor(embedding, dtype=torch.float32) 
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return embedding, label

Creazione dei dataloader

In [10]:
# Creazione dei dataset e dei DataLoader
train_dataset = AudioSentimentDataset("../Dataset/train_embeddings.npy", df_train['label'].tolist())
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
print(f"Train dataloader: {len(train_dataloader)} batches")

val_dataset = AudioSentimentDataset("../Dataset/val_embeddings.npy", df_val['label'].tolist())
val_dataloader = DataLoader(val_dataset, batch_size=8) 
print(f"Validation dataloader: {len(val_dataloader)} batches")

test_dataset = AudioSentimentDataset("../Dataset/test_embeddings.npy", df_test['label'].tolist())
test_dataloader = DataLoader(test_dataset, batch_size=8) 
print(f"Test dataloader: {len(test_dataloader)} batches")

Train dataloader: 1488 batches
Validation dataloader: 425 batches
Test dataloader: 213 batches


In [11]:
def compute_class_weights(y_labels):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    class_weights = compute_class_weight('balanced', classes=np.unique(y_labels), y=y_labels)
    return torch.tensor(class_weights, dtype=torch.float32, device=device)

In [12]:
print(compute_class_weights(y_train_labels))
print(compute_class_weights(y_val_labels))

tensor([4.1363, 4.1798, 0.3970], device='cuda:0')
tensor([4.1363, 4.1820, 0.3970], device='cuda:0')


In [13]:
# Iperparametri
embedding_dim = 768  
num_classes = 3
learning_rate = 1e-4
epochs = 10

# Inizializza modello
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [14]:
def training(epochs, model, train_dataloader, val_dataloader, optimizer):
    class_weights_train = compute_class_weights(y_train_labels)
    criterion_train = CrossEntropyLoss(weight=class_weights_train)
    class_weights_val = compute_class_weights(y_val_labels)
    criterion_val = CrossEntropyLoss(weight=class_weights_val)
    
    train_losses, val_losses = [], []
    for epoch in range(epochs):
        # Training
        model.train()
        total_train_loss = 0
        for embeddings, labels in train_dataloader:
            embeddings, labels = embeddings.to(device), labels.to(device)
            optimizer.zero_grad()
            # forward pass
            outputs = model(embeddings)
            loss = criterion_train(outputs, labels)
            # backward pass
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()
        avg_train_loss = total_train_loss / len(train_dataloader)
        train_losses.append(avg_train_loss)

        # Validation
        model.eval()
        all_preds, all_labels = [], []
        total_val_loss = 0
        with torch.no_grad():
            for embeddings, labels in val_dataloader:
                embeddings, labels = embeddings.to(device), labels.to(device)
                outputs = model(embeddings)
                loss = criterion_val(outputs, labels)
                total_val_loss += loss.item()
                preds = torch.argmax(outputs, dim=1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
        avg_val_loss = total_val_loss / len(val_dataloader)
        val_losses.append(avg_val_loss)
        val_f1 = f1_score(all_labels, all_preds, average='weighted')

        print(f"Epoch {epoch+1}/{epochs} - Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Val f1: {val_f1:.4f}\n")
    return train_losses, val_losses

In [15]:
def evaluate(model, dataloader):
      model.eval()
      all_preds, all_labels = [], []

      with torch.no_grad():
            for embeddings, labels in dataloader:
                  embeddings, labels = embeddings.to(device), labels.to(device)
                  outputs = model(embeddings)
                  preds = torch.argmax(outputs, dim=1)

                  all_preds.extend(preds.cpu().numpy())
                  all_labels.extend(labels.cpu().numpy())

      return all_labels, all_preds

In [16]:
### Add a just a dropout and a linear layer to the embeddigs 

class AudioSentimentClassifier(nn.Module):
    def __init__(self, embedding_dim, num_classes):
        super(AudioSentimentClassifier, self).__init__()
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(embedding_dim, num_classes)  # 768 -> 3

    def forward(self, embeddings):
        x = self.dropout(embeddings)
        logits = self.fc(x)
        logits = logits.squeeze(1)
        return logits

In [17]:
model_clf = AudioSentimentClassifier(embedding_dim, num_classes).to(device)
optimizer = AdamW(model_clf.parameters(), lr=learning_rate)

train_losses, val_losses = training(epochs, model_clf, train_dataloader, val_dataloader, optimizer)

true_labels, preds = evaluate(model_clf, test_dataloader)

Epoch 1/10 - Train Loss: 1.0593, Val Loss: 1.0419, Val f1: 0.7665

Epoch 2/10 - Train Loss: 1.0523, Val Loss: 1.0332, Val f1: 0.7665

Epoch 3/10 - Train Loss: 1.0507, Val Loss: 1.0331, Val f1: 0.7662

Epoch 4/10 - Train Loss: 1.0477, Val Loss: 1.0295, Val f1: 0.7683

Epoch 5/10 - Train Loss: 1.0371, Val Loss: 1.0263, Val f1: 0.7689

Epoch 6/10 - Train Loss: 1.0359, Val Loss: 1.0239, Val f1: 0.7723

Epoch 7/10 - Train Loss: 1.0305, Val Loss: 1.0183, Val f1: 0.7685

Epoch 8/10 - Train Loss: 1.0355, Val Loss: 1.0170, Val f1: 0.7708

Epoch 9/10 - Train Loss: 1.0286, Val Loss: 1.0161, Val f1: 0.7672

Epoch 10/10 - Train Loss: 1.0300, Val Loss: 1.0173, Val f1: 0.7671



In [18]:
print(classification_report(true_labels, preds, zero_division=1))
cm = confusion_matrix(true_labels, preds)
fig = ff.create_annotated_heatmap(z=cm, x=["Negative", "Neutral", "Positive"], y=["Negative", "Neutral", "Positive"], colorscale='Blues')
fig.update_layout(title="Confusion Matrix", xaxis_title="Predicted", yaxis_title="Actual", width=400, height=400)
fig.show()

fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(1, epochs+1)), y=train_losses, mode='lines+markers', name='Train Loss'))
fig.add_trace(go.Scatter(x=list(range(1, epochs+1)), y=val_losses, mode='lines+markers', name='Validation Loss'))
fig.update_layout(
    title="Training and Validation Loss",
    xaxis_title="Epochs",
    yaxis_title="Loss",
    legend=dict(x=0, y=1),
    width=900, height=400
)
fig.show()

              precision    recall  f1-score   support

           0       0.15      0.06      0.08       137
           1       0.00      0.00      0.00       135
           2       0.84      0.97      0.90      1428

    accuracy                           0.82      1700
   macro avg       0.33      0.34      0.33      1700
weighted avg       0.72      0.82      0.76      1700



In [19]:
### Senza fine tuning ma creazione di una NN

class MLPClassifier(nn.Module):
    def __init__(self, embedding_dim, num_classes):
        super(MLPClassifier, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(embedding_dim, 256),  # 768 -> 256
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),            # 256 -> 128
            nn.ReLU(),
            nn.Dropout(0.3),    
            nn.Linear(128, num_classes)     # 128 -> 3
        )

    def forward(self, x):
        logits = self.model(x)
        logits = logits.squeeze(1)
        return logits

In [20]:
model_mlp = MLPClassifier(embedding_dim, num_classes).to(device)
optimizer = AdamW(model_mlp.parameters(), lr=learning_rate)

train_losses, val_losses = training(epochs, model_mlp, train_dataloader, val_dataloader, optimizer)

true_labels, preds = evaluate(model_mlp, test_dataloader)

Epoch 1/10 - Train Loss: 1.0610, Val Loss: 1.0394, Val f1: 0.7665

Epoch 2/10 - Train Loss: 1.0552, Val Loss: 1.0331, Val f1: 0.7665

Epoch 3/10 - Train Loss: 1.0456, Val Loss: 1.0208, Val f1: 0.7665

Epoch 4/10 - Train Loss: 1.0424, Val Loss: 1.0128, Val f1: 0.7664

Epoch 5/10 - Train Loss: 1.0335, Val Loss: 1.0154, Val f1: 0.7505

Epoch 6/10 - Train Loss: 1.0314, Val Loss: 0.9999, Val f1: 0.7676

Epoch 7/10 - Train Loss: 1.0243, Val Loss: 1.0052, Val f1: 0.7596

Epoch 8/10 - Train Loss: 1.0181, Val Loss: 0.9999, Val f1: 0.7652

Epoch 9/10 - Train Loss: 1.0196, Val Loss: 1.0116, Val f1: 0.7107

Epoch 10/10 - Train Loss: 1.0205, Val Loss: 0.9962, Val f1: 0.7568



In [21]:
print(classification_report(true_labels, preds, zero_division=1))
cm = confusion_matrix(true_labels, preds)
fig = ff.create_annotated_heatmap(z=cm, x=["Negative", "Neutral", "Positive"], y=["Negative", "Neutral", "Positive"], colorscale='Blues')
fig.update_layout(title="Confusion Matrix", xaxis_title="Predicted", yaxis_title="Actual", width=400, height=400)
fig.show()

fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(1, epochs+1)), y=train_losses, mode='lines+markers', name='Train Loss'))
fig.add_trace(go.Scatter(x=list(range(1, epochs+1)), y=val_losses, mode='lines+markers', name='Validation Loss'))
fig.update_layout(
    title="Training and Validation Loss",
    xaxis_title="Epochs",
    yaxis_title="Loss",
    legend=dict(x=0, y=1),
    width=900, height=400
)
fig.show()

              precision    recall  f1-score   support

           0       0.11      0.10      0.11       137
           1       0.15      0.15      0.15       135
           2       0.85      0.86      0.86      1428

    accuracy                           0.75      1700
   macro avg       0.37      0.37      0.37      1700
weighted avg       0.74      0.75      0.74      1700

