In [1]:
import os, sys
from os.path import isfile, isdir, join
import yaml, json

import librosa 
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import cv2
import random

import torch 
import torch.nn as nn
import torch.nn.functional as F

In [2]:
from linformer import Linformer
from PIL import Image
from torch.optim.lr_scheduler import StepLR
from tqdm.notebook import tqdm
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
# from vit_pytorch.efficient import ViT
from vit_pytorch import ViT
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc, classification_report
import torch.utils.data as data
import torchvision
from torchvision.transforms import ToTensor
torch.cuda.is_available()

True

In [3]:
import torch

# Check if CUDA is available
print(f"CUDA available: {torch.cuda.is_available()}")

# Check number of available GPUs
if torch.cuda.is_available():
    print(f"Number of GPUs: {torch.cuda.device_count()}")
    print(f"Current device: {torch.cuda.current_device()}")
    print(f"Device name: {torch.cuda.get_device_name()}")

CUDA available: True
Number of GPUs: 1
Current device: 0
Device name: NVIDIA GeForce RTX 4070 Laptop GPU


In [4]:
# Set random seed for reproducibility
seed = 142
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # torch.backends.cudnn.deterministic = True
    # torch.backends.cudnn.benchmark = False

# gamma = 0.7
# patch_size = 16
num_classes = 7

In [5]:
import torch

def get_available_gpus():
    if torch.cuda.is_available():
        return [(i, torch.cuda.get_device_name(i)) for i in range(torch.cuda.device_count())]
    else:
        return []

# Let's say you want to use device=1
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

print(device)

cuda:1


In [6]:
with open('./meld.yaml', 'r') as fp:
    meld_dict = yaml.safe_load(fp)
    
train_split = meld_dict['train']
test_split = meld_dict['test']
dev_split = meld_dict['dev']

In [7]:
for aud_key in train_split.keys():
    aud_path = f"./dataset_extracted/output_train_extracted/{aud_key}.wav"
    
     # Check if file exists, skip if not
    if not os.path.exists(aud_path):
        print(f"File not found, skipping: {aud_path}")
        continue
    
    aud_properties = train_split[aud_key]
    text = aud_properties['Utterance']
    emotion_label = aud_properties['Emotion']
    # print(aud_path)
    # print(text)
    # break 

In [8]:
for aud_key in dev_split.keys():
    aud_path = f"./dataset_extracted/output_dev_extracted/{aud_key}.wav"
    
     # Check if file exists, skip if not
    if not os.path.exists(aud_path):
        print(f"File not found, skipping: {aud_path}")
        continue
    
    aud_properties = dev_split[aud_key]
    text = aud_properties['Utterance']
    emotion_label = aud_properties['Emotion']

In [9]:
# def custom_model(nn.Module):
#     def __init__(self):
#         super(custom_model, self).__init__()
#         self.activation = nn.ReLU()
#         self.classification = nn.Linear(128*3, 7)
    
#     def forward(self, x1, x2, x3):
#           y1 = model(x1)
#           y = torch.cat([y1, y2, y3])
#           y = self.activation(y)
#           y = self.classification(y)
#           y = torch.softmax(y, dim=1)
#             return y

In [24]:
from torch.utils.data import Dataset, DataLoader

class meld_dataset(Dataset):
    def __init__(self, dataset_path='./meld.yaml', split_name='train', sr=16000, audio_seq_len=10, text_seq_len=512):
        super(meld_dataset, self).__init__()
        with open(dataset_path, 'r') as fp:
            meld_dict = yaml.safe_load(fp)
        self.train_split = meld_dict['train']
        self.test_split = meld_dict['test']
        self.dev_split = meld_dict['dev']
        self.split_name = split_name
        
        self.train_keys = list(self.train_split.keys())
        self.test_keys = list(self.test_split.keys())
        self.dev_keys = list(self.dev_split.keys())
        
        self.audio_model = None  # Placeholder for audio model initialization
        self.sr = sr
        self.audio_seq_len = audio_seq_len
        self.text_seq_len = text_seq_len
        
        self.img_transforms = transforms.Compose([
            transforms.Normalize(mean=0.4907024448714564, std=0.3857828166466927),
        ])
        
    def __len__(self):
        if self.split_name == 'train':
            return len(self.train_split)
        if self.split_name == 'test':
            return len(self.test_split)
        if self.split_name == 'dev':
            return len(self.dev_split)
    
    # For train split
    def __getitem__(self, index):
        if self.split_name == 'train':
            aud_id = self.train_keys[index]
            aud_path = f"./dataset_extracted/output_train_extracted/{aud_id}.wav"
            audio_input, _ = librosa.load(aud_path)
            
            # Trimming silence from audio
            audio_input, _ = librosa.effects.trim(audio_input, top_db=20)
            
            spectrogram_db  = librosa.feature.melspectrogram(y=audio_input)
            # spectrogram_db = librosa.amplitude_to_db(spectrogram_train, ref=np.max)
            text_feature = self.train_split[aud_id]['Utterance']
            emotion_label = self.train_split[aud_id]['Emotion']
        
        # For test split    
        elif self.split_name == 'test':
            aud_id = self.test_keys[index]
            aud_path = f"./dataset_extracted/output_test_extracted/{aud_id}.wav"
            audio_input, _ = librosa.load(aud_path, sr=self.sr)
            
            spectrogram  = librosa.feature.melspectrogram(y=audio_input)
            spectrogram_db = librosa.amplitude_to_db(spectrogram, ref=np.max)
            text_feature = self.test_split[aud_id]['Utterance']
            emotion_label = self.test_split[aud_id]['Emotion']
        
        
        # For dev split    
        elif self.split_name == 'dev':
            aud_id = self.dev_keys[index]
            aud_path = f"./dataset_extracted/output_dev_extracted/{aud_id}.wav"
            audio_input, _ = librosa.load(aud_path, sr=self.sr)
            
            # Trimming silence from audio
            audio_input, _ = librosa.effects.trim(audio_input, top_db=20)
        
            spectrogram  = librosa.feature.melspectrogram(y=audio_input)
            spectrogram_db = librosa.amplitude_to_db(spectrogram, ref=np.max)
            text_feature = self.dev_split[aud_id]['Utterance']
            emotion_label = self.dev_split[aud_id]['Emotion']
        
        # print(audio_input.shape[0]/self.sr)
        # print(spectrogram_db.shape)
        # print(spectrogram_db.shape[1]/(audio_input.shape[0]/self.sr))
        
        
        # Preprocess training data features by trimming/padding text, audio, and spectrogram 
        # to fixed sequence lengths for consistency during training.
        text_feature = text_feature[:min(len(text_feature), self.text_seq_len)] + (" "*(self.text_seq_len - len(text_feature)) if len(text_feature) < self.text_seq_len else "")
        audio_input = audio_input[:min(len(audio_input), self.audio_seq_len * self.sr)]
        audio_input = np.pad(audio_input, (0, (self.audio_seq_len * self.sr) - len(audio_input)), 'constant')
        spectrogram_db = spectrogram_db[:,:min(spectrogram_db.shape[1], int(self.audio_seq_len * 32))]
        spectrogram_db = np.pad(spectrogram_db, ((0, 0), (0, (self.audio_seq_len * 32) - spectrogram_db.shape[1])), 'constant')
        
        # Convert all training features to tensors
        spectrogram_db = spectrogram_db.astype(np.float32)
        spectrogram_db = (spectrogram_db - spectrogram_db.min())/(spectrogram_db.max() - spectrogram_db.min())
        # print(spectrogram_db.shape)
        spectrogram_db = cv2.resize(spectrogram_db, (256, 256), interpolation=cv2.INTER_NEAREST)
        # cv2.imwrite('path_to_image', spectrogram_db)
        spectrogram_tensor = torch.from_numpy(spectrogram_db[np.newaxis, ...])*2. -1.
        # spectrogram_tensor = self.img_transforms(torch.from_numpy(spectrogram_db[np.newaxis, ...])*2. -1.)
        
        audio_tensor = torch.from_numpy(audio_input)
        text_tensor = torch.from_numpy(np.array([ord(c) for c in text_feature], dtype=np.long))  # Convert text to ASCII values
        
        
        # Convert emotion label to tensor
        # You might want to create an emotion_to_idx mapping in __init__
        emotion_to_idx = {
            'neutral': 0, 'surprise': 1, 'fear': 2, 'sadness': 3,
            'joy': 4, 'disgust': 5, 'anger': 6
        }
        emotion_tensor = torch.tensor(emotion_to_idx[emotion_label], dtype=torch.long)

        return {
            'spectrogram': spectrogram_tensor,
            # 'spectrogram_dev': spectrogram_dev_tensor,
            'text_feature': text_tensor,
            'audio_feature': audio_tensor,
            'emotion_label': emotion_tensor
        }

In [25]:
dataset = meld_dataset(dataset_path='./meld.yaml', split_name='train')
train_loader = DataLoader(dataset, batch_size=16, shuffle=False)
inp_vals = []
inp_devs = []
for idx, data_dict in enumerate(tqdm(train_loader)):
    inps = data_dict['spectrogram']
    for b_id in range(inps.shape[0]):
        inp_vals.append(torch.sum(inps[b_id], dim=None).item()/(inps.shape[2]*inps.shape[3]))
        inp_devs.append(torch.std(inps[b_id], dim=None).item()/(inps.shape[2]*inps.shape[3]))
    # if idx == 10:
    #     break
mean, std_dev = np.mean(np.array(inp_vals)), np.std(np.array(inp_vals))
print(mean, std_dev)
mean, std_dev = np.mean(np.array(inp_devs)), np.std(np.array(inp_devs))
print(mean, std_dev)

  0%|          | 0/625 [00:00<?, ?it/s]



-0.9970135084738673 0.0019998255346162013
4.912767481488786e-07


In [26]:
mean, std_dev = np.mean(np.array(inp_devs)), np.std(np.array(inp_devs))
print(mean, std_dev)

4.912767481488786e-07 1.4220140559987843e-07


In [16]:
# Training device:

# device = 'cuda:0'

def get_available_gpus():
    if torch.cuda.is_available():
        return [(i, torch.cuda.get_device_name(i)) for i in range(torch.cuda.device_count())]
    else:
        return []
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Define spectrogram dimensions and patch size:
spec_height = 256
spec_width = 224
patch_size = 8

# Compute number of patches and sequence length (with class token)
num_patches = (spec_height // patch_size) * (spec_width // patch_size)
seq_len = num_patches + 1  # +1 for class token

# Linear Transformer (adjust seq_len):
efficient_transformer = Linformer(dim=128, seq_len=seq_len, depth=12, heads=8, k=64)

# Vision Transformer Model for audio spectrogram (using 1 channel):
model = ViT(
    image_size=spec_height,
    patch_size=patch_size,
    # num_classes=128,
    num_classes=7,
    dim = 512,
    # dim = 256,
    depth = 6,
    # depth = 4,
    #heads = 8
    heads = 4,
    mlp_dim = 2048,
    dropout = 0.1,
    emb_dropout = 0.1,
    channels=1,
).to(device)

# Hyperparameters:
epochs = 20
lr = 1e-4

# Loss function, Optimizer and Learning Rate Scheduler:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.7, patience=3, verbose=True)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1000, gamma=0.5)

In [17]:
# Initialize metrics lists
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []

dataset = meld_dataset(dataset_path='./meld.yaml', split_name='train')
val_dataset = meld_dataset(dataset_path='./meld.yaml', split_name='dev')

train_loader = DataLoader(dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

for epoch_num in range(epochs):
    
    # Training phase
    model.train()
    epoch_loss = 0
    epoch_accuracy = 0
    
    iter_loss = 0
    print(f"Epoch: {epoch_num+1}/{epochs}")
    for idx, data_dict in enumerate(tqdm(train_loader)):
        # Zero the gradients
        optimizer.zero_grad()
        # Get the inputs
        input = data_dict['spectrogram'].to(device)
        labels = data_dict['emotion_label'].to(device)
        # Forward pass
        output = model(input)
        
        loss = criterion(output, labels)
        
        # Backward pass and optimize
        loss.backward()
        # scheduler.step(loss)
        
        # Calculate accuracy
        acc = (output.argmax(dim=1) == labels).float().mean()
        epoch_accuracy += acc / len(train_loader)
        epoch_loss += loss / len(train_loader)
        iter_loss += loss.item()
        if idx%10==0:
            print(f'ITER {idx+1} loss:', iter_loss/(idx+1))
        
    # Validation phase
    model.eval()
    with torch.no_grad():
        epoch_val_accuracy = 0
        epoch_val_loss = 0
        
        for data_dict in val_loader:
            inputs = data_dict['spectrogram'].to(device)
            labels = data_dict['emotion_label'].to(device)
            
            val_outputs = model(inputs)
            val_loss = criterion(val_outputs, labels)
            
            acc = (val_outputs.argmax(dim=1) == labels).float().mean()
            epoch_val_accuracy += acc / len(val_loader)
            epoch_val_loss += val_loss / len(val_loader)
        
    # Store metrics
    train_losses.append(epoch_loss.item())
    val_losses.append(epoch_val_loss.item())
    train_accuracies.append(epoch_accuracy.item())
    val_accuracies.append(epoch_val_accuracy.item())
    
     # Print metrics
    print(
        f"Epoch : {epoch_num + 1} - loss : {epoch_loss:.4f} - acc: {epoch_accuracy:.4f} - val_loss : {epoch_val_loss:.4f} - val_acc: {epoch_val_accuracy:.4f}\n"
    )
    # Print current learning rate
    current_lr = optimizer.param_groups[0]["lr"]
    print(f"Learning Rate = {current_lr:.6f}")
    
    # Update scheduler
    scheduler.step(epoch_val_loss.item())
    
# Plot Training and Validation Losses
plt.figure(figsize=(10, 6))
plt.plot(range(1, epochs + 1), train_losses, label='Training Loss', marker='o')
plt.plot(range(1, epochs + 1), val_losses, label='Validation Loss', marker='s')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training Loss vs Validation Loss')
plt.legend()
plt.grid(True)
plt.show()

# Plot Training and Validation Accuracies
plt.figure(figsize=(10, 6))
plt.plot(range(1, epochs + 1), train_accuracies, label='Training Accuracy', marker='o')
plt.plot(range(1, epochs + 1), val_accuracies, label='Validation Accuracy', marker='s')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Training Accuracy vs Validation Accuracy')
plt.legend()
plt.grid(True)
plt.show()

Epoch: 1/20


  0%|          | 0/625 [00:00<?, ?it/s]

ITER 1 loss: 2.2351460456848145
ITER 11 loss: 2.4405365857211025




ITER 21 loss: 2.44230656396775
ITER 31 loss: 2.4245070872768277
ITER 41 loss: 2.3960542039173407
ITER 51 loss: 2.4161359469095864
ITER 61 loss: 2.41752197312527
ITER 71 loss: 2.4202051330620136
ITER 81 loss: 2.422286804811454
ITER 91 loss: 2.418591787526896
ITER 101 loss: 2.4161921916621747
ITER 111 loss: 2.41721375568493
ITER 121 loss: 2.4192012321850487
ITER 131 loss: 2.421256507626017
ITER 141 loss: 2.4200918674468994


KeyboardInterrupt: 

In [None]:
# Load dataset and get a spectrogram
dataset = meld_dataset(dataset_path='./meld.yaml', split_name='train')
sample_0 = dataset[0]  # Fetch one sample

# Extract the spectrogram correctly
spectrogram_db_0 = sample_0['spectrogram ']  # Fix key to match dictionary

# Plot the spectrogram
plt.figure(figsize=(10, 4))
librosa.display.specshow(spectrogram_db_0, sr=16000, x_axis='time', y_axis='mel', cmap='magma')
plt.colorbar(format='%+2.0f dB')
plt.title('Mel Spectrogram')
plt.tight_layout()
plt.show()

In [None]:
# Load dataset and get a spectrogram
dataset = meld_dataset(dataset_path='./meld.yaml', split_name='train')
sample_1 = dataset[1]  # Fetch one sample

# Extract the spectrogram correctly
spectrogram_db_1 = sample_1['spectrogram ']  # Fix key to match dictionary

# Plot the spectrogram
plt.figure(figsize=(10, 4))
librosa.display.specshow(spectrogram_db_1, sr=16000, x_axis='time', y_axis='mel', cmap='magma')
plt.colorbar(format='%+2.0f dB')
plt.title('Mel Spectrogram')
plt.tight_layout()
plt.show()

In [None]:
pd.Series(sample_0['audio_feature']).plot(figsize=(10, 5),
                  lw=1,
                  title='Raw Audio Example',
                  color='blue')
plt.show()

In [None]:
pd.Series(sample_1['audio_feature']).plot(figsize=(10, 5),
                  lw=1,
                  title='Raw Audio Example',
                  color='blue')
plt.show()