# Audio Sentiment analysis project

In [None]:
import torch
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torchaudio
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
import torchaudio.backend as backend

# Check available backends
torchaudio.set_audio_backend("sox_io")


In [None]:
datasetRAVDEES = "dataset/data/RAVDEES/"
datasetCREMAD = "dataset/data/CREMA-D/AudioWAV/"
datasetTESS = "dataset/data/TESS/TESS Toronto emotional speech set data/"

Using the naming schema to segregate our data into different emotions. We will first start with RAVDEES

RAVDESS is one of the most common dataset used for this excercise by others. It's well liked because of its quality of speakers, recording and it has 24 actors of different genders. And there's more! You can get it in song format as well. There's something for everyone and their research project. So for convenience, here's the filename identifiers as per the official RAVDESS website:

1. Modality (01 = full-AV, 02 = video-only, 03 = audio-only).
2. Vocal channel (01 = speech, 02 = song).
3. Emotion (01 = neutral, 02 = calm, 03 = happy, 04 = sad, 05 = angry, 06 = fearful, 07 = disgust, 08 = surprised).
4. Emotional intensity (01 = normal, 02 = strong). NOTE: There is no strong intensity for the 'neutral' emotion.
5. Statement (01 = "Kids are talking by the door", 02 = "Dogs are sitting by the door").
6. Repetition (01 = 1st repetition, 02 = 2nd repetition).
7. Actor (01 to 24. Odd numbered actors are male, even numbered actors are female).

So, here's an example of an audio filename. 02-01-06-01-02-01-12.mp4

This means the meta data for the audio file is:

1. Video-only (02)
2. Speech (01)
3. Fearful (06)
4. Normal intensity (01)
5. Statement "dogs" (02)
6. 1st Repetition (01)
7. 12th Actor (12) - Female (as the actor ID number is even)


In [None]:
### We will use the naming schema for ravdees dataset and see if we can get the representing emotion for that specific audio file
def create_ravdess_dataframe(root_dir):
    # Initialize an empty list to store data
    data = []

    # Traverse through the directory structure
    for actor_dir in os.listdir(root_dir):
        actor_path = os.path.join(root_dir, actor_dir)
        if os.path.isdir(actor_path):
            for filename in os.listdir(actor_path):
                # Check if filename follows the expected format
                if filename.endswith('.wav') and filename.count('-') == 6:
                    # Extract information from the filename
                    file_info = filename.split('.')[0].split('-')
                    modality = int(file_info[0])
                    vocal_channel = int(file_info[1])
                    emotion = int(file_info[2])
                    intensity = int(file_info[3])
                    statement = int(file_info[4])
                    repetition = int(file_info[5])
                    actor_id = int(file_info[6])

                    # Determine emotion label
                    if emotion == 1:
                        emotion_label = 'neutral'
                    elif emotion == 2:
                        emotion_label = 'calm'
                    elif emotion == 3:
                        emotion_label = 'happy'
                    elif emotion == 4:
                        emotion_label = 'sad'
                    elif emotion == 5:
                        emotion_label = 'angry'
                    elif emotion == 6:
                        emotion_label = 'fearful'
                    elif emotion == 7:
                        emotion_label = 'disgust'
                    elif emotion == 8:
                        emotion_label = 'surprised'

                    # Append file path and emotion to the data list
                    file_path = os.path.join(actor_path, filename)
                    data.append((file_path, emotion_label))
                
    # Create a DataFrame object from the data list
    df = pd.DataFrame(data, columns=['file', 'emotion'])
    
    return df

### test the method
df = create_ravdess_dataframe(datasetRAVDEES)
print(df.head())
sns.countplot(df['emotion'])
plt.show()



In [None]:
### Similarly we write a method fir the CREMAD dataset and append to our df
def create_cremad_dataframe(root_dir, df):
    dirr = os.listdir(root_dir)
    for file in dirr:
        if not file.endswith('.wav'):
            continue
        nameSplit = file.split('_')
        emotion = nameSplit[2]
        if emotion == 'SAD':
            emotion = 'sad'
        elif emotion == 'ANG':
            emotion = 'angry'
        elif emotion == 'DIS':
            emotion = 'disgust'
        elif emotion == 'FEA':
            emotion = 'fearful'
        elif emotion == 'HAP':
            emotion = 'happy'
        elif emotion == 'NEU':
            emotion = 'neutral'
        elif emotion == 'SUR':
            emotion = 'surprised'
            
        # Create DataFrame for the current file
        new_df = pd.DataFrame({'file': [os.path.join(root_dir, file)], 'emotion': [emotion]})
        
        # Concatenate new DataFrame with existing DataFrame
        df = pd.concat([df, new_df], ignore_index=True)
        
    # Remove duplicate rows based on file path
    df = df.drop_duplicates(subset=['file'])
    
    return df


print(len(df))
print(df.columns)
df = create_cremad_dataframe(datasetCREMAD, df)
print(len(df))

#sorted_emotions = df['emotion'].sort_values(ascending=False).index

'''sns.countplot(df['emotion'])
plt.show()
print(df['emotion'].value_counts())'''

### now similarly we do the same for the TESS dataset


In [None]:
#### Similarly we will create the dataframe for the TESS dataset
def create_tess_dataframe(root_dir, df):
    for root, dirs, files in os.walk(root_dir):
        for file in files:
            if file.endswith('.wav'):
                # Extract emotion label from the parent directory
                emotion = os.path.basename(root)
                
                # Map emotion labels to standard emotions
                if emotion.startswith('YAF'):
                    # Remove speaker identifier for Young Female speakers
                    emotion = emotion[4:]
                elif emotion.startswith('OAF'):
                    # Remove speaker identifier for Old Female speakers
                    emotion = emotion[4:]
                elif emotion.startswith('YAF'):
                    # Remove speaker identifier for Young Male speakers
                    emotion = emotion[4:]
                elif emotion.startswith('OAF'):
                    # Remove speaker identifier for Old Male speakers
                    emotion = emotion[4:]

                # Map emotion labels to standard emotions
                if emotion == 'angry':
                    emotion = 'angry'
                elif emotion == 'disgust':
                    emotion = 'disgust'
                elif emotion == 'fear':
                    emotion = 'fearful'
                elif emotion == 'happy':
                    emotion = 'happy'
                elif emotion == 'neutral':
                    emotion = 'neutral'
                elif emotion == 'sad':
                    emotion = 'sad'
                elif emotion == 'ps':
                    emotion = 'surprised'

                # Append file path and emotion to the DataFrame
                df = pd.concat([df, pd.DataFrame({'file': [os.path.join(root, file)], 'emotion': [emotion]})], ignore_index=True)
    
    # Remove duplicate rows based on file path
    df = df.drop_duplicates(subset=['file'])
    
    return df

df_final = create_tess_dataframe(datasetTESS, df)
print(len(df))
print(df_final['emotion'].value_counts())
sns.countplot(df_final['emotion'])

In [None]:
def combine_emotion_labels(emotion):
    if emotion.lower() in ['fear', 'fearful']:
        return 'fearful'
    elif emotion.lower() in ['sad', 'Sad']:
        return 'sad'
    elif emotion.lower() in ['pleasant_surprised', 'Pleasant_surprise', 'surprised', 'pleasant_surprise']:
        return 'surprised'
    elif emotion.lower() in ['calm']:
        return 'neutral'
    else:
        return emotion.lower()

# Apply the function to the 'emotion' column
df_final['emotion'] = df_final['emotion'].apply(combine_emotion_labels)

print(len(df))
print(df_final['emotion'].value_counts())
plt.figure(figsize=(10, 5))
sns.countplot(df_final['emotion'])
plt.tight_layout()
plt.show()

As we can see there is an imbalance of `surprise` we will be assigning class weights to help us handle the imbalance of data. We will first handle preprocessing of the audio files. We will be analysing the spectrograms and waveform and plot them in a grid. One for each emotion and we will try to figure out ways to add noise in audio. 

In [None]:
df_final['emotion'].unique()
### we will also encode the emotions to numerical values and store their mapping in a dictionary
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df_final['label'] = encoder.fit_transform(df_final['emotion'])
label_mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
print(label_mapping)

data_to_plot = df_final.head(7)
print(data_to_plot)

def plot_waveform(file_path, emotion):
    waveform, sample_rate = torchaudio.load(file_path, format='wav')
    plt.figure(figsize=(15, 5))
    plt.plot(waveform.t().numpy())
    plt.xlabel('Sample')
    plt.ylabel('Amplitude')
    plt.title(emotion)
    plt.show()
    
file_path = 'dataset/data/RAVDEES/Actor_21/03-01-06-01-02-01-21.wav'

# Try loading the audio file and catch any exceptions
try:
    waveform, sample_rate = torchaudio.load(file_path)
except Exception as e:
    print("Error:", e)
    
plot_waveform(data_to_plot['file'][0], data_to_plot['emotion'][0])



In [None]:
#### now we will be plotting the waveform for the first 7 audio files in the dataset in a grid 

fig, axs = plt.subplots(2, 4, figsize=(20, 10))
fig.suptitle('Waveform', fontsize=16)

for i in range(7):
    waveform, sample_rate = torchaudio.load(data_to_plot['file'][i])
    axs[i // 4, i % 4].plot(waveform.t().numpy())
    axs[i // 4, i % 4].set_title(data_to_plot['emotion'][i])
    axs[i // 4, i % 4].set_xlabel('Sample')
    axs[i // 4, i % 4].set_ylabel('Amplitude')
    
plt.tight_layout()
plt.show()

In [None]:
import librosa
def preprocess_audio_file(filepath):
    waveform, sample_rate = torchaudio.load(filepath)
    spec = torchaudio.transforms.MelSpectrogram()(waveform)
    return spec
    
def plot_spectrogram(specgram, title=None, ylabel="freq_bin", ax=None):
    if ax is None:
        _, ax = plt.subplots(1, 1)
    if title is not None:
        ax.set_title(title)
    ax.set_ylabel(ylabel)
    ax.imshow(librosa.power_to_db(specgram[0]), origin="lower", aspect="auto", interpolation="nearest")
    
    
i = 1
specgram = preprocess_audio_file(data_to_plot['file'][i])
plt.figure(figsize=(10, 5))
plot_spectrogram(specgram, title=data_to_plot['emotion'][i])
plt.show()
### get the shape of the spectrogram
print(specgram.shape)

### Preprocessing the data

In [None]:
from multiprocessing import Pool
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster
import tqdm, time

def preprocess_data_dask(filepaths, num_cpus):
    with LocalCluster(n_workers=num_cpus) as cluster, Client(cluster) as client:
        specs = []
        for filepath in tqdm.tqdm(filepaths):
            spec = preprocess_audio_file(filepath)
            specs.append(spec)
    return specs

# Load data using Dask in parallel
filepaths = df_final['file'].values.flatten().tolist()

# Define the range of CPUs to test
#cpu_range = range(1, 9)
cpu_range = [1, 2, 4]

# Measure execution time for different numbers of CPUs
execution_times = []
for num_cpus in cpu_range:
    print("CPU Count:", num_cpus)
    start_time = time.time()
    specs = preprocess_data_dask(filepaths, num_cpus)
    end_time = time.time()
    execution_times.append(end_time - start_time)

# Plot the speedup
plt.plot(cpu_range, [execution_times[0] / time for time in execution_times], marker='o')
plt.xlabel('Number of CPUs')
plt.ylabel('Speedup')
plt.title('Speedup vs. Number of CPUs')
plt.grid(True)
plt.show()

# Preprocess all the files for easy access

We will preprocess the files and save them as a `.pkl` file.

In [None]:
import pickle
import concurrent.futures
from tqdm import tqdm

def process_audio_files_in_parallel(df, output_file_path):
    df_split = np.array_split(df, df.shape[0])
    with concurrent.futures.ProcessPoolExecutor() as executor:
        results = list(tqdm(executor.map(process_audio_and_save, df_split), total=len(df_split)))
    with open(output_file_path, 'wb') as f:
        pickle.dump(results, f)
    return results  # Add this line

def process_audio_and_save(df):
    max_length = 128
    # Extract the audio file path and emotion from the DataFrame
    audio_file_path = df.iloc[0]['file']
    emotion = df.iloc[0]['label']
    y, sr = librosa.load(audio_file_path)
    spectrogram = librosa.feature.melspectrogram(y=y, sr=sr)
    if spectrogram.shape[1] < max_length:
        pad_width = max_length - spectrogram.shape[1]
        spectrogram = np.pad(spectrogram, ((0, 0), (0, pad_width)), mode='constant')
    elif spectrogram.shape[1] > max_length:
        spectrogram = spectrogram[:, :max_length]
    results = (spectrogram, emotion)
    return results

fileName = "output.pkl"

if os.path.exists(fileName):
    with open(fileName, 'rb') as f:
        data = pickle.load(f)
        print("Loaded data from file.")
else:
    data = process_audio_files_in_parallel(df_final, fileName)
    
## we will now plot the spectrogram for the first 7 audio files in the dataset
fig, axs = plt.subplots(2, 4, figsize=(20, 10))
fig.suptitle('Spectrogram', fontsize=16)

for i in range(7):
    specgram, emotion = data[i]
    axs[i // 4, i % 4].imshow(librosa.power_to_db(specgram), origin="lower", aspect="auto", interpolation="nearest")
    emotion = encoder.inverse_transform([emotion])[0]
    axs[i // 4, i % 4].set_title(emotion)
    axs[i // 4, i % 4].set_ylabel('freq_bin')
    
plt.tight_layout()

### Modelling using Tensorflow approach

# Modelling

Now we will design our model using `pytorch` and use ResNet as a transfer layer to extract the important features. 

In [None]:
from torchvision import  models
from torch import nn
from torch.nn import functional as F

class CustomResNet50(nn.Module):
    def __init__(self, num_classes=7):
        super(CustomResNet50, self).__init__()
        # Load a pre-trained ResNet-50 model
        self.resnet = models.resnet50(pretrained=True)
        # Replace the first convolutional layer
        self.resnet.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
        # Modify the input dimension of the first batch normalization layer
        self.resnet.bn1 = nn.BatchNorm2d(64)
        # Freeze the parameters of the model except the new layers
        for param in self.resnet.parameters():
            param.requires_grad = False
        # Replace the last fully connected layer
        num_ftrs = self.resnet.fc.in_features
        self.resnet.fc = nn.Sequential(
            nn.Linear(num_ftrs, 512),
            nn.ReLU(),    
            nn.Dropout(0.5),
            nn.Linear(512, num_classes),
        )

    def forward(self, x):
        x = self.resnet(x)
        return x
    
    
class SimpleCNN(torch.nn.Module):
    def __init__(self, num_classes=7):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm2d(32)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
        self.bn3 = nn.BatchNorm2d(128)
        self.conv4 = nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1)
        self.bn4 = nn.BatchNorm2d(256)
        self.fc1 = nn.Linear(256*8*8, 512)
        self.dropout = nn.Dropout(0.5)  # Add dropout layer
        self.fc2 = nn.Linear(512, num_classes)

    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.max_pool2d(x, 2)
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.max_pool2d(x, 2)
        x = F.relu(self.bn3(self.conv3(x)))
        x = F.max_pool2d(x, 2)
        x = F.relu(self.bn4(self.conv4(x)))
        x = F.max_pool2d(x, 2)
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)  # Apply dropout
        x = self.fc2(x)
        return x
    
class Net(nn.Module):
    def __init__(self, num_classes=7):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm2d(32)
        self.pool1 = nn.MaxPool2d(2, 2)  # Add max pooling layer
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        self.pool2 = nn.MaxPool2d(2, 2)  # Add max pooling layer
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
        self.bn3 = nn.BatchNorm2d(128)
        self.pool3 = nn.MaxPool2d(2, 2)  # Add max pooling layer
        self.conv4 = nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1)
        self.bn4 = nn.BatchNorm2d(256)
        self.pool4 = nn.MaxPool2d(2, 2)  # Add max pooling layer
        self.fc1 = nn.Linear(256*8*8, 512)
        self.dropout = nn.Dropout(0.5)  # Add dropout layer
        self.fc2 = nn.Linear(512, num_classes)

    def forward(self, x):
        x = self.pool1(F.relu(self.bn1(self.conv1(x))))
        x = self.pool2(F.relu(self.bn2(self.conv2(x))))
        x = self.pool3(F.relu(self.bn3(self.conv3(x))))
        x = self.pool4(F.relu(self.bn4(self.conv4(x))))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)  # Apply dropout
        x = self.fc2(x)
        return x
    
### we will compile the model and check the summary

In [None]:
import torch.optim as optim
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from torch.cuda.amp import autocast, GradScaler

val_score = []
train_score = []
train_accu = []
epo = []

class AudioDataset(Dataset):
    def __init__(self, data):
        self.data = data
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        specgram, emotion = self.data[idx]
        specgram = torch.tensor(specgram, dtype=torch.float32)
        emotion = torch.tensor(emotion, dtype=torch.long)
        return specgram.unsqueeze(0), emotion
    
    
def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, device, num_epochs=10, warmup_epoch = 2):
    scaler = GradScaler()  # Initialize GradScaler

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        correct_train = 0
        total_train = 0
        for specgrams, labels in train_loader:
            specgrams, labels = specgrams.to(device), labels.to(device)
            optimizer.zero_grad()

            # Use autocast to enable mixed precision
            with autocast():
                outputs = model(specgrams)
                loss = criterion(outputs, labels)

            # Scale the loss and call backward()
            scaler.scale(loss).backward()
            # Step with the scaler
            scaler.step(optimizer)
            # Update the scaler
            scaler.update()

            running_loss += loss.item() * specgrams.size(0)
            _, predicted_train = torch.max(outputs, 1)
            total_train += labels.size(0)
            correct_train += (predicted_train == labels).sum().item()
        epoch_loss = running_loss / len(train_loader.dataset)
        train_accuracy = correct_train / total_train
        
        model.eval()
        correct_val = 0
        total_val = 0
        with torch.no_grad():
            for specgrams, labels in val_loader:
                specgrams, labels = specgrams.to(device), labels.to(device)
                outputs = model(specgrams)
                _, predicted_val = torch.max(outputs, 1)
                total_val += labels.size(0)
                correct_val += (predicted_val == labels).sum().item()
            val_accuracy = correct_val / total_val
        
        print(f"Epoch {epoch + 1}/{num_epochs}, "
              f"Train Loss: {epoch_loss:.4f}, "
              f"Train Accuracy: {train_accuracy:.4f}, "
              f"Validation Accuracy: {val_accuracy:.4f}")
        
        epo.append(epoch)
        val_score.append(val_accuracy)
        train_score.append(epoch_loss)
        train_accu.append(train_accuracy)
        
        scheduler.step()
        
        '''if epoch < warmup_epoch:
            lr = 0.01 * (epoch + 1) / warmup_epoch
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr'''
        
def train_resnet50_mixed_precision(model, data_file, batch_size=32, num_epochs=10, learning_rate=0.001):
    # Load preprocessed data
    if os.path.exists(data_file):
        with open(data_file, 'rb') as f:
            data = pickle.load(f)
    else:
        raise FileNotFoundError("Preprocessed data file not found.")
    train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)
    train_dataset = AudioDataset(train_data)
    val_dataset = AudioDataset(val_data)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model
    model = model.to(device)
    
    # Define loss function, optimizer, and scheduler
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adagrad(model.parameters(), lr=learning_rate)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)
    
    # Train the model
    train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, device, num_epochs, warmup_epoch=5)
    return model
    


In [None]:
# Define hyperparameters
batch_size = 8
num_epochs = 20
learning_rate = 0.001

model = Net()
# Call the training method
train_resnet50_mixed_precision(model, fileName, batch_size, num_epochs, learning_rate)

plt.plot(epo, val_score, label='Validation Accuracy')
plt.plot(epo, train_accu, label='Train Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Accuracy vs. Epoch')
plt.legend()
plt.grid(True)
plt.show()

### TensorRT

Here we will be first converting the model to ONNX and then using tensorrt to boost the infernce speed. 

In [None]:
### we will convert the model to ONNX format and use tensorRT to optimize the model

import torch.onnx
import onnx
import onnxruntime as ort

def convert_to_onnx(model, output_file):
    model.eval()
    dummy_input = torch.randn(1, 1, 128, 128).to(device='cuda')
    torch.onnx.export(model, dummy_input, output_file, verbose=True)
    return onnx.load(output_file)

convert_to_onnx(model, 'simple_cnn.onnx')

In [None]:
### we load and optimize the model using tensorRT
onnx_model = onnx.load('simple_cnn.onnx')
ort_session = ort.InferenceSession('simple_cnn.onnx')

def optimize_onnx_model(onnx_model, optimized_model_path):
    # Initialize the optimizer
    optimizer = onnx.optimizer.Optimizer(onnx_model)
    # Apply the optimization passes
    optimized_model = optimizer.optimize()
    # Save the optimized model
    onnx.save(optimized_model, optimized_model_path)
    return optimized_model

optimized_model = optimize_onnx_model(onnx_model, 'simple_cnn_optimized.onnx')

### we will compare the inference speed of the optimized model with the original model
import time

def measure_inference_speed(ort_session, num_iterations=100):
    total_time = 0
    for _ in range(num_iterations):
        start_time = time.time()
        ort_session.run(None)
        end_time = time.time()
        total_time += end_time - start_time
    return total_time / num_iterations

original_model_speed = measure_inference_speed(ort_session)
print(f"Original model inference speed: {original_model_speed:.4f} seconds")

optimized_ort_session = ort.InferenceSession('simple_cnn_optimized.onnx')
optimized_model_speed = measure_inference_speed(optimized_ort_session)
print(f"Optimized model inference speed: {optimized_model_speed:.4f} seconds")