# Loading Libraries and VGGissh

In [1]:
import numpy as np
import os
import pandas as pd
from torchvggish import vggish, vggish_input
from tqdm import tqdm
import torch

model = vggish()
model.eval()

VGG(
  (features): Sequential(
    (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (11): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (12): ReLU(inplace=True)
    (13): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (14): ReLU(inplace=True)
    (15): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
 

# Get Embeddings

In [2]:
# Function to get embeddings
def get_embedding(file_path, model):
    # Convert the audio file to a tensor of spectrogram
    input_tensor = vggish_input.wavfile_to_examples(file_path)
    
    # Check if there are no valid spectrogram
    if input_tensor.shape[0] == 0:
        return None
    
    # Ensure the tensor type is float
    input_tensor = input_tensor.float()
    
    # Get embeddings
    with torch.no_grad():
        embedding_batch = model(input_tensor)

    # Convert embeddings back to numpy
    embedding_batch = embedding_batch.cpu().numpy()
    
    return embedding_batch

In [3]:
import pandas as pd

audio_folder = 'mustard++/final_utterance_audios' # Location of all .wav audio files
embeddings_list = []
file_names = []

# Iterating over entire folder of .wav files
for audio_file in tqdm(os.listdir(audio_folder)):
    audio_path = os.path.join(audio_folder, audio_file)

    embedding = get_embedding(audio_path, model) # Get embeddings from .wav files
    file_names.append(audio_file) 

    # Making sure all .wav files are valid with VGGish
    if embedding is not None:
        embeddings_list.append(embedding.mean(axis=0))  # Taking the mean across time for each file to get shape (128,)
    else:
        embeddings_list.append([None] * 128) # NaN rows if invalid .wav files with VGGish

# Convert embeddings list to a DataFrame
df = pd.DataFrame({
    'file_name': file_names,
    'embedding': embeddings_list
})

# Saving DataFrame
df.to_csv('embeddings_dataframe.csv', index=False) # Not the final embeddings_dataframe.csv

100%|██████████████████████████████████████████████████████████████████████████████| 4812/4812 [14:20<00:00,  5.59it/s]


In [4]:
# Splitting the embedding column into multiple columns
expanded_embeddings = df['embedding'].apply(pd.Series)
expanded_embeddings = expanded_embeddings.rename(columns=lambda x: f'embedding_{x}')

# Drop the original embedding column and concatenate the expanded embeddings to the original df
df_new = pd.concat([df.drop('embedding', axis=1), expanded_embeddings], axis=1)

In [5]:
df_new

Unnamed: 0,file_name,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,...,embedding_118,embedding_119,embedding_120,embedding_121,embedding_122,embedding_123,embedding_124,embedding_125,embedding_126,embedding_127
0,1_10004_u.wav,158.571426,3.714286,190.000000,121.428574,193.142853,77.428574,113.142860,80.285713,117.571426,...,58.571430,208.285721,81.142860,150.428574,152.428574,192.857147,229.857147,60.285713,110.571426,255.0
1,1_10004_u.wav_add_noise.wav,156.000000,5.714286,187.428574,97.428574,207.571426,86.857140,109.142860,83.857140,122.142860,...,19.142857,236.428574,133.428574,110.714287,113.857140,186.000000,189.571426,49.428570,85.571426,255.0
2,1_10004_u.wav_pitch_shift.wav,161.714279,11.571428,189.000000,127.142860,204.142853,84.714287,100.571426,98.000000,125.428574,...,52.857143,215.428574,107.142860,140.285721,91.142860,166.428574,196.000000,76.714287,85.142860,255.0
3,1_10004_u.wav_time_stretch.wav,162.500000,12.500000,196.500000,126.166664,204.333328,84.333336,113.000000,83.333336,118.666664,...,52.000000,192.666672,74.833336,181.666672,136.833328,126.833336,159.666672,50.000000,85.333336,255.0
4,1_10009_u.wav,169.000000,10.600000,219.199997,148.199997,194.800003,61.799999,173.000000,48.400002,128.000000,...,53.400002,236.000000,1.400000,248.800003,212.000000,177.000000,232.399994,60.000000,207.000000,255.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4807,3_S06E07_272_u.wav_time_stretch.wav,123.281250,,,,,,,,,...,,,,,,,,,,
4808,Copy of 1_105_u.wav,160.166672,4.333333,188.166672,164.000000,193.166672,54.500000,93.000000,52.333332,124.833336,...,36.333332,252.666672,164.833328,152.500000,96.166664,183.166672,166.500000,35.833332,168.666672,255.0
4809,Copy of 1_105_u.wav_add_noise.wav,160.000000,4.500000,188.166672,155.000000,190.666672,58.000000,102.000000,50.500000,122.166664,...,44.666668,255.000000,173.166672,141.666672,97.000000,186.666672,196.000000,42.166668,169.666672,255.0
4810,Copy of 1_105_u.wav_pitch_shift.wav,174.000000,31.000000,193.000000,151.833328,215.166672,74.333336,68.833336,66.333336,101.166664,...,42.666668,210.666672,29.833334,143.000000,85.500000,190.666672,211.833328,116.666664,42.500000,255.0


# Processing Embeddings df 
To make the dataframe clean and instantly usable for training

In [6]:
# Ground truth labels
labels = pd.read_csv('labels_final.csv', index_col='SCENE')
labels.head()

Unnamed: 0_level_0,Sarcasm
SCENE,Unnamed: 1_level_1
1_10004,0.0
1_10009,0.0
1_1001,0.0
1_1003,1.0
1_10190,0.0


## Mapping labels to embeddings dataframe because it contain augmented files

In [7]:
# Create a dictionary from the 'labels_df' dataframe for mapping 
label_dict = labels['Sarcasm'].to_dict()

# Map 'Sarcasm' values from 'label_dict' to 'df' where 'SCENE' is contained in 'file_name'
df_new['Sarcasm'] = df_new['file_name'].apply(lambda x: next((v for k, v in label_dict.items() if k in x), None))

df_new.dropna(inplace=True)
cols_to_check = df_new.columns.difference(['Sarcasm', 'audio_file'])

# Drop rows where all values in cols_to_check are 0
df_new.drop(df_new[df_new[cols_to_check].sum(axis=1) == 0].index, inplace=True)
df_new.set_index('file_name', inplace=True)
df_new.to_csv('df_new.csv')

In [8]:
df_new

Unnamed: 0,file_name,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,...,embedding_119,embedding_120,embedding_121,embedding_122,embedding_123,embedding_124,embedding_125,embedding_126,embedding_127,Sarcasm
0,1_10004_u.wav,158.571426,3.714286,190.000000,121.428574,193.142853,77.428574,113.142860,80.285713,117.571426,...,208.285721,81.142860,150.428574,152.428574,192.857147,229.857147,60.285713,110.571426,255.0,0.0
1,1_10004_u.wav_add_noise.wav,156.000000,5.714286,187.428574,97.428574,207.571426,86.857140,109.142860,83.857140,122.142860,...,236.428574,133.428574,110.714287,113.857140,186.000000,189.571426,49.428570,85.571426,255.0,0.0
2,1_10004_u.wav_pitch_shift.wav,161.714279,11.571428,189.000000,127.142860,204.142853,84.714287,100.571426,98.000000,125.428574,...,215.428574,107.142860,140.285721,91.142860,166.428574,196.000000,76.714287,85.142860,255.0,0.0
3,1_10004_u.wav_time_stretch.wav,162.500000,12.500000,196.500000,126.166664,204.333328,84.333336,113.000000,83.333336,118.666664,...,192.666672,74.833336,181.666672,136.833328,126.833336,159.666672,50.000000,85.333336,255.0,0.0
4,1_10009_u.wav,169.000000,10.600000,219.199997,148.199997,194.800003,61.799999,173.000000,48.400002,128.000000,...,236.000000,1.400000,248.800003,212.000000,177.000000,232.399994,60.000000,207.000000,255.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4803,3_S06E06_143_u.wav_time_stretch.wav,156.000000,10.750000,191.250000,126.750000,218.250000,82.750000,128.250000,117.250000,109.250000,...,138.250000,147.000000,189.250000,80.250000,104.000000,99.000000,8.500000,44.500000,255.0,1.0
4808,Copy of 1_105_u.wav,160.166672,4.333333,188.166672,164.000000,193.166672,54.500000,93.000000,52.333332,124.833336,...,252.666672,164.833328,152.500000,96.166664,183.166672,166.500000,35.833332,168.666672,255.0,1.0
4809,Copy of 1_105_u.wav_add_noise.wav,160.000000,4.500000,188.166672,155.000000,190.666672,58.000000,102.000000,50.500000,122.166664,...,255.000000,173.166672,141.666672,97.000000,186.666672,196.000000,42.166668,169.666672,255.0,1.0
4810,Copy of 1_105_u.wav_pitch_shift.wav,174.000000,31.000000,193.000000,151.833328,215.166672,74.333336,68.833336,66.333336,101.166664,...,210.666672,29.833334,143.000000,85.500000,190.666672,211.833328,116.666664,42.500000,255.0,1.0
