# Load and Preprocess dataframe 

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd

file_path = "mustard++/dataframe.csv"

df = pd.read_csv(file_path)
df = df.iloc[:, :-5] # non relevant features
mask = df['KEY'].str[-1] == 'u'
df_new = df[mask]

import re

def clean_text(text):
    text = re.sub(r'\d+', '', text)  # remove numbers
    text = re.sub(r'\W+', ' ', text)  # remove all non-word characters
    return text

# apply the function to the 'SENTENCE' column
df_new.loc[:, 'SENTENCE'] = df_new['SENTENCE'].apply(clean_text)

# format time
def standardize_end_time(time_str):
    minutes, seconds = time_str.split(':')
    if len(seconds) == 1:
        seconds = seconds + '0'
    return minutes + ':' + seconds

df_new.loc[:, 'END_TIME'] = df_new['END_TIME'].apply(standardize_end_time)

# get words per minute for each speaker
def words_per_minute(row):
    num_words = len(row['SENTENCE'].split())
    seconds = float(row['END_TIME'].split(':')[-1])  
    duration_minutes = seconds / 60.0
    return num_words / duration_minutes

# create a new column 'words_per_minute'
df_new.loc[:, 'words_per_minute'] = df_new.apply(words_per_minute, axis=1)

df_new.head(25)

Unnamed: 0,SCENE,KEY,SENTENCE,END_TIME,SPEAKER,SHOW,Sarcasm,words_per_minute
5,1_10004,1_10004_u,And of those few months how long have you been...,0:07,SHELDON,BBT,0.0,120.0
14,1_10009,1_10009_u,Let the dead man talk So why do you think that,0:05,PENNY,BBT,0.0,132.0
18,1_1001,1_1001_u,What else Sell it on eBay as slightly used,0:04,RAJ,BBT,0.0,135.0
24,1_1003,1_1003_u,Good idea sit with her Hold her comfort her An...,0:08,HOWARD,BBT,1.0,165.0
31,1_10190,1_10190_u,Well now that I ve given up string theory I m ...,0:11,SHELDON,BBT,0.0,190.909091
36,1_10462,1_10462_u,You know if you re really serious about that I...,0:09,HOWARD,BBT,1.0,160.0
42,1_10495,1_10495_u,And if it turns out you don t you and I could ...,0:07,AMY,BBT,1.0,154.285714
49,1_10496,1_10496_u,You and Oh sure And while we re at it why don...,0:14,SHELDON,BBT,1.0,162.857143
56,1_105,1_105_u,I m just inferring this is a couch because the...,0:06,SHELDON,BBT,1.0,210.0
63,1_10748,1_10748_u,There they go fighting again You d never hear ...,0:07,AMY,BBT,1.0,137.142857


# Function to extract features

In [2]:
import numpy as np
import librosa

def extract_features(file_name):
    # Load the audio file
    y, sr = librosa.load(file_name, sr=22050)

    # Compute the features
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
    delta_mfccs_mean = np.mean(librosa.feature.delta(mfccs), axis=1)
    delta_mfccs_median = np.median(librosa.feature.delta(mfccs), axis=1)
    delta_mfccs_std = np.std(librosa.feature.delta(mfccs), axis=1)
    mfccs_mean = np.mean(mfccs, axis=1)
    mfccs_std = np.std(mfccs, axis=1)
    mfccs_median = np.median(mfccs, axis=1)
    
    melspectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=20)
    delta_melspectrogram_mean = np.mean(librosa.feature.delta(melspectrogram), axis=1)
    delta_melspectrogram_median = np.median(librosa.feature.delta(melspectrogram), axis=1)
    delta_melspectrogram_std = np.std(librosa.feature.delta(melspectrogram), axis=1)
    melspectrogram_mean = np.mean(melspectrogram, axis=1)
    melspectrogram_std = np.std(melspectrogram, axis=1)
    melspectrogram_median = np.median(melspectrogram, axis=1)
    
    spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr)
    delta_spectral_centroids_mean = np.mean(librosa.feature.delta(spectral_centroids), axis=1)
    delta_spectral_centroids_median = np.median(librosa.feature.delta(spectral_centroids), axis=1)
    delta_spectral_centroids_std = np.std(librosa.feature.delta(spectral_centroids), axis=1)
    spectral_centroids_mean = np.mean(spectral_centroids, axis=1)
    spectral_centroids_std = np.std(spectral_centroids, axis=1)
    spectral_centroids_median = np.median(spectral_centroids, axis=1)
    
    intensity_mean = np.mean(librosa.feature.rms(y=y)[0])
    intensity_median = np.median(librosa.feature.rms(y=y)[0])
    intensity_std = np.std(librosa.feature.rms(y=y)[0])
    zero_crossing_rate_mean = np.mean(librosa.feature.zero_crossing_rate(y=y), axis=1)
    zero_crossing_rate_median = np.median(librosa.feature.zero_crossing_rate(y=y), axis=1)
    zero_crossing_rate_std = np.std(librosa.feature.zero_crossing_rate(y=y), axis=1)

    # Return the features in a dictionary
    return {
        'mfccs_mean': mfccs_mean,
        'mfccs_std': mfccs_std,
        'mfccs_median': mfccs_median,
        'melspectrogram_mean': melspectrogram_mean,
        'melspectrogram_std': melspectrogram_std,
        'melspectrogram_median': melspectrogram_median,
        'spectralcentroids_mean': spectral_centroids_mean,
        'spectralcentroids_std': spectral_centroids_std,
        'spectralcentroids_median': spectral_centroids_median,
        'deltamfccs_mean': delta_mfccs_mean,
        'deltamfccs_median': delta_mfccs_median,
        'deltamfccs_std': delta_mfccs_std,
        'deltamelspectrogram_mean': delta_melspectrogram_mean,
        'deltamelspectrogram_median': delta_melspectrogram_median,
        'deltamelspectrogram_std': delta_melspectrogram_std,
        'deltaspectral_centroids_mean': delta_spectral_centroids_mean,
        'deltaspectral_centroids_median': delta_spectral_centroids_median,
        'deltaspectral_centroids_std': delta_spectral_centroids_std,
        'intensity_mean': intensity_mean,
        'intensity_median': intensity_median,
        'intensity_std': intensity_std,
        'zerocrossingrate_mean': zero_crossing_rate_mean,
        'zerocrossingrate_median': zero_crossing_rate_median,
        'zerocrossingrate_std': zero_crossing_rate_std
    }


# Load other dataframes

In [3]:
import pandas as pd

text = pd.read_csv('text_final.csv', index_col='SCENE')

audio = pd.read_csv('audio_final.csv', index_col='audio_file')
audio = audio.join(text.iloc[:, -27:])
audio = audio.fillna(150)
audio.head()

Unnamed: 0_level_0,intensity,intensity_median,intensity_std,words_per_minute,mfccs_0,mfccs_1,mfccs_2,mfccs_3,mfccs_4,mfccs_5,...,PERSON1,PERSON3,PHOEBE,RACHEL,RAJ,RICHARD,ROSE,ROSS,SHELDON,STUART
audio_file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1_10004,0.059431,0.052855,0.04393,120.0,-243.93507,70.85715,-37.16033,19.805502,-22.752916,-1.574325,...,0,0,0,0,0,0,0,0,1,0
1_10009,0.048186,0.04419,0.024762,132.0,-199.60484,106.24419,-67.8631,5.480549,-34.111588,-4.429686,...,0,0,0,0,0,0,0,0,0,0
1_1001,0.288145,0.320339,0.182242,135.0,-77.95845,85.00533,-20.893145,31.380796,-13.386493,4.5098,...,0,0,0,0,1,0,0,0,0,0
1_1003,0.256163,0.257658,0.158541,165.0,-98.87288,110.46051,-17.9312,35.882313,-12.107592,2.541859,...,0,0,0,0,0,0,0,0,0,0
1_10190,0.039448,0.037176,0.029028,190.909091,-298.76166,64.262024,-43.251045,22.255568,-17.129074,-3.612903,...,0,0,0,0,0,0,0,0,1,0


In [4]:
labels = pd.read_csv('labels_final.csv', index_col='SCENE') # labels
labels.head()

Unnamed: 0_level_0,Sarcasm
SCENE,Unnamed: 1_level_1
1_10004,0.0
1_10009,0.0
1_1001,0.0
1_1003,1.0
1_10190,0.0


# Get features

In [5]:
import os
import pandas as pd
from tqdm import tqdm

audio_folder_path = 'mustard++/final_utterance_audios'  

# Get a list of all .wav files in the directory
audio_files = [f for f in os.listdir(audio_folder_path) if f.endswith('.wav')]

# Initialize an empty list to store the features
features = []

# Iterate over each audio file
for audio_file in tqdm(audio_files):
    audio_path = os.path.join(audio_folder_path, audio_file)

    # Extract the audio features
    audio_features = extract_features(audio_path)
    
    # Append the audio file name and features to the list
    features.append([audio_file] + list(audio_features.values()))

# Define the column names for the DataFrame
columns = [
    'audio_file',
    'mfccs',
    'mfccs_std',
    'mfccs_median',
    'melspectrogram',
    'melspectrogram_std',
    'melspectrogram_median',
    'spectralcentroids',
    'spectralcentroids_std',
    'spectralcentroids_median',
    'deltamfccs',
    'deltamfccs_median',
    'deltamfccs_std',
    'deltamelspectrogram',
    'deltamelspectrogram_median',
    'deltamelspectrogram_std',
    'deltaspectral_centroids',
    'deltaspectral_centroids_median',
    'deltaspectral_centroids_std',
    'intensity',
    'intensity_median',
    'intensity_std',
    'zerocrossingrate',
    'zerocrossingrate_median',
    'zerocrossingrate_std'
]

# Create a DataFrame from the features
df_audio_features = pd.DataFrame(features, columns=columns)

# proper formatting 
df_new['KEY'] = df_new['KEY'] + '.wav'

# Merge the dataframes based on the matching columns
merged_df = pd.merge(df_audio_features, df_new[['KEY', 'words_per_minute']], left_on='audio_file', right_on='KEY', how='left')

# Drop 'KEY' column
merged_df.drop('KEY', axis=1, inplace=True)

df_audio_features = merged_df

# Save the DataFrame to a CSV file
df_audio_features.to_csv('audio_features.csv', index=False)

100%|██████████████████████████████████████████████████████████████████████████████| 1203/1203 [00:46<00:00, 25.87it/s]


# Apply Augmentation

In [6]:
from sklearn.model_selection import train_test_split
import soundfile as sf

# Split the data into training and test sets
train_df, test_df = train_test_split(df_audio_features, test_size=0.2, random_state=42)

# augment audio to increase data size
def augment_audio(y, sr, augment_type):
    if augment_type == 'time_stretch':
        rate = np.random.uniform(0.8, 1.2)
        y_augmented = librosa.effects.time_stretch(y, rate=rate)
    elif augment_type == 'pitch_shift':
        n_steps = np.random.randint(-2, 3)
        y_augmented = librosa.effects.pitch_shift(y, sr=sr, n_steps=n_steps)
    elif augment_type == 'add_noise':
        noise = np.random.randn(len(y))
        y_augmented = y + 0.005 * noise
    else:
        raise ValueError("Unknown augmentation type")
    
    return y_augmented

def apply_augmentation(audio_file, augment_type):
    y, sr = librosa.load(audio_file, sr=22050)
    return augment_audio(y, sr, augment_type)

# Apply augmentation
augmented_data = []
augment_types = ['time_stretch', 'pitch_shift', 'add_noise']

for _, row in tqdm(train_df.iterrows(), total=len(train_df)):
    audio_path = os.path.join(audio_folder_path, row['audio_file'])
    for augment_type in augment_types:
        try:
            y_augmented = apply_augmentation(audio_path, augment_type)
            output_file = os.path.join(audio_folder_path, f"{row['audio_file']}_{augment_type}.wav")
            sf.write(output_file, y_augmented, samplerate=22050)
            
            augmented_data.append([f"{row['audio_file']}_{augment_type}.wav"] + list(extract_features(output_file).values()))
        except Exception as e:
            print(f"Error in augmenting {row['audio_file']} with {augment_type}. Error: {str(e)}")

# Add augmented data to the training dataframe
df_augmented = pd.DataFrame(augmented_data, columns=columns)
train_df = pd.concat([train_df, df_augmented], ignore_index=True)


100%|████████████████████████████████████████████████████████████████████████████████| 962/962 [02:59<00:00,  5.35it/s]


In [7]:
train_df

Unnamed: 0,audio_file,mfccs,mfccs_std,mfccs_median,melspectrogram,melspectrogram_std,melspectrogram_median,spectralcentroids,spectralcentroids_std,spectralcentroids_median,...,deltaspectral_centroids,deltaspectral_centroids_median,deltaspectral_centroids_std,intensity,intensity_median,intensity_std,zerocrossingrate,zerocrossingrate_median,zerocrossingrate_std,words_per_minute
0,2_109_u.wav,"[-196.90884, 111.83503, -16.157011, 23.736609,...","[72.18466, 34.165287, 24.766827, 19.58982, 15....","[-195.98349, 123.20456, -13.905862, 23.60362, ...","[2.5105267, 3.987721, 3.2400048, 3.928331, 2.6...","[2.8956556, 6.6080117, 4.467386, 6.011086, 4.6...","[1.9631906, 1.8085113, 1.8844368, 0.71152675, ...",[1991.9668020982826],[752.1139183810745],[1776.5853126249613],...,[-4.19169947468945],[3.4629446315656343],[142.4346836037139],0.058492,0.064852,0.033266,[0.09207243966584158],[0.08154296875],[0.05625018447499862],210.000000
1,1_S09E04_015_u.wav,"[-380.06613, 67.622505, -20.799032, 6.2227693,...","[110.2847, 41.935616, 32.416496, 22.351, 15.43...","[-357.39172, 74.59197, -15.181493, 5.416186, -...","[0.11194358, 0.20914024, 0.343886, 0.43238857,...","[0.2736083, 0.44279093, 1.0361524, 1.1485971, ...","[0.01825831, 0.03410589, 0.031108392, 0.026846...",[2840.238899894462],[1141.087649113566],[2449.319826706136],...,[14.415422175586398],[16.287821277563275],[195.74770905588713],0.014006,0.007977,0.013335,[0.16993913746843434],[0.132080078125],[0.10842496531566415],223.243598
2,1_S11E08_428_u.wav,"[-294.01672, 128.57971, -54.649467, 1.9314408,...","[66.29139, 23.524239, 23.741444, 13.906257, 19...","[-285.4978, 131.57803, -61.675095, 0.27324158,...","[0.1594015, 0.23444289, 0.352568, 0.79607666, ...","[0.2707256, 0.25937924, 0.38441026, 1.1604184,...","[0.08648823, 0.18101576, 0.22971424, 0.3974384...",[1704.3364631727802],[386.7693784817542],[1657.5653852078126],...,[-13.43224693484664],[0.4000463595130535],[65.7695227379353],0.022311,0.023524,0.012967,[0.09622508081896551],[0.0966796875],[0.020375970100533485],157.303371
3,1_S11E10_249_u.wav,"[-315.4979, 102.2661, -16.66823, 24.343424, -9...","[122.97403, 39.358967, 35.23052, 18.697966, 17...","[-301.71155, 112.79481, -7.8687105, 25.496126,...","[0.1682688, 0.30080462, 1.0937432, 1.1899221, ...","[0.29806674, 0.5403738, 2.0080583, 2.0119379, ...","[0.06492986, 0.056276347, 0.062610164, 0.01932...",[2194.068712248989],[962.571659878655],[1842.7427044829433],...,[-2.110737595699977],[-4.952305509581439],[193.04187500341786],0.023749,0.012663,0.022242,[0.11807174041491597],[0.1015625],[0.0773283822574313],132.401618
4,1_1803_u.wav,"[-159.72514, 132.56606, -42.203346, 7.192785, ...","[96.27192, 31.983568, 29.521448, 20.196257, 18...","[-152.11081, 130.90698, -45.413193, 7.419087, ...","[2.6657176, 8.356776, 16.455145, 19.060928, 9....","[3.180691, 11.47162, 24.1563, 26.065609, 16.52...","[1.9521621, 4.54249, 7.3387794, 6.7651567, 2.1...",[1724.2506068894706],[643.281800889309],[1688.0573311920903],...,[0.6026793558632387],[1.6505585554969306],[137.76245503506823],0.114489,0.128750,0.072061,[0.09586651141826923],[0.088623046875],[0.05151990975277466],120.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3843,2_299_u.wav_pitch_shift.wav,"[-321.42868, 89.714294, -4.0556793, 35.665718,...","[52.891533, 31.065886, 14.380011, 17.59288, 14...","[-318.27753, 87.87525, -1.5888232, 33.853386, ...","[0.44460267, 0.3724543, 0.33514124, 0.03445041...","[0.67471975, 0.52755713, 0.6039184, 0.07738792...","[0.21899721, 0.17558073, 0.09636216, 0.0087244...",[2451.631805042219],[754.2685154140471],[2474.1990311091013],...,[-1.3923492317144266],[-9.345619313108239],[175.9718094005758],0.014609,0.013189,0.008867,[0.12309905772900763],[0.111328125],[0.06381414300662816],
3844,2_299_u.wav_add_noise.wav,"[-240.75491, 46.63111, 15.003235, 16.913387, 7...","[36.619263, 25.114595, 12.168673, 14.203475, 1...","[-241.46321, 46.791832, 15.640235, 13.158957, ...","[1.0256176, 0.81886977, 0.94425607, 0.15105303...","[1.4439033, 1.1677994, 1.7564273, 0.39128217, ...","[0.42410165, 0.24627993, 0.108723246, 0.020827...",[3757.060596594481],[905.2173029519156],[3790.602252636639],...,[2.713984667189526],[18.14804040957053],[181.55602404042122],0.023471,0.023033,0.013719,[0.2245721016221374],[0.21240234375],[0.11000365618362516],
3845,3_S01E01_255_u.wav_time_stretch.wav,"[-441.76843, 98.82056, -1.6545755, 41.3603, 10...","[111.68157, 36.251755, 18.221857, 18.803154, 1...","[-426.3916, 97.933365, -0.68755805, 37.934235,...","[1.8596581, 0.6134057, 0.061233584, 0.04058955...","[3.693253, 1.3535997, 0.12498018, 0.086768806,...","[0.011420857, 0.0045213364, 0.00047702424, 0.0...",[1902.2702060526922],[895.6533770604816],[1618.7017223460607],...,[-4.217642728949495],[1.4856336126145466],[196.55186574321345],0.014987,0.004802,0.019059,[0.0881491268382353],[0.052734375],[0.07834153944453244],
3846,3_S01E01_255_u.wav_pitch_shift.wav,"[-449.76132, 106.88076, -8.726253, 42.898926, ...","[105.92214, 36.853325, 17.203405, 21.12479, 20...","[-431.68164, 101.05611, -9.760225, 41.991047, ...","[1.1275443, 0.1683807, 0.06480938, 0.017758342...","[2.402499, 0.31759107, 0.13737202, 0.0353159, ...","[0.019915652, 0.0015960876, 0.00031487, 0.0003...",[1745.2068633876158],[803.506714200273],[1462.6509349384626],...,[-2.366868860097411],[-12.167593444861279],[175.1627686499497],0.011558,0.004408,0.013579,[0.08480326334635417],[0.05224609375],[0.07193997165184417],


In [8]:
df_audio_features = train_df.copy()

In [9]:
import numpy as np

# determine the columns that contain 1D array
array_cols = [col for col in df_audio_features.columns if isinstance(df_audio_features[col][0], np.ndarray)]

# create new dataframe to hold the expanded features
df_expanded = pd.DataFrame()

# for each column that contains 1D array
for col in array_cols:
    # expand the array into separate columns and add to df_expanded
    expanded_col = pd.DataFrame(df_audio_features[col].to_list(), 
                                 columns=[f'{col}_{i}' for i in range(len(df_audio_features[col][0]))])
    df_expanded = pd.concat([df_expanded, expanded_col], axis=1)

# drop the original columns that contain 1D array from df_audio_features
df_audio_features = df_audio_features.drop(columns=array_cols)

# concatenate df_audio_features and df_expanded along the columns
df_audio_features = pd.concat([df_audio_features, df_expanded], axis=1)

In [10]:
df_audio_features.head()

Unnamed: 0,audio_file,intensity,intensity_median,intensity_std,words_per_minute,mfccs_0,mfccs_1,mfccs_2,mfccs_3,mfccs_4,...,deltamelspectrogram_std_16,deltamelspectrogram_std_17,deltamelspectrogram_std_18,deltamelspectrogram_std_19,deltaspectral_centroids_0,deltaspectral_centroids_median_0,deltaspectral_centroids_std_0,zerocrossingrate_0,zerocrossingrate_median_0,zerocrossingrate_std_0
0,2_109_u.wav,0.058492,0.064852,0.033266,210.0,-196.908844,111.83503,-16.157011,23.736609,-7.913743,...,0.002665,0.001124,0.000657,0.000346,-4.191699,3.462945,142.434684,0.092072,0.081543,0.05625
1,1_S09E04_015_u.wav,0.014006,0.007977,0.013335,223.243598,-380.066132,67.622505,-20.799032,6.222769,-9.419434,...,0.00035,0.000124,0.000107,8.5e-05,14.415422,16.287821,195.747709,0.169939,0.13208,0.108425
2,1_S11E08_428_u.wav,0.022311,0.023524,0.012967,157.303371,-294.016724,128.579712,-54.649467,1.931441,-24.82024,...,0.00026,0.000371,0.000183,6.1e-05,-13.432247,0.400046,65.769523,0.096225,0.09668,0.020376
3,1_S11E10_249_u.wav,0.023749,0.012663,0.022242,132.401618,-315.497894,102.266098,-16.66823,24.343424,-9.789073,...,0.004074,0.002142,0.001154,0.000431,-2.110738,-4.952306,193.041875,0.118072,0.101562,0.077328
4,1_1803_u.wav,0.114489,0.12875,0.072061,120.0,-159.725143,132.566055,-42.203346,7.192785,-17.504177,...,0.008022,0.004868,0.00306,0.002769,0.602679,1.650559,137.762455,0.095867,0.088623,0.05152


In [11]:
train_df_expanded = df_audio_features.copy()

## Augmentation on test set

In [12]:
augmented_data = []

for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
    audio_path = os.path.join(audio_folder_path, row['audio_file'])
    for augment_type in augment_types:
        try:
            y_augmented = apply_augmentation(audio_path, augment_type)
            output_file = os.path.join(audio_folder_path, f"{row['audio_file']}_{augment_type}.wav")
            sf.write(output_file, y_augmented, samplerate=22050)
            
            augmented_data.append([f"{row['audio_file']}_{augment_type}.wav"] + list(extract_features(output_file).values()))
        except Exception as e:
            print(f"Error in augmenting {row['audio_file']} with {augment_type}. Error: {str(e)}")

# Add augmented data to the training dataframe
df_augmented = pd.DataFrame(augmented_data, columns=columns)
test_df = pd.concat([test_df, df_augmented], ignore_index=True)

100%|████████████████████████████████████████████████████████████████████████████████| 241/241 [00:43<00:00,  5.60it/s]


In [13]:
df_audio_features = test_df.copy()

In [14]:
import numpy as np

# determine the columns that contain 1D array
array_cols = [col for col in df_audio_features.columns if isinstance(df_audio_features[col][0], np.ndarray)]

# create new dataframe to hold the expanded features
df_expanded = pd.DataFrame()

# for each column that contains 1D array
for col in array_cols:
    # expand the array into separate columns and add to df_expanded
    expanded_col = pd.DataFrame(df_audio_features[col].to_list(), 
                                 columns=[f'{col}_{i}' for i in range(len(df_audio_features[col][0]))])
    df_expanded = pd.concat([df_expanded, expanded_col], axis=1)

# drop the original columns that contain 1D array from df_audio_features
df_audio_features = df_audio_features.drop(columns=array_cols)

# concatenate df_audio_features and df_expanded along the columns
df_audio_features = pd.concat([df_audio_features, df_expanded], axis=1)

In [15]:
df_audio_features

Unnamed: 0,audio_file,intensity,intensity_median,intensity_std,words_per_minute,mfccs_0,mfccs_1,mfccs_2,mfccs_3,mfccs_4,...,deltamelspectrogram_std_16,deltamelspectrogram_std_17,deltamelspectrogram_std_18,deltamelspectrogram_std_19,deltaspectral_centroids_0,deltaspectral_centroids_median_0,deltaspectral_centroids_std_0,zerocrossingrate_0,zerocrossingrate_median_0,zerocrossingrate_std_0
0,3_S01E03_056_u.wav,0.048176,0.046927,0.027791,182.982672,-261.690582,99.408005,-19.862806,34.142185,-0.207228,...,0.005724,0.005057,0.002997,0.000841,9.497981,2.117391,203.684945,0.102340,0.075439,0.091120
1,1_S10E10_009_u.wav,0.026234,0.025120,0.021415,179.829189,-346.629761,104.680977,-35.389511,8.592740,-11.992854,...,0.000694,0.000398,0.000071,0.000023,-6.208916,-5.007938,154.678952,0.144416,0.106201,0.096394
2,1_1722_u.wav,0.111215,0.119678,0.050773,220.000000,-179.489029,129.078339,-25.307587,29.013868,-5.873797,...,0.004248,0.002468,0.001643,0.001291,-10.945756,4.508401,185.451453,0.082136,0.062500,0.051088
3,2_222_u.wav,0.073214,0.078654,0.024775,255.000000,-169.284515,90.389061,-25.957767,5.838042,-16.583344,...,0.016681,0.016417,0.009452,0.002882,-4.340378,-7.868666,181.611836,0.120562,0.105957,0.066509
4,1_S12E03_095_u.wav,0.024447,0.019920,0.017781,166.336634,-364.537415,95.205116,-22.572809,24.469709,-0.761370,...,0.000439,0.000520,0.000560,0.000192,-14.026509,-5.239973,272.800271,0.152793,0.113525,0.120041
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
959,1_S12E06_248_u.wav_pitch_shift.wav,0.020858,0.017942,0.011548,,-284.788696,129.187820,-56.304859,11.557641,-13.897083,...,0.000238,0.000152,0.000071,0.000016,0.448903,-0.066102,137.144242,0.121313,0.112305,0.052229
960,1_S12E06_248_u.wav_add_noise.wav,0.030353,0.028802,0.015240,,-197.609573,67.671120,-21.351759,-7.868472,-6.135896,...,0.000624,0.000485,0.000234,0.000082,2.637218,4.717485,150.586718,0.177479,0.148438,0.070958
961,2_357_u.wav_time_stretch.wav,0.025899,0.024135,0.023307,,-309.346649,96.477325,-25.223465,18.527857,-16.343370,...,0.000360,0.000764,0.001240,0.000714,2.444475,-0.046479,167.009674,0.131510,0.115723,0.078519
962,2_357_u.wav_pitch_shift.wav,0.025137,0.024516,0.021708,,-317.686066,112.507050,-31.243000,28.100573,-17.411545,...,0.000593,0.001065,0.000904,0.000156,2.444212,2.754003,150.692555,0.117402,0.104004,0.064858


In [16]:
test_df_expanded = df_audio_features.copy()

# Saving augmented dataframes

In [19]:
train_df_expanded.to_csv('train_df_expanded.csv', index=False)
test_df_expanded.to_csv('test_df_expanded.csv', index=False)

# Modelling

In [20]:
labels = pd.read_csv('labels_final.csv', index_col='SCENE')
labels.head()

Unnamed: 0_level_0,Sarcasm
SCENE,Unnamed: 1_level_1
1_10004,0.0
1_10009,0.0
1_1001,0.0
1_1003,1.0
1_10190,0.0


In [21]:
import pandas as pd

# Assuming df_label and df_data are your two dataframes

# Create a dictionary of original audio names to their sarcasm labels
labels_dict = labels['Sarcasm'].to_dict()

# Function to get label for an audio, whether original or augmented
def get_label(audio_name):
    for orig_name, label in labels_dict.items():
        # Check if the original audio name is a substring of the augmented audio name
        if orig_name in audio_name:
            return label

# Apply the function to the 'audio_file' column in df_data
train_df_expanded['Sarcasm'] = train_df_expanded['audio_file'].apply(get_label)
test_df_expanded['Sarcasm'] = test_df_expanded['audio_file'].apply(get_label)

In [22]:
y_train = train_df_expanded['Sarcasm']
y_test = test_df_expanded['Sarcasm']
train_df_expanded.drop('Sarcasm', inplace=True, axis=1)
test_df_expanded.drop('Sarcasm', inplace=True, axis=1)

In [23]:
train_df_expanded.set_index("audio_file", inplace=True)
test_df_expanded.set_index("audio_file", inplace=True)

In [24]:
train_df_expanded.drop('words_per_minute', inplace=True, axis=1)
test_df_expanded.drop('words_per_minute', inplace=True, axis=1)

In [25]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

labels = pd.read_csv('labels_final.csv', index_col='SCENE')

input_dim = 254

# Scale the input features
scaler = StandardScaler()
audio_scaled_train = scaler.fit_transform(train_df_expanded)
audio_scaled_test = scaler.fit_transform(test_df_expanded)
labels = labels['Sarcasm'].astype(int)

# Split data into train and test
X_train = train_df_expanded
X_test = test_df_expanded

# Define the model
model = Sequential()
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid')) # binary output 

optimizer = tf.keras.optimizers.Adam(learning_rate=0.01, epsilon=1e-07)


# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Set callback functions to early stop training and save the best model
callbacks = [EarlyStopping(monitor='val_loss', patience=15)]

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, callbacks=callbacks, validation_split=0.1)

# Predict on the test set
y_pred_prob = model.predict(X_test)
y_pred = np.round(y_pred_prob).astype(int)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Accuracy: 0.5145228215767634
Precision: 0.5145228215767634
Recall: 1.0
F1 Score: 0.6794520547945205


# Conclusion:
The audio dataset through mfcc and spectrogram features are not good enough. There is very poor performance and the model hasn't learning anything which is evident by the test scores. Hence this model + dataframe will not be used and transfer learning will be done to get embeddings.