# Audio Feature Evaluation

In this notebook, we will check the viability of the traditional audio features extracted from the data. The embeddings give around 70% performance, hence we will investigate whether tradiation features perform the worst. 

In [3]:
audio = pd.read_csv('audio_final.csv', index_col='audio_file')
audio = audio.join(text.iloc[:, -27:])
audio = audio.fillna(150)
audio.head()

Unnamed: 0_level_0,intensity,intensity_median,intensity_std,words_per_minute,mfccs_0,mfccs_1,mfccs_2,mfccs_3,mfccs_4,mfccs_5,...,PERSON1,PERSON3,PHOEBE,RACHEL,RAJ,RICHARD,ROSE,ROSS,SHELDON,STUART
audio_file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1_10004,0.059431,0.052855,0.04393,120.0,-243.93507,70.85715,-37.16033,19.805502,-22.752916,-1.574325,...,0,0,0,0,0,0,0,0,1,0
1_10009,0.048186,0.04419,0.024762,132.0,-199.60484,106.24419,-67.8631,5.480549,-34.111588,-4.429686,...,0,0,0,0,0,0,0,0,0,0
1_1001,0.288145,0.320339,0.182242,135.0,-77.95845,85.00533,-20.893145,31.380796,-13.386493,4.5098,...,0,0,0,0,1,0,0,0,0,0
1_1003,0.256163,0.257658,0.158541,165.0,-98.87288,110.46051,-17.9312,35.882313,-12.107592,2.541859,...,0,0,0,0,0,0,0,0,0,0
1_10190,0.039448,0.037176,0.029028,190.909091,-298.76166,64.262024,-43.251045,22.255568,-17.129074,-3.612903,...,0,0,0,0,0,0,0,0,1,0


In [4]:
df = audio.copy() 

# Create an empty DataFrame to store averaged features
audio_averaged = pd.DataFrame()

# Loop through unique prefixes (e.g., 'mfcc', 'melspectrogram', ...)
for feature in set(df.columns.str.split('_').str[0]):
    # Find all columns with this prefix
    cols = df.columns[df.columns.str.startswith(feature)]
    # Calculate the mean and add it to the new DataFrame
    audio_averaged[feature] = df[cols].mean(axis=1)

# Now 'audio_averaged' contains averaged features
audio_averaged.head()

Unnamed: 0_level_0,RAJ,HOWARD,CHANDLER,melspectrogram,OTHER,intensity,deltamfccs,ROSE,JOEY,ERLICH,...,RACHEL,PENNY,ROSS,GILFOYLE,SHELDON,STUART,BERNADETTE,MODERATOR,deltamelspectrogram,zerocrossingrate
audio_file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1_10004,0.0,0.0,0.0,1.130459,0.0,0.052072,0.976613,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.113363,0.150452
1_10009,0.0,0.0,0.0,0.58993,0.0,0.039046,0.695638,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.053511,0.094552
1_1001,1.0,0.0,0.0,25.124002,0.0,0.263576,1.134144,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.685188,0.12867
1_1003,0.0,1.0,0.0,21.512324,0.0,0.224121,0.981176,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.335945,0.086266
1_10190,0.0,0.0,0.0,0.621053,0.0,0.035217,1.262856,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.083502,0.137214


In [81]:
labels = pd.read_csv('labels_final.csv', index_col='SCENE')
labels.head()

Unnamed: 0_level_0,Sarcasm
SCENE,Unnamed: 1_level_1
1_10004,0.0
1_10009,0.0
1_1001,0.0
1_1003,1.0
1_10190,0.0


In [100]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.preprocessing import StandardScaler

labels = pd.read_csv('labels_final.csv', index_col='SCENE')

input_dim = 280

# Scale the input features
scaler = StandardScaler()
audio_scaled = scaler.fit_transform(audio)
labels = labels['Sarcasm'].astype(int)

# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(audio_scaled, labels, test_size=0.2, random_state=42)

# Define the model
model = Sequential()
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.7))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.7))
model.add(Dense(1, activation='sigmoid')) 

optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, epsilon=1e-07)


# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Set callback functions to early stop training and save the best model
callbacks = [EarlyStopping(monitor='val_loss', patience=15)]

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, callbacks=callbacks, validation_split=0.1)

# Predict on the test set
y_pred_prob = model.predict(X_test)
y_pred = np.round(y_pred_prob).astype(int)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Accuracy: 0.5352697095435685
Precision: 0.5202702702702703
Recall: 0.652542372881356
F1 Score: 0.5789473684210527


# Conclusion
The traditional audio features perform very bad. This sheds light on why previous works neglected the improvement of audio models and feature extraction in this area. Transfer learning provides greater performance and it shows the viability of using transfer learning to generate high-quality embeddings.