In [8]:
import pandas as pd
import librosa
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split

# Load the data
data = pd.read_csv("data.csv")
scenes = data["SCENE"]
labels = data["Sarcasm"].values

# Step 1: Find the maximum duration across all audio files
def get_max_duration(scenes, sr=14500):
    max_duration = 0
    for file_path in scenes:
        n_f="audio_utterance/"+file_path+"_u.wav"
        y, _ = librosa.load(n_f, sr=sr)
        duration = librosa.get_duration(y=y, sr=sr)
        if duration > max_duration:
            max_duration = duration
        #print("done with",n_f)
    return max_duration

# Calculate the maximum duration (in seconds) across all files
max_duration = get_max_duration(scenes)
print(f"Maximum audio duration: {max_duration} seconds")

# Step 2: Convert audio to fixed-size mel spectrograms based on max duration
def audio_to_mel_spectrogram(file_path, n_mels=128, sr=14500, duration=max_duration):
    n_f="audio_utterance/"+file_path+"_u.wav"
    y, sr = librosa.load(n_f, sr=sr)
    target_length = int(duration * sr)

    # Pad or truncate the audio to match the maximum duration
    if len(y) > target_length:
        y = y[:target_length]
    elif len(y) < target_length:
        y = np.pad(y, (0, target_length - len(y)), mode='constant')

    # Convert to mel spectrogram
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    #print("done wh",n_f)
    return mel_spec_db

# Step 3: Prepare the dataset
mel_spectrograms = []
for scene in scenes:
    mel_spec = audio_to_mel_spectrogram(scene)
    mel_spectrograms.append(mel_spec)
print("here")
# Resize spectrograms to ensure consistent input shape (e.g., width of 216)
mel_spectrograms = [librosa.util.fix_length(mel, size=216, axis=1) for mel in mel_spectrograms]
X = np.array(mel_spectrograms)
X = X[..., np.newaxis]  # Add channel dimension for CNN
y = np.array(labels)
print("split")
# Split the dataset
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Build the CNN model
model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='tanh', input_shape=(128, 216, 1)),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='tanh'),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='tanh'),
    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Step 5: Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

# Model evaluation
loss, accuracy = model.evaluate(X_val, y_val)
print(f"Validation Loss: {loss}")
print(f"Validation Accuracy: {accuracy}")


Maximum audio duration: 20.02 seconds
here
split


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 675ms/step - accuracy: 0.5286 - loss: 8.3984 - val_accuracy: 0.4896 - val_loss: 0.6932
Epoch 2/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 688ms/step - accuracy: 0.5132 - loss: 0.6931 - val_accuracy: 0.4896 - val_loss: 0.6932
Epoch 3/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 716ms/step - accuracy: 0.4846 - loss: 0.6932 - val_accuracy: 0.4896 - val_loss: 0.6932
Epoch 4/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 650ms/step - accuracy: 0.4962 - loss: 0.6932 - val_accuracy: 0.4896 - val_loss: 0.6932
Epoch 5/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 646ms/step - accuracy: 0.5101 - loss: 0.6931 - val_accuracy: 0.4896 - val_loss: 0.6932
Epoch 6/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 661ms/step - accuracy: 0.5014 - loss: 0.6932 - val_accuracy: 0.4896 - val_loss: 0.6932
Epoch 7/10
[1m31/31[

Epoch 1/10
31/31 ━━━━━━━━━━━━━━━━━━━━ 24s 644ms/step - accuracy: 0.4985 - loss: 24.7281 - val_accuracy: 0.5021 - val_loss: 0.6927
Epoch 2/10
31/31 ━━━━━━━━━━━━━━━━━━━━ 20s 631ms/step - accuracy: 0.5644 - loss: 0.7754 - val_accuracy: 0.5560 - val_loss: 0.6913
Epoch 3/10
31/31 ━━━━━━━━━━━━━━━━━━━━ 19s 600ms/step - accuracy: 0.5887 - loss: 0.6592 - val_accuracy: 0.5270 - val_loss: 0.6892
Epoch 4/10
31/31 ━━━━━━━━━━━━━━━━━━━━ 18s 595ms/step - accuracy: 0.5882 - loss: 0.6672 - val_accuracy: 0.5436 - val_loss: 0.8236
Epoch 5/10
31/31 ━━━━━━━━━━━━━━━━━━━━ 19s 605ms/step - accuracy: 0.5768 - loss: 0.6543 - val_accuracy: 0.5519 - val_loss: 0.7626
Epoch 6/10
31/31 ━━━━━━━━━━━━━━━━━━━━ 19s 607ms/step - accuracy: 0.6173 - loss: 0.6394 - val_accuracy: 0.5726 - val_loss: 0.8320
Epoch 7/10
31/31 ━━━━━━━━━━━━━━━━━━━━ 18s 594ms/step - accuracy: 0.6221 - loss: 0.6213 - val_accuracy: 0.5436 - val_loss: 0.7776
Epoch 8/10
31/31 ━━━━━━━━━━━━━━━━━━━━ 19s 598ms/step - accuracy: 0.6301 - loss: 0.6056 - val_accuracy: 0.5477 - val_loss: 0.8203
Epoch 9/10
31/31 ━━━━━━━━━━━━━━━━━━━━ 19s 625ms/step - accuracy: 0.6308 - loss: 0.6060 - val_accuracy: 0.5602 - val_loss: 0.8401
Epoch 10/10
31/31 ━━━━━━━━━━━━━━━━━━━━ 19s 603ms/step - accuracy: 0.6173 - loss: 0.6087 - val_accuracy: 0.5560 - val_loss: 0.7946
8/8 ━━━━━━━━━━━━━━━━━━━━ 1s 140ms/step - accuracy: 0.5398 - loss: 0.7825
Validation Loss: 0.7945839166641235
Validation Accuracy: 0.5560166239738464

Epoch 1/10
31/31 ━━━━━━━━━━━━━━━━━━━━ 23s 635ms/step - accuracy: 0.5001 - loss: 67.7266 - val_accuracy: 0.4896 - val_loss: 0.7114
Epoch 2/10
31/31 ━━━━━━━━━━━━━━━━━━━━ 19s 607ms/step - accuracy: 0.6140 - loss: 0.6625 - val_accuracy: 0.6266 - val_loss: 0.6798
Epoch 3/10
31/31 ━━━━━━━━━━━━━━━━━━━━ 21s 614ms/step - accuracy: 0.6716 - loss: 0.6441 - val_accuracy: 0.6183 - val_loss: 0.6864
Epoch 4/10
31/31 ━━━━━━━━━━━━━━━━━━━━ 19s 599ms/step - accuracy: 0.7171 - loss: 0.6152 - val_accuracy: 0.6639 - val_loss: 0.6755
Epoch 5/10
31/31 ━━━━━━━━━━━━━━━━━━━━ 19s 596ms/step - accuracy: 0.7047 - loss: 0.6094 - val_accuracy: 0.6390 - val_loss: 0.7702
Epoch 6/10
31/31 ━━━━━━━━━━━━━━━━━━━━ 19s 599ms/step - accuracy: 0.6743 - loss: 0.6281 - val_accuracy: 0.6349 - val_loss: 0.7054
Epoch 7/10
31/31 ━━━━━━━━━━━━━━━━━━━━ 20s 661ms/step - accuracy: 0.6877 - loss: 0.5917 - val_accuracy: 0.6515 - val_loss: 0.6971
Epoch 8/10
31/31 ━━━━━━━━━━━━━━━━━━━━ 19s 612ms/step - accuracy: 0.7227 - loss: 0.5755 - val_accuracy: 0.6639 - val_loss: 0.6896
Epoch 9/10
31/31 ━━━━━━━━━━━━━━━━━━━━ 20s 657ms/step - accuracy: 0.7071 - loss: 0.5810 - val_accuracy: 0.6390 - val_loss: 0.7685
Epoch 10/10
31/31 ━━━━━━━━━━━━━━━━━━━━ 19s 612ms/step - accuracy: 0.7030 - loss: 0.5799 - val_accuracy: 0.6100 - val_loss: 0.9900
8/8 ━━━━━━━━━━━━━━━━━━━━ 1s 135ms/step - accuracy: 0.6262 - loss: 0.9740
Validation Loss: 0.9900058507919312
Validation Accuracy: 0.6099585294723511