### MERGING OF CONTEXT AND UTTERANCE FEATURES

In [1]:
"""
MERGING THE AUDIO EMBEDDINGS OF CONTEXT AND UTTERNACE WITH LABELS AND OTHER FEATURES
MERGING Wav2Vec2 Embeddings
"""

import pandas as pd

# Load the CSV files
csv1 = pd.read_csv("/kaggle/input/btp-audioembeddings/audio_context_Wav2Vec2_base_embeddings.csv")
csv2 = pd.read_csv("/kaggle/input/btp-audioembeddings/audio_utterance_Wav2Vec2_base_embeddings.csv")
map_df = pd.read_csv("/kaggle/input/btp-audioembeddings/context_to_utterance_map.csv")

# Remove the 'audio_context/' and 'audio_utterance/' prefixes from map.csv
map_df["audio_context"] = map_df["audio_context"].str.replace("audio_context/", "", regex=False)
map_df["audio_utterance"] = map_df["audio_utterance"].str.replace("audio_utterance/", "", regex=False)

# Extract features (excluding the first column which is file_name)
features_csv1 = csv1.iloc[:, 1:].copy()  # Features from csv1
features_csv2 = csv2.iloc[:, 1:].copy()  # Features from csv2

# Rename columns to distinguish between csv1 and csv2 features
features_csv1.columns = [f"audio_c_feature_{col}" for col in features_csv1.columns]
features_csv2.columns = [f"audio_u_feature_{col}" for col in features_csv2.columns]

# Add file_name back to features for merging
features_csv1.insert(0, "filename", csv1.iloc[:, 0])
features_csv2.insert(0, "filename", csv2.iloc[:, 0])

# Merge csv1 with map.csv using audio_context (which is file_name in csv1)
merged_df = map_df.merge(features_csv1, left_on="audio_context", right_on="filename", how="inner")

# Merge csv2 with the updated dataframe using audio_utterance (which is file_name in csv2)
merged_df = merged_df.merge(features_csv2, left_on="audio_utterance", right_on="filename", how="inner", suffixes=("_csv1", "_csv2"))

# Drop redundant filename columns from csv1 and csv2
merged_df.drop(columns=["filename_csv1", "filename_csv2"], inplace=True)

# Rename columns to keep them organized
#merged_df.rename(columns={"audio_context": "file_csv1", "audio_utterance": "file_csv2"}, inplace=True)

# Save the final dataset
merged_df.to_csv("audio_features_Wav2Vec2_base.csv", index=False)

print("Merged dataset saved as final_dataset.csv")


Merged dataset saved as final_dataset.csv


# Model Trained On WavLM EMBEDDINGS

### CNN

In [4]:
"""
CNN MODEL
"""
from tensorflow.keras.callbacks import ModelCheckpoint
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Load the final dataset
df = pd.read_csv("/kaggle/input/btp-audioembeddings/audio_features_WavLM_base.csv")

# Extract labels
y = df["Sarcasm"].values  # Labels (0: No sarcasm, 1: Sarcasm)

# Extract context features (from csv1_)
X_context = df[[col for col in df.columns if col.startswith("audio_c_feature_")]].values

# Extract utterance features (from csv2_)
X_utterance = df[[col for col in df.columns if col.startswith("audio_u_feature_")]].values

# Convert to NumPy arrays
X_context = np.array(X_context, dtype=np.float32)
X_utterance = np.array(X_utterance, dtype=np.float32)
y = np.array(y, dtype=np.float32)

# First, split into train (70%) and temp (30%) 
Xc_train, Xc_temp, Xu_train, Xu_temp, y_train, y_temp = train_test_split(
    X_context, X_utterance, y, test_size=0.3, random_state=42, stratify=y
)

# Split temp (30%) into validation (10%) and test (20%)
Xc_val, Xc_test, Xu_val, Xu_test, y_val, y_test = train_test_split(
    Xc_temp, Xu_temp, y_temp, test_size=2/3, random_state=42, stratify=y_temp
)

# CNN Model for Sarcasm Detection
input_dim = 768  # Number of features per input

# Context Branch
input_context = keras.Input(shape=(input_dim,))
context_branch = layers.Reshape((input_dim, 1))(input_context)
context_branch = layers.Conv1D(filters=126, kernel_size=3, activation="tanh")(context_branch)
context_branch = layers.MaxPooling1D(pool_size=2)(context_branch)
context_branch = layers.Flatten()(context_branch)

# Utterance Branch
input_utterance = keras.Input(shape=(input_dim,))
utterance_branch = layers.Reshape((input_dim, 1))(input_utterance)
utterance_branch = layers.Conv1D(filters=126, kernel_size=3, activation="tanh")(utterance_branch)
utterance_branch = layers.MaxPooling1D(pool_size=2)(utterance_branch)
utterance_branch = layers.Flatten()(utterance_branch)

# Concatenation
merged = layers.Concatenate()([context_branch, utterance_branch])
merged = layers.Dense(32, activation="tanh")(merged)
#merged = layers.Dense(32, activation="relu")(merged)
output = layers.Dense(1, activation="sigmoid")(merged)  # Sigmoid for binary classification

# Define Model
model = keras.Model(inputs=[input_context, input_utterance], outputs=output)
model.summary()

# Compile Model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Checkpoint to save the best model based on validation accuracy
checkpoint_callback = ModelCheckpoint(
    "/kaggle/working/wav_lm_base_cnn_model.weights.h5",
    monitor="val_accuracy",  # Monitor validation accuracy
    mode="max",  # Save when val_accuracy is maximum
    save_best_only=True,  # Keep only the best weights
    save_weights_only=True,  # Don't save full model
    verbose=1
)

# Train Model
model.fit(
    [Xc_train, Xu_train], y_train,
    epochs=50, batch_size=32,
    validation_data=([Xc_val, Xu_val], y_val),  # Use validation set
    callbacks=[checkpoint_callback]
)

# Load best model weights
model.load_weights("/kaggle/working/wav_lm_base_cnn_model.weights.h5")
print("Loaded Best Model Weights.")

# Generate predictions using the best model
y_train_pred = (model.predict([Xc_train, Xu_train]) > 0.5).astype(int)
y_val_pred = (model.predict([Xc_val, Xu_val]) > 0.5).astype(int)
y_test_pred = (model.predict([Xc_test, Xu_test]) > 0.5).astype(int)

# Print classification reports for all sets
print("Train Set Classification Report:\n", classification_report(y_train, y_train_pred,digits=4))
print("Validation Set Classification Report:\n", classification_report(y_val, y_val_pred,digits=4))
print("Test Set Classification Report:\n", classification_report(y_test, y_test_pred,digits=4))


Epoch 1/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.5455 - loss: 1.0161
Epoch 1: val_accuracy improved from -inf to 0.55833, saving model to /kaggle/working/wav_lm_base_cnn_model.weights.h5
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 73ms/step - accuracy: 0.5470 - loss: 1.0105 - val_accuracy: 0.5583 - val_loss: 0.7093
Epoch 2/50
[1m22/27[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m0s[0m 8ms/step - accuracy: 0.6418 - loss: 0.6028
Epoch 2: val_accuracy improved from 0.55833 to 0.56667, saving model to /kaggle/working/wav_lm_base_cnn_model.weights.h5
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.6513 - loss: 0.5993 - val_accuracy: 0.5667 - val_loss: 0.6697
Epoch 3/50
[1m22/27[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m0s[0m 8ms/step - accuracy: 0.7400 - loss: 0.5296
Epoch 3: val_accuracy improved from 0.56667 to 0.64167, saving model to /kaggle/working/wav_lm_base_cnn_mo

### FCN

In [5]:
"""
FCN MODEL
"""

from tensorflow.keras.callbacks import ModelCheckpoint
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Load the final dataset
df = pd.read_csv("/kaggle/input/btp-audioembeddings/audio_features_WavLM_base.csv")

# Extract labels
y = df["Sarcasm"].values  # Labels (0: No sarcasm, 1: Sarcasm)

# Extract context and utterance features
X_context = df[[col for col in df.columns if col.startswith("audio_c_feature_")]].values
X_utterance = df[[col for col in df.columns if col.startswith("audio_u_feature_")]].values

# Convert to NumPy arrays
X_context = np.array(X_context, dtype=np.float32)
X_utterance = np.array(X_utterance, dtype=np.float32)
y = np.array(y, dtype=np.float32)

# First, split into train (70%) and temp (30%) 
Xc_train, Xc_temp, Xu_train, Xu_temp, y_train, y_temp = train_test_split(
    X_context, X_utterance, y, test_size=0.3, random_state=42, stratify=y
)

# Split temp (30%) into validation (10%) and test (20%)
Xc_val, Xc_test, Xu_val, Xu_test, y_val, y_test = train_test_split(
    Xc_temp, Xu_temp, y_temp, test_size=2/3, random_state=42, stratify=y_temp
)

# FCN Model for Sarcasm Detection
input_dim = 768  # Number of features per input

# Context Branch (Fully Connected Layers)
input_context = keras.Input(shape=(input_dim,))
context_branch = layers.Dense(256, activation="tanh")(input_context)
context_branch = layers.Dense(128, activation="tanh")(context_branch)

# Utterance Branch (Fully Connected Layers)
input_utterance = keras.Input(shape=(input_dim,))
utterance_branch = layers.Dense(256, activation="tanh")(input_utterance)
utterance_branch = layers.Dense(128, activation="tanh")(utterance_branch)

# Concatenation
merged = layers.Concatenate()([context_branch, utterance_branch])
merged = layers.Dense(64, activation="tanh")(merged)
output = layers.Dense(1, activation="sigmoid")(merged)  # Sigmoid for binary classification

# Define Model
model = keras.Model(inputs=[input_context, input_utterance], outputs=output)
model.summary()

# Compile Model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Checkpoint to save the best model based on validation accuracy
checkpoint_callback = ModelCheckpoint(
    "/kaggle/working/wav_lm_base_fcn_model.weights.h5",
    monitor="val_accuracy",
    mode="max",
    save_best_only=True,
    save_weights_only=True,
    verbose=1
)

# Train Model
model.fit(
    [Xc_train, Xu_train], y_train,
    epochs=50, batch_size=32,
    validation_data=([Xc_val, Xu_val], y_val),
    callbacks=[checkpoint_callback]
)

# Load best model weights
model.load_weights("/kaggle/working/wav_lm_base_fcn_model.weights.h5")
print("Loaded Best Model Weights.")

# Generate predictions using the best model
y_train_pred = (model.predict([Xc_train, Xu_train]) > 0.5).astype(int)
y_val_pred = (model.predict([Xc_val, Xu_val]) > 0.5).astype(int)
y_test_pred = (model.predict([Xc_test, Xu_test]) > 0.5).astype(int)

# Print classification reports
print("Train Set Classification Report:\n", classification_report(y_train, y_train_pred, digits=4))
print("Validation Set Classification Report:\n", classification_report(y_val, y_val_pred, digits=4))
print("Test Set Classification Report:\n", classification_report(y_test, y_test_pred, digits=4))


Epoch 1/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 0.5385 - loss: 0.7904
Epoch 1: val_accuracy improved from -inf to 0.56667, saving model to /kaggle/working/wav_lm_base_fcn_model.weights.h5
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 67ms/step - accuracy: 0.5394 - loss: 0.7888 - val_accuracy: 0.5667 - val_loss: 0.6870
Epoch 2/50
[1m 1/27[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 21ms/step - accuracy: 0.8750 - loss: 0.5142
Epoch 2: val_accuracy improved from 0.56667 to 0.59167, saving model to /kaggle/working/wav_lm_base_fcn_model.weights.h5
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6384 - loss: 0.6417 - val_accuracy: 0.5917 - val_loss: 0.7036
Epoch 3/50
[1m 1/27[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 21ms/step - accuracy: 0.7812 - loss: 0.4878
Epoch 3: val_accuracy did not improve from 0.59167
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

# Model Trained On Wav2Vec2 Embeddings

### CNN 

In [12]:
"""
CNN MODEL
"""
from tensorflow.keras.callbacks import ModelCheckpoint
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Load the final dataset
df = pd.read_csv("/kaggle/input/btp-audioembeddings/audio_features_Wav2Vec2_base.csv")

# Extract labels
y = df["Sarcasm"].values  # Labels (0: No sarcasm, 1: Sarcasm)

# Extract context features (from csv1_)
X_context = df[[col for col in df.columns if col.startswith("audio_c_feature_")]].values

# Extract utterance features (from csv2_)
X_utterance = df[[col for col in df.columns if col.startswith("audio_u_feature_")]].values

# Convert to NumPy arrays
X_context = np.array(X_context, dtype=np.float32)
X_utterance = np.array(X_utterance, dtype=np.float32)
y = np.array(y, dtype=np.float32)

# First, split into train (70%) and temp (30%) 
Xc_train, Xc_temp, Xu_train, Xu_temp, y_train, y_temp = train_test_split(
    X_context, X_utterance, y, test_size=0.3, random_state=42, stratify=y
)

# Split temp (30%) into validation (10%) and test (20%)
Xc_val, Xc_test, Xu_val, Xu_test, y_val, y_test = train_test_split(
    Xc_temp, Xu_temp, y_temp, test_size=2/3, random_state=42, stratify=y_temp
)

# CNN Model for Sarcasm Detection
input_dim = 768  # Number of features per input

# Context Branch
input_context = keras.Input(shape=(input_dim,))
context_branch = layers.Reshape((input_dim, 1))(input_context)
context_branch = layers.Conv1D(filters=126, kernel_size=3, activation="tanh")(context_branch)
context_branch = layers.MaxPooling1D(pool_size=2)(context_branch)
context_branch = layers.Flatten()(context_branch)

# Utterance Branch
input_utterance = keras.Input(shape=(input_dim,))
utterance_branch = layers.Reshape((input_dim, 1))(input_utterance)
utterance_branch = layers.Conv1D(filters=126, kernel_size=3, activation="tanh")(utterance_branch)
utterance_branch = layers.MaxPooling1D(pool_size=2)(utterance_branch)
utterance_branch = layers.Flatten()(utterance_branch)

# Concatenation
merged = layers.Concatenate()([context_branch, utterance_branch])#merged = layers.Dense(32, activation="relu")(merged)
#merged = layers.Dense(32, activation="relu")(merged)
output = layers.Dense(1, activation="sigmoid")(merged)  # Sigmoid for binary classification

# Define Model
model = keras.Model(inputs=[input_context, input_utterance], outputs=output)
model.summary()

# Compile Model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Checkpoint to save the best model based on validation accuracy
checkpoint_callback = ModelCheckpoint(
    "/kaggle/working/wav2vec2_base_cnn_model.weights.h5",
    monitor="val_accuracy",  # Monitor validation accuracy
    mode="max",  # Save when val_accuracy is maximum
    save_best_only=True,  # Keep only the best weights
    save_weights_only=True,  # Don't save full model
    verbose=1
)

# Train Model
model.fit(
    [Xc_train, Xu_train], y_train,
    epochs=50, batch_size=32,
    validation_data=([Xc_val, Xu_val], y_val),  # Use validation set
    callbacks=[checkpoint_callback]
)

# Load best model weights
model.load_weights("/kaggle/working/wav2vec2_base_cnn_model.weights.h5")
print("Loaded Best Model Weights.")

# Generate predictions using the best model
y_train_pred = (model.predict([Xc_train, Xu_train]) > 0.5).astype(int)
y_val_pred = (model.predict([Xc_val, Xu_val]) > 0.5).astype(int)
y_test_pred = (model.predict([Xc_test, Xu_test]) > 0.5).astype(int)

# Print classification reports for all sets
print("Train Set Classification Report:\n", classification_report(y_train, y_train_pred,digits=4))
print("Validation Set Classification Report:\n", classification_report(y_val, y_val_pred,digits=4))
print("Test Set Classification Report:\n", classification_report(y_test, y_test_pred,digits=4))


Epoch 1/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.5786 - loss: 0.7476
Epoch 1: val_accuracy improved from -inf to 0.57500, saving model to /kaggle/working/wav2vec2_base_cnn_model.weights.h5
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 60ms/step - accuracy: 0.5794 - loss: 0.7457 - val_accuracy: 0.5750 - val_loss: 0.6458
Epoch 2/50
[1m25/27[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 7ms/step - accuracy: 0.6759 - loss: 0.6087
Epoch 2: val_accuracy improved from 0.57500 to 0.64167, saving model to /kaggle/working/wav2vec2_base_cnn_model.weights.h5
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.6747 - loss: 0.6095 - val_accuracy: 0.6417 - val_loss: 0.6544
Epoch 3/50
[1m25/27[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 7ms/step - accuracy: 0.6904 - loss: 0.5932
Epoch 3: val_accuracy did not improve from 0.64167
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m

### FCN

In [19]:
"""
FCN MODEL
"""

from tensorflow.keras.callbacks import ModelCheckpoint
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Load the final dataset
df = pd.read_csv("/kaggle/input/btp-audioembeddings/audio_features_Wav2Vec2_base.csv")

# Extract labels
y = df["Sarcasm"].values  # Labels (0: No sarcasm, 1: Sarcasm)

# Extract context and utterance features
X_context = df[[col for col in df.columns if col.startswith("audio_c_feature_")]].values
X_utterance = df[[col for col in df.columns if col.startswith("audio_u_feature_")]].values

# Convert to NumPy arrays
X_context = np.array(X_context, dtype=np.float32)
X_utterance = np.array(X_utterance, dtype=np.float32)
y = np.array(y, dtype=np.float32)

# First, split into train (70%) and temp (30%) 
Xc_train, Xc_temp, Xu_train, Xu_temp, y_train, y_temp = train_test_split(
    X_context, X_utterance, y, test_size=0.3, random_state=42, stratify=y
)

# Split temp (30%) into validation (10%) and test (20%)
Xc_val, Xc_test, Xu_val, Xu_test, y_val, y_test = train_test_split(
    Xc_temp, Xu_temp, y_temp, test_size=2/3, random_state=42, stratify=y_temp
)

# FCN Model for Sarcasm Detection
input_dim = 768  # Number of features per input

# Context Branch (Fully Connected Layers)
input_context = keras.Input(shape=(input_dim,))
context_branch = layers.Dense(128, activation="tanh")(input_context)
context_branch = layers.Dense(64, activation="tanh")(context_branch)

# Utterance Branch (Fully Connected Layers)
input_utterance = keras.Input(shape=(input_dim,))
utterance_branch = layers.Dense(128, activation="tanh")(input_utterance)
utterance_branch = layers.Dense(64, activation="tanh")(utterance_branch)

# Concatenation
merged = layers.Concatenate()([context_branch, utterance_branch])
merged = layers.Dense(64, activation="tanh")(merged)
output = layers.Dense(1, activation="sigmoid")(merged)  # Sigmoid for binary classification

# Define Model
model = keras.Model(inputs=[input_context, input_utterance], outputs=output)
model.summary()

# Compile Model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Checkpoint to save the best model based on validation accuracy
checkpoint_callback = ModelCheckpoint(
    "/kaggle/working/wav2vec2_base_fcn_model.weights.h5",
    monitor="val_accuracy",
    mode="max",
    save_best_only=True,
    save_weights_only=True,
    verbose=1
)

# Train Model
model.fit(
    [Xc_train, Xu_train], y_train,
    epochs=50, batch_size=32,
    validation_data=([Xc_val, Xu_val], y_val),
    callbacks=[checkpoint_callback]
)

# Load best model weights
model.load_weights("/kaggle/working/wav2vec2_base_fcn_model.weights.h5")
print("Loaded Best Model Weights.")

# Generate predictions using the best model
y_train_pred = (model.predict([Xc_train, Xu_train]) > 0.5).astype(int)
y_val_pred = (model.predict([Xc_val, Xu_val]) > 0.5).astype(int)
y_test_pred = (model.predict([Xc_test, Xu_test]) > 0.5).astype(int)

# Print classification reports
print("Train Set Classification Report:\n", classification_report(y_train, y_train_pred, digits=4))
print("Validation Set Classification Report:\n", classification_report(y_val, y_val_pred, digits=4))
print("Test Set Classification Report:\n", classification_report(y_test, y_test_pred, digits=4))


Epoch 1/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 0.5966 - loss: 0.6881
Epoch 1: val_accuracy improved from -inf to 0.55833, saving model to /kaggle/working/wav2vec2_base_fcn_model.weights.h5
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 61ms/step - accuracy: 0.5964 - loss: 0.6881 - val_accuracy: 0.5583 - val_loss: 0.6609
Epoch 2/50
[1m 1/27[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 21ms/step - accuracy: 0.5938 - loss: 0.6271
Epoch 2: val_accuracy improved from 0.55833 to 0.58333, saving model to /kaggle/working/wav2vec2_base_fcn_model.weights.h5
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6788 - loss: 0.6117 - val_accuracy: 0.5833 - val_loss: 0.6826
Epoch 3/50
[1m 1/27[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 21ms/step - accuracy: 0.5938 - loss: 0.6526
Epoch 3: val_accuracy improved from 0.58333 to 0.59167, saving model to /kaggle/working/wav2vec2_base_fcn_model.weights

# Model Trained on HUBERT Embeddings

### CNN

In [7]:
"""
CNN MODEL
"""
from tensorflow.keras.callbacks import ModelCheckpoint
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Load the final dataset
df = pd.read_csv("/kaggle/input/btp-audioembeddings/audio_features_hubert.csv")

# Extract labels
y = df["Sarcasm"].values  # Labels (0: No sarcasm, 1: Sarcasm)

# Extract context features (from csv1_)
X_context = df[[col for col in df.columns if col.startswith("audio_c_feature_")]].values

# Extract utterance features (from csv2_)
X_utterance = df[[col for col in df.columns if col.startswith("audio_u_feature_")]].values

# Convert to NumPy arrays
X_context = np.array(X_context, dtype=np.float32)
X_utterance = np.array(X_utterance, dtype=np.float32)
y = np.array(y, dtype=np.float32)

# First, split into train (70%) and temp (30%) 
Xc_train, Xc_temp, Xu_train, Xu_temp, y_train, y_temp = train_test_split(
    X_context, X_utterance, y, test_size=0.3, random_state=42, stratify=y
)

# Split temp (30%) into validation (10%) and test (20%)
Xc_val, Xc_test, Xu_val, Xu_test, y_val, y_test = train_test_split(
    Xc_temp, Xu_temp, y_temp, test_size=2/3, random_state=42, stratify=y_temp
)

# CNN Model for Sarcasm Detection
input_dim = 768  # Number of features per input

# Context Branch
input_context = keras.Input(shape=(input_dim,))
context_branch = layers.Reshape((input_dim, 1))(input_context)
context_branch = layers.Conv1D(filters=128, kernel_size=3, activation="swish")(context_branch)
context_branch = layers.MaxPooling1D(pool_size=2)(context_branch)
context_branch = layers.Flatten()(context_branch)

# Utterance Branch
input_utterance = keras.Input(shape=(input_dim,))
utterance_branch = layers.Reshape((input_dim, 1))(input_utterance)
utterance_branch = layers.Conv1D(filters=128, kernel_size=3, activation="swish")(utterance_branch)
utterance_branch = layers.MaxPooling1D(pool_size=2)(utterance_branch)
utterance_branch = layers.Flatten()(utterance_branch)

# Concatenation
merged = layers.Concatenate()([context_branch, utterance_branch])
#merged = layers.Dense(768, activation="swish")(merged)
#merged = layers.Dense(32, activation="swish")(merged)
output = layers.Dense(1, activation="sigmoid")(merged)  # Sigmoid for binary classification

# Define Model
model = keras.Model(inputs=[input_context, input_utterance], outputs=output)
model.summary()

# Compile Model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Checkpoint to save the best model based on validation accuracy
checkpoint_callback = ModelCheckpoint(
    "/kaggle/working/hubert_cnn_model.weights.h5",
    monitor="val_accuracy",  # Monitor validation accuracy
    mode="max",  # Save when val_accuracy is maximum
    save_best_only=True,  # Keep only the best weights
    save_weights_only=True,  # Don't save full model
    verbose=1
)

# Train Model
model.fit(
    [Xc_train, Xu_train], y_train,
    epochs=50, batch_size=32,
    validation_data=([Xc_val, Xu_val], y_val),  # Use validation set
    callbacks=[checkpoint_callback]
)

# Load best model weights
model.load_weights("/kaggle/working/hubert_cnn_model.weights.h5")
print("Loaded Best Model Weights.")

# Generate predictions using the best model
y_train_pred = (model.predict([Xc_train, Xu_train]) > 0.5).astype(int)
y_val_pred = (model.predict([Xc_val, Xu_val]) > 0.5).astype(int)
y_test_pred = (model.predict([Xc_test, Xu_test]) > 0.5).astype(int)

# Print classification reports for all sets
print("Train Set Classification Report:\n", classification_report(y_train, y_train_pred,digits=4))
print("Validation Set Classification Report:\n", classification_report(y_val, y_val_pred,digits=4))
print("Test Set Classification Report:\n", classification_report(y_test, y_test_pred,digits=4))


Epoch 1/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 0.5631 - loss: 0.6716
Epoch 1: val_accuracy improved from -inf to 0.60833, saving model to /kaggle/working/hubert_cnn_model.weights.h5
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 107ms/step - accuracy: 0.5643 - loss: 0.6711 - val_accuracy: 0.6083 - val_loss: 0.6563
Epoch 2/50
[1m22/27[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.6580 - loss: 0.6109 
Epoch 2: val_accuracy improved from 0.60833 to 0.61667, saving model to /kaggle/working/hubert_cnn_model.weights.h5
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6609 - loss: 0.6086 - val_accuracy: 0.6167 - val_loss: 0.6386
Epoch 3/50
[1m22/27[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.6565 - loss: 0.6061 
Epoch 3: val_accuracy improved from 0.61667 to 0.64167, saving model to /kaggle/working/hubert_cnn_model.weights.h

### FCN

In [8]:
"""
FCN MODEL
"""

from tensorflow.keras.callbacks import ModelCheckpoint
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Load the final dataset
df = pd.read_csv("/kaggle/input/btp-audioembeddings/audio_features_hubert.csv")

# Extract labels
y = df["Sarcasm"].values  # Labels (0: No sarcasm, 1: Sarcasm)

# Extract context and utterance features
X_context = df[[col for col in df.columns if col.startswith("audio_c_feature_")]].values
X_utterance = df[[col for col in df.columns if col.startswith("audio_u_feature_")]].values

# Convert to NumPy arrays
X_context = np.array(X_context, dtype=np.float32)
X_utterance = np.array(X_utterance, dtype=np.float32)
y = np.array(y, dtype=np.float32)

# First, split into train (70%) and temp (30%) 
Xc_train, Xc_temp, Xu_train, Xu_temp, y_train, y_temp = train_test_split(
    X_context, X_utterance, y, test_size=0.3, random_state=42, stratify=y
)

# Split temp (30%) into validation (10%) and test (20%)
Xc_val, Xc_test, Xu_val, Xu_test, y_val, y_test = train_test_split(
    Xc_temp, Xu_temp, y_temp, test_size=2/3, random_state=42, stratify=y_temp
)

# FCN Model for Sarcasm Detection
input_dim = 768  # Number of features per input

# Context Branch (Fully Connected Layers)
input_context = keras.Input(shape=(input_dim,))
context_branch = layers.Dense(128, activation="tanh")(input_context)
context_branch = layers.Dense(64, activation="tanh")(context_branch)

# Utterance Branch (Fully Connected Layers)
input_utterance = keras.Input(shape=(input_dim,))
utterance_branch = layers.Dense(128, activation="tanh")(input_utterance)
utterance_branch = layers.Dense(64, activation="tanh")(utterance_branch)

# Concatenation
merged = layers.Concatenate()([context_branch, utterance_branch])
merged = layers.Dense(64, activation="tanh")(merged)
output = layers.Dense(1, activation="sigmoid")(merged)  # Sigmoid for binary classification

# Define Model
model = keras.Model(inputs=[input_context, input_utterance], outputs=output)
model.summary()

# Compile Model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Checkpoint to save the best model based on validation accuracy
checkpoint_callback = ModelCheckpoint(
    "/kaggle/working/hubert_fcn_model.weights.h5",
    monitor="val_accuracy",
    mode="max",
    save_best_only=True,
    save_weights_only=True,
    verbose=1
)

# Train Model
model.fit(
    [Xc_train, Xu_train], y_train,
    epochs=50, batch_size=32,
    validation_data=([Xc_val, Xu_val], y_val),
    callbacks=[checkpoint_callback]
)

# Load best model weights
model.load_weights("/kaggle/working/hubert_fcn_model.weights.h5")
print("Loaded Best Model Weights.")

# Generate predictions using the best model
y_train_pred = (model.predict([Xc_train, Xu_train]) > 0.5).astype(int)
y_val_pred = (model.predict([Xc_val, Xu_val]) > 0.5).astype(int)
y_test_pred = (model.predict([Xc_test, Xu_test]) > 0.5).astype(int)

# Print classification reports
print("Train Set Classification Report:\n", classification_report(y_train, y_train_pred, digits=4))
print("Validation Set Classification Report:\n", classification_report(y_val, y_val_pred, digits=4))
print("Test Set Classification Report:\n", classification_report(y_test, y_test_pred, digits=4))


Epoch 1/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - accuracy: 0.6349 - loss: 0.6403
Epoch 1: val_accuracy improved from -inf to 0.65833, saving model to /kaggle/working/hubert_fcn_model.weights.h5
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 61ms/step - accuracy: 0.6351 - loss: 0.6402 - val_accuracy: 0.6583 - val_loss: 0.6296
Epoch 2/50
[1m 1/27[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 26ms/step - accuracy: 0.8125 - loss: 0.5503
Epoch 2: val_accuracy did not improve from 0.65833
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6878 - loss: 0.5940 - val_accuracy: 0.6333 - val_loss: 0.6912
Epoch 3/50
[1m23/27[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 2ms/step - accuracy: 0.6839 - loss: 0.5978 
Epoch 3: val_accuracy did not improve from 0.65833
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6891 - loss: 0.5933 - val_accuracy: 0.6417 - val_lo

# Model Trained ON MMS Embeddings

### CNN

In [8]:
"""
CNN MODEL
"""
from tensorflow.keras.callbacks import ModelCheckpoint
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Load the final dataset
df = pd.read_csv("/kaggle/input/btp-audioembeddings/audio_features_mms.csv")

# Extract labels
y = df["Sarcasm"].values  # Labels (0: No sarcasm, 1: Sarcasm)

# Extract context features (from csv1_)
X_context = df[[col for col in df.columns if col.startswith("audio_c_feature_")]].values

# Extract utterance features (from csv2_)
X_utterance = df[[col for col in df.columns if col.startswith("audio_u_feature_")]].values

# Convert to NumPy arrays
X_context = np.array(X_context, dtype=np.float32)
X_utterance = np.array(X_utterance, dtype=np.float32)
y = np.array(y, dtype=np.float32)

# First, split into train (70%) and temp (30%) 
Xc_train, Xc_temp, Xu_train, Xu_temp, y_train, y_temp = train_test_split(
    X_context, X_utterance, y, test_size=0.3, random_state=42, stratify=y
)

# Split temp (30%) into validation (10%) and test (20%)
Xc_val, Xc_test, Xu_val, Xu_test, y_val, y_test = train_test_split(
    Xc_temp, Xu_temp, y_temp, test_size=2/3, random_state=42, stratify=y_temp
)

# CNN Model for Sarcasm Detection
input_dim = 1280  # Number of features per input

# Context Branch
input_context = keras.Input(shape=(input_dim,))
context_branch = layers.Reshape((input_dim, 1))(input_context)
context_branch = layers.Conv1D(filters=256, kernel_size=3, activation="relu")(context_branch)
context_branch = layers.MaxPooling1D(pool_size=2)(context_branch)
context_branch = layers.Flatten()(context_branch)

# Utterance Branch
input_utterance = keras.Input(shape=(input_dim,))
utterance_branch = layers.Reshape((input_dim, 1))(input_utterance)
utterance_branch = layers.Conv1D(filters=256, kernel_size=3, activation="relu")(utterance_branch)
utterance_branch = layers.MaxPooling1D(pool_size=2)(utterance_branch)
utterance_branch = layers.Flatten()(utterance_branch)

# Concatenation
merged = layers.Concatenate()([context_branch, utterance_branch])
merged = layers.Dense(256, activation="relu")(merged)
merged = layers.Dense(156, activation="relu")(merged)
merged = layers.Dense(128, activation="relu")(merged)
merged = layers.Dense(64, activation="relu")(merged)
merged = layers.Dense(32, activation="relu")(merged)
merged = layers.Dense(8, activation="relu")(merged)
output = layers.Dense(1, activation="sigmoid")(merged)  # Sigmoid for binary classification

# Define Model
model = keras.Model(inputs=[input_context, input_utterance], outputs=output)
model.summary()

# Compile Model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Checkpoint to save the best model based on validation accuracy
checkpoint_callback = ModelCheckpoint(
    "/kaggle/working/mms_cnn_model.weights.h5",
    monitor="val_accuracy",  # Monitor validation accuracy
    mode="max",  # Save when val_accuracy is maximum
    save_best_only=True,  # Keep only the best weights
    save_weights_only=True,  # Don't save full model
    verbose=1
)

# Train Model
model.fit(
    [Xc_train, Xu_train], y_train,
    epochs=50, batch_size=32,
    validation_data=([Xc_val, Xu_val], y_val),  # Use validation set
    callbacks=[checkpoint_callback]
)

# Load best model weights
model.load_weights("/kaggle/working/mms_cnn_model.weights.h5")
print("Loaded Best Model Weights.")

# Generate predictions using the best model
y_train_pred = (model.predict([Xc_train, Xu_train]) > 0.5).astype(int)
y_val_pred = (model.predict([Xc_val, Xu_val]) > 0.5).astype(int)
y_test_pred = (model.predict([Xc_test, Xu_test]) > 0.5).astype(int)

# Print classification reports for all sets
print("Train Set Classification Report:\n", classification_report(y_train, y_train_pred,digits=4))
print("Validation Set Classification Report:\n", classification_report(y_val, y_val_pred,digits=4))
print("Test Set Classification Report:\n", classification_report(y_test, y_test_pred,digits=4))


Epoch 1/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 120ms/step - accuracy: 0.5342 - loss: 0.7621
Epoch 1: val_accuracy improved from -inf to 0.62500, saving model to /kaggle/working/mms_cnn_model.weights.h5
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 312ms/step - accuracy: 0.5346 - loss: 0.7616 - val_accuracy: 0.6250 - val_loss: 0.6777
Epoch 2/50
[1m25/27[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 21ms/step - accuracy: 0.5487 - loss: 0.6851
Epoch 2: val_accuracy did not improve from 0.62500
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.5488 - loss: 0.6850 - val_accuracy: 0.5083 - val_loss: 0.6717
Epoch 3/50
[1m25/27[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 20ms/step - accuracy: 0.6034 - loss: 0.6537
Epoch 3: val_accuracy did not improve from 0.62500
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.6029 - loss: 0.6547 - val_accuracy: 0.59

### FCN

In [12]:
"""
FCN MODEL
"""

from tensorflow.keras.callbacks import ModelCheckpoint
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Load the final dataset
df = pd.read_csv("/kaggle/input/btp-audioembeddings/audio_features_mms.csv")

# Extract labels
y = df["Sarcasm"].values  # Labels (0: No sarcasm, 1: Sarcasm)

# Extract context and utterance features
X_context = df[[col for col in df.columns if col.startswith("audio_c_feature_")]].values
X_utterance = df[[col for col in df.columns if col.startswith("audio_u_feature_")]].values

# Convert to NumPy arrays
X_context = np.array(X_context, dtype=np.float32)
X_utterance = np.array(X_utterance, dtype=np.float32)
y = np.array(y, dtype=np.float32)

# First, split into train (70%) and temp (30%) 
Xc_train, Xc_temp, Xu_train, Xu_temp, y_train, y_temp = train_test_split(
    X_context, X_utterance, y, test_size=0.3, random_state=42, stratify=y
)

# Split temp (30%) into validation (10%) and test (20%)
Xc_val, Xc_test, Xu_val, Xu_test, y_val, y_test = train_test_split(
    Xc_temp, Xu_temp, y_temp, test_size=2/3, random_state=42, stratify=y_temp
)

# FCN Model for Sarcasm Detection
input_dim = 1280  # Number of features per input

# Context Branch (Fully Connected Layers)
input_context = keras.Input(shape=(input_dim,))
context_branch = layers.Dense(128, activation="relu")(input_context)
context_branch = layers.Dense(64, activation="relu")(context_branch)
context_branch = layers.Dense(32, activation="relu")(context_branch)
context_branch = layers.Dense(8, activation="relu")(context_branch)


# Utterance Branch (Fully Connected Layers)
input_utterance = keras.Input(shape=(input_dim,))
utterance_branch = layers.Dense(128, activation="relu")(input_utterance)
utterance_branch = layers.Dense(64, activation="relu")(utterance_branch)
utterance_branch = layers.Dense(32, activation="relu")(utterance_branch)
utterance_branch = layers.Dense(8, activation="relu")(utterance_branch)

# Concatenation
merged = layers.Concatenate()([context_branch, utterance_branch])
merged = layers.Dense(64, activation="relu")(merged)
output = layers.Dense(1, activation="sigmoid")(merged)  # Sigmoid for binary classification

# Define Model
model = keras.Model(inputs=[input_context, input_utterance], outputs=output)
model.summary()

# Compile Model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Checkpoint to save the best model based on validation accuracy
checkpoint_callback = ModelCheckpoint(
    "/kaggle/working/mms_fcn_model.weights.h5",
    monitor="val_accuracy",
    mode="max",
    save_best_only=True,
    save_weights_only=True,
    verbose=1
)

# Train Model
model.fit(
    [Xc_train, Xu_train], y_train,
    epochs=50, batch_size=32,
    validation_data=([Xc_val, Xu_val], y_val),
    callbacks=[checkpoint_callback]
)

# Load best model weights
model.load_weights("/kaggle/working/mms_fcn_model.weights.h5")
print("Loaded Best Model Weights.")

# Generate predictions using the best model
y_train_pred = (model.predict([Xc_train, Xu_train]) > 0.5).astype(int)
y_val_pred = (model.predict([Xc_val, Xu_val]) > 0.5).astype(int)
y_test_pred = (model.predict([Xc_test, Xu_test]) > 0.5).astype(int)

# Print classification reports
print("Train Set Classification Report:\n", classification_report(y_train, y_train_pred, digits=4))
print("Validation Set Classification Report:\n", classification_report(y_val, y_val_pred, digits=4))
print("Test Set Classification Report:\n", classification_report(y_test, y_test_pred, digits=4))


Epoch 1/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - accuracy: 0.4931 - loss: 0.6886
Epoch 1: val_accuracy improved from -inf to 0.58333, saving model to /kaggle/working/mms_fcn_model.weights.h5
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 68ms/step - accuracy: 0.4948 - loss: 0.6884 - val_accuracy: 0.5833 - val_loss: 0.6713
Epoch 2/50
[1m26/27[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - accuracy: 0.5979 - loss: 0.6677 
Epoch 2: val_accuracy did not improve from 0.58333
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5974 - loss: 0.6682 - val_accuracy: 0.5333 - val_loss: 0.6934
Epoch 3/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5206 - loss: 0.6932 
Epoch 3: val_accuracy did not improve from 0.58333
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5211 - loss: 0.6930 - val_accuracy: 0.5750 - 