# LanguageBind + MMS
**CNN**

In [6]:
"""
each branch gets a separate cnn
"""

from tensorflow.keras.callbacks import ModelCheckpoint
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Load CSV Files
df1 = pd.read_csv("/kaggle/input/btp-audioembeddings/audio_features_lb.csv")  # LB dataset
df2 = pd.read_csv("/kaggle/input/btp-audioembeddings/audio_features_mms.csv")  # MMS dataset

# Extract Labels (assuming both datasets have the same labels)
y = df1["Sarcasm"].values  

# Extract features from both datasets
Xc1 = df1[[col for col in df1.columns if col.startswith("audio_c_feature_")]].values
Xu1 = df1[[col for col in df1.columns if col.startswith("audio_u_feature_")]].values
Xc2 = df2[[col for col in df2.columns if col.startswith("audio_c_feature_")]].values
Xu2 = df2[[col for col in df2.columns if col.startswith("audio_u_feature_")]].values

# Convert to NumPy arrays
Xc1, Xu1, Xc2, Xu2 = map(lambda x: np.array(x, dtype=np.float32), [Xc1, Xu1, Xc2, Xu2])
y = np.array(y, dtype=np.float32)

# Train-test split (70%-30%)
Xc1_train, Xc1_temp, Xu1_train, Xu1_temp, Xc2_train, Xc2_temp, Xu2_train, Xu2_temp, y_train, y_temp = train_test_split(
    Xc1, Xu1, Xc2, Xu2, y, test_size=0.3, random_state=42, stratify=y
)

# Further split temp set into validation (10%) and test (20%)
Xc1_val, Xc1_test, Xu1_val, Xu1_test, Xc2_val, Xc2_test, Xu2_val, Xu2_test, y_val, y_test = train_test_split(
    Xc1_temp, Xu1_temp, Xc2_temp, Xu2_temp, y_temp, test_size=2/3, random_state=42, stratify=y_temp
)

# Define feature dimensions separately
input_dim_lb = 768  # LB Feature dimension
input_dim_mms = 1280  # MMS Feature dimension

# CNN Model for feature extraction
def build_cnn_branch(input_dim):
    inp = keras.Input(shape=(input_dim,))
    x = layers.Reshape((input_dim, 1))(inp)
    x = layers.Conv1D(filters=126, kernel_size=3, activation="relu")(x)
    x = layers.MaxPooling1D(pool_size=2)(x)
    x = layers.Conv1D(filters=64, kernel_size=3, activation="relu")(x)
    x = layers.MaxPooling1D(pool_size=2)(x)
    x = layers.Flatten()(x)
    return inp, x

# Apply CNN on LB inputs (768)
input_c1, context_cnn1 = build_cnn_branch(input_dim_lb)
input_u1, utterance_cnn1 = build_cnn_branch(input_dim_lb)

# Apply CNN on MMS inputs (1280)
input_c2, context_cnn2 = build_cnn_branch(input_dim_mms)
input_u2, utterance_cnn2 = build_cnn_branch(input_dim_mms)

# First fusion (context_cnn1 + utterance_cnn1) and (context_cnn2 + utterance_cnn2)
fused_1 = layers.Concatenate()([context_cnn1, utterance_cnn1])
fused_2 = layers.Concatenate()([context_cnn2, utterance_cnn2])

# Apply another CNN on fused representations
def build_fused_cnn(input_tensor):
    x = layers.Reshape((-1, 1))(input_tensor) 
    x = layers.Conv1D(filters=64, kernel_size=3, activation="relu")(x)
    x = layers.MaxPooling1D(pool_size=2)(x)
    x = layers.Conv1D(filters=32, kernel_size=3, activation="relu")(x)
    x = layers.MaxPooling1D(pool_size=2)(x)
    x = layers.Flatten()(x)
    return x

def build_fused_fcn(input_tensor):
    x = layers.Dense(128, activation="swish")(input_tensor)
    x = layers.Dense(64, activation="swish")(x)
    x = layers.Dense(32, activation="swish")(x)
    return x

cnn1 = build_fused_fcn(fused_1)
cnn2 = build_fused_fcn(fused_2)

# Final fusion (cnn1 + cnn2)
final_fusion = layers.Concatenate()([cnn1, cnn2])

# Fully connected layers
fc = layers.Dense(64, activation="swish")(final_fusion)
fc = layers.Dense(32, activation="swish")(fc)
output = layers.Dense(1, activation="sigmoid")(fc)

# Define and compile model
model = keras.Model(inputs=[input_c1, input_u1, input_c2, input_u2], outputs=output)
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
model.summary()

# Model checkpoint
checkpoint_callback = ModelCheckpoint(
    "/kaggle/working/lb+mms_cnn.weights.h5",
    monitor="val_accuracy",
    mode="max",
    save_best_only=True,
    save_weights_only=True,
    verbose=1
)

# Train Model
model.fit(
    [Xc1_train, Xu1_train, Xc2_train, Xu2_train], y_train,
    epochs=50, batch_size=32,
    validation_data=([Xc1_val, Xu1_val, Xc2_val, Xu2_val], y_val),
    callbacks=[checkpoint_callback]
)

# Load best weights
model.load_weights("/kaggle/working/lb+mms_cnn.weights.h5")
print("Loaded Best Model Weights.")

# Predictions
y_train_pred = (model.predict([Xc1_train, Xu1_train, Xc2_train, Xu2_train]) > 0.5).astype(int)
y_val_pred = (model.predict([Xc1_val, Xu1_val, Xc2_val, Xu2_val]) > 0.5).astype(int)
y_test_pred = (model.predict([Xc1_test, Xu1_test, Xc2_test, Xu2_test]) > 0.5).astype(int)

# Classification Reports
print("Train Set Classification Report:\n", classification_report(y_train, y_train_pred, digits=4))
print("Validation Set Classification Report:\n", classification_report(y_val, y_val_pred, digits=4))
print("Test Set Classification Report:\n", classification_report(y_test, y_test_pred, digits=4))


Epoch 1/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 100ms/step - accuracy: 0.4927 - loss: 0.7026
Epoch 1: val_accuracy improved from -inf to 0.50000, saving model to /kaggle/working/lb+mms_cnn.weights.h5
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 160ms/step - accuracy: 0.4936 - loss: 0.7024 - val_accuracy: 0.5000 - val_loss: 0.6907
Epoch 2/50
[1m25/27[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 26ms/step - accuracy: 0.5525 - loss: 0.6836
Epoch 2: val_accuracy improved from 0.50000 to 0.57500, saving model to /kaggle/working/lb+mms_cnn.weights.h5
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step - accuracy: 0.5554 - loss: 0.6829 - val_accuracy: 0.5750 - val_loss: 0.6566
Epoch 3/50
[1m25/27[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 26ms/step - accuracy: 0.6350 - loss: 0.6529
Epoch 3: val_accuracy improved from 0.57500 to 0.67500, saving model to /kaggle/working/lb+mms_cnn.weights.h5
[1m27/27[0m

**FCN**

In [1]:
from tensorflow.keras.callbacks import ModelCheckpoint
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Load CSV Files
df1 = pd.read_csv("/kaggle/input/btp-audioembeddings/audio_features_lb.csv")  # LB dataset
df2 = pd.read_csv("/kaggle/input/btp-audioembeddings/audio_features_mms.csv")  # MMS dataset

# Extract Labels
y = df1["Sarcasm"].values  

# Extract features from both datasets
Xc1 = df1[[col for col in df1.columns if col.startswith("audio_c_feature_")]].values
Xu1 = df1[[col for col in df1.columns if col.startswith("audio_u_feature_")]].values
Xc2 = df2[[col for col in df2.columns if col.startswith("audio_c_feature_")]].values
Xu2 = df2[[col for col in df2.columns if col.startswith("audio_u_feature_")]].values

# Convert to NumPy arrays
Xc1, Xu1, Xc2, Xu2 = map(lambda x: np.array(x, dtype=np.float32), [Xc1, Xu1, Xc2, Xu2])
y = np.array(y, dtype=np.float32)

# Train-test split
Xc1_train, Xc1_temp, Xu1_train, Xu1_temp, Xc2_train, Xc2_temp, Xu2_train, Xu2_temp, y_train, y_temp = train_test_split(
    Xc1, Xu1, Xc2, Xu2, y, test_size=0.3, random_state=42, stratify=y
)

Xc1_val, Xc1_test, Xu1_val, Xu1_test, Xc2_val, Xc2_test, Xu2_val, Xu2_test, y_val, y_test = train_test_split(
    Xc1_temp, Xu1_temp, Xc2_temp, Xu2_temp, y_temp, test_size=2/3, random_state=42, stratify=y_temp
)

# Define feature dimensions
input_dim_lb = 768  # LB Feature dimension
input_dim_mms = 1280  # MMS Feature dimension

# Fully connected network branch
def build_fcn_branch(input_dim):
    inp = keras.Input(shape=(input_dim,))
    x = layers.Dense(512, activation="swish")(inp)
    x = layers.Dense(256, activation="swish")(x)
    x = layers.Dense(128, activation="swish")(x)
    return inp, x

# FCN for LB dataset
input_c1, context_fcn1 = build_fcn_branch(input_dim_lb)
input_u1, utterance_fcn1 = build_fcn_branch(input_dim_lb)

# FCN for MMS dataset
input_c2, context_fcn2 = build_fcn_branch(input_dim_mms)
input_u2, utterance_fcn2 = build_fcn_branch(input_dim_mms)

# First fusion
fused_1 = layers.Concatenate()([context_fcn1, utterance_fcn1])
fused_2 = layers.Concatenate()([context_fcn2, utterance_fcn2])

# Additional FCN model after fusion
def build_fcn_model(input_tensor):
    x = layers.Dense(512, activation="swish")(input_tensor)
    x = layers.Dense(256, activation="swish")(x)
    x = layers.Dense(128, activation="swish")(x)
    return x

fcn1 = build_fcn_model(fused_1)
fcn2 = build_fcn_model(fused_2)

# Final fusion
final_fusion = layers.Concatenate()([fcn1, fcn2])

# Fully connected layers
fc = layers.Dense(64, activation="swish")(final_fusion)
fc = layers.Dense(32, activation="swish")(fc)
fc = layers.Dense(8, activation="swish")(fc)
output = layers.Dense(1, activation="sigmoid")(fc)

# Define and compile model
model = keras.Model(inputs=[input_c1, input_u1, input_c2, input_u2], outputs=output)
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
model.summary()

# Model checkpoint
checkpoint_callback = ModelCheckpoint(
    "/kaggle/working/lb+mms_fcn.weights.h5",
    monitor="val_accuracy",
    mode="max",
    save_best_only=True,
    save_weights_only=True,
    verbose=1
)

# Train Model
model.fit(
    [Xc1_train, Xu1_train, Xc2_train, Xu2_train], y_train,
    epochs=50, batch_size=32,
    validation_data=([Xc1_val, Xu1_val, Xc2_val, Xu2_val], y_val),
    callbacks=[checkpoint_callback]
)

# Load best weights
model.load_weights("/kaggle/working/lb+mms_fcn.weights.h5")
print("Loaded Best Model Weights.")

# Predictions
y_train_pred = (model.predict([Xc1_train, Xu1_train, Xc2_train, Xu2_train]) > 0.5).astype(int)
y_val_pred = (model.predict([Xc1_val, Xu1_val, Xc2_val, Xu2_val]) > 0.5).astype(int)
y_test_pred = (model.predict([Xc1_test, Xu1_test, Xc2_test, Xu2_test]) > 0.5).astype(int)

# Classification Reports
print("Train Set Classification Report:\n", classification_report(y_train, y_train_pred, digits=4))
print("Validation Set Classification Report:\n", classification_report(y_val, y_val_pred, digits=4))
print("Test Set Classification Report:\n", classification_report(y_test, y_test_pred, digits=4))


Epoch 1/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms/step - accuracy: 0.4890 - loss: 0.6936
Epoch 1: val_accuracy improved from -inf to 0.60000, saving model to /kaggle/working/lb+mms_fcn.weights.h5
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 146ms/step - accuracy: 0.4907 - loss: 0.6933 - val_accuracy: 0.6000 - val_loss: 0.6500
Epoch 2/50
[1m16/27[0m [32m━━━━━━━━━━━[0m[37m━━━━━━━━━[0m [1m0s[0m 3ms/step - accuracy: 0.6748 - loss: 0.6212 
Epoch 2: val_accuracy improved from 0.60000 to 0.66667, saving model to /kaggle/working/lb+mms_fcn.weights.h5
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.6693 - loss: 0.6244 - val_accuracy: 0.6667 - val_loss: 0.6139
Epoch 3/50
[1m16/27[0m [32m━━━━━━━━━━━[0m[37m━━━━━━━━━[0m [1m0s[0m 3ms/step - accuracy: 0.6427 - loss: 0.6143 
Epoch 3: val_accuracy improved from 0.66667 to 0.68333, saving model to /kaggle/working/lb+mms_fcn.weights.h5
[1m27/27[0m 

# LanguageBind + XLs-R
**CNN**

In [3]:
"""
each branch gets a separate cnn
"""

from tensorflow.keras.callbacks import ModelCheckpoint
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Load CSV Files
df1 = pd.read_csv("/kaggle/input/btp-audioembeddings/audio_features_lb.csv")  # LB dataset
df2 = pd.read_csv("/kaggle/input/btp-audioembeddings/audio_features_xlsr.csv")  # MMS dataset

# Extract Labels (assuming both datasets have the same labels)
y = df1["Sarcasm"].values  

# Extract features from both datasets
Xc1 = df1[[col for col in df1.columns if col.startswith("audio_c_feature_")]].values
Xu1 = df1[[col for col in df1.columns if col.startswith("audio_u_feature_")]].values
Xc2 = df2[[col for col in df2.columns if col.startswith("audio_c_feature_")]].values
Xu2 = df2[[col for col in df2.columns if col.startswith("audio_u_feature_")]].values

# Convert to NumPy arrays
Xc1, Xu1, Xc2, Xu2 = map(lambda x: np.array(x, dtype=np.float32), [Xc1, Xu1, Xc2, Xu2])
y = np.array(y, dtype=np.float32)

# Train-test split (70%-30%)
Xc1_train, Xc1_temp, Xu1_train, Xu1_temp, Xc2_train, Xc2_temp, Xu2_train, Xu2_temp, y_train, y_temp = train_test_split(
    Xc1, Xu1, Xc2, Xu2, y, test_size=0.3, random_state=42, stratify=y
)

# Further split temp set into validation (10%) and test (20%)
Xc1_val, Xc1_test, Xu1_val, Xu1_test, Xc2_val, Xc2_test, Xu2_val, Xu2_test, y_val, y_test = train_test_split(
    Xc1_temp, Xu1_temp, Xc2_temp, Xu2_temp, y_temp, test_size=2/3, random_state=42, stratify=y_temp
)

# Define feature dimensions separately
input_dim_lb = 768  # LB Feature dimension
input_dim_mms = 1280  # MMS Feature dimension

# CNN Model for feature extraction
def build_cnn_branch(input_dim):
    inp = keras.Input(shape=(input_dim,))
    x = layers.Reshape((input_dim, 1))(inp)
    x = layers.Conv1D(filters=126, kernel_size=3, activation="relu")(x)
    x = layers.MaxPooling1D(pool_size=2)(x)
    x = layers.Conv1D(filters=64, kernel_size=3, activation="relu")(x)
    x = layers.MaxPooling1D(pool_size=2)(x)
    x = layers.Flatten()(x)
    return inp, x

# Apply CNN on LB inputs (768)
input_c1, context_cnn1 = build_cnn_branch(input_dim_lb)
input_u1, utterance_cnn1 = build_cnn_branch(input_dim_lb)

# Apply CNN on MMS inputs (1280)
input_c2, context_cnn2 = build_cnn_branch(input_dim_mms)
input_u2, utterance_cnn2 = build_cnn_branch(input_dim_mms)

# First fusion (context_cnn1 + utterance_cnn1) and (context_cnn2 + utterance_cnn2)
fused_1 = layers.Concatenate()([context_cnn1, utterance_cnn1])
fused_2 = layers.Concatenate()([context_cnn2, utterance_cnn2])


def build_fused_fcn(input_tensor):
    x = layers.Dense(128, activation="relu")(input_tensor)
    x = layers.Dense(64, activation="relu")(x)
    x = layers.Dense(32, activation="relu")(x)
    return x

cnn1 = build_fused_fcn(fused_1)
cnn2 = build_fused_fcn(fused_2)

# Final fusion (cnn1 + cnn2)
final_fusion = layers.Concatenate()([cnn1, cnn2])

# Fully connected layers
fc = layers.Dense(64, activation="relu")(final_fusion)
fc = layers.Dense(32, activation="relu")(fc)
output = layers.Dense(1, activation="sigmoid")(fc)

# Define and compile model
model = keras.Model(inputs=[input_c1, input_u1, input_c2, input_u2], outputs=output)
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
model.summary()

# Model checkpoint
checkpoint_callback = ModelCheckpoint(
    "/kaggle/working/lb+xlsr_cnn.weights.h5",
    monitor="val_accuracy",
    mode="max",
    save_best_only=True,
    save_weights_only=True,
    verbose=1
)

# Train Model
model.fit(
    [Xc1_train, Xu1_train, Xc2_train, Xu2_train], y_train,
    epochs=50, batch_size=32,
    validation_data=([Xc1_val, Xu1_val, Xc2_val, Xu2_val], y_val),
    callbacks=[checkpoint_callback]
)

# Load best weights
model.load_weights("/kaggle/working/lb+xlsr_cnn.weights.h5")
print("Loaded Best Model Weights.")

# Predictions
y_train_pred = (model.predict([Xc1_train, Xu1_train, Xc2_train, Xu2_train]) > 0.5).astype(int)
y_val_pred = (model.predict([Xc1_val, Xu1_val, Xc2_val, Xu2_val]) > 0.5).astype(int)
y_test_pred = (model.predict([Xc1_test, Xu1_test, Xc2_test, Xu2_test]) > 0.5).astype(int)

# Classification Reports
print("Train Set Classification Report:\n", classification_report(y_train, y_train_pred, digits=4))
print("Validation Set Classification Report:\n", classification_report(y_val, y_val_pred, digits=4))
print("Test Set Classification Report:\n", classification_report(y_test, y_test_pred, digits=4))


Epoch 1/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 99ms/step - accuracy: 0.5481 - loss: 0.6927
Epoch 1: val_accuracy improved from -inf to 0.50000, saving model to /kaggle/working/lb+xlsr_cnn.weights.h5
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 162ms/step - accuracy: 0.5483 - loss: 0.6926 - val_accuracy: 0.5000 - val_loss: 0.7024
Epoch 2/50
[1m25/27[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 26ms/step - accuracy: 0.5521 - loss: 0.6804
Epoch 2: val_accuracy improved from 0.50000 to 0.60833, saving model to /kaggle/working/lb+xlsr_cnn.weights.h5
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 40ms/step - accuracy: 0.5566 - loss: 0.6795 - val_accuracy: 0.6083 - val_loss: 0.6533
Epoch 3/50
[1m25/27[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 26ms/step - accuracy: 0.6141 - loss: 0.6469
Epoch 3: val_accuracy did not improve from 0.60833
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/st

**FCN**

In [6]:
from tensorflow.keras.callbacks import ModelCheckpoint
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Load CSV Files
df1 = pd.read_csv("/kaggle/input/btp-audioembeddings/audio_features_lb.csv")  # LB dataset
df2 = pd.read_csv("/kaggle/input/btp-audioembeddings/audio_features_xlsr.csv")  # MMS dataset

# Extract Labels
y = df1["Sarcasm"].values  

# Extract features from both datasets
Xc1 = df1[[col for col in df1.columns if col.startswith("audio_c_feature_")]].values
Xu1 = df1[[col for col in df1.columns if col.startswith("audio_u_feature_")]].values
Xc2 = df2[[col for col in df2.columns if col.startswith("audio_c_feature_")]].values
Xu2 = df2[[col for col in df2.columns if col.startswith("audio_u_feature_")]].values

# Convert to NumPy arrays
Xc1, Xu1, Xc2, Xu2 = map(lambda x: np.array(x, dtype=np.float32), [Xc1, Xu1, Xc2, Xu2])
y = np.array(y, dtype=np.float32)

# Train-test split
Xc1_train, Xc1_temp, Xu1_train, Xu1_temp, Xc2_train, Xc2_temp, Xu2_train, Xu2_temp, y_train, y_temp = train_test_split(
    Xc1, Xu1, Xc2, Xu2, y, test_size=0.3, random_state=42, stratify=y
)

Xc1_val, Xc1_test, Xu1_val, Xu1_test, Xc2_val, Xc2_test, Xu2_val, Xu2_test, y_val, y_test = train_test_split(
    Xc1_temp, Xu1_temp, Xc2_temp, Xu2_temp, y_temp, test_size=2/3, random_state=42, stratify=y_temp
)

# Define feature dimensions
input_dim_lb = 768  # LB Feature dimension
input_dim_mms = 1280  # MMS Feature dimension

# Fully connected network branch
def build_fcn_branch(input_dim):
    inp = keras.Input(shape=(input_dim,))
    x = layers.Dense(512, activation="relu")(inp)
    x = layers.Dense(256, activation="relu")(x)
    x = layers.Dense(128, activation="relu")(x)
    return inp, x

# FCN for LB dataset
input_c1, context_fcn1 = build_fcn_branch(input_dim_lb)
input_u1, utterance_fcn1 = build_fcn_branch(input_dim_lb)

# FCN for MMS dataset
input_c2, context_fcn2 = build_fcn_branch(input_dim_mms)
input_u2, utterance_fcn2 = build_fcn_branch(input_dim_mms)

# First fusion
fused_1 = layers.Concatenate()([context_fcn1, utterance_fcn1])
fused_2 = layers.Concatenate()([context_fcn2, utterance_fcn2])

# Additional FCN model after fusion
def build_fcn_model(input_tensor):
    x = layers.Dense(512, activation="relu")(input_tensor)
    x = layers.Dense(256, activation="relu")(x)
    x = layers.Dense(128, activation="relu")(x)
    return x

fcn1 = build_fcn_model(fused_1)
fcn2 = build_fcn_model(fused_2)

# Final fusion
final_fusion = layers.Concatenate()([fcn1, fcn2])

# Fully connected layers
fc = layers.Dense(64, activation="relu")(final_fusion)
fc = layers.Dense(32, activation="relu")(fc)
fc = layers.Dense(8, activation="relu")(fc)
output = layers.Dense(1, activation="sigmoid")(fc)

# Define and compile model
model = keras.Model(inputs=[input_c1, input_u1, input_c2, input_u2], outputs=output)
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
model.summary()

# Model checkpoint
checkpoint_callback = ModelCheckpoint(
    "/kaggle/working/lb+xlsr_fcn.weights.h5",
    monitor="val_accuracy",
    mode="max",
    save_best_only=True,
    save_weights_only=True,
    verbose=1
)

# Train Model
model.fit(
    [Xc1_train, Xu1_train, Xc2_train, Xu2_train], y_train,
    epochs=50, batch_size=32,
    validation_data=([Xc1_val, Xu1_val, Xc2_val, Xu2_val], y_val),
    callbacks=[checkpoint_callback]
)

# Load best weights
model.load_weights("/kaggle/working/lb+xlsr_fcn.weights.h5")
print("Loaded Best Model Weights.")

# Predictions
y_train_pred = (model.predict([Xc1_train, Xu1_train, Xc2_train, Xu2_train]) > 0.5).astype(int)
y_val_pred = (model.predict([Xc1_val, Xu1_val, Xc2_val, Xu2_val]) > 0.5).astype(int)
y_test_pred = (model.predict([Xc1_test, Xu1_test, Xc2_test, Xu2_test]) > 0.5).astype(int)

# Classification Reports
print("Train Set Classification Report:\n", classification_report(y_train, y_train_pred, digits=4))
print("Validation Set Classification Report:\n", classification_report(y_val, y_val_pred, digits=4))
print("Test Set Classification Report:\n", classification_report(y_test, y_test_pred, digits=4))


Epoch 1/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step - accuracy: 0.5215 - loss: 0.6940
Epoch 1: val_accuracy improved from -inf to 0.64167, saving model to /kaggle/working/lb+xlsr_fcn.weights.h5
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 93ms/step - accuracy: 0.5225 - loss: 0.6936 - val_accuracy: 0.6417 - val_loss: 0.6311
Epoch 2/50
[1m16/27[0m [32m━━━━━━━━━━━[0m[37m━━━━━━━━━[0m [1m0s[0m 4ms/step - accuracy: 0.6880 - loss: 0.6009 
Epoch 2: val_accuracy improved from 0.64167 to 0.70000, saving model to /kaggle/working/lb+xlsr_fcn.weights.h5
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.6755 - loss: 0.6137 - val_accuracy: 0.7000 - val_loss: 0.6147
Epoch 3/50
[1m16/27[0m [32m━━━━━━━━━━━[0m[37m━━━━━━━━━[0m [1m0s[0m 3ms/step - accuracy: 0.6826 - loss: 0.5796 
Epoch 3: val_accuracy did not improve from 0.70000
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 

# Whisper + MMS
**CNN**

In [19]:
"""
each branch gets a separate cnn
"""

from tensorflow.keras.callbacks import ModelCheckpoint
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Load CSV Files
df1 = pd.read_csv("/kaggle/input/btp-audioembeddings/audio_features_whisper.csv")  # LB dataset
df2 = pd.read_csv("/kaggle/input/btp-audioembeddings/audio_features_mms.csv")  # MMS dataset

# Extract Labels (assuming both datasets have the same labels)
y = df1["Sarcasm"].values  

# Extract features from both datasets
Xc1 = df1[[col for col in df1.columns if col.startswith("audio_c_feature_")]].values
Xu1 = df1[[col for col in df1.columns if col.startswith("audio_u_feature_")]].values
Xc2 = df2[[col for col in df2.columns if col.startswith("audio_c_feature_")]].values
Xu2 = df2[[col for col in df2.columns if col.startswith("audio_u_feature_")]].values

# Convert to NumPy arrays
Xc1, Xu1, Xc2, Xu2 = map(lambda x: np.array(x, dtype=np.float32), [Xc1, Xu1, Xc2, Xu2])
y = np.array(y, dtype=np.float32)

# Train-test split (70%-30%)
Xc1_train, Xc1_temp, Xu1_train, Xu1_temp, Xc2_train, Xc2_temp, Xu2_train, Xu2_temp, y_train, y_temp = train_test_split(
    Xc1, Xu1, Xc2, Xu2, y, test_size=0.3, random_state=42, stratify=y
)

# Further split temp set into validation (10%) and test (20%)
Xc1_val, Xc1_test, Xu1_val, Xu1_test, Xc2_val, Xc2_test, Xu2_val, Xu2_test, y_val, y_test = train_test_split(
    Xc1_temp, Xu1_temp, Xc2_temp, Xu2_temp, y_temp, test_size=2/3, random_state=42, stratify=y_temp
)

# Define feature dimensions separately
input_dim_lb = 512  # LB Feature dimension
input_dim_mms = 1280  # MMS Feature dimension

# CNN Model for feature extraction
def build_cnn_branch(input_dim):
    inp = keras.Input(shape=(input_dim,))
    x = layers.Reshape((input_dim, 1))(inp)
    x = layers.Conv1D(filters=264, kernel_size=3, activation="swish")(x)
    x = layers.MaxPooling1D(pool_size=2)(x)
    x = layers.Conv1D(filters=126, kernel_size=3, activation="swish")(x)
    x = layers.MaxPooling1D(pool_size=2)(x)
    x = layers.Conv1D(filters=64, kernel_size=3, activation="swish")(x)
    x = layers.MaxPooling1D(pool_size=2)(x)
    x = layers.Flatten()(x)
    return inp, x

# Apply CNN on LB inputs (768)
input_c1, context_cnn1 = build_cnn_branch(input_dim_lb)
input_u1, utterance_cnn1 = build_cnn_branch(input_dim_lb)

# Apply CNN on MMS inputs (1280)
input_c2, context_cnn2 = build_cnn_branch(input_dim_mms)
input_u2, utterance_cnn2 = build_cnn_branch(input_dim_mms)

# First fusion (context_cnn1 + utterance_cnn1) and (context_cnn2 + utterance_cnn2)
fused_1 = layers.Concatenate()([context_cnn1, utterance_cnn1])
fused_2 = layers.Concatenate()([context_cnn2, utterance_cnn2])



def build_fused_fcn(input_tensor):
    x = layers.Dense(264, activation="swish")(input_tensor)
    x = layers.Dense(126, activation="swish")(x)
    x = layers.Dense(64, activation="swish")(x)
    x = layers.Dense(32, activation="swish")(x)
    x = layers.Dense(8, activation="swish")(x)
    return x

cnn1 = build_fused_fcn(fused_1)
cnn2 = build_fused_fcn(fused_2)

# Final fusion (cnn1 + cnn2)
final_fusion = layers.Concatenate()([cnn1, cnn2])

# Fully connected layers
fc = layers.Dense(64, activation="swish")(final_fusion)
fc = layers.Dense(32, activation="swish")(fc)
fc = layers.Dense(8, activation="swish")(fc)
output = layers.Dense(1, activation="sigmoid")(fc)

# Define and compile model
model = keras.Model(inputs=[input_c1, input_u1, input_c2, input_u2], outputs=output)
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
model.summary()

# Model checkpoint
checkpoint_callback = ModelCheckpoint(
    "/kaggle/working/whisper+mms_cnn.weights.h5",
    monitor="val_accuracy",
    mode="max",
    save_best_only=True,
    save_weights_only=True,
    verbose=1
)

# Train Model
model.fit(
    [Xc1_train, Xu1_train, Xc2_train, Xu2_train], y_train,
    epochs=50, batch_size=32,
    validation_data=([Xc1_val, Xu1_val, Xc2_val, Xu2_val], y_val),
    callbacks=[checkpoint_callback]
)

# Load best weights
model.load_weights("/kaggle/working/whisper+mms_cnn.weights.h5")
print("Loaded Best Model Weights.")

# Predictions
y_train_pred = (model.predict([Xc1_train, Xu1_train, Xc2_train, Xu2_train]) > 0.5).astype(int)
y_val_pred = (model.predict([Xc1_val, Xu1_val, Xc2_val, Xu2_val]) > 0.5).astype(int)
y_test_pred = (model.predict([Xc1_test, Xu1_test, Xc2_test, Xu2_test]) > 0.5).astype(int)

# Classification Reports
print("Train Set Classification Report:\n", classification_report(y_train, y_train_pred, digits=4))
print("Validation Set Classification Report:\n", classification_report(y_val, y_val_pred, digits=4))
print("Test Set Classification Report:\n", classification_report(y_test, y_test_pred, digits=4))


Epoch 1/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 112ms/step - accuracy: 0.5049 - loss: 0.6936
Epoch 1: val_accuracy improved from -inf to 0.50000, saving model to /kaggle/working/whisper+mms_cnn.weights.h5
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 183ms/step - accuracy: 0.5045 - loss: 0.6936 - val_accuracy: 0.5000 - val_loss: 0.6931
Epoch 2/50
[1m25/27[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 18ms/step - accuracy: 0.4530 - loss: 0.6937
Epoch 2: val_accuracy did not improve from 0.50000
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.4528 - loss: 0.6938 - val_accuracy: 0.5000 - val_loss: 0.6931
Epoch 3/50
[1m25/27[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 18ms/step - accuracy: 0.5017 - loss: 0.6931
Epoch 3: val_accuracy did not improve from 0.50000
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.5013 - loss: 0.6931 - val_accuracy: 0.

**FCN**

In [30]:
from tensorflow.keras.callbacks import ModelCheckpoint
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Load CSV Files
df1 = pd.read_csv("/kaggle/input/btp-audioembeddings/audio_features_whisper.csv")  # LB dataset
df2 = pd.read_csv("/kaggle/input/btp-audioembeddings/audio_features_mms.csv")  # MMS dataset

# Extract Labels
y = df1["Sarcasm"].values  

# Extract features from both datasets
Xc1 = df1[[col for col in df1.columns if col.startswith("audio_c_feature_")]].values
Xu1 = df1[[col for col in df1.columns if col.startswith("audio_u_feature_")]].values
Xc2 = df2[[col for col in df2.columns if col.startswith("audio_c_feature_")]].values
Xu2 = df2[[col for col in df2.columns if col.startswith("audio_u_feature_")]].values

# Convert to NumPy arrays
Xc1, Xu1, Xc2, Xu2 = map(lambda x: np.array(x, dtype=np.float32), [Xc1, Xu1, Xc2, Xu2])
y = np.array(y, dtype=np.float32)

# Train-test split
Xc1_train, Xc1_temp, Xu1_train, Xu1_temp, Xc2_train, Xc2_temp, Xu2_train, Xu2_temp, y_train, y_temp = train_test_split(
    Xc1, Xu1, Xc2, Xu2, y, test_size=0.3, random_state=42, stratify=y
)

Xc1_val, Xc1_test, Xu1_val, Xu1_test, Xc2_val, Xc2_test, Xu2_val, Xu2_test, y_val, y_test = train_test_split(
    Xc1_temp, Xu1_temp, Xc2_temp, Xu2_temp, y_temp, test_size=2/3, random_state=42, stratify=y_temp
)

# Define feature dimensions
input_dim_lb = 512  # LB Feature dimension
input_dim_mms = 1280  # MMS Feature dimension

# Fully connected network branch
def build_fcn_branch(input_dim):
    inp = keras.Input(shape=(input_dim,))
    x = layers.Dense(684, activation="swish")(inp)
    x = layers.Dense(512, activation="swish")(x)
    x = layers.Dense(256, activation="swish")(x)
    x = layers.Dense(128, activation="swish")(x)
    x = layers.Dense(64, activation="swish")(x)
    x = layers.Dense(32, activation="swish")(x)
    x = layers.Dense(8, activation="swish")(x)
    return inp, x

# FCN for LB dataset
input_c1, context_fcn1 = build_fcn_branch(input_dim_lb)
input_u1, utterance_fcn1 = build_fcn_branch(input_dim_lb)

# FCN for MMS dataset
input_c2, context_fcn2 = build_fcn_branch(input_dim_mms)
input_u2, utterance_fcn2 = build_fcn_branch(input_dim_mms)

# First fusion
fused_1 = layers.Concatenate()([context_fcn1, utterance_fcn1])
fused_2 = layers.Concatenate()([context_fcn2, utterance_fcn2])

# Additional FCN model after fusion
def build_fcn_model(input_tensor):
    x = layers.Dense(512, activation="swish")(input_tensor)
    x = layers.Dense(256, activation="swish")(x)
    x = layers.Dense(128, activation="swish")(x)
    x = layers.Dense(64, activation="swish")(x)
    x = layers.Dense(32, activation="swish")(x)
    return x

fcn1 = build_fcn_model(fused_1)
fcn2 = build_fcn_model(fused_2)

# Final fusion
final_fusion = layers.Concatenate()([fcn1, fcn2])

# Fully connected layers
fc = layers.Dense(64, activation="swish")(final_fusion)
fc = layers.Dense(32, activation="swish")(fc)
fc = layers.Dense(8, activation="swish")(fc)
output = layers.Dense(1, activation="sigmoid")(fc)

# Define and compile model
model = keras.Model(inputs=[input_c1, input_u1, input_c2, input_u2], outputs=output)
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
model.summary()

# Model checkpoint
checkpoint_callback = ModelCheckpoint(
    "/kaggle/working/whisper+mms_fcn.weights.h5",
    monitor="val_accuracy",
    mode="max",
    save_best_only=True,
    save_weights_only=True,
    verbose=1
)

# Train Model
model.fit(
    [Xc1_train, Xu1_train, Xc2_train, Xu2_train], y_train,
    epochs=50, batch_size=32,
    validation_data=([Xc1_val, Xu1_val, Xc2_val, Xu2_val], y_val),
    callbacks=[checkpoint_callback]
)

# Load best weights
model.load_weights("/kaggle/working/whisper+mms_fcn.weights.h5")
print("Loaded Best Model Weights.")

# Predictions
y_train_pred = (model.predict([Xc1_train, Xu1_train, Xc2_train, Xu2_train]) > 0.5).astype(int)
y_val_pred = (model.predict([Xc1_val, Xu1_val, Xc2_val, Xu2_val]) > 0.5).astype(int)
y_test_pred = (model.predict([Xc1_test, Xu1_test, Xc2_test, Xu2_test]) > 0.5).astype(int)

# Classification Reports
print("Train Set Classification Report:\n", classification_report(y_train, y_train_pred, digits=4))
print("Validation Set Classification Report:\n", classification_report(y_val, y_val_pred, digits=4))
print("Test Set Classification Report:\n", classification_report(y_test, y_test_pred, digits=4))


Epoch 1/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 128ms/step - accuracy: 0.5033 - loss: 0.6934
Epoch 1: val_accuracy improved from -inf to 0.50000, saving model to /kaggle/working/whisper+mms_fcn.weights.h5
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 206ms/step - accuracy: 0.5031 - loss: 0.6934 - val_accuracy: 0.5000 - val_loss: 0.6931
Epoch 2/50
[1m22/27[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m0s[0m 5ms/step - accuracy: 0.5190 - loss: 0.6930
Epoch 2: val_accuracy did not improve from 0.50000
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.5222 - loss: 0.6932 - val_accuracy: 0.5000 - val_loss: 0.6931
Epoch 3/50
[1m23/27[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 5ms/step - accuracy: 0.4904 - loss: 0.6933
Epoch 3: val_accuracy did not improve from 0.50000
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.4903 - loss: 0.6933 - val_accuracy: 0.5000

# Whisper + XLS-R
**CNN**

In [4]:
"""
each branch gets a separate cnn
"""

from tensorflow.keras.callbacks import ModelCheckpoint
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Load CSV Files
df1 = pd.read_csv("/kaggle/input/btp-audioembeddings/audio_features_whisper.csv")  # LB dataset
df2 = pd.read_csv("/kaggle/input/btp-audioembeddings/audio_features_xlsr.csv")  # MMS dataset

# Extract Labels (assuming both datasets have the same labels)
y = df1["Sarcasm"].values  

# Extract features from both datasets
Xc1 = df1[[col for col in df1.columns if col.startswith("audio_c_feature_")]].values
Xu1 = df1[[col for col in df1.columns if col.startswith("audio_u_feature_")]].values
Xc2 = df2[[col for col in df2.columns if col.startswith("audio_c_feature_")]].values
Xu2 = df2[[col for col in df2.columns if col.startswith("audio_u_feature_")]].values

# Convert to NumPy arrays
Xc1, Xu1, Xc2, Xu2 = map(lambda x: np.array(x, dtype=np.float32), [Xc1, Xu1, Xc2, Xu2])
y = np.array(y, dtype=np.float32)

# Train-test split (70%-30%)
Xc1_train, Xc1_temp, Xu1_train, Xu1_temp, Xc2_train, Xc2_temp, Xu2_train, Xu2_temp, y_train, y_temp = train_test_split(
    Xc1, Xu1, Xc2, Xu2, y, test_size=0.3, random_state=42, stratify=y
)

# Further split temp set into validation (10%) and test (20%)
Xc1_val, Xc1_test, Xu1_val, Xu1_test, Xc2_val, Xc2_test, Xu2_val, Xu2_test, y_val, y_test = train_test_split(
    Xc1_temp, Xu1_temp, Xc2_temp, Xu2_temp, y_temp, test_size=2/3, random_state=42, stratify=y_temp
)

# Define feature dimensions separately
input_dim_lb = 512  # LB Feature dimension
input_dim_mms = 1280  # MMS Feature dimension

# CNN Model for feature extraction
def build_cnn_branch(input_dim):
    inp = keras.Input(shape=(input_dim,))
    x = layers.Reshape((input_dim, 1))(inp)
    #x = layers.Conv1D(filters=1280, kernel_size=3, activation="swish")(x)
    #x = layers.MaxPooling1D(pool_size=2)(x)
    #x = layers.Conv1D(filters=512, kernel_size=3, activation="swish")(x)
    #x = layers.MaxPooling1D(pool_size=2)(x)
    x = layers.Conv1D(filters=256, kernel_size=3, activation="swish")(x)
    x = layers.MaxPooling1D(pool_size=2)(x)
    x = layers.Conv1D(filters=64, kernel_size=3, activation="swish")(x)
    x = layers.MaxPooling1D(pool_size=2)(x)
    x = layers.Flatten()(x)
    return inp, x

# Apply CNN on LB inputs (768)
input_c1, context_cnn1 = build_cnn_branch(input_dim_lb)
input_u1, utterance_cnn1 = build_cnn_branch(input_dim_lb)

# Apply CNN on MMS inputs (1280)
input_c2, context_cnn2 = build_cnn_branch(input_dim_mms)
input_u2, utterance_cnn2 = build_cnn_branch(input_dim_mms)

# First fusion (context_cnn1 + utterance_cnn1) and (context_cnn2 + utterance_cnn2)
fused_1 = layers.Concatenate()([context_cnn1, utterance_cnn1])
fused_2 = layers.Concatenate()([context_cnn2, utterance_cnn2])



def build_fused_fcn(input_tensor):
    x = layers.Dense(512, activation="relu")(input_tensor)
    x = layers.Dense(256, activation="relu")(x)
    x = layers.Dense(128, activation="relu")(x)
    x = layers.Dense(64, activation="relu")(x)
    x = layers.Dense(32, activation="relu")(x)
    x = layers.Dense(8, activation="relu")(x)
    #x = layers.Dense(8, activation="relu")(x)
    return x

cnn1 = build_fused_fcn(fused_1)
cnn2 = build_fused_fcn(fused_2)

# Final fusion (cnn1 + cnn2)
final_fusion = layers.Concatenate()([cnn1, cnn2])

# Fully connected layers
fc = layers.Dense(64, activation="relu")(final_fusion)
fc = layers.Dense(32, activation="relu")(fc)
fc = layers.Dense(8, activation="relu")(fc)
output = layers.Dense(1, activation="sigmoid")(fc)

# Define and compile model
model = keras.Model(inputs=[input_c1, input_u1, input_c2, input_u2], outputs=output)
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
model.summary()

# Model checkpoint
checkpoint_callback = ModelCheckpoint(
    "/kaggle/working/whisper+xlsr_cnn.weights.h5",
    monitor="val_accuracy",
    mode="max",
    save_best_only=True,
    save_weights_only=True,
    verbose=1
)

# Train Model
model.fit(
    [Xc1_train, Xu1_train, Xc2_train, Xu2_train], y_train,
    epochs=100, batch_size=32,
    validation_data=([Xc1_val, Xu1_val, Xc2_val, Xu2_val], y_val),
    callbacks=[checkpoint_callback]
)

# Load best weights
model.load_weights("/kaggle/working/whisper+xlsr_cnn.weights.h5")
print("Loaded Best Model Weights.")

# Predictions
y_train_pred = (model.predict([Xc1_train, Xu1_train, Xc2_train, Xu2_train]) > 0.5).astype(int)
y_val_pred = (model.predict([Xc1_val, Xu1_val, Xc2_val, Xu2_val]) > 0.5).astype(int)
y_test_pred = (model.predict([Xc1_test, Xu1_test, Xc2_test, Xu2_test]) > 0.5).astype(int)

# Classification Reports
print("Train Set Classification Report:\n", classification_report(y_train, y_train_pred, digits=4))
print("Validation Set Classification Report:\n", classification_report(y_val, y_val_pred, digits=4))
print("Test Set Classification Report:\n", classification_report(y_test, y_test_pred, digits=4))


Epoch 1/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 105ms/step - accuracy: 0.4499 - loss: 0.6937
Epoch 1: val_accuracy improved from -inf to 0.50000, saving model to /kaggle/working/whisper+xlsr_cnn.weights.h5
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 211ms/step - accuracy: 0.4507 - loss: 0.6937 - val_accuracy: 0.5000 - val_loss: 0.6931
Epoch 2/100
[1m25/27[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 15ms/step - accuracy: 0.5110 - loss: 0.6931
Epoch 2: val_accuracy did not improve from 0.50000
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.5104 - loss: 0.6931 - val_accuracy: 0.5000 - val_loss: 0.6931
Epoch 3/100
[1m25/27[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 15ms/step - accuracy: 0.5098 - loss: 0.6931
Epoch 3: val_accuracy did not improve from 0.50000
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.5089 - loss: 0.6931 - val_accuracy

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**FCN**

In [19]:
from tensorflow.keras.callbacks import ModelCheckpoint
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Load CSV Files
df1 = pd.read_csv("/kaggle/input/btp-audioembeddings/audio_features_whisper.csv")  # LB dataset
df2 = pd.read_csv("/kaggle/input/btp-audioembeddings/audio_features_xlsr.csv")  # MMS dataset

# Extract Labels
y = df1["Sarcasm"].values  

# Extract features from both datasets
Xc1 = df1[[col for col in df1.columns if col.startswith("audio_c_feature_")]].values
Xu1 = df1[[col for col in df1.columns if col.startswith("audio_u_feature_")]].values
Xc2 = df2[[col for col in df2.columns if col.startswith("audio_c_feature_")]].values
Xu2 = df2[[col for col in df2.columns if col.startswith("audio_u_feature_")]].values

# Convert to NumPy arrays
Xc1, Xu1, Xc2, Xu2 = map(lambda x: np.array(x, dtype=np.float32), [Xc1, Xu1, Xc2, Xu2])
y = np.array(y, dtype=np.float32)

# Train-test split
Xc1_train, Xc1_temp, Xu1_train, Xu1_temp, Xc2_train, Xc2_temp, Xu2_train, Xu2_temp, y_train, y_temp = train_test_split(
    Xc1, Xu1, Xc2, Xu2, y, test_size=0.3, random_state=42, stratify=y
)

Xc1_val, Xc1_test, Xu1_val, Xu1_test, Xc2_val, Xc2_test, Xu2_val, Xu2_test, y_val, y_test = train_test_split(
    Xc1_temp, Xu1_temp, Xc2_temp, Xu2_temp, y_temp, test_size=2/3, random_state=42, stratify=y_temp
)

# Define feature dimensions
input_dim_lb = 512  # LB Feature dimension
input_dim_mms = 1280  # MMS Feature dimension

# Fully connected network branch
def build_fcn_branch(input_dim):
    inp = keras.Input(shape=(input_dim,))
    x = layers.Dense(256, activation="swish")(inp)
    x = layers.Dense(64, activation="swish")(x)
    x = layers.Dense(32, activation="swish")(x)
    x = layers.Dense(8, activation="swish")(x)
    # x = layers.Dense(64, activation="relu")(x)
    # x = layers.Dense(32, activation="relu")(x)
    return inp, x

# FCN for LB dataset
input_c1, context_fcn1 = build_fcn_branch(input_dim_lb)
input_u1, utterance_fcn1 = build_fcn_branch(input_dim_lb)

# FCN for MMS dataset
input_c2, context_fcn2 = build_fcn_branch(input_dim_mms)
input_u2, utterance_fcn2 = build_fcn_branch(input_dim_mms)

# First fusion
fused_1 = layers.Concatenate()([context_fcn1, utterance_fcn1])
fused_2 = layers.Concatenate()([context_fcn2, utterance_fcn2])

# Additional FCN model after fusion
def build_fcn_model(input_tensor):
    x = layers.Dense(128, activation="swish")(input_tensor)
    x = layers.Dense(64, activation="swish")(x)
    x = layers.Dense(32, activation="swish")(x)
    x = layers.Dense(8, activation="swish")(x)
   # x = layers.Dense(1, activation="relu")(x)
    return x

fcn1 = build_fcn_model(fused_1)
fcn2 = build_fcn_model(fused_2)

# Final fusion
final_fusion = layers.Concatenate()([fcn1, fcn2])

# Fully connected layers
fc = layers.Dense(64, activation="swish")(final_fusion)
fc = layers.Dense(32, activation="swish")(fc)
fc = layers.Dense(8, activation="swish")(fc)
output = layers.Dense(1, activation="sigmoid")(fc)

# Define and compile model
model = keras.Model(inputs=[input_c1, input_u1, input_c2, input_u2], outputs=output)
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
model.summary()

# Model checkpoint
checkpoint_callback = ModelCheckpoint(
    "/kaggle/working/whisper+xlsr_fcn.weights.h5",
    monitor="val_accuracy",
    mode="max",
    save_best_only=True,
    save_weights_only=True,
    verbose=1
)

# Train Model
model.fit(
    [Xc1_train, Xu1_train, Xc2_train, Xu2_train], y_train,
    epochs=50, batch_size=32,
    validation_data=([Xc1_val, Xu1_val, Xc2_val, Xu2_val], y_val),
    callbacks=[checkpoint_callback]
)

# Load best weights
model.load_weights("/kaggle/working/whisper+xlsr_fcn.weights.h5")
print("Loaded Best Model Weights.")

# Predictions
y_train_pred = (model.predict([Xc1_train, Xu1_train, Xc2_train, Xu2_train]) > 0.5).astype(int)
y_val_pred = (model.predict([Xc1_val, Xu1_val, Xc2_val, Xu2_val]) > 0.5).astype(int)
y_test_pred = (model.predict([Xc1_test, Xu1_test, Xc2_test, Xu2_test]) > 0.5).astype(int)

# Classification Reports
print("Train Set Classification Report:\n", classification_report(y_train, y_train_pred, digits=4))
print("Validation Set Classification Report:\n", classification_report(y_val, y_val_pred, digits=4))
print("Test Set Classification Report:\n", classification_report(y_test, y_test_pred, digits=4))


Epoch 1/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 96ms/step - accuracy: 0.4868 - loss: 0.6934
Epoch 1: val_accuracy improved from -inf to 0.46667, saving model to /kaggle/working/whisper+xlsr_fcn.weights.h5
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 161ms/step - accuracy: 0.4875 - loss: 0.6934 - val_accuracy: 0.4667 - val_loss: 0.6826
Epoch 2/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5184 - loss: 0.6941
Epoch 2: val_accuracy improved from 0.46667 to 0.62500, saving model to /kaggle/working/whisper+xlsr_fcn.weights.h5
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.5200 - loss: 0.6938 - val_accuracy: 0.6250 - val_loss: 0.6670
Epoch 3/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5594 - loss: 0.6697
Epoch 3: val_accuracy did not improve from 0.62500
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m

# MMS + XLS-R
**CNN**

In [46]:
"""
each branch gets a separate cnn
"""

from tensorflow.keras.callbacks import ModelCheckpoint
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Load CSV Files
df1 = pd.read_csv("/kaggle/input/btp-audioembeddings/audio_features_mms.csv")  # LB dataset
df2 = pd.read_csv("/kaggle/input/btp-audioembeddings/audio_features_xlsr.csv")  # MMS dataset

# Extract Labels (assuming both datasets have the same labels)
y = df1["Sarcasm"].values  

# Extract features from both datasets
Xc1 = df1[[col for col in df1.columns if col.startswith("audio_c_feature_")]].values
Xu1 = df1[[col for col in df1.columns if col.startswith("audio_u_feature_")]].values
Xc2 = df2[[col for col in df2.columns if col.startswith("audio_c_feature_")]].values
Xu2 = df2[[col for col in df2.columns if col.startswith("audio_u_feature_")]].values

# Convert to NumPy arrays
Xc1, Xu1, Xc2, Xu2 = map(lambda x: np.array(x, dtype=np.float32), [Xc1, Xu1, Xc2, Xu2])
y = np.array(y, dtype=np.float32)

# Train-test split (70%-30%)
Xc1_train, Xc1_temp, Xu1_train, Xu1_temp, Xc2_train, Xc2_temp, Xu2_train, Xu2_temp, y_train, y_temp = train_test_split(
    Xc1, Xu1, Xc2, Xu2, y, test_size=0.3, random_state=42, stratify=y
)

# Further split temp set into validation (10%) and test (20%)
Xc1_val, Xc1_test, Xu1_val, Xu1_test, Xc2_val, Xc2_test, Xu2_val, Xu2_test, y_val, y_test = train_test_split(
    Xc1_temp, Xu1_temp, Xc2_temp, Xu2_temp, y_temp, test_size=2/3, random_state=42, stratify=y_temp
)

# Define feature dimensions separately
input_dim_lb = 1280  # LB Feature dimension
input_dim_mms = 1280  # MMS Feature dimension

# CNN Model for feature extraction
def build_cnn_branch(input_dim):
    inp = keras.Input(shape=(input_dim,))
    x = layers.Reshape((input_dim, 1))(inp)
    x = layers.Conv1D(filters=126, kernel_size=3, activation="swish")(x)
    x = layers.MaxPooling1D(pool_size=2)(x)
    x = layers.Conv1D(filters=64, kernel_size=3, activation="swish")(x)
    x = layers.MaxPooling1D(pool_size=2)(x)
    x = layers.Flatten()(x)
    return inp, x

# Apply CNN on LB inputs (768)
input_c1, context_cnn1 = build_cnn_branch(input_dim_lb)
input_u1, utterance_cnn1 = build_cnn_branch(input_dim_lb)

# Apply CNN on MMS inputs (1280)
input_c2, context_cnn2 = build_cnn_branch(input_dim_mms)
input_u2, utterance_cnn2 = build_cnn_branch(input_dim_mms)

# First fusion (context_cnn1 + utterance_cnn1) and (context_cnn2 + utterance_cnn2)
fused_1 = layers.Concatenate()([context_cnn1, utterance_cnn1])
fused_2 = layers.Concatenate()([context_cnn2, utterance_cnn2])



def build_fused_fcn(input_tensor):
    x = layers.Dense(128, activation="swish")(input_tensor)
    x = layers.Dense(64, activation="swish")(x)
    x = layers.Dense(32, activation="swish")(x)
    return x

cnn1 = build_fused_fcn(fused_1)
cnn2 = build_fused_fcn(fused_2)

# Final fusion (cnn1 + cnn2)
final_fusion = layers.Concatenate()([cnn1, cnn2])

# Fully connected layers
fc = layers.Dense(64, activation="swish")(final_fusion)
fc = layers.Dense(32, activation="swish")(fc)
output = layers.Dense(1, activation="sigmoid")(fc)

# Define and compile model
model = keras.Model(inputs=[input_c1, input_u1, input_c2, input_u2], outputs=output)
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
model.summary()

# Model checkpoint
checkpoint_callback = ModelCheckpoint(
    "/kaggle/working/mms+xlsr_cnn.weights.h5",
    monitor="val_accuracy",
    mode="max",
    save_best_only=True,
    save_weights_only=True,
    verbose=1
)

# Train Model
model.fit(
    [Xc1_train, Xu1_train, Xc2_train, Xu2_train], y_train,
    epochs=50, batch_size=32,
    validation_data=([Xc1_val, Xu1_val, Xc2_val, Xu2_val], y_val),
    callbacks=[checkpoint_callback]
)

# Load best weights
model.load_weights("/kaggle/working/mms+xlsr_cnn.weights.h5")
print("Loaded Best Model Weights.")

# Predictions
y_train_pred = (model.predict([Xc1_train, Xu1_train, Xc2_train, Xu2_train]) > 0.5).astype(int)
y_val_pred = (model.predict([Xc1_val, Xu1_val, Xc2_val, Xu2_val]) > 0.5).astype(int)
y_test_pred = (model.predict([Xc1_test, Xu1_test, Xc2_test, Xu2_test]) > 0.5).astype(int)

# Classification Reports
print("Train Set Classification Report:\n", classification_report(y_train, y_train_pred, digits=4))
print("Validation Set Classification Report:\n", classification_report(y_val, y_val_pred, digits=4))
print("Test Set Classification Report:\n", classification_report(y_test, y_test_pred, digits=4))


Epoch 1/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step - accuracy: 0.5515 - loss: 0.6912
Epoch 1: val_accuracy improved from -inf to 0.51667, saving model to /kaggle/working/mms+xlsr_cnn.weights.h5
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 129ms/step - accuracy: 0.5515 - loss: 0.6910 - val_accuracy: 0.5167 - val_loss: 0.6872
Epoch 2/50
[1m26/27[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 10ms/step - accuracy: 0.5650 - loss: 0.6791
Epoch 2: val_accuracy improved from 0.51667 to 0.64167, saving model to /kaggle/working/mms+xlsr_cnn.weights.h5
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 25ms/step - accuracy: 0.5661 - loss: 0.6789 - val_accuracy: 0.6417 - val_loss: 0.6612
Epoch 3/50
[1m26/27[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 10ms/step - accuracy: 0.5948 - loss: 0.6567
Epoch 3: val_accuracy did not improve from 0.64167
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/s

**FCN**

In [11]:
from tensorflow.keras.callbacks import ModelCheckpoint
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Load CSV Files
df1 = pd.read_csv("/kaggle/input/btp-audioembeddings/audio_features_mms.csv")  # LB dataset
df2 = pd.read_csv("/kaggle/input/btp-audioembeddings/audio_features_xlsr.csv")  # MMS dataset

# Extract Labels
y = df1["Sarcasm"].values  

# Extract features from both datasets
Xc1 = df1[[col for col in df1.columns if col.startswith("audio_c_feature_")]].values
Xu1 = df1[[col for col in df1.columns if col.startswith("audio_u_feature_")]].values
Xc2 = df2[[col for col in df2.columns if col.startswith("audio_c_feature_")]].values
Xu2 = df2[[col for col in df2.columns if col.startswith("audio_u_feature_")]].values

# Convert to NumPy arrays
Xc1, Xu1, Xc2, Xu2 = map(lambda x: np.array(x, dtype=np.float32), [Xc1, Xu1, Xc2, Xu2])
y = np.array(y, dtype=np.float32)

# Train-test split
Xc1_train, Xc1_temp, Xu1_train, Xu1_temp, Xc2_train, Xc2_temp, Xu2_train, Xu2_temp, y_train, y_temp = train_test_split(
    Xc1, Xu1, Xc2, Xu2, y, test_size=0.3, random_state=42, stratify=y
)

Xc1_val, Xc1_test, Xu1_val, Xu1_test, Xc2_val, Xc2_test, Xu2_val, Xu2_test, y_val, y_test = train_test_split(
    Xc1_temp, Xu1_temp, Xc2_temp, Xu2_temp, y_temp, test_size=2/3, random_state=42, stratify=y_temp
)

# Define feature dimensions
input_dim_lb = 1280  # LB Feature dimension
input_dim_mms = 1280  # MMS Feature dimension

# Fully connected network branch
def build_fcn_branch(input_dim):
    inp = keras.Input(shape=(input_dim,))
    x = layers.Dense(512, activation="swish")(inp)
    x = layers.Dense(256, activation="swish")(x)
    x = layers.Dense(128, activation="swish")(x)
    x = layers.Dense(64, activation="swish")(x)
    x = layers.Dense(32, activation="swish")(x)
    return inp, x

# FCN for LB dataset
input_c1, context_fcn1 = build_fcn_branch(input_dim_lb)
input_u1, utterance_fcn1 = build_fcn_branch(input_dim_lb)

# FCN for MMS dataset
input_c2, context_fcn2 = build_fcn_branch(input_dim_mms)
input_u2, utterance_fcn2 = build_fcn_branch(input_dim_mms)

# First fusion
fused_1 = layers.Concatenate()([context_fcn1, utterance_fcn1])
fused_2 = layers.Concatenate()([context_fcn2, utterance_fcn2])

# Additional FCN model after fusion
def build_fcn_model(input_tensor):
    x = layers.Dense(512, activation="swish")(input_tensor)
    x = layers.Dense(256, activation="swish")(x)
    x = layers.Dense(128, activation="swish")(x)
    return x

fcn1 = build_fcn_model(fused_1)
fcn2 = build_fcn_model(fused_2)

# Final fusion
final_fusion = layers.Concatenate()([fcn1, fcn2])

# Fully connected layers
fc = layers.Dense(64, activation="swish")(final_fusion)
fc = layers.Dense(32, activation="swish")(fc)
fc = layers.Dense(8, activation="swish")(fc)
output = layers.Dense(1, activation="sigmoid")(fc)

# Define and compile model
model = keras.Model(inputs=[input_c1, input_u1, input_c2, input_u2], outputs=output)
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
model.summary()

# Model checkpoint
checkpoint_callback = ModelCheckpoint(
    "/kaggle/working/mms+xlsr_fcn.weights.h5",
    monitor="val_accuracy",
    mode="max",
    save_best_only=True,
    save_weights_only=True,
    verbose=1
)

# Train Model
model.fit(
    [Xc1_train, Xu1_train, Xc2_train, Xu2_train], y_train,
    epochs=50, batch_size=32,
    validation_data=([Xc1_val, Xu1_val, Xc2_val, Xu2_val], y_val),
    callbacks=[checkpoint_callback]
)

# Load best weights
model.load_weights("/kaggle/working/mms+xlsr_fcn.weights.h5")
print("Loaded Best Model Weights.")

# Predictions
y_train_pred = (model.predict([Xc1_train, Xu1_train, Xc2_train, Xu2_train]) > 0.5).astype(int)
y_val_pred = (model.predict([Xc1_val, Xu1_val, Xc2_val, Xu2_val]) > 0.5).astype(int)
y_test_pred = (model.predict([Xc1_test, Xu1_test, Xc2_test, Xu2_test]) > 0.5).astype(int)

# Classification Reports
print("Train Set Classification Report:\n", classification_report(y_train, y_train_pred, digits=4))
print("Validation Set Classification Report:\n", classification_report(y_val, y_val_pred, digits=4))
print("Test Set Classification Report:\n", classification_report(y_test, y_test_pred, digits=4))


Epoch 1/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step - accuracy: 0.5209 - loss: 0.6946
Epoch 1: val_accuracy improved from -inf to 0.60000, saving model to /kaggle/working/mms+xlsr_fcn.weights.h5
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 126ms/step - accuracy: 0.5207 - loss: 0.6945 - val_accuracy: 0.6000 - val_loss: 0.6840
Epoch 2/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5452 - loss: 0.6813
Epoch 2: val_accuracy did not improve from 0.60000
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.5458 - loss: 0.6812 - val_accuracy: 0.5833 - val_loss: 0.6615
Epoch 3/50
[1m15/27[0m [32m━━━━━━━━━━━[0m[37m━━━━━━━━━[0m [1m0s[0m 4ms/step - accuracy: 0.5784 - loss: 0.6642 
Epoch 3: val_accuracy did not improve from 0.60000
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.5836 - loss: 0.6631 - val_accuracy: 0.5917 - 