# Setting Up LanguageBind

In [None]:
!git clone https://github.com/PKU-YuanGroup/LanguageBind

In [None]:
!cd LanguageBind
!pip install torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116


In [None]:
!pip install -r /kaggle/working/LanguageBind/requirements.txt

# Extracting Audio Embeddings Without Clip Type
**Audio Context**

In [None]:
import os
import torch
import pandas as pd
from tqdm import tqdm  # For progress bar
from LanguageBind.languagebind import LanguageBindAudio, LanguageBindAudioTokenizer, LanguageBindAudioProcessor

# Function to extract embeddings and save to CSV
def extract_audio_embeddings(audio_folder: str, output_csv: str, batch_size=32):
    # Load the pretrained model and tokenizer once
    pretrained_ckpt = 'LanguageBind/LanguageBind_Audio_FT'
    model = LanguageBindAudio.from_pretrained(pretrained_ckpt, cache_dir='./cache_dir')
    tokenizer = LanguageBindAudioTokenizer.from_pretrained(pretrained_ckpt, cache_dir='./cache_dir')
    audio_processor = LanguageBindAudioProcessor(model.config, tokenizer)

    # Get all audio file paths
    audio_files = [os.path.join(audio_folder, f) for f in os.listdir(audio_folder) if f.endswith(".wav")]

    if not audio_files:
        print("No audio files found in the directory!")
        return

    dummy_text = [""]  # Single empty text string

    # Put model in evaluation mode **before** the loop
    model.eval()

    total_files = len(audio_files)
    print(f"Processing {total_files} audio files...")

    all_embeddings = []  # Store embeddings temporarily

    for i in tqdm(range(0, total_files, batch_size), desc="Extracting Embeddings"):
        batch_files = audio_files[i:i + batch_size]

        # Process the batch (reusing the single empty text input)
        data = audio_processor(batch_files, dummy_text, return_tensors='pt')

        # Extract embeddings
        with torch.no_grad():
            outputs = model(**data)
            audio_embeddings = outputs.image_embeds.cpu().numpy()  # Convert to NumPy array

        # Store data in memory
        for file, embedding in zip(batch_files, audio_embeddings):
            all_embeddings.append([os.path.basename(file)] + embedding.tolist())

    # Convert to DataFrame (after loop for efficiency)
    df = pd.DataFrame(all_embeddings, columns=["filename"] + [str(i) for i in range(768)])

    # Save to CSV in one go
    df.to_csv(output_csv, index=False)
    
    print(f"\nProcessing complete! Saved embeddings to {output_csv}")

# Example usage
extract_audio_embeddings("/home/iiitd/Nikhil/BTP_DATASET/audio_context", "lb_audio_context_embeddings.csv")


**Audio Utterance**

In [None]:
import os
import torch
import pandas as pd
from tqdm import tqdm  # For progress bar
from LanguageBind.languagebind import LanguageBindAudio, LanguageBindAudioTokenizer, LanguageBindAudioProcessor

# Function to extract embeddings and save to CSV
def extract_audio_embeddings(audio_folder: str, output_csv: str, batch_size=32):
    # Load the pretrained model and tokenizer once
    pretrained_ckpt = 'LanguageBind/LanguageBind_Audio_FT'
    model = LanguageBindAudio.from_pretrained(pretrained_ckpt, cache_dir='./cache_dir')
    tokenizer = LanguageBindAudioTokenizer.from_pretrained(pretrained_ckpt, cache_dir='./cache_dir')
    audio_processor = LanguageBindAudioProcessor(model.config, tokenizer)

    # Get all audio file paths
    audio_files = [os.path.join(audio_folder, f) for f in os.listdir(audio_folder) if f.endswith(".wav")]

    if not audio_files:
        print("No audio files found in the directory!")
        return

    dummy_text = [""]  # Single empty text string

    # Put model in evaluation mode **before** the loop
    model.eval()

    total_files = len(audio_files)
    print(f"Processing {total_files} audio files...")

    all_embeddings = []  # Store embeddings temporarily

    for i in tqdm(range(0, total_files, batch_size), desc="Extracting Embeddings"):
        batch_files = audio_files[i:i + batch_size]

        # Process the batch (reusing the single empty text input)
        data = audio_processor(batch_files, dummy_text, return_tensors='pt')

        # Extract embeddings
        with torch.no_grad():
            outputs = model(**data)
            audio_embeddings = outputs.image_embeds.cpu().numpy()  # Convert to NumPy array

        # Store data in memory
        for file, embedding in zip(batch_files, audio_embeddings):
            all_embeddings.append([os.path.basename(file)] + embedding.tolist())

    # Convert to DataFrame (after loop for efficiency)
    df = pd.DataFrame(all_embeddings, columns=["filename"] + [str(i) for i in range(768)])

    # Save to CSV in one go
    df.to_csv(output_csv, index=False)
    
    print(f"\nProcessing complete! Saved embeddings to {output_csv}")

# Example usage
extract_audio_embeddings("/home/iiitd/Nikhil/BTP_DATASET/audio_utterance", "lb_audio_utterance_embeddings.csv")


**Merging Audio Features**

In [1]:
"""
MERGING THE AUDIO EMBEDDINGS OF CONTEXT AND UTTERNACE WITH LABELS AND OTHER FEATURES
MERGING LanguageBind Embeddings
"""

import pandas as pd

# Load the CSV files
csv1 = pd.read_csv("/kaggle/input/btp-audioembeddings/lb_audio_context_embeddings.csv")
csv2 = pd.read_csv("/kaggle/input/btp-audioembeddings/lb_audio_utterance_embeddings.csv")
map_df = pd.read_csv("/kaggle/input/btp-audioembeddings/context_to_utterance_map.csv")

# Remove the 'audio_context/' and 'audio_utterance/' prefixes from map.csv
map_df["audio_context"] = map_df["audio_context"].str.replace("audio_context/", "", regex=False)
map_df["audio_utterance"] = map_df["audio_utterance"].str.replace("audio_utterance/", "", regex=False)

# Extract features (excluding the first column which is file_name)
features_csv1 = csv1.iloc[:, 1:].copy()  # Features from csv1
features_csv2 = csv2.iloc[:, 1:].copy()  # Features from csv2

# Rename columns to distinguish between csv1 and csv2 features
features_csv1.columns = [f"audio_c_feature_{col}" for col in features_csv1.columns]
features_csv2.columns = [f"audio_u_feature_{col}" for col in features_csv2.columns]

# Add file_name back to features for merging
features_csv1.insert(0, "filename", csv1.iloc[:, 0])
features_csv2.insert(0, "filename", csv2.iloc[:, 0])

# Merge csv1 with map.csv using audio_context (which is file_name in csv1)
merged_df = map_df.merge(features_csv1, left_on="audio_context", right_on="filename", how="inner")

# Merge csv2 with the updated dataframe using audio_utterance (which is file_name in csv2)
merged_df = merged_df.merge(features_csv2, left_on="audio_utterance", right_on="filename", how="inner", suffixes=("_csv1", "_csv2"))

# Drop redundant filename columns from csv1 and csv2
merged_df.drop(columns=["filename_csv1", "filename_csv2"], inplace=True)

# Rename columns to keep them organized
#merged_df.rename(columns={"audio_context": "file_csv1", "audio_utterance": "file_csv2"}, inplace=True)

# Save the final dataset
merged_df.to_csv("audio_features_lb.csv", index=False)

print("Merged dataset saved.")


Merged dataset saved.


# Extracting Audio Embeddings With Clip Type
**Audio Context**

In [None]:
import os
import torch
import pandas as pd
from tqdm import tqdm  # For progress bar
from LanguageBind.languagebind import LanguageBind, to_device, transform_dict, LanguageBindImageTokenizer

def extract_audio_embeddings(audio_folder: str, output_csv: str, batch_size=32):
    device = torch.device("cpu")
    clip_type = {'audio': 'LanguageBind_Audio_FT'}

    model = LanguageBind(clip_type=clip_type, cache_dir='./cache_dir').to(device)
    model.eval()

    pretrained_ckpt = 'lb203/LanguageBind_Image'
    tokenizer = LanguageBindImageTokenizer.from_pretrained(pretrained_ckpt, cache_dir='./cache_dir/tokenizer_cache_dir')
    modality_transform = {c: transform_dict[c](model.modality_config[c]) for c in clip_type.keys()}

    audio_files = [os.path.join(audio_folder, f) for f in os.listdir(audio_folder) if f.endswith(".wav")]
    if not audio_files:
        print("No audio files found in the directory!")
        return

    total_files = len(audio_files)
    print(f"Processing {total_files} audio files...")

    all_embeddings = []
    language = [""]

    for i in tqdm(range(0, total_files, batch_size), desc="Extracting Embeddings"):
        batch_files = audio_files[i:i + batch_size]

        inputs = {
            'audio': to_device(modality_transform['audio'](batch_files), device),
            'language': to_device(tokenizer(language, max_length=77, padding='max_length', truncation=True, return_tensors='pt'), device)
        }

        with torch.no_grad():
            embeddings = model(inputs)
            audio_embeddings = embeddings['audio'].cpu().numpy()

        for file, embedding in zip(batch_files, audio_embeddings):
            all_embeddings.append([os.path.basename(file)] + embedding.tolist())

    df = pd.DataFrame(all_embeddings, columns=["filename"] + [str(i) for i in range(768)])
    df.to_csv(output_csv, index=False)
    print(f"\nProcessing complete! Saved embeddings to {output_csv}")

extract_audio_embeddings("/home/iiitd/Nikhil/BTP_DATASET/audio_context", "lb_audio_context_embeddings_cliptype.csv")

**Audio Utterance**

In [None]:
import os
import torch
import pandas as pd
from tqdm import tqdm  # For progress bar
from LanguageBind.languagebind import LanguageBind, to_device, transform_dict, LanguageBindImageTokenizer

def extract_audio_embeddings(audio_folder: str, output_csv: str, batch_size=32):
    device = torch.device("cpu")
    clip_type = {'audio': 'LanguageBind_Audio_FT'}

    model = LanguageBind(clip_type=clip_type, cache_dir='./cache_dir').to(device)
    model.eval()

    pretrained_ckpt = 'lb203/LanguageBind_Image'
    tokenizer = LanguageBindImageTokenizer.from_pretrained(pretrained_ckpt, cache_dir='./cache_dir/tokenizer_cache_dir')
    modality_transform = {c: transform_dict[c](model.modality_config[c]) for c in clip_type.keys()}

    audio_files = [os.path.join(audio_folder, f) for f in os.listdir(audio_folder) if f.endswith(".wav")]
    if not audio_files:
        print("No audio files found in the directory!")
        return

    total_files = len(audio_files)
    print(f"Processing {total_files} audio files...")

    all_embeddings = []
    language = [""]

    for i in tqdm(range(0, total_files, batch_size), desc="Extracting Embeddings"):
        batch_files = audio_files[i:i + batch_size]

        inputs = {
            'audio': to_device(modality_transform['audio'](batch_files), device),
            'language': to_device(tokenizer(language, max_length=77, padding='max_length', truncation=True, return_tensors='pt'), device)
        }

        with torch.no_grad():
            embeddings = model(inputs)
            audio_embeddings = embeddings['audio'].cpu().numpy()

        for file, embedding in zip(batch_files, audio_embeddings):
            all_embeddings.append([os.path.basename(file)] + embedding.tolist())

    df = pd.DataFrame(all_embeddings, columns=["filename"] + [str(i) for i in range(768)])
    df.to_csv(output_csv, index=False)
    print(f"\nProcessing complete! Saved embeddings to {output_csv}")

extract_audio_embeddings("/home/iiitd/Nikhil/BTP_DATASET/audio_utterance", "lb_audio_utterance_embeddings_cliptype.csv")

**Merging Audio features**

In [2]:
"""
MERGING THE AUDIO EMBEDDINGS OF CONTEXT AND UTTERNACE WITH LABELS AND OTHER FEATURES
MERGING LanguageBind Embeddings
"""

import pandas as pd

# Load the CSV files
csv1 = pd.read_csv("/kaggle/input/btp-audioembeddings/lb_audio_context_embeddings_cliptype.csv")
csv2 = pd.read_csv("/kaggle/input/btp-audioembeddings/lb_audio_utterance_embeddings_cliptype.csv")
map_df = pd.read_csv("/kaggle/input/btp-audioembeddings/context_to_utterance_map.csv")

# Remove the 'audio_context/' and 'audio_utterance/' prefixes from map.csv
map_df["audio_context"] = map_df["audio_context"].str.replace("audio_context/", "", regex=False)
map_df["audio_utterance"] = map_df["audio_utterance"].str.replace("audio_utterance/", "", regex=False)

# Extract features (excluding the first column which is file_name)
features_csv1 = csv1.iloc[:, 1:].copy()  # Features from csv1
features_csv2 = csv2.iloc[:, 1:].copy()  # Features from csv2

# Rename columns to distinguish between csv1 and csv2 features
features_csv1.columns = [f"audio_c_feature_{col}" for col in features_csv1.columns]
features_csv2.columns = [f"audio_u_feature_{col}" for col in features_csv2.columns]

# Add file_name back to features for merging
features_csv1.insert(0, "filename", csv1.iloc[:, 0])
features_csv2.insert(0, "filename", csv2.iloc[:, 0])

# Merge csv1 with map.csv using audio_context (which is file_name in csv1)
merged_df = map_df.merge(features_csv1, left_on="audio_context", right_on="filename", how="inner")

# Merge csv2 with the updated dataframe using audio_utterance (which is file_name in csv2)
merged_df = merged_df.merge(features_csv2, left_on="audio_utterance", right_on="filename", how="inner", suffixes=("_csv1", "_csv2"))

# Drop redundant filename columns from csv1 and csv2
merged_df.drop(columns=["filename_csv1", "filename_csv2"], inplace=True)

# Rename columns to keep them organized
#merged_df.rename(columns={"audio_context": "file_csv1", "audio_utterance": "file_csv2"}, inplace=True)

# Save the final dataset
merged_df.to_csv("audio_features_lb_cliptype.csv", index=False)

print("Merged dataset saved.")


Merged dataset saved.


# Model Training 

### CNN

In [7]:
"""
CNN MODEL
"""
from tensorflow.keras.callbacks import ModelCheckpoint
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Load the final dataset
df = pd.read_csv("/kaggle/input/btp-audioembeddings/audio_features_lb.csv")

# Extract labels
y = df["Sarcasm"].values  # Labels (0: No sarcasm, 1: Sarcasm)

# Extract context features (from csv1_)
X_context = df[[col for col in df.columns if col.startswith("audio_c_feature_")]].values

# Extract utterance features (from csv2_)
X_utterance = df[[col for col in df.columns if col.startswith("audio_u_feature_")]].values

# Convert to NumPy arrays
X_context = np.array(X_context, dtype=np.float32)
X_utterance = np.array(X_utterance, dtype=np.float32)
y = np.array(y, dtype=np.float32)

# First, split into train (70%) and temp (30%) 
Xc_train, Xc_temp, Xu_train, Xu_temp, y_train, y_temp = train_test_split(
    X_context, X_utterance, y, test_size=0.3, random_state=42, stratify=y
)

# Split temp (30%) into validation (10%) and test (20%)
Xc_val, Xc_test, Xu_val, Xu_test, y_val, y_test = train_test_split(
    Xc_temp, Xu_temp, y_temp, test_size=2/3, random_state=42, stratify=y_temp
)

# CNN Model for Sarcasm Detection
input_dim = 768  # Number of features per input

# Context Branch
input_context = keras.Input(shape=(input_dim,))
context_branch = layers.Reshape((input_dim, 1))(input_context)
context_branch = layers.Conv1D(filters=128, kernel_size=3, activation="swish")(context_branch)
context_branch = layers.MaxPooling1D(pool_size=2)(context_branch)
context_branch = layers.Flatten()(context_branch)

# Utterance Branch
input_utterance = keras.Input(shape=(input_dim,))
utterance_branch = layers.Reshape((input_dim, 1))(input_utterance)
utterance_branch = layers.Conv1D(filters=128, kernel_size=3, activation="swish")(utterance_branch)
utterance_branch = layers.MaxPooling1D(pool_size=2)(utterance_branch)
utterance_branch = layers.Flatten()(utterance_branch)

# Concatenation
merged = layers.Concatenate()([context_branch, utterance_branch])
#merged = layers.Dense(768, activation="relu")(merged)
#merged = layers.Dense(32, activation="swish")(merged)
output = layers.Dense(1, activation="sigmoid")(merged)  # Sigmoid for binary classification

# Define Model
model = keras.Model(inputs=[input_context, input_utterance], outputs=output)
model.summary()

# Compile Model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Checkpoint to save the best model based on validation accuracy
checkpoint_callback = ModelCheckpoint(
    "/kaggle/working/lb_cnn_model.weights.h5",
    monitor="val_accuracy",  # Monitor validation accuracy
    mode="max",  # Save when val_accuracy is maximum
    save_best_only=True,  # Keep only the best weights
    save_weights_only=True,  # Don't save full model
    verbose=1
)

# Train Model
model.fit(
    [Xc_train, Xu_train], y_train,
    epochs=50, batch_size=32,
    validation_data=([Xc_val, Xu_val], y_val),  # Use validation set
    callbacks=[checkpoint_callback]
)

# Load best model weights
model.load_weights("/kaggle/working/lb_cnn_model.weights.h5")
print("Loaded Best Model Weights.")

# Generate predictions using the best model
y_train_pred = (model.predict([Xc_train, Xu_train]) > 0.5).astype(int)
y_val_pred = (model.predict([Xc_val, Xu_val]) > 0.5).astype(int)
y_test_pred = (model.predict([Xc_test, Xu_test]) > 0.5).astype(int)

# Print classification reports for all sets
print("Train Set Classification Report:\n", classification_report(y_train, y_train_pred,digits=4))
print("Validation Set Classification Report:\n", classification_report(y_val, y_val_pred,digits=4))
print("Test Set Classification Report:\n", classification_report(y_test, y_test_pred,digits=4))


Epoch 1/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.5702 - loss: 0.6850
Epoch 1: val_accuracy improved from -inf to 0.70000, saving model to /kaggle/working/lb_cnn_model.weights.h5
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 44ms/step - accuracy: 0.5712 - loss: 0.6847 - val_accuracy: 0.7000 - val_loss: 0.6520
Epoch 2/50
[1m23/27[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 2ms/step - accuracy: 0.6365 - loss: 0.6491 
Epoch 2: val_accuracy did not improve from 0.70000
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6382 - loss: 0.6470 - val_accuracy: 0.6500 - val_loss: 0.6205
Epoch 3/50
[1m23/27[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 2ms/step - accuracy: 0.6799 - loss: 0.6112 
Epoch 3: val_accuracy did not improve from 0.70000
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6806 - loss: 0.6100 - val_accuracy: 0.6500 - v

### FCN

In [21]:

"""
FCN MODEL
"""
from tensorflow.keras.callbacks import ModelCheckpoint
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Load the final dataset
df = pd.read_csv("/kaggle/input/btp-audioembeddings/audio_features_lb_cliptype.csv")

# Extract labels
y = df["Sarcasm"].values  # Labels (0: No sarcasm, 1: Sarcasm)

# Extract context and utterance features
X_context = df[[col for col in df.columns if col.startswith("audio_c_feature_")]].values
X_utterance = df[[col for col in df.columns if col.startswith("audio_u_feature_")]].values

# Concatenate context and utterance embeddings
X = np.concatenate((X_context, X_utterance), axis=1)

# Convert to NumPy arrays
X = np.array(X, dtype=np.float32)
y = np.array(y, dtype=np.float32)

# First, split into train (70%) and temp (30%) 
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Split temp (30%) into validation (10%) and test (20%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=2/3, random_state=42, stratify=y_temp
)

# Define FCN Model
input_dim = X.shape[1]

model = keras.Sequential([
    layers.Dense(128, activation="swish", input_shape=(input_dim,)),
    #layers.BatchNormalization(),
    #layers.Dropout(0.3),
    #layers.Dense(256, activation="relu"),
    #layers.BatchNormalization(),
    #layers.Dropout(0.3),
    #layers.Dense(128, activation="relu"),
    #layers.BatchNormalization(),
    #layers.Dropout(0.3),
    layers.Dense(1, activation="sigmoid")  # Binary classification
])

model.summary()

# Compile Model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Checkpoint to save the best model based on validation accuracy
checkpoint_callback = ModelCheckpoint(
    "/kaggle/working/lb_fcn_model.weights.h5",
    monitor="val_accuracy",  # Monitor validation accuracy
    mode="max",  # Save when val_accuracy is maximum
    save_best_only=True,  # Keep only the best weights
    save_weights_only=True,  # Don't save full model
    verbose=1
)

# Train Model
model.fit(
    X_train, y_train,
    epochs=50, batch_size=32,
    validation_data=(X_val, y_val),  # Use validation set
    callbacks=[checkpoint_callback]
)

# Load best model weights
model.load_weights("/kaggle/working/lb_fcn_model.weights.h5")
print("Loaded Best Model Weights.")

# Generate predictions using the best model
y_train_pred = (model.predict(X_train) > 0.5).astype(int)
y_val_pred = (model.predict(X_val) > 0.5).astype(int)
y_test_pred = (model.predict(X_test) > 0.5).astype(int)

# Print classification reports for all sets
print("Train Set Classification Report:\n", classification_report(y_train, y_train_pred,digits=4))
print("Validation Set Classification Report:\n", classification_report(y_val, y_val_pred,digits=4))
print("Test Set Classification Report:\n", classification_report(y_test, y_test_pred,digits=4))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.5388 - loss: 1.3863
Epoch 1: val_accuracy improved from -inf to 0.65833, saving model to /kaggle/working/lb_fcn_model.weights.h5
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 32ms/step - accuracy: 0.5405 - loss: 1.3714 - val_accuracy: 0.6583 - val_loss: 0.5870
Epoch 2/50
[1m 1/27[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 20ms/step - accuracy: 0.5938 - loss: 0.6341
Epoch 2: val_accuracy improved from 0.65833 to 0.68333, saving model to /kaggle/working/lb_fcn_model.weights.h5
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6676 - loss: 0.5740 - val_accuracy: 0.6833 - val_loss: 0.5531
Epoch 3/50
[1m 1/27[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 19ms/step - accuracy: 0.8750 - loss: 0.4641
Epoch 3: val_accuracy improved from 0.68333 to 0.71667, saving model to /kaggle/working/lb_fcn_model.weights.h5
[1m27/27[0m [32m━━━━━━━━━━