In [1]:
import pandas as pd
import numpy as np

# Path to the CSV file
csv_path = "C:/Users/lenovo/Downloads/data.csv"

# Load the CSV file
data = pd.read_csv(csv_path)

# Extract the 'sarcasm' colum  as array
if 'sarcasm' in data.columns:
    sarcasm_values = data['sarcasm'].values
else:
    sarcasm_values = data.iloc[:, 6].values

# Save the values to a NumPy file
np.save('Y.npy', sarcasm_values)

print("Values from the 'sarcasm' column saved to Y.npy.")
print(f"Shape of Y: {sarcasm_values.shape}, Example values: {sarcasm_values[:10]}")


Values from the 'sarcasm' column saved to Y.npy.
Shape of Y: (1202,), Example values: [0 0 0 1 0 1 1 1 1 1]


In [2]:
import os
import librosa
import numpy as np

# Directory containing audio files
base_audio_dir = 'C:/Users/lenovo/Downloads/audio_utterance/audio_utterance' 
spectrogram_dir = './spectrogram_data'  # Directory to save spectrograms
os.makedirs(spectrogram_dir, exist_ok=True)

# Sampling rate and number of mel bins
sr = 22500
n_mels = 128

# Find the maximum duration across all audio files
def get_max_duration(audio_dir, sr=sr):
    max_duration = 0
    for file_name in os.listdir(audio_dir):
        audio_path = os.path.join(audio_dir, file_name)
        if os.path.isfile(audio_path): 
            y, _ = librosa.load(audio_path, sr=sr)
            duration = librosa.get_duration(y=y, sr=sr)
            max_duration = max(max_duration, duration)
    return max_duration


# Convert audio to fixed-size mel spectrograms based on max duration
def audio_to_mel_spectrogram(audio_path, sr=sr, duration=None, n_mels=n_mels):
    y, _ = librosa.load(audio_path, sr=sr)

    if duration is not None:
        target_length = int(duration * sr)

        # Pad or truncate the audio to match the maximum duration
        if len(y) > target_length:
            y = y[:target_length]
        elif len(y) < target_length:
            y = np.pad(y, (0, target_length - len(y)), mode='constant')

    # Convert to mel spectrogram
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)

    # Resize spectrogram to ensure consistent width
    mel_spec_db = librosa.util.fix_length(mel_spec_db, size=216, axis=1)  
    return mel_spec_db


# Prepare the dataset by converting all audio files to mel spectrograms
mel_spectrograms = []

print("Processing audio files...")
max_duration = get_max_duration(base_audio_dir, sr)  # Determine max duration
print(f"Maximum duration: {max_duration} seconds")

for file_name in os.listdir(base_audio_dir):
    audio_path = os.path.join(base_audio_dir, file_name)
    if os.path.isfile(audio_path):
        try:
            mel_spec = audio_to_mel_spectrogram(audio_path, duration=max_duration)
            mel_spectrograms.append(mel_spec)
        except Exception as e:
            print(f"Error processing file {audio_path}: {e}")

# Convert to numpy array
X = np.array(mel_spectrograms)
X = X[..., np.newaxis]  

# Save the features as a numpy array
np.save(os.path.join(spectrogram_dir, 'X.npy'), X)  # Save features

print(f"Spectrograms saved to {spectrogram_dir}.")
print(f"Shape of X: {X.shape}")


Processing audio files...
Maximum duration: 20.02 seconds
Spectrograms saved to ./spectrogram_data.
Shape of X: (1202, 128, 216, 1)


In [3]:
import tensorflow as tf
from tensorflow.keras import layers, models
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

#  Load and preprocess data 

# Load audio data
X_audio = np.load('C:/Users/lenovo/Downloads/ipwork/spectrogram_data/X.npy')  
y_audio = np.load('C:/Users/lenovo/Downloads/ipwork/Y.npy') 

# Load text data from CSV
csv_data = pd.read_csv("C:/Users/lenovo/Downloads/data.csv")
text_data = csv_data.iloc[:, 2].values #text is in3rd column

# Tokenize and pad text data
tokenizer = Tokenizer(num_words=10000)  # Limiting to top 10,000 words
tokenizer.fit_on_texts(text_data)  # Fit tokenizer on the text data
text_sequences = tokenizer.texts_to_sequences(text_data)  # Convert text to sequences
X_text = pad_sequences(text_sequences, maxlen=100)

# Split the data into training and testing sets 
X_audio_train, X_audio_test, X_text_train, X_text_test, y_train, y_test = train_test_split(
    X_audio, X_text, y_audio, test_size=0.2, random_state=42)

# Building the audio model

# Input layer for audio
audio_input = layers.Input(shape=(128, 216, 1)) 

audio_model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(128, 216, 1)),  # First Conv Layer
    layers.MaxPooling2D((2, 2)),  # Max Pooling
    layers.Conv2D(64, (3, 3), activation='relu'),  # Second Conv Layer
    layers.MaxPooling2D((2, 2)),  # Max Pooling
    layers.Conv2D(64, (3, 3), activation='relu'),  # Third Conv Layer
    layers.Flatten(),  # Flatten the output
    layers.Dense(64, activation='relu'),  # Fully connected layer
    layers.Dropout(0.4)  # Dropout to prevent overfitting
])

audio_output = audio_model(audio_input)  # Pass the input through the audio model

# Build the text model

# Input layer for text
text_input = layers.Input(shape=(100,))  # Define the input shape for text (pad_sequences output)

text_model = models.Sequential([
    layers.Embedding(input_dim=10000, output_dim=128, input_length=100),  # Embedding layer
    layers.LSTM(64),  # LSTM layer for sequence processing
    layers.Dropout(0.4),  # Dropout layer for regularization
    layers.Dense(64, activation='relu'), 
     layers.Dense(32, activation='sigmoid'),                                         # Fully connected layer
    layers.Dense(1, activation='sigmoid'),
    layers.Dropout(0.4)  # Dropout layer for regularization
])

text_output = text_model(text_input)  # Pass the input through the text model

# --- Step 4: Merge the models ---

# Concatenate the outputs of both models
combined_input = layers.concatenate([audio_output, text_output])

# Add dense layers after combining both branches
x = layers.Dense(64, activation='relu')(combined_input)
x = layers.Dropout(0.2)(x)  # Dropout layer for the combined features
x = layers.Dense(32, activation='relu')(x)  # Another Dense layer
output = layers.Dense(1, activation='sigmoid')(x)  # Output layer for binary classification

# --- Step 5: Create the final model ---
model = models.Model(inputs=[audio_input, text_input], outputs=output)

# --- Step 6: Compile the model ---
model.compile(optimizer='adam',
              loss='binary_crossentropy',  # Use binary crossentropy for binary classification
              metrics=['accuracy'])

# Print the model summary
model.summary()

# --- Step 7: Train the model ---
history = model.fit(
    [X_audio_train, X_text_train], y_train, 
    validation_data=([X_audio_test, X_text_test], y_test),  # Validation split using test data
    epochs=10,  # Train for 10 epochs
    batch_size=32  # Set batch size
)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 672ms/step - accuracy: 0.4865 - loss: 3.5411 - val_accuracy: 0.5270 - val_loss: 0.6928
Epoch 2/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 429ms/step - accuracy: 0.5140 - loss: 0.6950 - val_accuracy: 0.5104 - val_loss: 0.6931
Epoch 3/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 381ms/step - accuracy: 0.5210 - loss: 0.6925 - val_accuracy: 0.4896 - val_loss: 0.6933
Epoch 4/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 369ms/step - accuracy: 0.5061 - loss: 0.6937 - val_accuracy: 0.4896 - val_loss: 0.6937
Epoch 5/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 367ms/step - accuracy: 0.4936 - loss: 0.6938 - val_accuracy: 0.4896 - val_loss: 0.6939
Epoch 6/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 377ms/step - accuracy: 0.5347 - loss: 0.6921 - val_accuracy: 0.4896 - val_loss: 0.6938
Epoch 7/10
[1m31/31[

In [4]:
import tensorflow as tf
from tensorflow.keras import layers, models
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# --- Load and preprocess data ---

# Load audio data
X_audio = np.load('C:/Users/lenovo/Downloads/ipwork/spectrogram_data/X.npy')  
y_audio = np.load('C:/Users/lenovo/Downloads/ipwork/Y.npy') 

# Load text data from CSV
csv_data = pd.read_csv("C:/Users/lenovo/Downloads/data.csv")
text_data_7 = csv_data.iloc[:, 2].values  # 7th column
text_data_10 = csv_data.iloc[:, 9].values  # 10th column

# Tokenize and pad text data (7th column)
tokenizer_7 = Tokenizer(num_words=10000)  # Limiting to top 10,000 words
tokenizer_7.fit_on_texts(text_data_7)  # Fit tokenizer on the 7th column text data
text_sequences_7 = tokenizer_7.texts_to_sequences(text_data_7)  # Convert text to sequences
X_text_7 = pad_sequences(text_sequences_7, maxlen=100)

# Tokenize and pad text data (10th column)
tokenizer_10 = Tokenizer(num_words=10000)  # Limiting to top 10,000 words
tokenizer_10.fit_on_texts(text_data_10)  # Fit tokenizer on the 10th column text data
text_sequences_10 = tokenizer_10.texts_to_sequences(text_data_10)  # Convert text to sequences
X_text_10 = pad_sequences(text_sequences_10, maxlen=100)

# Concatenate the processed columns (7th and 10th)
X_text_combined = np.hstack([X_text_7, X_text_10])  # Combine the two text columns horizontally

# Split the data into training and testing sets
X_audio_train, X_audio_test, X_text_train, X_text_test, y_train, y_test = train_test_split(
    X_audio, X_text_combined, y_audio, test_size=0.2, random_state=42)

# --- Building the audio model ---

# Input layer for audio
audio_input = layers.Input(shape=(128, 216, 1)) 

audio_model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(128, 216, 1)),  # First Conv Layer
    layers.MaxPooling2D((2, 2)),  # Max Pooling
    layers.Conv2D(64, (3, 3), activation='relu'),  # Second Conv Layer
    layers.MaxPooling2D((2, 2)),  # Max Pooling
    layers.Conv2D(64, (3, 3), activation='relu'),  # Third Conv Layer
    layers.Flatten(),  # Flatten the output
    layers.Dense(64, activation='relu'),  # Fully connected layer
])

audio_output = audio_model(audio_input)  # Pass the input through the audio model

# --- Build the text model ---

# Input layer for text
text_input = layers.Input(shape=(200,))  # Input shape is 200 after combining (100 from each column)

text_model = models.Sequential([
    layers.Embedding(input_dim=10000, output_dim=128, input_length=200),  # Embedding layer
    layers.SimpleRNN(64),  # RNN layer for sequence processing  # Dropout layer for regularization
    layers.Dense(64, activation='relu'),  # Fully connected layer
])

text_output = text_model(text_input)  # Pass the input through the text model

# --- Merge the models ---

# Concatenate the outputs of both models
combined_input = layers.concatenate([audio_output, text_output])

# Add dense layers after combining both branches
x = layers.Dense(128, activation='relu')(combined_input)  # Increased to 128 neurons
x = layers.Dense(64, activation='relu')(x)  # Next layer with 64 neurons
x = layers.Dense(32, activation='relu')(x)  # Next layer with 32 neurons Another Dropout layer
x = layers.Dense(16, activation='relu')(x)  # Next layer with 16 neurons
output = layers.Dense(1, activation='sigmoid')(x)  # Final layer with 1 neuron

# --- Create the final model ---
model = models.Model(inputs=[audio_input, text_input], outputs=output)

# --- Compile the model ---
model.compile(optimizer='adam',
              loss='binary_crossentropy',  # Use binary crossentropy for binary classification
              metrics=['accuracy'])

# Print the model summary
model.summary()

# --- Train the model ---
history = model.fit(
    [X_audio_train, X_text_train], y_train, 
    validation_data=([X_audio_test, X_text_test], y_test),  # Validation split using test data
    epochs=10,  # Train for 10 epochs
    batch_size=32  # Set batch size
)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 428ms/step - accuracy: 0.4781 - loss: 2.8126 - val_accuracy: 0.4896 - val_loss: 0.6946
Epoch 2/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 350ms/step - accuracy: 0.5105 - loss: 0.6938 - val_accuracy: 0.5975 - val_loss: 0.6919
Epoch 3/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 410ms/step - accuracy: 0.4879 - loss: 0.6927 - val_accuracy: 0.5104 - val_loss: 0.6852
Epoch 4/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 342ms/step - accuracy: 0.5241 - loss: 0.6800 - val_accuracy: 0.6100 - val_loss: 0.6430
Epoch 5/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 420ms/step - accuracy: 0.6248 - loss: 0.6324 - val_accuracy: 0.6100 - val_loss: 0.6206
Epoch 6/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 433ms/step - accuracy: 0.5926 - loss: 0.6516 - val_accuracy: 0.6100 - val_loss: 0.6375
Epoch 7/10
[1m31/31[