In [1]:
import os
import pandas as pd
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np

In [None]:
BASE_DIR = '/kaggle/input/deepfakedataset'

In [None]:
for root, dirs, files in os.walk(BASE_DIR):
    print(f"Root: {root}, Directories: {dirs}, Files: {len(files)}")

In [None]:
import librosa
import librosa.display
import matplotlib.pyplot as plt
import os
import numpy as np

# Initialize variables to track maximum and minimum durations
max_duration = -np.inf
min_duration = np.inf
max_duration_file = None
min_duration_file = None

# Function to analyze individual audio file
def analyze_audio(file_path):
    global max_duration, min_duration, max_duration_file, min_duration_file

    # Load audio file
    audio, sr = librosa.load(file_path, sr=None)
    
    # Compute the duration
    duration = librosa.get_duration(y=audio, sr=sr)
    
    # Update maximum and minimum durations
    if duration > max_duration:
        max_duration = duration
        max_duration_file = file_path
    if duration < min_duration:
        min_duration = duration
        min_duration_file = file_path

    # Print basic audio properties
    print(f"Audio File: {file_path}")
    print(f"Sample Rate: {sr}")
    print(f"Duration: {duration:.2f} seconds")
    
    # Plot the waveform
    plt.figure(figsize=(10, 4))
    librosa.display.waveshow(audio, sr=sr)
    plt.title('Waveform')
    plt.xlabel('Time (s)')
    plt.ylabel('Amplitude')
    plt.show()

    # Compute and plot the spectrogram
    spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=128)  # Fixed typo
    spectrogram_db = librosa.power_to_db(spectrogram, ref=np.max)
    
    plt.figure(figsize=(10, 4))
    librosa.display.specshow(spectrogram_db, sr=sr, x_axis='time', y_axis='mel', cmap='coolwarm')
    plt.colorbar(format='%+2.0f dB')
    plt.title('Mel-Spectrogram')
    plt.show()

# Loop through all files in the directory
for root, dirs, files in os.walk(BASE_DIR):
    for file in files:
        if file.endswith(('.wav', '.mp3', '.flac')):  # Check for valid audio file extensions
            file_path = os.path.join(root, file)  # Construct the full path
            analyze_audio(file_path)  # Pass the full path to the function

# Print the results for maximum and minimum durations
print("\nAnalysis Complete:")
print(f"Longest Audio: {max_duration_file} ({max_duration:.2f} seconds)")
print(f"Shortest Audio: {min_duration_file} ({min_duration:.2f} seconds)")


In [None]:
### import os
import csv

# Define the root directory (adjust this based on your Kaggle dataset path)
root_dir = "/kaggle/input/deepfakedataset/FakeMusicCaps"  
output_csv = "fake musiccaps_metadata.csv"

# Initialize a list to store file paths and labels
data = []

# Traverse through each subfolder in the root directory
for folder_name in os.listdir(root_dir):
    folder_path = os.path.join(root_dir, folder_name)
    
    # Skip non-folder entries
    if not os.path.isdir(folder_path):
        continue
    
    # Iterate through each file in the folder
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        
        # Check if the file is an audio file (adjust extensions if necessary)
        if file_name.endswith(('.mp3', '.wav', '.flac', '.aac')):
            data.append([file_path, folder_name])  # Add file path and folder label to data

# Write the data to a CSV file
with open(output_csv, mode='w', newline='', encoding='utf-8') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(['file_path', 'label'])  # Header
    writer.writerows(data)

print(f"Metadata CSV file has been saved as {output_csv}")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
file_path = './fakemusiccaps_metadata.csv'  
df = pd.read_csv(file_path)

# Display basic information
print("Dataset Overview")
print(df.info())
print("\nFirst Five Rows")
print(df.head())

# Checking for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Descriptive statistics
print("\nDescriptive Statistics:")
print(df.describe())

# Exploring unique classes in the dataset
if 'label' in df.columns:  # Replace 'label' with the column name for class labels
    print("\nClass Distribution:")
    print(df['label'].value_counts())

    # Visualizing class distribution
    plt.figure(figsize=(10, 6))
    sns.countplot(data=df, x='label', order=df['label'].value_counts().index, palette='viridis')
    plt.title("Class Distribution")
    plt.xlabel("Labels")
    plt.ylabel("Count")
    plt.xticks(rotation=45)
    plt.show()

# Exploring durations if available
if 'duration' in df.columns:  # Replace 'duration' with the actual column name
    print("\nDuration Statistics:")
    print(df['duration'].describe())

    # Visualizing durations
    plt.figure(figsize=(10, 6))
    sns.histplot(df['duration'], bins=30, kde=True, color='blue')
    plt.title("Distribution of Durations")
    plt.xlabel("Duration")
    plt.ylabel("Frequency")
    plt.show()

# Checking for duplicates
duplicates = df.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")

# Correlation analysis if numeric columns are available
if df.select_dtypes(include=['float64', 'int64']).shape[1] > 1:
    print("\nCorrelation Matrix:")
    correlation_matrix = df.corr()
    print(correlation_matrix)

    # Heatmap visualization
    plt.figure(figsize=(10, 6))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
    plt.title("Correlation Heatmap")
    plt.show()

# Any additional column-specific analysis
print("\nUnique Columns:")
for col in df.columns:
    unique_values = df[col].nunique()
    print(f"{col}: {unique_values} unique values")

    if unique_values < 15:  # Show value counts for categorical columns with fewer unique values
        print(df[col].value_counts())
        print()


CNN Architecture

In [None]:
import librosa
import librosa.display
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Path to CSV file
CSV_PATH = "/kaggle/working/fakemusiccaps_metadata.csv"

# Load data from CSV
def load_data_from_csv(csv_path):
    data = pd.read_csv(csv_path)
    file_paths = data['file_path'].values
    labels = data['label'].values
    return file_paths, labels

# Preprocessing function to generate spectrogram
def extract_features(file_path):
    audio, sr = librosa.load(file_path, sr=16000)  # Consistent sampling rate
    spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=128)
    spectrogram_db = librosa.power_to_db(spectrogram, ref=np.max)
    return spectrogram_db

# Prepare dataset
def prepare_dataset(file_paths, labels, max_length=256):
    spectrograms = []
    fixed_labels = []
    
    for file_path, label in zip(file_paths, labels):
        try:
            spectrogram = extract_features(file_path)
            if spectrogram.shape[1] > max_length:
                spectrogram = spectrogram[:, :max_length]
            else:
                padding = max_length - spectrogram.shape[1]
                spectrogram = np.pad(spectrogram, ((0, 0), (0, padding)), mode='constant')
            
            spectrogram /= (np.max(spectrogram) + 1e-8)  # Normalize after padding
            spectrograms.append(spectrogram)
            fixed_labels.append(label)
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
    
    return np.array(spectrograms), np.array(fixed_labels)

# Load data
file_paths, labels = load_data_from_csv(CSV_PATH)

# Encode labels
unique_labels = np.unique(labels)
label_map = {label: idx for idx, label in enumerate(unique_labels)}
encoded_labels = np.array([label_map[label] for label in labels])

# Prepare dataset
MAX_LENGTH = 256
spectrograms, encoded_labels = prepare_dataset(file_paths, encoded_labels, max_length=MAX_LENGTH)

# Add channel dimension
spectrograms = spectrograms[..., np.newaxis]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    spectrograms, encoded_labels, test_size=0.2, random_state=42
)

# One-hot encode labels
y_train = to_categorical(y_train, num_classes=len(unique_labels))
y_test = to_categorical(y_test, num_classes=len(unique_labels))

# Build CNN model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(128, MAX_LENGTH, 1)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(len(unique_labels), activation='softmax')
])

# Compile model
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# Train model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

# Evaluate model
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc:.2f}")

# Save the model
model.save('audio_classification_cnn.h5')

# Save the label mapping
label_mapping_df = pd.DataFrame(list(label_map.items()), columns=['Label', 'Encoded Value'])
label_mapping_df.to_csv('label_mapping.csv', index=False)


In [None]:
import librosa
import librosa.display
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

# Path to CSV file
CSV_PATH = "/kaggle/working/fakemusiccaps_metadata.csv"

# Load data from CSV
def load_data_from_csv(csv_path):
    data = pd.read_csv(csv_path)
    file_paths = data['file_path'].values
    labels = data['label'].values
    return file_paths, labels

# Preprocessing function to generate spectrogram
def extract_features(file_path, sr=22050, n_fft=2048, hop_length=512, n_mels=128):
    try:
        audio, sr = librosa.load(file_path, sr=sr)
        spectrogram = librosa.feature.melspectrogram(
            y=audio, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels
        )
        spectrogram_db = librosa.power_to_db(spectrogram, ref=np.max)
        return spectrogram_db
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

# Prepare dataset
def prepare_dataset(file_paths, labels, max_length=256, sr=16000):
    spectrograms, fixed_labels = [], []
    for file_path, label in zip(file_paths, labels):
        spectrogram = extract_features(file_path, sr=sr)
        if spectrogram is not None:
            # Truncate or pad to fixed length
            if spectrogram.shape[1] > max_length:
                spectrogram = spectrogram[:, :max_length]
            else:
                padding = max_length - spectrogram.shape[1]
                spectrogram = np.pad(spectrogram, ((0, 0), (0, padding)), mode='constant')
            spectrograms.append(spectrogram)
            fixed_labels.append(label)
    return np.array(spectrograms), np.array(fixed_labels)

# Load data
file_paths, labels = load_data_from_csv(CSV_PATH)

# Encode labels
unique_labels = np.unique(labels)
label_map = {label: idx for idx, label in enumerate(unique_labels)}
encoded_labels = np.array([label_map[label] for label in labels])

# Define the fixed spectrogram length
MAX_LENGTH = 256

# Prepare dataset
spectrograms, encoded_labels = prepare_dataset(file_paths, encoded_labels, max_length=MAX_LENGTH)

# Normalize spectrograms and add channel dimension
spectrograms = spectrograms / np.max(spectrograms)  # Normalize
spectrograms = spectrograms[..., np.newaxis]  # Add channel dimension

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    spectrograms, encoded_labels, test_size=0.2, random_state=42
)

# One-hot encode labels
y_train = to_categorical(y_train, num_classes=len(unique_labels))
y_test = to_categorical(y_test, num_classes=len(unique_labels))

# Compute class weights
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(encoded_labels),
    y=encoded_labels
)
class_weights = dict(enumerate(class_weights))

# Build CNN model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(128, MAX_LENGTH, 1)),
    BatchNormalization(),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D((2, 2)),
    Conv2D(128, (3, 3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    Dropout(0.5),
    Dense(len(unique_labels), activation='softmax')
])

# Compile model
model.compile(optimizer=Adam(learning_rate=0.0001), loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model with class weights
history = model.fit(
    X_train, y_train,
    epochs=10, batch_size=32,
    validation_split=0.2,
    class_weight=class_weights,
    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)]
)

# Evaluate model
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc:.2f}")

# Save the model
model.save('audio_classification_cnn.h5')

# Save the label mapping for decoding predictions
label_mapping_df = pd.DataFrame(list(label_map.items()), columns=['Label', 'Encoded Value'])
label_mapping_df.to_csv('label_mapping.csv', index=False)


In [None]:
import librosa
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Flatten
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

# Path to CSV file
CSV_PATH = "/kaggle/working/fakemusiccaps_metadata.csv"

# Load data from CSV
def load_data_from_csv(csv_path):
    data = pd.read_csv(csv_path)
    file_paths = data['file_path'].values
    labels = data['label'].values
    return file_paths, labels

# Preprocessing function to generate spectrogram
def extract_features(file_path, sr=22050, n_fft=2048, hop_length=512, n_mels=128):
    try:
        audio, sr = librosa.load(file_path, sr=sr)
        spectrogram = librosa.feature.melspectrogram(
            y=audio, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels
        )
        spectrogram_db = librosa.power_to_db(spectrogram, ref=np.max)
        return spectrogram_db
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

# Prepare dataset
def prepare_dataset(file_paths, labels, max_length=256, sr=16000):
    spectrograms, fixed_labels = [], []
    for file_path, label in zip(file_paths, labels):
        spectrogram = extract_features(file_path, sr=sr)
        if spectrogram is not None:
            # Truncate or pad to fixed length
            if spectrogram.shape[1] > max_length:
                spectrogram = spectrogram[:, :max_length]
            else:
                padding = max_length - spectrogram.shape[1]
                spectrogram = np.pad(spectrogram, ((0, 0), (0, padding)), mode='constant')
            spectrograms.append(spectrogram)
            fixed_labels.append(label)
    return np.array(spectrograms), np.array(fixed_labels)

# Load data
file_paths, labels = load_data_from_csv(CSV_PATH)

# Encode labels
unique_labels = np.unique(labels)
label_map = {label: idx for idx, label in enumerate(unique_labels)}
encoded_labels = np.array([label_map[label] for label in labels])

# Define the fixed spectrogram length
MAX_LENGTH = 256

# Prepare dataset
spectrograms, encoded_labels = prepare_dataset(file_paths, encoded_labels, max_length=MAX_LENGTH)

# Flatten spectrograms for ANN
X_flat = spectrograms.reshape(spectrograms.shape[0], -1)  # Flatten each spectrogram

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_flat, encoded_labels, test_size=0.2, random_state=42
)

# Normalize flattened spectrograms
X_train = X_train / np.max(X_train, axis=1, keepdims=True)  # Normalize each sample
X_test = X_test / np.max(X_test, axis=1, keepdims=True)  # Normalize each sample

# One-hot encode labels
y_train = to_categorical(y_train, num_classes=len(unique_labels))
y_test = to_categorical(y_test, num_classes=len(unique_labels))

# Compute class weights
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(encoded_labels),
    y=encoded_labels
)
class_weights = dict(enumerate(class_weights))

# Build ANN model
ann_model = Sequential([
    Dense(512, activation='relu', input_shape=(X_flat.shape[1],), kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    BatchNormalization(),
    Dropout(0.5),
    Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    BatchNormalization(),
    Dropout(0.5),
    Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    BatchNormalization(),
    Dropout(0.5),
    Dense(len(unique_labels), activation='softmax')  # Output layer with softmax activation
])

# Compile the ANN model
ann_model.compile(optimizer=Adam(learning_rate=0.0001), loss='categorical_crossentropy', metrics=['accuracy'])

# Train the ANN model
ann_history = ann_model.fit(
    X_train, y_train,
    epochs=10, batch_size=32,
    validation_split=0.2,
    class_weight=class_weights,
    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)]
)

# Evaluate the ANN model
ann_test_loss, ann_test_acc = ann_model.evaluate(X_test, y_test)
print(f"Test Accuracy (ANN): {ann_test_acc:.2f}")

# Save the ANN model
ann_model.save('audio_classification_ann.keras')

# Save the label mapping for decoding predictions
label_mapping_df = pd.DataFrame(list(label_map.items()), columns=['Label', 'Encoded Value'])
label_mapping_df.to_csv('ann_label_mapping.csv', index=False)


In [None]:
import os
import csv
import librosa
import librosa.display
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Define the root directory (adjust this based on your Kaggle dataset path)
root_dir = "/kaggle/input/deepfakedataset/FakeMusicCaps"  # Adjust this path as needed
output_csv = "fakemusiccaps_metadata1.csv"

# Initialize a list to store file paths and labels
data = []

# Traverse through each subfolder in the root directory
for folder_name in os.listdir(root_dir):
    folder_path = os.path.join(root_dir, folder_name)
    
    # Skip non-folder entries
    if not os.path.isdir(folder_path):
        continue
    
    # Iterate through each file in the folder
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        
        # Check if the file is an audio file (adjust extensions if necessary)
        if file_name.endswith(('.mp3', '.wav', '.flac', '.aac')):
            data.append([file_path, folder_name])  # Add file path and folder label to data

# Write the data to a CSV file
with open(output_csv, mode='w', newline='', encoding='utf-8') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(['file_path', 'label'])  # Header
    writer.writerows(data)

print(f"Metadata CSV file has been saved as {output_csv}")

# Path to CSV file
CSV_PATH = "/kaggle/working/fakemusiccaps_metadata1.csv"

# Load data from CSV
def load_data_from_csv(csv_path):
    data = pd.read_csv(csv_path)
    file_paths = data['file_path'].values
    labels = data['label'].values
    return file_paths, labels

# Preprocessing function to generate MFCC features
def extract_mfcc(file_path, n_mfcc=13):
    audio, sr = librosa.load(file_path, sr=16000)  # Consistent sampling rate
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
    return mfcc

# Prepare dataset
def prepare_dataset(file_paths, labels, max_length=256, n_mfcc=13):
    mfcc_features = []
    fixed_labels = []
    
    for file_path, label in zip(file_paths, labels):
        try:
            mfcc = extract_mfcc(file_path, n_mfcc=n_mfcc)
            if mfcc.shape[1] > max_length:
                mfcc = mfcc[:, :max_length]
            else:
                padding = max_length - mfcc.shape[1]
                mfcc = np.pad(mfcc, ((0, 0), (0, padding)), mode='constant')
            
            mfcc /= (np.max(np.abs(mfcc)) + 1e-8)  # Normalize after padding
            mfcc_features.append(mfcc)
            fixed_labels.append(label)
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
    
    return np.array(mfcc_features), np.array(fixed_labels)

# Load data
file_paths, labels = load_data_from_csv(CSV_PATH)

# Encode labels
unique_labels = np.unique(labels)
label_map = {label: idx for idx, label in enumerate(unique_labels)}
encoded_labels = np.array([label_map[label] for label in labels])

# Prepare dataset
MAX_LENGTH = 256
N_MFCC = 13
mfcc_features, encoded_labels = prepare_dataset(file_paths, encoded_labels, max_length=MAX_LENGTH, n_mfcc=N_MFCC)

# Add channel dimension
mfcc_features = mfcc_features[..., np.newaxis]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    mfcc_features, encoded_labels, test_size=0.2, random_state=42
)

# One-hot encode labels
y_train = to_categorical(y_train, num_classes=len(unique_labels))
y_test = to_categorical(y_test, num_classes=len(unique_labels))

# Build CNN model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(N_MFCC, MAX_LENGTH, 1)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(len(unique_labels), activation='softmax')
])

# Compile model
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# Train model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

# Evaluate model
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc:.2f}")

# Save the model
model.save('audio_classification_cnn_mfcc.keras')

# Save the label mapping
label_mapping_df = pd.DataFrame(list(label_map.items()), columns=['Label', 'Encoded Value'])
label_mapping_df.to_csv('label_mapping.csv', index=False)

print("Model and label mapping saved successfully.")


In [None]:
import os
import csv
import librosa
import librosa.display
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Define the root directory (adjust this based on your Kaggle dataset path)
root_dir = "/kaggle/input/deepfakedataset/FakeMusicCaps"  # Adjust this path as needed
output_csv = "fakemusiccaps_metadata1.csv"

# Initialize a list to store file paths and labels
data = []

# Traverse through each subfolder in the root directory
for folder_name in os.listdir(root_dir):
    folder_path = os.path.join(root_dir, folder_name)
    
    # Skip non-folder entries
    if not os.path.isdir(folder_path):
        continue
    
    # Iterate through each file in the folder
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        
        # Check if the file is an audio file (adjust extensions if necessary)
        if file_name.endswith(('.mp3', '.wav', '.flac', '.aac')):
            data.append([file_path, folder_name])  # Add file path and folder label to data

# Write the data to a CSV file
with open(output_csv, mode='w', newline='', encoding='utf-8') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(['file_path', 'label'])  # Header
    writer.writerows(data)

print(f"Metadata CSV file has been saved as {output_csv}")

# Path to CSV file
CSV_PATH = "/kaggle/working/fakemusiccaps_metadata1.csv"

# Load data from CSV
def load_data_from_csv(csv_path):
    data = pd.read_csv(csv_path)
    file_paths = data['file_path'].values
    labels = data['label'].values
    return file_paths, labels

# Preprocessing function to generate MFCC features
def extract_mfcc(file_path, n_mfcc=13):
    audio, sr = librosa.load(file_path, sr=22050)  # Consistent sampling rate
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
    return mfcc

# Prepare dataset
def prepare_dataset(file_paths, labels, max_length=256, n_mfcc=13):
    mfcc_features = []
    fixed_labels = []
    
    for file_path, label in zip(file_paths, labels):
        try:
            mfcc = extract_mfcc(file_path, n_mfcc=n_mfcc)
            if mfcc.shape[1] > max_length:
                mfcc = mfcc[:, :max_length]
            else:
                padding = max_length - mfcc.shape[1]
                mfcc = np.pad(mfcc, ((0, 0), (0, padding)), mode='constant')
            
            mfcc /= (np.max(np.abs(mfcc)) + 1e-8)  # Normalize after padding
            mfcc_features.append(mfcc)
            fixed_labels.append(label)
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
    
    return np.array(mfcc_features), np.array(fixed_labels)

# Load data
file_paths, labels = load_data_from_csv(CSV_PATH)

# Encode labels
unique_labels = np.unique(labels)
label_map = {label: idx for idx, label in enumerate(unique_labels)}
encoded_labels = np.array([label_map[label] for label in labels])

# Prepare dataset
MAX_LENGTH = 256
N_MFCC = 13
mfcc_features, encoded_labels = prepare_dataset(file_paths, encoded_labels, max_length=MAX_LENGTH, n_mfcc=N_MFCC)

# Add channel dimension
mfcc_features = mfcc_features[..., np.newaxis]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    mfcc_features, encoded_labels, test_size=0.2, random_state=42
)

# One-hot encode labels
y_train = to_categorical(y_train, num_classes=len(unique_labels))
y_test = to_categorical(y_test, num_classes=len(unique_labels))

# Build CNN model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(N_MFCC, MAX_LENGTH, 1)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(len(unique_labels), activation='softmax')
])

# Compile model
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# Train model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

# Evaluate model
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc:.2f}")

# Save the model
model.save('audio_classification_cnn_mfcc2.keras')

# Save the label mapping
label_mapping_df = pd.DataFrame(list(label_map.items()), columns=['Label', 'Encoded Value'])
label_mapping_df.to_csv('label_mapping.csv', index=False)

print("Model and label mapping saved successfully.")


In [None]:
import librosa
import librosa.display
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Path to CSV file
CSV_PATH = "/kaggle/working/fakemusiccaps_metadata.csv"

# Load data from CSV
def load_data_from_csv(csv_path):
    data = pd.read_csv(csv_path)
    file_paths = data['file_path'].values
    labels = data['label'].values
    return file_paths, labels

# Preprocessing function to generate spectrogram
def extract_features(file_path):
    audio, sr = librosa.load(file_path, sr=16000)  # Consistent sampling rate
    spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=128)
    spectrogram_db = librosa.power_to_db(spectrogram, ref=np.max)
    return spectrogram_db

# Prepare dataset
def prepare_dataset(file_paths, labels, max_length=256):
    spectrograms = []
    fixed_labels = []
    
    for file_path, label in zip(file_paths, labels):
        try:
            spectrogram = extract_features(file_path)
            if spectrogram.shape[1] > max_length:
                spectrogram = spectrogram[:, :max_length]
            else:
                padding = max_length - spectrogram.shape[1]
                spectrogram = np.pad(spectrogram, ((0, 0), (0, padding)), mode='constant')
            
            spectrogram /= (np.max(spectrogram) + 1e-8) 
            spectrograms.append(spectrogram)
            fixed_labels.append(label)
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
    
    return np.array(spectrograms), np.array(fixed_labels)

# Load data
file_paths, labels = load_data_from_csv(CSV_PATH)

# Encode labels
unique_labels = np.unique(labels)
label_map = {label: idx for idx, label in enumerate(unique_labels)}
encoded_labels = np.array([label_map[label] for label in labels])

# Prepare dataset
MAX_LENGTH = 256
spectrograms, encoded_labels = prepare_dataset(file_paths, encoded_labels, max_length=MAX_LENGTH)

# Add channel dimension
spectrograms = spectrograms[..., np.newaxis]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    spectrograms, encoded_labels, test_size=0.2, random_state=42
)

# One-hot encode labels
y_train = to_categorical(y_train, num_classes=len(unique_labels))
y_test = to_categorical(y_test, num_classes=len(unique_labels))

# Build CNN model
model_eer = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(128, MAX_LENGTH, 1)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(len(unique_labels), activation='softmax')
])

# Compile model
model_eer.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# Train model
history = model_eer.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

# Evaluate model
test_loss, test_acc = model_eer.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc:.2f}")

# Save the model
model_eer.save('audio_classification_cnn.h5')

# Save the label mapping
label_mapping_df = pd.DataFrame(list(label_map.items()), columns=['Label', 'Encoded Value'])
label_mapping_df.to_csv('label_mapping.csv', index=False)


In [None]:
import sklearn.metrics

def compute_eer_multiclass(y_true, y_pred_probs):
    """
    Compute Equal Error Rate (EER) for a multiclass classification problem.
    
    :param y_true: True labels (one-hot encoded or label encoded)
    :param y_pred_probs: Predicted probabilities from the model
    :return: Average EER across all classes
    """
    num_classes = y_pred_probs.shape[1]
    eer_list = []
    
    # Compute ROC curve and EER for each class
    for i in range(num_classes):
        # Consider class 'i' as the positive class and all others as negative
        true_binary = (y_true == i).astype(int)  # Convert to binary ground truth (1 for class i, 0 otherwise)
        pred_prob = y_pred_probs[:, i]  # Predicted probabilities for class i
        
        # Calculate FPR, TPR, and thresholds for the ROC curve
        fpr, tpr, thresholds = sklearn.metrics.roc_curve(true_binary, pred_prob)
        
        # Calculate FNR (False Negative Rate)
        fnr = 1 - tpr
        
        # Find EER by finding where FPR equals FNR
        eer_threshold = thresholds[np.nanargmin(np.absolute(fnr - fpr))]
        
        # Get EER value (mean of FPR and FNR)
        eer = (fpr[np.nanargmin(np.absolute(fnr - fpr))] + fnr[np.nanargmin(np.absolute(fnr - fpr))]) / 2
        eer_list.append(eer)
    
    # Return the average EER across all classes
    average_eer = np.mean(eer_list)
    return average_eer

# Make predictions
y_pred_probs = model_eer.predict(X_test)

# Compute EER for multiclass classification
average_eer = compute_eer_multiclass(np.argmax(y_test, axis=1), y_pred_probs)
print(f"Average Equal Error Rate (EER): {average_eer:.2f}")


In [None]:
import os
import csv
import librosa
import librosa.display
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Define the root directory (adjust this based on your Kaggle dataset path)
root_dir = "/kaggle/input/deepfakedataset/FakeMusicCaps"  # Adjust this path as needed
output_csv = "fakemusiccaps_metadata1.csv"

# Initialize a list to store file paths and labels
data = []

# Traverse through each subfolder in the root directory
for folder_name in os.listdir(root_dir):
    folder_path = os.path.join(root_dir, folder_name)
    
    # Skip non-folder entries
    if not os.path.isdir(folder_path):
        continue
    
    # Iterate through each file in the folder
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        
        # Check if the file is an audio file (adjust extensions if necessary)
        if file_name.endswith(('.mp3', '.wav', '.flac', '.aac')):
            data.append([file_path, folder_name])  # Add file path and folder label to data

# Write the data to a CSV file
with open(output_csv, mode='w', newline='', encoding='utf-8') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(['file_path', 'label'])  # Header
    writer.writerows(data)

print(f"Metadata CSV file has been saved as {output_csv}")

# Path to CSV file
CSV_PATH = "/kaggle/working/fakemusiccaps_metadata1.csv"

# Load data from CSV
def load_data_from_csv(csv_path):
    data = pd.read_csv(csv_path)
    file_paths = data['file_path'].values
    labels = data['label'].values
    return file_paths, labels

# Preprocessing function to generate MFCC features
def extract_mfcc(file_path, n_mfcc=13):
    audio, sr = librosa.load(file_path, sr=22050)  # Consistent sampling rate
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
    return mfcc

# Prepare dataset
def prepare_dataset(file_paths, labels, max_length=256, n_mfcc=13):
    mfcc_features = []
    fixed_labels = []
    
    for file_path, label in zip(file_paths, labels):
        try:
            mfcc = extract_mfcc(file_path, n_mfcc=n_mfcc)
            if mfcc.shape[1] > max_length:
                mfcc = mfcc[:, :max_length]
            else:
                padding = max_length - mfcc.shape[1]
                mfcc = np.pad(mfcc, ((0, 0), (0, padding)), mode='constant')
            
            mfcc /= (np.max(np.abs(mfcc)) + 1e-8)  # Normalize after padding
            mfcc_features.append(mfcc)
            fixed_labels.append(label)
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
    
    return np.array(mfcc_features), np.array(fixed_labels)

# Load data
file_paths, labels = load_data_from_csv(CSV_PATH)

# Encode labels
unique_labels = np.unique(labels)
label_map = {label: idx for idx, label in enumerate(unique_labels)}
encoded_labels = np.array([label_map[label] for label in labels])

# Prepare dataset
MAX_LENGTH = 256
N_MFCC = 13
mfcc_features, encoded_labels = prepare_dataset(file_paths, encoded_labels, max_length=MAX_LENGTH, n_mfcc=N_MFCC)

# Add channel dimension
mfcc_features = mfcc_features[..., np.newaxis]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    mfcc_features, encoded_labels, test_size=0.2, random_state=42
)

# One-hot encode labels
y_train = to_categorical(y_train, num_classes=len(unique_labels))
y_test = to_categorical(y_test, num_classes=len(unique_labels))

# Build CNN model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(N_MFCC, MAX_LENGTH, 1)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(len(unique_labels), activation='softmax')
])

# Compile model
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# Train model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

# Evaluate model
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc:.2f}")

# Save the model
model.save('audio_classification_cnn_mfcc2.keras')

# Save the label mapping
label_mapping_df = pd.DataFrame(list(label_map.items()), columns=['Label', 'Encoded Value'])
label_mapping_df.to_csv('label_mapping.csv', index=False)

print("Model and label mapping saved successfully.")


In [7]:
!git clone https://github.com/CodeVault-girish/MFM-models.git

Cloning into 'MFM-models'...
remote: Enumerating objects: 56, done.[K
remote: Counting objects: 100% (56/56), done.[K
remote: Compressing objects: 100% (40/40), done.[K
remote: Total 56 (delta 14), reused 51 (delta 11), pack-reused 0 (from 0)[K
Receiving objects: 100% (56/56), 10.43 KiB | 10.43 MiB/s, done.
Resolving deltas: 100% (14/14), done.


In [2]:

!ls

import sys
sys.path.append("/kaggle/working/MFM-models")  # Adjust the path if needed




MFM-models  requirements.txt  state.db


In [4]:
import os
print(os.path.exists("/kaggle/input/deepfakedataset/FakeMusicCaps/MusicCaps"))


True


In [5]:
from MFM_extractor import model_list, extract_from
model_list()
extract_from(
    selection="1",
    folder_path="/kaggle/input/deepfakedataset/FakeMusicCaps/MusicCaps",
    output_file="/kaggle/working/output.csv",  # Save output in /kaggle/working
    device="cuda"
)


Available models:
1. MERT-v0


Some weights of the model checkpoint at m-a-p/MERT-v0 were not used when initializing MERTModel: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing MERTModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MERTModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of MERTModel were not initialized from the model checkpoint at m-a-p/MERT-v0 and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.




Processing audio files: 100%|██████████| 5373/5373 [06:42<00:00, 13.35it/s]


Saved all features to /kaggle/working/output.csv


In [6]:
from MFM_extractor import model_list, extract_from
model_list()
extract_from(
 selection="1",
 folder_path="/kaggle/input/deepfakedataset/FakeMusicCaps/MusicGen_medium",
 output_file="/kaggle/working/output2.csv",   
 device="cuda"                             
 )

Available models:
1. MERT-v0


Some weights of the model checkpoint at m-a-p/MERT-v0 were not used when initializing MERTModel: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing MERTModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MERTModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of MERTModel were not initialized from the model checkpoint at m-a-p/MERT-v0 and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Processing

Saved all features to /kaggle/working/output2.csv


In [7]:
from MFM_extractor import model_list, extract_from
model_list()
extract_from(
 selection="1",
 folder_path="/kaggle/input/deepfakedataset/FakeMusicCaps/SunoCaps",
 output_file="/kaggle/working/output3.csv",   
 device="cuda"                             
 )

Available models:
1. MERT-v0


Some weights of the model checkpoint at m-a-p/MERT-v0 were not used when initializing MERTModel: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing MERTModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MERTModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of MERTModel were not initialized from the model checkpoint at m-a-p/MERT-v0 and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.




Processing audio files: 100%|██████████| 63/63 [01:09<00:00,  1.11s/it]

Saved all features to /kaggle/working/output3.csv





In [8]:
from MFM_extractor import model_list, extract_from
model_list()
extract_from(
 selection="1",
 folder_path="/kaggle/input/deepfakedataset/FakeMusicCaps/audioldm2",
 output_file="/kaggle/working/output4.csv",   
 device="cuda"                             
 )

Available models:
1. MERT-v0


Some weights of the model checkpoint at m-a-p/MERT-v0 were not used when initializing MERTModel: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing MERTModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MERTModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of MERTModel were not initialized from the model checkpoint at m-a-p/MERT-v0 and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Processing

Saved all features to /kaggle/working/output4.csv


In [9]:
from MFM_extractor import model_list, extract_from
model_list()
extract_from(
 selection="1",
 folder_path="/kaggle/input/deepfakedataset/FakeMusicCaps/musicldm",
 output_file="/kaggle/working/output5.csv",   
 device="cuda"                             
 )

Available models:
1. MERT-v0


Some weights of the model checkpoint at m-a-p/MERT-v0 were not used when initializing MERTModel: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing MERTModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MERTModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of MERTModel were not initialized from the model checkpoint at m-a-p/MERT-v0 and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Processing

Saved all features to /kaggle/working/output5.csv


In [10]:
from MFM_extractor import model_list, extract_from
model_list()
extract_from(
 selection="1",
 folder_path="/kaggle/input/deepfakedataset/FakeMusicCaps/mustango",
 output_file="/kaggle/working/output6.csv",   
 device="cuda"                             
 )

Available models:
1. MERT-v0


Some weights of the model checkpoint at m-a-p/MERT-v0 were not used when initializing MERTModel: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing MERTModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MERTModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of MERTModel were not initialized from the model checkpoint at m-a-p/MERT-v0 and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Processing

Saved all features to /kaggle/working/output6.csv


In [11]:
from MFM_extractor import model_list, extract_from
model_list()
extract_from(
 selection="1",
 folder_path="/kaggle/input/deepfakedataset/FakeMusicCaps/stable_audio_open",
 output_file="/kaggle/working/output7.csv",   
 device="cuda"                             
 )

Available models:
1. MERT-v0


Some weights of the model checkpoint at m-a-p/MERT-v0 were not used when initializing MERTModel: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing MERTModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MERTModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of MERTModel were not initialized from the model checkpoint at m-a-p/MERT-v0 and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.




Processing audio files: 100%|██████████| 5521/5521 [07:29<00:00, 12.28it/s]


Saved all features to /kaggle/working/output7.csv


CLAP

In [1]:
pip install laion-clap

Collecting laion-clap
  Downloading laion_clap-1.1.6-py3-none-any.whl.metadata (26 kB)
Collecting numpy==1.23.5 (from laion-clap)
  Downloading numpy-1.23.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
Collecting torchlibrosa (from laion-clap)
  Downloading torchlibrosa-0.1.0-py3-none-any.whl.metadata (3.5 kB)
Collecting ftfy (from laion-clap)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting braceexpand (from laion-clap)
  Downloading braceexpand-0.1.7-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting webdataset (from laion-clap)
  Downloading webdataset-0.2.111-py3-none-any.whl.metadata (15 kB)
Collecting wget (from laion-clap)
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting progressbar (from laion-clap)
  Downloading progressbar-2.5.tar.gz (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading laion_clap-1.1.6-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━

In [4]:
!git clone https://github.com/LAION-AI/CLAP.git


fatal: destination path 'CLAP' already exists and is not an empty directory.


In [6]:
!pip install ffmpeg-python


Collecting ffmpeg-python
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Downloading ffmpeg_python-0.2.0-py3-none-any.whl (25 kB)
Installing collected packages: ffmpeg-python
Successfully installed ffmpeg-python-0.2.0


In [7]:
%cd CLAP


/kaggle/working/CLAP


In [10]:
!pip install huggingface_hub




In [11]:
from huggingface_hub import login

login("your_huggingface_token_here")  # Replace with your actual token


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.


ValueError: Invalid token passed!

HUBERT

In [13]:
pip install pip==23.3.1


Collecting pip==23.3.1
  Downloading pip-23.3.1-py3-none-any.whl.metadata (3.5 kB)
Downloading pip-23.3.1-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-23.3.1
Note: you may need to restart the kernel to use updated packages.


In [14]:
pip install fairseq


Collecting fairseq
  Using cached fairseq-0.12.2.tar.gz (9.6 MB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting hydra-core<1.1,>=1.0.7 (from fairseq)
  Using cached hydra_core-1.0.7-py3-none-any.whl.metadata (3.7 kB)
Collecting omegaconf<2.1 (from fairseq)
  Using cached omegaconf-2.0.6-py3-none-any.whl.metadata (3.0 kB)
Collecting sacrebleu>=1.4.12 (from fairseq)
  Using cached sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
Collecting bitarray (from fairseq)
  Using cached bitarray-3.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (32 kB)
Collecting antlr4-python3-runtime==4.8 (from hydra-core<1.1,>=1.0.7->fairseq)
  Downloading antlr4-python3-runtime-4.8.tar.gz (112 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.4/112.4 kB[0m [31m2.5 MB/s[0m e

In [15]:
!git clone https://github.com/bshall/hubert.git


Cloning into 'hubert'...
remote: Enumerating objects: 141, done.[K
remote: Counting objects: 100% (63/63), done.[K
remote: Compressing objects: 100% (18/18), done.[K
remote: Total 141 (delta 52), reused 48 (delta 45), pack-reused 78 (from 1)[K
Receiving objects: 100% (141/141), 473.68 KiB | 5.51 MiB/s, done.
Resolving deltas: 100% (77/77), done.


In [16]:
%cd hubert


/kaggle/working/CLAP/hubert


In [19]:
!wget https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt


--2025-03-09 07:54:15--  https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 54.230.202.3, 54.230.202.7, 54.230.202.65, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|54.230.202.3|:443... connected.
HTTP request sent, awaiting response... 

  pid, fd = os.forkpty()


200 OK
Length: 1136468879 (1.1G) [application/zip]
Saving to: ‘hubert_base_ls960.pt’


2025-03-09 07:54:24 (132 MB/s) - ‘hubert_base_ls960.pt’ saved [1136468879/1136468879]



In [22]:
!pip install fairseq


  pid, fd = os.forkpty()


[33mDEPRECATION: omegaconf 2.0.6 has a non-standard dependency specifier PyYAML>=5.1.*. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of omegaconf or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

In [23]:
!wget -O hubert_base_ls960.pt https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt


--2025-03-09 07:55:56--  https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 54.230.202.65, 54.230.202.3, 54.230.202.7, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|54.230.202.65|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1136468879 (1.1G) [application/zip]
Saving to: ‘hubert_base_ls960.pt’


2025-03-09 07:56:09 (89.0 MB/s) - ‘hubert_base_ls960.pt’ saved [1136468879/1136468879]



In [24]:
import torch
import fairseq

# Load the pretrained model
models, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task(["hubert_base_ls960.pt"])

# Extract the model
hubert = models[0]
hubert.eval()  # Set to evaluation mode

print("HuBERT model loaded successfully!")


  state = torch.load(f, map_location=torch.device("cpu"))


HuBERT model loaded successfully!


In [1]:
import torch
import os
import numpy as np
import torchaudio

# Function to extract embeddings
# Function to extract embeddings
def extract_embeddings(audio_path):
    waveform, sample_rate = torchaudio.load(audio_path)  # Load audio
    waveform = waveform.mean(dim=0, keepdim=True)  # Convert to mono if stereo
    if sample_rate != 16000:  # HuBERT requires 16kHz
        waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)

    # Move waveform to the same device as HuBERT
    device = next(hubert.parameters()).device
    waveform = waveform.to(device)

    with torch.no_grad():
        features = hubert.extract_features(waveform)[0]  # Extract embeddings
    return features.squeeze(0)  # Remove batch dimension


# Paths
dataset_path = "/kaggle/input/deepfakedataset/FakeMusicCaps"  # Root dataset folder
save_path = "/kaggle/working/hubert_npy"  # Where to save .npy files
os.makedirs(save_path, exist_ok=True)

# Process each folder
for folder in os.listdir(dataset_path):
    folder_path = os.path.join(dataset_path, folder)

    if os.path.isdir(folder_path):
        print(f"Processing folder: {folder}")

        folder_embeddings = []  # Store all embeddings for the folder

        for file in os.listdir(folder_path):
            if file.endswith(".wav"):
                file_path = os.path.join(folder_path, file)
                
                # Extract embeddings
                embeddings = extract_embeddings(file_path)
                folder_embeddings.append(embeddings.cpu().numpy())  # Convert to NumPy and store

        # Save all embeddings of this folder as a single .npy file
        save_file = os.path.join(save_path, f"{folder}.npy")
        np.save(save_file, np.array(folder_embeddings, dtype=object))


        print(f"Saved embeddings for {folder} to {save_file}")


Processing folder: stable_audio_open


NameError: name 'hubert' is not defined

In [2]:
import pandas as pd
import os

# Define file paths and labels
file_paths = {
    "MusicCaps.csv": 1,  # Label 1 for MusicCaps.csv
    "MusicGen_medium.csv": 0,
    "SunoCaps.csv": 0,
    "audioldm2.csv": 0,
    "musicldm.csv": 0,
    "mustango.csv": 0,
    "stable_audio_open.csv": 0
}

# Input folder (where CSVs are stored)
input_folder = "/kaggle/input/output"  # Change this based on your dataset folder

# Output folder (where modified files will be saved)
output_folder = "/kaggle/working/"  # This is writable in Kaggle

# Process each file
for file_name, label in file_paths.items():
    input_path = os.path.join(input_folder, file_name)
    output_path = os.path.join(output_folder, file_name)  # Save to working directory

    # Check if file exists
    if os.path.exists(input_path):
        # Read CSV
        df = pd.read_csv(input_path)

        # Insert the label column as the first column
        df.insert(0, "label", label)

        # Save the modified file to Kaggle's working directory
        df.to_csv(output_path, index=False)

        print(f"✅ Processed {file_name} with label {label} and saved to {output_path}")
    else:
        print(f"⚠️ File not found: {input_path}")

print("🎯 Labeling complete! Check /kaggle/working/ for updated files.")


✅ Processed MusicCaps.csv with label 1 and saved to /kaggle/working/MusicCaps.csv
✅ Processed MusicGen_medium.csv with label 0 and saved to /kaggle/working/MusicGen_medium.csv
✅ Processed SunoCaps.csv with label 0 and saved to /kaggle/working/SunoCaps.csv
✅ Processed audioldm2.csv with label 0 and saved to /kaggle/working/audioldm2.csv
✅ Processed musicldm.csv with label 0 and saved to /kaggle/working/musicldm.csv
✅ Processed mustango.csv with label 0 and saved to /kaggle/working/mustango.csv
✅ Processed stable_audio_open.csv with label 0 and saved to /kaggle/working/stable_audio_open.csv
🎯 Labeling complete! Check /kaggle/working/ for updated files.
