In [31]:
import os
import librosa
import numpy as np
import pandas as pd
from tqdm import tqdm 

In [32]:
real_dir = '/kaggle/input/in-the-wild-audio-deepfake/release_in_the_wild/real'
fake_dir = '/kaggle/input/in-the-wild-audio-deepfake/release_in_the_wild/fake'


In [33]:
def extract_features(file_path):
    y, sr = librosa.load(file_path, sr=None)
    
    # Extracting relevant features
    chroma_stft = np.mean(librosa.feature.chroma_stft(y=y, sr=sr))
    rms = np.mean(librosa.feature.rms(y=y))
    spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
    spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))
    zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(y))
    
    # Extract 20 MFCCs
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
    mfccs_mean = np.mean(mfccs, axis=1)
    
    # Combine all features into a single array
    features = np.hstack([chroma_stft, rms, spectral_centroid, spectral_rolloff, zero_crossing_rate, mfccs_mean])
    return features


In [34]:
# Function to process a directory and extract features for all audio files
def process_directory(directory, label):
    feature_data = []
    files = [f for f in os.listdir(directory) if f.endswith('.wav')]
    
    # Progress bar with tqdm
    for filename in tqdm(files, desc=f"Processing {label} files"):
        file_path = os.path.join(directory, filename)
        features = extract_features(file_path)
        features = np.append(features, label)  # Add label (real or fake)
        feature_data.append(features)
    
    return feature_data


In [35]:
# Extract features from both real and fake directories
real_features = process_directory(real_dir, label='real')
fake_features = process_directory(fake_dir, label='fake')

# Combine the features into one list
all_features = real_features + fake_features

# Define column names
columns = ['chroma_stft', 'rms', 'spectral_centroid', 'spectral_rolloff', 'zero_crossing_rate'] + [f'mfcc{i}' for i in range(1, 21)] + ['label']

# Create a DataFrame
df = pd.DataFrame(all_features, columns=columns)

# Save the DataFrame to a CSV file
df.to_csv('/kaggle/working/features.csv', index=False)

print("All features extracted and saved to features.csv")

Processing real files: 100%|██████████| 19963/19963 [26:37<00:00, 12.49it/s]
Processing fake files: 100%|██████████| 11816/11816 [20:36<00:00,  9.56it/s]


All features extracted and saved to features.csv


In [36]:
df = df.sample(frac=1).reset_index(drop=True)

In [37]:
df.to_csv('/kaggle/working/features.csv', index=False)

In [48]:
df.shape

(31779, 26)

In [49]:
df.head

<bound method NDFrame.head of        chroma_stft       rms  spectral_centroid  spectral_rolloff  \
0         0.454670  0.036317         778.374716       1630.809295   
1         0.387188  0.026810        1679.154006       3148.925781   
2         0.401663  0.031832        1220.353790       2190.835674   
3         0.336937  0.089518         799.617459       1368.794643   
4         0.274407  0.026884        1604.618712       3301.200930   
...            ...       ...                ...               ...   
31774     0.327164  0.042014        1426.071709       2664.329848   
31775     0.313332  0.036872        1404.865225       2943.847656   
31776     0.392111  0.061376        1550.970244       2725.947627   
31777     0.464994  0.027045        1544.312289       3033.100329   
31778     0.394513  0.029955        1398.155276       2532.871893   

       zero_crossing_rate       mfcc1       mfcc2      mfcc3      mfcc4  \
0                0.046800 -443.403351  114.807091  -2.051513  25.7