# Deepfake Audio Detection using Deep Neural Network

This notebook implements a Deep Neural Network (DNN) model using PyTorch for detecting Urdu deepfake audio.

In [None]:
import numpy as np
import pandas as pd
import os
import librosa
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support, roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import joblib
import sys
sys.path.append('..')
from utils.audio_features import extract_features

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)

# Check if CUDA is available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Try to import the datasets library, but provide a fallback if it's not available
try:
    from datasets import load_dataset
    DATASETS_AVAILABLE = True
except ImportError:
    print("Warning: 'datasets' library not available. Will use local data if provided.")
    DATASETS_AVAILABLE = False

## Define the DNN Model

In [None]:
class DeepfakeDetectionDNN(nn.Module):
    def __init__(self, input_size, hidden_size1=128, hidden_size2=64, dropout_rate=0.3):
        super(DeepfakeDetectionDNN, self).__init__()
        
        self.model = nn.Sequential(
            nn.Linear(input_size, hidden_size1),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_size1, hidden_size2),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_size2, 1),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        return self.model(x)

## Load and Preprocess Data

We'll load the Urdu deepfake detection dataset and extract features from the audio files.

In [None]:
# Function to extract features from audio files
def extract_audio_features(audio_path, sr=22050, duration=5):
    """Extract audio features from an audio file."""
    try:
        # Load audio file with a fixed duration
        y, sr = librosa.load(audio_path, sr=sr, duration=duration)
        
        # Extract MFCCs (Mel-frequency cepstral coefficients)
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
        mfcc_mean = np.mean(mfccs, axis=1)
        mfcc_std = np.std(mfccs, axis=1)
        
        # Extract spectral features
        spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
        spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)[0]
        spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)[0]
        
        # Extract temporal features
        zero_crossing_rate = librosa.feature.zero_crossing_rate(y)[0]
        
        # Combine all features
        features = np.concatenate([
            mfcc_mean, mfcc_std,
            [np.mean(spectral_centroid), np.std(spectral_centroid)],
            [np.mean(spectral_bandwidth), np.std(spectral_bandwidth)],
            [np.mean(spectral_rolloff), np.std(spectral_rolloff)],
            [np.mean(zero_crossing_rate), np.std(zero_crossing_rate)]
        ])
        
        return features
    except Exception as e:
        print(f"Error extracting features from {audio_path}: {e}")
        return None

In [None]:
# Load the dataset
if DATASETS_AVAILABLE:
    try:
        print("Loading the Urdu deepfake detection dataset...")
        ds = load_dataset("CSALT/deepfake_detection_dataset_urdu")
        print(f"Dataset loaded successfully. Available splits: {ds.keys()}")
        
        # Process the dataset
        features = []
        labels = []
        
        # Process training data
        for item in ds['train']:
            audio_path = item['audio_path']
            label = 1 if item['label'] == 'deepfake' else 0  # Convert to binary labels
            
            # Extract features
            audio_features = extract_audio_features(audio_path)
            if audio_features is not None:
                features.append(audio_features)
                labels.append(label)
        
        # Convert to numpy arrays
        X = np.array(features)
        y = np.array(labels)
        
        print(f"Processed {len(X)} audio files with {X.shape[1]} features per file.")
        print(f"Class distribution: {np.bincount(y)}")
        
    except Exception as e:
        print(f"Error loading dataset: {e}")
        print("Falling back to local data if available...")
        DATASETS_AVAILABLE = False

# Fallback to local data if datasets library is not available or loading failed
if not DATASETS_AVAILABLE:
    print("Using local data...")
    # Here you would load local audio files and extract features
    # For demonstration, we'll create some dummy data
    X = np.random.rand(100, 32)  # 100 samples, 32 features
    y = np.random.randint(0, 2, 100)  # Binary labels (0: real, 1: deepfake)
    print(f"Created dummy data with {X.shape[0]} samples and {X.shape[1]} features.")
    print(f"Class distribution: {np.bincount(y)}")

## Preprocess and Split Data

In [None]:
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Validation set: {X_val.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

## Prepare PyTorch Datasets and DataLoaders

In [None]:
# Convert numpy arrays to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train)
y_train_tensor = torch.FloatTensor(y_train).unsqueeze(1)  # Add dimension for binary classification

X_val_tensor = torch.FloatTensor(X_val)
y_val_tensor = torch.FloatTensor(y_val).unsqueeze(1)

X_test_tensor = torch.FloatTensor(X_test)
y_test_tensor = torch.FloatTensor(y_test).unsqueeze(1)

# Create TensorDatasets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# Create DataLoaders
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

## Initialize and Train the DNN Model

In [None]:
# Initialize the model
input_size = X_train.shape[1]
model = DeepfakeDetectionDNN(input_size=input_size).to(device)

# Define loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training parameters
num_epochs = 20
train_losses = []
val_losses = []
val_accuracies = []

# Training loop
for epoch in range(num_epochs):
    # Training phase
    model.train()
    running_loss = 0.0
    
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        # Backward pass and optimize
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * inputs.size(0)
    
    epoch_train_loss = running_loss / len(train_loader.dataset)
    train_losses.append(epoch_train_loss)
    
    # Validation phase
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            
            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            running_loss += loss.item() * inputs.size(0)
            
            # Calculate accuracy
            predicted = (outputs > 0.5).float()
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    epoch_val_loss = running_loss / len(val_loader.dataset)
    epoch_val_acc = correct / total
    
    val_losses.append(epoch_val_loss)
    val_accuracies.append(epoch_val_acc)
    
    print(f"Epoch {epoch+1}/{num_epochs} - "
          f"Train Loss: {epoch_train_loss:.4f} - "
          f"Val Loss: {epoch_val_loss:.4f} - "
          f"Val Acc: {epoch_val_acc:.4f}")

## Visualize Training Progress

In [None]:
# Plot training and validation loss
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(range(1, num_epochs+1), train_losses, label='Training Loss')
plt.plot(range(1, num_epochs+1), val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.plot(range(1, num_epochs+1), val_accuracies, label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Validation Accuracy')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Evaluate Model on Test Set

In [None]:
# Evaluate model on test set
model.eval()
y_pred = []
y_prob = []
y_true = []

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        
        predicted = (outputs > 0.5).float()
        
        y_pred.extend(predicted.cpu().numpy())
        y_prob.extend(outputs.cpu().numpy())
        y_true.extend(labels.cpu().numpy())

# Convert lists to numpy arrays
y_pred = np.array(y_pred).flatten()
y_prob = np.array(y_prob).flatten()
y_true = np.array(y_true).flatten()

# Calculate metrics
accuracy = accuracy_score(y_true, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')
auc_roc = roc_auc_score(y_true, y_prob)

# Print classification report
print("Classification Report:")
print(classification_report(y_true, y_pred, target_names=['Bonafide', 'Deepfake']))

# Print summary metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"AUC-ROC: {auc_roc:.4f}")

## Visualize ROC Curve

In [None]:
# Plot ROC curve
fpr, tpr, _ = roc_curve(y_true, y_prob)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'DNN (AUC = {auc_roc:.4f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Deepfake Detection')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## Save Model

In [None]:
# Save model and scaler
model_path = '../models/dnn_deepfake.pt'
scaler_path = '../models/dnn_deepfake_scaler.pkl'

torch.save(model.state_dict(), model_path)
joblib.dump(scaler, scaler_path)

print(f"Model saved to {model_path}")
print(f"Scaler saved to {scaler_path}")

# Save model architecture information for later loading
model_info = {
    'input_size': input_size,
    'hidden_size1': 128,
    'hidden_size2': 64,
    'dropout_rate': 0.3
}
joblib.dump(model_info, '../models/dnn_deepfake_info.pkl')
print(f"Model info saved to ../models/dnn_deepfake_info.pkl")