In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import soundfile as sf

# ---------------- Model Definition ----------------
class DeepfakeDetectorCNN(nn.Module):
    def __init__(self, num_classes=2):
        super(DeepfakeDetectorCNN, self).__init__()
        
        self.conv_layers = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Dropout(0.4),

            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Dropout(0.4),

            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Dropout(0.5),

            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Dropout(0.5),
        )
        
        self.fc_layers = nn.Sequential(
            nn.Linear(256 * 8 * 8, 512),
            nn.ReLU(),
            nn.Dropout(0.6),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Dropout(0.6),
            nn.Linear(128, num_classes)
        )
        
    def forward(self, x):
        x = self.conv_layers(x)
        x = x.view(x.size(0), -1)
        x = self.fc_layers(x)
        return x

# ---------------- Feature Extractor ----------------
class AudioFeatureExtractor:
    def __init__(self, sample_rate=22050, n_mels=128, max_len=128):
        self.sample_rate = sample_rate
        self.n_mels = n_mels
        self.max_len = max_len
    
    def extract_mel_spectrogram(self, file_path):
        y, sr = librosa.load(file_path, sr=self.sample_rate, mono=True, duration=5.0)
        mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=self.n_mels, hop_length=512)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
        
        if mel_spec_db.shape[1] < self.max_len:
            mel_spec_db = np.pad(mel_spec_db, ((0, 0), (0, self.max_len - mel_spec_db.shape[1])), mode="constant")
        else:
            mel_spec_db = mel_spec_db[:, :self.max_len]

        return mel_spec_db.astype(np.float32)

# ---------------- Dataset ----------------
class AudioDataset(Dataset):
    def __init__(self, file_paths, labels, extractor):
        self.file_paths = file_paths
        self.labels = labels
        self.extractor = extractor

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        mel_spec = self.extractor.extract_mel_spectrogram(self.file_paths[idx])
        mel_spec = torch.tensor(mel_spec).unsqueeze(0)  # (1, n_mels, max_len)
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return mel_spec, label

# ---------------- Training Loop ----------------
def train_model(train_loader, val_loader, model, criterion, optimizer, device, epochs=20):
    best_acc = 0.0
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        correct, total = 0, 0

        for inputs, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

        train_acc = correct / total
        val_acc = evaluate_model(val_loader, model, device)

        print(f"Epoch {epoch+1}: Loss={running_loss/len(train_loader):.4f}, Train Acc={train_acc:.4f}, Val Acc={val_acc:.4f}")

        # Save best model
        if val_acc > best_acc:
            torch.save(model.state_dict(), "sound_deepfake_detector.pth")
            best_acc = val_acc
            print("✅ Model saved!")

def evaluate_model(loader, model, device):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for inputs, labels in loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()
    return correct / total

# ---------------- Main ----------------
if __name__ == "__main__":
    DATASET_DIR = "/fake_or_real_dataset"  

    real_files = [os.path.join(DATASET_DIR, "real", f) for f in os.listdir(os.path.join(DATASET_DIR, "real"))]
    fake_files = [os.path.join(DATASET_DIR, "fake", f) for f in os.listdir(os.path.join(DATASET_DIR, "fake"))]

    X = real_files + fake_files
    y = [0] * len(real_files) + [1] * len(fake_files)

    train_files, val_files, train_labels, val_labels = train_test_split(X, y, test_size=0.2, random_state=42)

    extractor = AudioFeatureExtractor()
    train_dataset = AudioDataset(train_files, train_labels, extractor)
    val_dataset = AudioDataset(val_files, val_labels, extractor)

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=2)
    val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, num_workers=2)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = DeepfakeDetectorCNN().to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-4)

    train_model(train_loader, val_loader, model, criterion, optimizer, device, epochs=20)
