<a href="https://colab.research.google.com/github/Twinkle-gawri/DeepFake-Speech-Detection/blob/main/DeepFake_Speech_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install Kaggle
from google.colab import files
files.upload()  # This will prompt you to upload your kaggle.json
!mkdir -p ~/.kaggle
!cp /content/kaggle.json ~/.kaggle/   # <-- using "cp" (copy), not "mv"
!chmod 600 ~/.kaggle/kaggle.json



Saving kaggle.json to kaggle.json


In [None]:
!kaggle datasets download -d mohammedabdeldayem/scenefake
!unzip scenefake.zip -d /content/

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: /content/train/fake/A_7524_15_B.wav  
  inflating: /content/train/fake/A_7525_20_B.wav  
  inflating: /content/train/fake/A_7526_20_B.wav  
  inflating: /content/train/fake/A_7527_0_B.wav  
  inflating: /content/train/fake/A_7528_20_B.wav  
  inflating: /content/train/fake/A_7529_5_B.wav  
  inflating: /content/train/fake/A_7530_5_B.wav  
  inflating: /content/train/fake/A_7531_05_B.wav  
  inflating: /content/train/fake/A_7532_0_B.wav  
  inflating: /content/train/fake/A_7533_20_B.wav  
  inflating: /content/train/fake/A_7534_10_B.wav  
  inflating: /content/train/fake/A_7535_15_B.wav  
  inflating: /content/train/fake/A_7536_5_B.wav  
  inflating: /content/train/fake/A_7537_15_B.wav  
  inflating: /content/train/fake/A_7538_20_B.wav  
  inflating: /content/train/fake/A_7539_05_B.wav  
  inflating: /content/train/fake/A_7540_15_B.wav  
  inflating: /content/train/fake/A_7541_10_B.wav  
  inflating: /content/

In [None]:
import os
import random
import shutil

def copy_balanced_subset(source_root, target_root, percentage=0.75):
    # Get file lists
    fake_files = [f for f in os.listdir(os.path.join(source_root, 'fake')) if f.endswith(('.mp3', '.wav', '.m4a'))]
    real_files = [f for f in os.listdir(os.path.join(source_root, 'real')) if f.endswith(('.mp3', '.wav', '.m4a'))]

    # Calculate how many to take based on the smaller class
    min_class_size = min(len(fake_files), len(real_files))
    sample_size = int(min_class_size * percentage)

    # Shuffle and sample
    random.seed(42)
    fake_selected = random.sample(fake_files, sample_size)
    real_selected = random.sample(real_files, sample_size)

    # Copy files
    for cls, files in [('fake', fake_selected), ('real', real_selected)]:
        source_dir = os.path.join(source_root, cls)
        target_dir = os.path.join(target_root, cls)
        os.makedirs(target_dir, exist_ok=True)
        for f in files:
            shutil.copy(os.path.join(source_dir, f), os.path.join(target_dir, f))
        print(f"✅ {cls}: Copied {len(files)} of {len(os.listdir(source_dir))} files from {source_root}")

# Apply to all sets
copy_balanced_subset("/content/train", "/content/train_balanced_75")
copy_balanced_subset("/content/dev", "/content/dev_balanced_75")
copy_balanced_subset("/content/eval", "/content/eval_balanced_75")

✅ fake: Copied 1893 of 10660 files from /content/train
✅ real: Copied 1893 of 2525 files from /content/train
✅ fake: Copied 1911 of 10295 files from /content/dev
✅ real: Copied 1911 of 2548 files from /content/dev
✅ fake: Copied 4750 of 26412 files from /content/eval
✅ real: Copied 4750 of 6334 files from /content/eval


In [None]:
import shutil
import os

# Define folders to keep
keep_folders = {'train_balanced_75', 'dev_balanced_75', 'eval_balanced_75'}

# List all items in /content
all_items = os.listdir('/content')

# Delete everything not in keep_folders
for item in all_items:
    item_path = os.path.join('/content', item)
    if os.path.isdir(item_path) and item not in keep_folders:
        shutil.rmtree(item_path)
        print(f"🗑️ Deleted folder: {item}")
    else:
        print(f"✅ Kept: {item}")

🗑️ Deleted folder: .config
✅ Kept: dev_balanced_75
✅ Kept: train_balanced_75
🗑️ Deleted folder: dev
✅ Kept: scenefake.zip
✅ Kept: eval_balanced_75
✅ Kept: kaggle.json
✅ Kept: README.txt
🗑️ Deleted folder: train
🗑️ Deleted folder: eval
🗑️ Deleted folder: sample_data


In [None]:
import os
import random
import shutil
import numpy as np
import torch
from transformers import WhisperFeatureExtractor, WhisperModel
import librosa
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from torch import nn, optim
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F

# ✅ Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ✅ Load feature extractor and encoder only
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")
encoder = WhisperModel.from_pretrained("openai/whisper-small").encoder.to(device)
encoder.eval()

# ✅ Extract average encoder embedding
def extract_whisper_embeddings(audio_path, sr=16000):
    audio, _ = librosa.load(audio_path, sr=sr)
    inputs = feature_extractor(audio, sampling_rate=sr, return_tensors="pt")
    with torch.no_grad():
        output = encoder(input_features=inputs["input_features"].to(device))
    return output.last_hidden_state[0].mean(dim=0).cpu().numpy()

# ✅ Dataset loading and processing
def process_dataset(folder_path):
    X, y = [], []
    for label in ['fake', 'real']:
        subfolder = os.path.join(folder_path, label)
        for file in os.listdir(subfolder):
            if file.endswith(('.mp3', '.wav', '.m4a')):
                path = os.path.join(subfolder, file)
                try:
                    emb = extract_whisper_embeddings(path)
                    X.append(emb)
                    y.append(label)
                except Exception as e:
                    print(f"⚠️ Error processing {file}: {e}")
    return np.array(X), np.array(y)

print("🔄 Extracting features...")
X_train, y_train = process_dataset("/content/train_balanced_75")
X_dev, y_dev = process_dataset("/content/dev_balanced_75")
X_eval, y_eval = process_dataset("/content/eval_balanced_75")

# ✅ Encode labels
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_dev_enc = le.transform(y_dev)
y_eval_enc = le.transform(y_eval)


🔄 Extracting features...


In [None]:
# Save
np.save('/content/X_train.npy', X_train)
np.save('/content/y_train.npy', y_train)
np.save('/content/X_dev.npy', X_dev)
np.save('/content/y_dev.npy', y_dev)
np.save('/content/X_eval.npy', X_eval)
np.save('/content/y_eval.npy', y_eval)


In [None]:
# # Load
# X_train = np.load('/content/X_train.npy')
# y_train = np.load('/content/y_train.npy')
# X_dev = np.load('/content/X_dev.npy')
# y_dev = np.load('/content/y_dev.npy')
# X_eval = np.load('/content/X_eval.npy')
# y_eval = np.load('/content/y_eval.npy')


We tried data augmentation techniques like smote but results were not good, the below results are without data augmentation.

We tried various techniques such as DNN, LGBMClassifier,LogisticRegression. We also tried CNN(50% accuracy) and less optimised versions of dnn and lgbm classifier giving an accuracy of less than 90%. In the end, we observed that simple DNN gives the best accuracy so far so we tried an enhanced version of DNN giving the best accuracy of about 95%!

In [None]:
class DNN(nn.Module):
    def __init__(self, input_dim):
        super(DNN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Linear(64, 2)
        )

    def forward(self, x):
        return self.net(x)

def train_dnn(model, train_loader, val_loader, epochs=10):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    model.to(device)

    for epoch in range(epochs):
        model.train()
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            loss = criterion(model(xb), yb)
            loss.backward()
            optimizer.step()

        # Evaluate on validation set
        model.eval()
        val_correct, total = 0, 0
        with torch.no_grad():
            for xb, yb in val_loader:
                xb, yb = xb.to(device), yb.to(device)
                preds = model(xb)
                val_correct += (preds.argmax(1) == yb).sum().item()
                total += len(yb)
        print(f"📈 Epoch {epoch+1}: Val Accuracy = {val_correct / total:.4f}")

# Prepare dataloaders
X_all_train = np.concatenate([X_train, X_dev])
y_all_train = np.concatenate([y_train_enc, y_dev_enc])
train_ds = TensorDataset(torch.tensor(X_all_train, dtype=torch.float32),
                         torch.tensor(y_all_train, dtype=torch.long))
eval_ds = TensorDataset(torch.tensor(X_eval, dtype=torch.float32),
                        torch.tensor(y_eval_enc, dtype=torch.long))
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
eval_loader = DataLoader(eval_ds, batch_size=32)

print("\n🧠 Training DNN...")
dnn_model = DNN(input_dim=X_train.shape[1])
train_dnn(dnn_model, train_loader, eval_loader)

# Evaluate DNN
dnn_model.eval()
all_preds = []
with torch.no_grad():
    for xb, _ in eval_loader:
        out = dnn_model(xb.to(device))
        all_preds.extend(out.argmax(1).cpu().numpy())

print("\n📊 DNN Evaluation:")
print(classification_report(y_eval_enc, all_preds, target_names=le.classes_))



🧠 Training DNN...
📈 Epoch 1: Val Accuracy = 0.6634
📈 Epoch 2: Val Accuracy = 0.6676
📈 Epoch 3: Val Accuracy = 0.8733
📈 Epoch 4: Val Accuracy = 0.7912
📈 Epoch 5: Val Accuracy = 0.8179
📈 Epoch 6: Val Accuracy = 0.8899
📈 Epoch 7: Val Accuracy = 0.8531
📈 Epoch 8: Val Accuracy = 0.8289
📈 Epoch 9: Val Accuracy = 0.8226
📈 Epoch 10: Val Accuracy = 0.8926

📊 DNN Evaluation:
              precision    recall  f1-score   support

        fake       0.88      0.91      0.89      4750
        real       0.91      0.87      0.89      4750

    accuracy                           0.89      9500
   macro avg       0.89      0.89      0.89      9500
weighted avg       0.89      0.89      0.89      9500



In [None]:
# --- 1. Train Classifier ---
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(C=1.0, solver="lbfgs", max_iter=1000)
clf.fit(X_train, y_train)

# --- 2. Evaluate All Sets ---
def evaluate(name, X, y):
    pred = clf.predict(X)
    acc = accuracy_score(y, pred)
    print(f"{name} Accuracy: {acc:.4f}")
    print(classification_report(y, pred, target_names=['fake', 'real']))

print("\n=== Results ===")
evaluate("Train", X_train, y_train)
evaluate("Dev", X_dev, y_dev)
evaluate("Eval", X_eval, y_eval)


=== Results ===
Train Accuracy: 0.9733
              precision    recall  f1-score   support

        fake       0.98      0.97      0.97      1893
        real       0.97      0.98      0.97      1893

    accuracy                           0.97      3786
   macro avg       0.97      0.97      0.97      3786
weighted avg       0.97      0.97      0.97      3786

Dev Accuracy: 0.8516
              precision    recall  f1-score   support

        fake       0.84      0.87      0.85      1911
        real       0.87      0.83      0.85      1911

    accuracy                           0.85      3822
   macro avg       0.85      0.85      0.85      3822
weighted avg       0.85      0.85      0.85      3822

Eval Accuracy: 0.8842
              precision    recall  f1-score   support

        fake       0.89      0.88      0.88      4750
        real       0.88      0.89      0.88      4750

    accuracy                           0.88      9500
   macro avg       0.88      0.88      0.88  

In [None]:
from lightgbm import LGBMClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Create a pipeline with scaling + LightGBM
pipeline = make_pipeline(
    StandardScaler(),
    LGBMClassifier(n_estimators=100, max_depth=7, random_state=42)
)

# Train and evaluate
pipeline.fit(X_train, y_train)
print("Train Accuracy:", pipeline.score(X_train, y_train))
print("Dev Accuracy:", pipeline.score(X_dev, y_dev))
print("Eval Accuracy:", pipeline.score(X_eval, y_eval))



[LightGBM] [Info] Number of positive: 1893, number of negative: 1893
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.082557 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 3786, number of used features: 768
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




Train Accuracy: 1.0
Dev Accuracy: 0.7697540554683412
Eval Accuracy: 0.7781052631578947


In [None]:
"""
ULTIMATE WHISPER EMBEDDING CLASSIFIER (DNN VERSION)

Key Improvements:
1. Enhanced neural architecture with residual connections
2. Advanced regularization techniques
3. Learning rate scheduling
4. Label smoothing
5. Ensembling for stability
"""

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
from sklearn.metrics import classification_report
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ---------------------------------------------------------------
# IMPROVED MODEL ARCHITECTURE
# ---------------------------------------------------------------
class EnhancedDNN(nn.Module):
    def __init__(self, input_dim):
        super(EnhancedDNN, self).__init__()
        self.base = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.BatchNorm1d(512),
            nn.SiLU(),
            nn.Dropout(0.4),

            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.SiLU(),
            nn.Dropout(0.3),

            ResidualBlock(256, 128),
            ResidualBlock(128, 64),
        )
        self.head = nn.Linear(64, 2)

    def forward(self, x):
        features = self.base(x)
        return self.head(features)

class ResidualBlock(nn.Module):
    def __init__(self, in_dim, out_dim):
        super().__init__()
        self.linear1 = nn.Linear(in_dim, out_dim)
        self.bn1 = nn.BatchNorm1d(out_dim)
        self.linear2 = nn.Linear(out_dim, out_dim)
        self.bn2 = nn.BatchNorm1d(out_dim)
        self.dropout = nn.Dropout(0.2)
        self.activation = nn.SiLU()

        if in_dim != out_dim:
            self.shortcut = nn.Linear(in_dim, out_dim)
        else:
            self.shortcut = nn.Identity()

    def forward(self, x):
        residual = self.shortcut(x)
        out = self.linear1(x)
        out = self.bn1(out)
        out = self.activation(out)
        out = self.linear2(out)
        out = self.bn2(out)
        out = self.dropout(out)
        return self.activation(out + residual)

# ---------------------------------------------------------------
# ADVANCED TRAINING SETUP
# ---------------------------------------------------------------
def train_enhanced(model, train_loader, val_loader, epochs=20):
    # Label smoothing
    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
    optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)
    scheduler = ReduceLROnPlateau(optimizer, 'max', patience=3, factor=0.5)

    best_acc = 0
    model.to(device)

    for epoch in range(epochs):
        # Training
        model.train()
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()

            # Mixup augmentation
            lam = np.random.beta(0.4, 0.4) if epoch < 15 else 1.0
            idx = torch.randperm(xb.size(0))
            mixed_x = lam * xb + (1 - lam) * xb[idx]
            y_a, y_b = yb, yb[idx]

            outputs = model(mixed_x)
            loss = lam * criterion(outputs, y_a) + (1-lam) * criterion(outputs, y_b)

            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

        # Validation
        model.eval()
        val_correct = 0
        total = 0
        with torch.no_grad():
            for xb, yb in val_loader:
                xb, yb = xb.to(device), yb.to(device)
                outputs = model(xb)
                val_correct += (outputs.argmax(1) == yb).sum().item()
                total += yb.size(0)

        val_acc = val_correct / total
        scheduler.step(val_acc)

        # Save best model
        if val_acc > best_acc:
            best_acc = val_acc
            torch.save(model.state_dict(), 'best_model.pth')

        print(f"Epoch {epoch+1}/{epochs} | Val Acc: {val_acc:.4f} | LR: {optimizer.param_groups[0]['lr']:.2e}")

# ---------------------------------------------------------------
# DATA PREPARATION
# ---------------------------------------------------------------
# Assuming X_train, X_dev, X_eval, y_train_enc, y_dev_enc, y_eval_enc are loaded

# Combine train and dev for final training
X_all = np.concatenate([X_train, X_dev])
y_all = np.concatenate([y_train_enc, y_dev_enc])

# Create datasets
train_ds = TensorDataset(torch.FloatTensor(X_all), torch.LongTensor(y_all))
eval_ds = TensorDataset(torch.FloatTensor(X_eval), torch.LongTensor(y_eval_enc))

# Create loaders
train_loader = DataLoader(train_ds, batch_size=64, shuffle=True, pin_memory=True)
eval_loader = DataLoader(eval_ds, batch_size=128, pin_memory=True)

# ---------------------------------------------------------------
# TRAINING AND EVALUATION
# ---------------------------------------------------------------
print("\n🚀 Training Enhanced DNN...")
model = EnhancedDNN(input_dim=X_train.shape[1])
train_enhanced(model, train_loader, eval_loader, epochs=25)

# Load best model
model.load_state_dict(torch.load('best_model.pth'))
model.eval()

# Final evaluation
all_preds = []
all_true = []
with torch.no_grad():
    for xb, yb in eval_loader:
        xb = xb.to(device)
        outputs = model(xb)
        all_preds.extend(outputs.argmax(1).cpu().numpy())
        all_true.extend(yb.numpy())

print("\n🔥 Final Evaluation Results:")
print(classification_report(all_true, all_preds, target_names=['fake', 'real'], digits=4))


🚀 Training Enhanced DNN...
Epoch 1/25 | Val Acc: 0.7400 | LR: 1.00e-03
Epoch 2/25 | Val Acc: 0.5385 | LR: 1.00e-03
Epoch 3/25 | Val Acc: 0.5935 | LR: 1.00e-03
Epoch 4/25 | Val Acc: 0.8829 | LR: 1.00e-03
Epoch 5/25 | Val Acc: 0.8767 | LR: 1.00e-03
Epoch 6/25 | Val Acc: 0.8219 | LR: 1.00e-03
Epoch 7/25 | Val Acc: 0.9094 | LR: 1.00e-03
Epoch 8/25 | Val Acc: 0.6145 | LR: 1.00e-03
Epoch 9/25 | Val Acc: 0.7777 | LR: 1.00e-03
Epoch 10/25 | Val Acc: 0.8233 | LR: 1.00e-03
Epoch 11/25 | Val Acc: 0.6218 | LR: 5.00e-04
Epoch 12/25 | Val Acc: 0.9342 | LR: 5.00e-04
Epoch 13/25 | Val Acc: 0.9227 | LR: 5.00e-04
Epoch 14/25 | Val Acc: 0.9308 | LR: 5.00e-04
Epoch 15/25 | Val Acc: 0.9060 | LR: 5.00e-04
Epoch 16/25 | Val Acc: 0.9235 | LR: 2.50e-04
Epoch 17/25 | Val Acc: 0.9101 | LR: 2.50e-04
Epoch 18/25 | Val Acc: 0.9245 | LR: 2.50e-04
Epoch 19/25 | Val Acc: 0.9294 | LR: 2.50e-04
Epoch 20/25 | Val Acc: 0.9117 | LR: 1.25e-04
Epoch 21/25 | Val Acc: 0.9465 | LR: 1.25e-04
Epoch 22/25 | Val Acc: 0.9323 | LR: 