In [4]:
# %% [markdown]
# # Part 1: Urdu Deepfake Audio Detection  
# Binary classification with SVM, Logistic Regression, Perceptron, and a small DNN.

# %%
# 0. (Optional) Inspect / override HF cache location
import os
from datasets import config, load_dataset

# Uncomment to override the default cache:
# os.environ["HF_DATASETS_CACHE"] = r"D:\6th Semester\Data Science\Assignmentno4\AudioData\hf_cache"

print("HF cache dir:", config.HF_DATASETS_CACHE)

# %%
# 1. Imports and setup
import numpy as np
import librosa
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
import pdb      # Python debugger
import joblib
RANDOM_SEED = 42

# %%
# 2. Load & split dataset from local clone
print("Loading dataset from local path…")
ds = load_dataset(
    "D:/6th Semester/Data Science/Assignmentno4/deepfake_detection_dataset_urdu",  # Local folder
    "default"  # config name
)
full_train = ds["train"]
split      = full_train.train_test_split(test_size=0.2, seed=RANDOM_SEED)
train_ds   = split["train"]
test_ds    = split["test"]
print(f"Train size: {len(train_ds)}, Test size: {len(test_ds)}")

# %%
# 3. Feature extraction: mean+variance of 40 MFCCs
def extract_features(audio_array, sr, n_mfcc=40):
    mfcc      = librosa.feature.mfcc(y=audio_array, sr=sr, n_mfcc=n_mfcc)
    mfcc_mean = np.mean(mfcc, axis=1)
    mfcc_var  = np.var(mfcc, axis=1)
    return np.concatenate([mfcc_mean, mfcc_var])

# %%
# 4. Build X / y arrays (derive y from file path)
def build_xy(dataset, name="dataset"):
    X, y = [], []
    print(f"Building features for {name}…")
    for ex in dataset:
        arr   = ex["audio"]["array"]
        sr    = ex["audio"]["sampling_rate"]
        feats = extract_features(arr, sr)
        X.append(feats)
        # derive label: bonafide→0, else→1
        path = ex["audio"]["path"].lower()
        label = 0 if "bonafide" in path else 1
        y.append(label)
    X_arr = np.array(X)
    y_arr = np.array(y)
    print(f"{name} shapes: X={X_arr.shape}, y={y_arr.shape}")
    return X_arr, y_arr

X_train, y_train = build_xy(train_ds, "train_ds")
X_test,  y_test  = build_xy(test_ds,  "test_ds")

# %%
# 5. Scale features
print("Scaling features…")
scaler         = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)
print(f"Scaled shapes: X_train={X_train_scaled.shape}, X_test={X_test_scaled.shape}")

# %%
# 6. Train classical classifiers
classifiers = {
    "SVM":        SVC(kernel="rbf", probability=True, random_state=RANDOM_SEED),
    "Logistic":   LogisticRegression(max_iter=1000, random_state=RANDOM_SEED),
    "Perceptron": Perceptron(max_iter=1000, random_state=RANDOM_SEED),
}

for name, clf in classifiers.items():
    print(f"Training {name}…")
    clf.fit(X_train_scaled, y_train)

# %%
# 7. Train a “deep” MLP via sklearn
print("Training Deep Neural Network (MLPClassifier)…")
dnn_clf = MLPClassifier(
    hidden_layer_sizes=(64,32),
    activation="relu",
    solver="adam",
    random_state=RANDOM_SEED,
    max_iter=50
)
dnn_clf.fit(X_train_scaled, y_train)

# %%
# 8. Evaluation helper
def eval_model(name, y_true, y_pred, y_score):
    print(f"\n--- {name} ---")
    print("Accuracy :", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall   :", recall_score(y_true, y_pred))
    print("F1-Score :", f1_score(y_true, y_pred))
    print("AUC-ROC  :", roc_auc_score(y_true, y_score))

# %%
# 9. Evaluate classical models (handle missing predict_proba)
for name, clf in classifiers.items():
    print(f"\nEvaluating {name}…")
    y_pred = clf.predict(X_test_scaled)
    if hasattr(clf, "predict_proba"):
        y_score = clf.predict_proba(X_test_scaled)[:, 1]
    else:
        y_score = clf.decision_function(X_test_scaled)
    eval_model(name, y_test, y_pred, y_score)

# %%
# 10. Evaluate the “deep” MLP
print("\nEvaluating Deep Neural Network (MLPClassifier)…")
y_pred_dnn   = dnn_clf.predict(X_test_scaled)
y_score_dnn  = dnn_clf.predict_proba(X_test_scaled)[:, 1]
eval_model("Deep Neural Network", y_test, y_pred_dnn, y_score_dnn)


# %% 
# 11. Save scaler and models to disk for Streamlit app
# Replace these names with whatever variables hold your objects
# (from the cells above):
joblib.dump(scaler,   "scaler.pkl")
joblib.dump(classifiers["SVM"],        "svm_clf.pkl")
joblib.dump(classifiers["Logistic"],   "log_clf.pkl")
joblib.dump(classifiers["Perceptron"], "per_clf.pkl")
joblib.dump(dnn_clf,    "dnn_clf.pkl")

print("All models and scaler saved as .pkl files.")



Repo card metadata block was not found. Setting CardData to empty.


HF cache dir: C:\Users\LAPTOP MART\.cache\huggingface\datasets
Loading dataset from local path…


Resolving data files:   0%|          | 0/6794 [00:00<?, ?it/s]

Train size: 5435, Test size: 1359
Building features for train_ds…
train_ds shapes: X=(5435, 80), y=(5435,)
Building features for test_ds…
test_ds shapes: X=(1359, 80), y=(1359,)
Scaling features…
Scaled shapes: X_train=(5435, 80), X_test=(1359, 80)
Training SVM…
Training Logistic…
Training Perceptron…
Training Deep Neural Network (MLPClassifier)…

Evaluating SVM…

--- SVM ---
Accuracy : 0.9977924944812362
Precision: 0.9970674486803519
Recall   : 0.9985315712187959
F1-Score : 0.9977989728539985
AUC-ROC  : 0.9995365136295314

Evaluating Logistic…

--- Logistic ---
Accuracy : 0.9359823399558499
Precision: 0.9230769230769231
Recall   : 0.9515418502202643
F1-Score : 0.9370932754880694
AUC-ROC  : 0.9781078493799246

Evaluating Perceptron…

--- Perceptron ---
Accuracy : 0.8800588668138337
Precision: 0.861731843575419
Recall   : 0.9060205580029369
F1-Score : 0.8833214030064424
AUC-ROC  : 0.9551912639316639

Evaluating Deep Neural Network (MLPClassifier)…

--- Deep Neural Network ---
Accuracy :