# 🎧 AI-Morphed Voice Detection System
Using CNN + LSTM on RGB Spectrograms


This notebook demonstrates how to build a deep learning model to detect AI-generated (fake) voices from real ones using RGB spectrograms and a hybrid CNN-LSTM architecture.

**Steps Covered:**
- Load and preprocess image data
- Reshape for LSTM layers
- Build CNN + LSTM model
- Train and evaluate
- Visualize accuracy, loss, and confusion matrix


In [None]:

import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.utils import shuffle
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (Conv2D, MaxPooling2D, Flatten, Dense,
                                     Dropout, LSTM, TimeDistributed, Reshape,
                                     BatchNormalization, InputLayer)
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping


In [None]:

# Parameters
IMG_HEIGHT, IMG_WIDTH = 224, 224
BATCH_SIZE = 32
EPOCHS = 50
NUM_CLASSES = 2
TIME_STEPS = 28
FEATURES = IMG_WIDTH * 3  # RGB


In [None]:

def load_dataset(folder_path):
    data, labels = [], []
    for label, subfolder in enumerate(['real', 'fake']):
        path = os.path.join(folder_path, subfolder)
        for img_name in os.listdir(path):
            img_path = os.path.join(path, img_name)
            try:
                img = cv2.imread(img_path)
                if img is not None:
                    img = cv2.resize(img, (IMG_WIDTH, IMG_HEIGHT))
                    data.append(img)
                    labels.append(label)
            except Exception as e:
                print(f"Error loading image {img_path}: {e}")
    return shuffle(np.array(data), np.array(labels))


In [None]:

train_path = 'dataset/training'
val_path = 'dataset/validation'
test_path = 'dataset/testing'

X_train, y_train = load_dataset(train_path)
X_val, y_val = load_dataset(val_path)
X_test, y_test = load_dataset(test_path)

X_train, X_val, X_test = X_train / 255.0, X_val / 255.0, X_test / 255.0
y_train = to_categorical(y_train, NUM_CLASSES)
y_val = to_categorical(y_val, NUM_CLASSES)
y_test = to_categorical(y_test, NUM_CLASSES)

def reshape_for_lstm(X):
    return X.reshape((X.shape[0], TIME_STEPS, -1))

X_train_seq = reshape_for_lstm(X_train)
X_val_seq = reshape_for_lstm(X_val)
X_test_seq = reshape_for_lstm(X_test)


In [None]:

def build_cnn_lstm_model():
    model = Sequential()
    model.add(InputLayer(input_shape=(TIME_STEPS, FEATURES)))
    model.add(Reshape((TIME_STEPS, IMG_WIDTH, 3)))
    model.add(TimeDistributed(Conv2D(32, (3, 3), activation='relu', padding='same')))
    model.add(TimeDistributed(MaxPooling2D((2, 2))))
    model.add(TimeDistributed(BatchNormalization()))
    model.add(TimeDistributed(Conv2D(64, (3, 3), activation='relu', padding='same')))
    model.add(TimeDistributed(MaxPooling2D((2, 2))))
    model.add(TimeDistributed(BatchNormalization()))
    model.add(TimeDistributed(Flatten()))
    model.add(LSTM(128, return_sequences=False))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(NUM_CLASSES, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

model = build_cnn_lstm_model()
model.summary()


In [None]:

checkpoint = ModelCheckpoint("best_model.h5", monitor='val_accuracy', save_best_only=True, mode='max', verbose=1)
earlystop = EarlyStopping(monitor='val_loss', patience=8, restore_best_weights=True)

history = model.fit(
    X_train_seq, y_train,
    validation_data=(X_val_seq, y_val),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=[checkpoint, earlystop],
    verbose=1
)


In [None]:

test_loss, test_acc = model.evaluate(X_test_seq, y_test, verbose=1)
print(f"Test Accuracy: {test_acc*100:.2f}%")

y_pred = model.predict(X_test_seq)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true = np.argmax(y_test, axis=1)
print(classification_report(y_true, y_pred_classes, target_names=['Real', 'Fake']))


In [None]:

# Accuracy plot
plt.figure()
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.savefig('accuracy_plot.png')
plt.show()

# Loss plot
plt.figure()
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.savefig('loss_plot.png')
plt.show()


In [None]:

cm = confusion_matrix(y_true, y_pred_classes)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Real", "Fake"])
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.savefig("confusion_matrix.png")
plt.show()
