In [None]:
import os
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
import tensorflow as tf
import sounddevice as sd
import wave
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization, GaussianNoise
from tensorflow.keras.regularizers import l2
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report, confusion_matrix
from pydub import AudioSegment
from pydub.playback import play

# Define dataset path
data_path = "C:\\Users\\tsnte\\Downloads\\archive\\data"

# Load and extract MFCC features
def extract_features(file_path, max_pad_len=100):
    audio, sample_rate = librosa.load(file_path, sr=22050)
    if np.max(np.abs(audio)) < 0.015:  # Silence detection threshold
        return None
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    pad_width = max_pad_len - mfccs.shape[1]
    if pad_width > 0:
        mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
    else:
        mfccs = mfccs[:, :max_pad_len]
    return mfccs

# Prepare dataset
X, y = [], []
labels = {'male': 0, 'female': 1}

for label in labels.keys():
    folder_path = os.path.join(data_path, label)
    for file in os.listdir(folder_path):
        if file.endswith(".wav"):
            file_path = os.path.join(folder_path, file)
            features = extract_features(file_path)
            if features is not None:
                X.append(features)
                y.append(labels[label])

X = np.array(X)
y = np.array(y)

# Reshape for CNN input
X = X[..., np.newaxis]  

# Split dataset (Stratified to balance male & female)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Convert labels to categorical
y_train = to_categorical(y_train, num_classes=2)
y_test = to_categorical(y_test, num_classes=2)

# Compute class weights (handle imbalance)
class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(y), y=y)
class_weights = {i: class_weights[i] for i in range(len(class_weights))}

# Build CNN Model
model = Sequential([
    GaussianNoise(0.1, input_shape=(40, 100, 1)),  # Add noise for better generalization
    Conv2D(32, (3, 3), activation='relu', kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    MaxPooling2D((2, 2)),
    Dropout(0.3),  # Increased dropout
    
    Conv2D(64, (3, 3), activation='relu', kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    MaxPooling2D((2, 2)),
    Dropout(0.4),

    Conv2D(128, (3, 3), activation='relu', kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    MaxPooling2D((2, 2)),
    Dropout(0.5),

    Flatten(),
    Dense(128, activation='relu', kernel_regularizer=l2(0.001)),  # Reduced from 256
    Dropout(0.5),  
    Dense(2, activation='softmax')
])

# Compile model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train model with class weights
history = model.fit(X_train, y_train, epochs=20, batch_size=16, validation_data=(X_test, y_test), class_weight=class_weights)

# Evaluate model
loss, acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {acc * 100:.2f}%")

# Save the model
model.save("voice_gender_model.h5")

# Function to predict gender
def predict_gender(audio_path):
    features = extract_features(audio_path)
    if features is None:
        print("Result: Silence or Noise detected")
        return
    features = np.expand_dims(features, axis=-1)
    features = np.expand_dims(features, axis=0)
    predictions = model.predict(features)
    male_prob, female_prob = predictions[0]
    print(f"Male Probability: {male_prob * 100:.2f}%")
    print(f"Female Probability: {female_prob * 100:.2f}%")
    print("Predicted Gender:", "Male" if male_prob > female_prob else "Female")

# Function to record audio
def record_audio(filename="recorded.wav", duration=3, fs=22050):
    print("Recording...")
    audio = sd.rec(int(duration * fs), samplerate=fs, channels=1, dtype='int16')
    sd.wait()
    print("Recording complete.")
    wavefile = wave.open(filename, 'wb')
    wavefile.setnchannels(1)
    wavefile.setsampwidth(2)
    wavefile.setframerate(fs)
    wavefile.writeframes(audio.tobytes())
    wavefile.close()
    return filename

# Function to play recorded audio
def play_audio(filename="recorded.wav"):
    audio = AudioSegment.from_wav(filename)
    play(audio)

# Real-time recording, playback, and prediction 
recorded_file = record_audio()
play_audio(recorded_file)
predict_gender(recorded_file)

# Generate predictions
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true_classes = np.argmax(y_test, axis=1)

# Print classification report
print(classification_report(y_true_classes, y_pred_classes, target_names=['Male', 'Female']))

# Confusion Matrix
conf_matrix = confusion_matrix(y_true_classes, y_pred_classes)

# Plot confusion matrix
plt.figure(figsize=(6, 5))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Male', 'Female'], yticklabels=['Male', 'Female'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

# Plot training accuracy & loss
plt.figure(figsize=(12, 5))

# Accuracy Graph
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend()
plt.title("Model Accuracy")

# Loss Graph
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.title("Model Loss")

plt.show()
