In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
base_dir = '/content/drive/MyDrive/audio_speech_actors_01-24'

In [3]:
##Predicting voice samples using CNN and MFCC
import os
import numpy as np
import pandas as pd
import librosa
import torch
from torch.utils.data import DataLoader, random_split, TensorDataset
from sklearn.preprocessing import LabelEncoder
from torch import nn
import torch.nn.functional as F

# 1. Load data paths and labels
paths = []
labels = []
base_dir = '/content/drive/MyDrive/audio_speech_actors_01-24'
for dirname, _, filenames in os.walk(base_dir):
    for filename in filenames:
        parts = filename.replace(".wav", "").split("-")
        paths.append(os.path.join(dirname, filename))
        labels.append(int(parts[2]))

# 2. Create DataFrame and encode labels
df = pd.DataFrame({"speech": paths, "labels": labels})
encode = LabelEncoder()
df["encoded_labels"] = encode.fit_transform(df["labels"])

# 3. Preprocess MFCC
def preprocess_mfcc(file_path, n_mfcc=40, max_len=360):
    signal, sr = librosa.load(file_path, sr=None)
    mfcc = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=n_mfcc)
    if mfcc.shape[1] < max_len:
        pad_width = max_len - mfcc.shape[1]
        mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')
    else:
        mfcc = mfcc[:, :max_len]
    mfcc = (mfcc - np.mean(mfcc)) / np.std(mfcc)
    return torch.tensor(mfcc, dtype=torch.float32).unsqueeze(0)

# 4. Prepare dataset
features = []
targets = []
for i in range(len(df)):
    mfcc = preprocess_mfcc(df.iloc[i]["speech"])
    label = torch.tensor(df.iloc[i]["encoded_labels"], dtype=torch.long)
    features.append(mfcc)
    targets.append(label)

# 5. Create DataLoader
dataset = list(zip(features, targets)) #list of tuples
train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = val_size
train_data, val_data, test_data = random_split(dataset, [train_size, val_size, test_size])

train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
val_loader = DataLoader(val_data, batch_size=32)
test_loader = DataLoader(test_data, batch_size=32)

# 6. Defining CNN model
model = nn.Sequential(
    nn.Conv2d(1, 16, kernel_size=3),
    nn.ReLU(),
    nn.MaxPool2d(2, 2),
    nn.Conv2d(16, 32, kernel_size=3),
    nn.ReLU(),
    nn.MaxPool2d(2, 2),
    nn.Flatten(),
    nn.Linear(22528, 120),
    nn.ReLU(),
    nn.Linear(120, len(encode.classes_))
)

# 7. Training setup
device = torch.device("cpu")
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# 8. Training loop
for epoch in range(10):
    model.train()
    total_loss = 0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")

# 9. Validation
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in val_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f"Validation Accuracy: {100 * correct / total:.2f}%")

Epoch 1, Loss: 2.0675
Epoch 2, Loss: 1.8035
Epoch 3, Loss: 1.5028
Epoch 4, Loss: 1.2695
Epoch 5, Loss: 1.0766
Epoch 6, Loss: 0.8975
Epoch 7, Loss: 0.8317
Epoch 8, Loss: 0.6280
Epoch 9, Loss: 0.5079
Epoch 10, Loss: 0.4164
Validation Accuracy: 62.50%
