In [None]:
import parselmouth

In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [7]:
base_dir = '/content/drive/MyDrive/audio_speech_actors_01-24'

In [27]:
import os
import numpy as np
import pandas as pd
import librosa
import torch
import parselmouth
from torch.utils.data import DataLoader, random_split, TensorDataset
from sklearn.preprocessing import LabelEncoder
from torch import nn
import torch.nn.functional as F
from sklearn.preprocessing import StandardScaler

base_dir = '/content/drive/MyDrive/audio_speech_actors_01-24'

paths = []
labels = []
for dirname, _, filenames in os.walk(base_dir):
    for filename in filenames:
        parts = filename.replace(".wav", "").split("-")
        paths.append(os.path.join(dirname, filename))
        labels.append(int(parts[2]))

df = pd.DataFrame({"speech": paths, "labels": labels})
le = LabelEncoder()
df["encoded_labels"] = le.fit_transform(df["labels"])

def extract_mfcc(path, n_mfcc=40, max_len=360):
    signal, sr = librosa.load(path, sr=None)
    mfcc = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=n_mfcc)
    if mfcc.shape[1] < max_len:
        mfcc = np.pad(mfcc, pad_width=((0, 0), (0, max_len - mfcc.shape[1])), mode='constant')
    else:
        mfcc = mfcc[:, :max_len]
    mfcc = (mfcc - np.mean(mfcc)) / np.std(mfcc)
    return mfcc

def extract_acoustic_features(path):
    snd = parselmouth.Sound(path)
    pitch = snd.to_pitch()
    point_process = parselmouth.praat.call(snd, "To PointProcess (periodic, cc)", 75, 500)

    jitter = parselmouth.praat.call([point_process], "Get jitter (local)", 0, 0, 0.0001, 0.02, 1.3)
    shimmer = parselmouth.praat.call([snd, point_process], "Get shimmer (local)", 0, 0, 0.0001, 0.02, 1.3, 1.6)

    duration = snd.get_total_duration()
    num_pulses = parselmouth.praat.call(point_process, "Get number of points")
    speaking_rate = num_pulses / duration if duration > 0 else 0

    return np.array([jitter, shimmer, speaking_rate], dtype=np.float32)



mfcc_features = []
acoustic_features = []
targets = []

for i in range(len(df)):
    mfcc = extract_mfcc(df.iloc[i]["speech"])
    acoustic = extract_acoustic_features(df.iloc[i]["speech"])
    label = torch.tensor(df.iloc[i]["encoded_labels"], dtype=torch.long)

    mfcc_features.append(torch.tensor(mfcc, dtype=torch.float32).unsqueeze(0))
    acoustic_features.append(torch.tensor(acoustic))
    targets.append(label)

mfcc_tensor = torch.stack(mfcc_features)
acoustic_tensor = torch.stack(acoustic_features)
target_tensor = torch.stack(targets)

# NORMALIZE ACOUSTIC FEATURES
scaler = StandardScaler()
acoustic_tensor = torch.tensor(scaler.fit_transform(acoustic_tensor.numpy()), dtype=torch.float32)

# CNN FEATURE EXTRACTOR
cnn_model = nn.Sequential(
    nn.Conv2d(1, 16, kernel_size=3),
    nn.ReLU(),
    nn.MaxPool2d(2, 2),
    nn.Conv2d(16, 32, kernel_size=3),
    nn.ReLU(),
    nn.MaxPool2d(2, 2),
    nn.Flatten()
)

# EXTRACT CNN OUTPUT FEATURES
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
cnn_model.to(device)
cnn_model.eval()

with torch.no_grad():
    cnn_output = cnn_model(mfcc_tensor.to(device)).cpu()

# CONCATENATE WITH ACOUSTIC FEATURES
combined_features = torch.cat([cnn_output, acoustic_tensor], dim=1)

# FINAL FFNN CLASSIFIER
model = nn.Sequential(
    nn.Linear(22531, 512),
    nn.ReLU(),
    nn.Linear(512, 256),
    nn.ReLU(),
    nn.Linear(256, 64),
    nn.ReLU(),
    nn.Linear(64, len(le.classes_))
).to(device)

#CREATE DATASET AND LOADERS
dataset = TensorDataset(combined_features, target_tensor)
train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size
train_data, val_data, test_data = random_split(dataset, [train_size, val_size, test_size])
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
val_loader = DataLoader(val_data, batch_size=32)

#TRAINING SETUP
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

#TRAINING LOOP
for epoch in range(10):
    model.train()
    total_loss = 0
    for x, y in train_loader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        out = model(x)
        loss = criterion(out, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")

#VALIDATION
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for x, y in val_loader:
        x, y = x.to(device), y.to(device)
        pred = torch.argmax(model(x), dim=1)
        correct += (pred == y).sum().item()
        total += y.size(0)

print(f"Validation Accuracy: {100 * correct / total:.2f}%")

Epoch 1, Loss: 2.0522
Epoch 2, Loss: 1.9033
Epoch 3, Loss: 1.7121
Epoch 4, Loss: 1.5918
Epoch 5, Loss: 1.5408
Epoch 6, Loss: 1.4909
Epoch 7, Loss: 1.4138
Epoch 8, Loss: 1.4037
Epoch 9, Loss: 1.3607
Epoch 10, Loss: 1.2797
Validation Accuracy: 44.44%
