In [1]:
# ===============================================================
# üéµ Audio Classification with RNN/LSTM (GPU Safe)
# ===============================================================
import os
import numpy as np
import pandas as pd
import librosa
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from torch.cuda.amp import autocast, GradScaler

# ===============================================================
# ‚öôÔ∏è Device setup (GPU / CPU)
# ===============================================================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if device.type == "cuda":
    print(f"GPU: {torch.cuda.get_device_name(0)}")

# ===============================================================
# üì¶ Load metadata
# ===============================================================
def load_audio_metadata(csv_path):
    df = pd.read_csv(csv_path)
    return df["File Path"].tolist(), df["Classification"].tolist()

# ===============================================================
# üéõ Feature Extraction (Time-sliced MFCC + LPC)
# ===============================================================
def extract_mfcc_lpc_sequence(paths, labels, n_mfcc=20, sr=22050,
                              slice_duration=1.0, overlap=0.5, lpc_order=12):
    X, y = [], []
    for path, label in zip(paths, labels):
        try:
            y_audio, _ = librosa.load(path, sr=sr)
            slice_len = int(slice_duration * sr)
            hop_len = int(slice_len * (1 - overlap))
            seq_features = []

            for start in range(0, len(y_audio) - slice_len + 1, hop_len):
                slice_y = y_audio[start:start + slice_len]
                if len(slice_y) < lpc_order + 1:
                    continue

                mfcc = librosa.feature.mfcc(y=slice_y, sr=sr, n_mfcc=n_mfcc)
                mfcc_mean = np.mean(mfcc.T, axis=0)
                mfcc_std = np.std(mfcc.T, axis=0)

                lpc = librosa.lpc(slice_y, order=lpc_order)
                lpc_feats = np.concatenate([
                    lpc, np.abs(lpc), np.log1p(np.abs(lpc))
                ])

                feat_vec = np.concatenate([mfcc_mean, mfcc_std, lpc_feats])
                seq_features.append(feat_vec)

            if len(seq_features) > 0:
                X.append(np.array(seq_features))
                y.append(label)

        except Exception as e:
            print(f"Error: {path} ‚Äî {e}")
    print(f"Extracted {len(X)} valid audio sequences.")
    return X, y

# ===============================================================
# ‚öôÔ∏è Pad + Scale
# ===============================================================
def pad_and_scale(X):
    max_len = max(seq.shape[0] for seq in X)
    feat_dim = X[0].shape[1]
    padded = np.zeros((len(X), max_len, feat_dim))
    for i, seq in enumerate(X):
        padded[i, :seq.shape[0], :] = seq

    scaler = StandardScaler()
    reshaped = padded.reshape(-1, feat_dim)
    scaled = scaler.fit_transform(reshaped)
    scaled = scaled.reshape(len(X), max_len, feat_dim)
    return scaled

# ===============================================================
# üß© Dataset + DataLoader
# ===============================================================
class AudioDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# ===============================================================
# üß† Model: LSTM / BiLSTM
# ===============================================================
class AudioRNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes,
                 num_layers=2, bidirectional=True, dropout=0.3):
        super(AudioRNN, self).__init__()
        self.bidirectional = bidirectional
        self.lstm = nn.LSTM(input_dim, hidden_dim,
                            num_layers=num_layers,
                            dropout=dropout,
                            bidirectional=bidirectional,
                            batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * (2 if bidirectional else 1), num_classes)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.dropout(out[:, -1, :])  # last timestep
        return self.fc(out)

# ===============================================================
# üöÄ Training Loop (with AMP + no leaks)
# ===============================================================
def train_model(model, train_loader, val_loader, epochs=20, lr=1e-3):
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    scaler = GradScaler()  # mixed precision
    model.train()

    for epoch in range(epochs):
        total_loss = 0.0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()

            with autocast():
                outputs = model(X_batch)
                loss = criterion(outputs, y_batch)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch [{epoch+1}/{epochs}] - Loss: {avg_loss:.4f}")

    # validation
    model.eval()
    preds, labels = [], []
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch = X_batch.to(device)
            outputs = model(X_batch)
            pred = torch.argmax(outputs, dim=1)
            preds.extend(pred.cpu().numpy())
            labels.extend(y_batch.numpy())

    acc = accuracy_score(labels, preds)
    print("\nClassification Report:\n", classification_report(labels, preds))
    print(f" Validation Accuracy: {acc:.4f}")

    torch.cuda.empty_cache()  # release memory
    return model

# ===============================================================
# üß∞ Full Pipeline
# ===============================================================
def run_pipeline(csv_path):
    print("Loading metadata...")
    paths, labels = load_audio_metadata(csv_path)

    print("\nExtracting MFCC + LPC features (sequences)...")
    X, y = extract_mfcc_lpc_sequence(paths, labels)

    print("\nNormalizing & Padding...")
    X_scaled = pad_and_scale(X)

    le = LabelEncoder()
    y_enc = le.fit_transform(y)
    X_train, X_val, y_train, y_val = train_test_split(
        X_scaled, y_enc, test_size=0.2, random_state=42, stratify=y_enc
    )

    train_loader = DataLoader(AudioDataset(X_train, y_train), batch_size=8, shuffle=True)
    val_loader = DataLoader(AudioDataset(X_val, y_val), batch_size=8)

    model = AudioRNN(input_dim=X_scaled.shape[2],
                     hidden_dim=128,
                     num_classes=len(le.classes_)).to(device)

    print("\nStarting training...")
    trained_model = train_model(model, train_loader, val_loader, epochs=25, lr=1e-3)

    print("\n Training complete. Best model saved in memory.")
    return trained_model, le

# ===============================================================
# ‚ñ∂Ô∏è Run
# ===============================================================
if __name__ == "__main__":
    model, label_encoder = run_pipeline("classified_audios_auto.csv")



Using device: cuda
GPU: NVIDIA GeForce RTX 3050 4GB Laptop GPU
Loading metadata...

Extracting MFCC + LPC features (sequences)...
Extracted 2775 valid audio sequences.

Normalizing & Padding...

Starting training...


  scaler = GradScaler()  # mixed precision
  with autocast():


Epoch [1/25] - Loss: 0.9920
Epoch [2/25] - Loss: 0.9909
Epoch [3/25] - Loss: 0.9889
Epoch [4/25] - Loss: 0.9835
Epoch [5/25] - Loss: 0.9862
Epoch [6/25] - Loss: 0.9871
Epoch [7/25] - Loss: 0.9895
Epoch [8/25] - Loss: 0.9861
Epoch [9/25] - Loss: 0.9863
Epoch [10/25] - Loss: 0.9862
Epoch [11/25] - Loss: 0.9853
Epoch [12/25] - Loss: 0.9867
Epoch [13/25] - Loss: 0.9866
Epoch [14/25] - Loss: 0.9885
Epoch [15/25] - Loss: 0.9871
Epoch [16/25] - Loss: 0.9852
Epoch [17/25] - Loss: 0.9865
Epoch [18/25] - Loss: 0.9843
Epoch [19/25] - Loss: 0.9868
Epoch [20/25] - Loss: 0.9857
Epoch [21/25] - Loss: 0.9873
Epoch [22/25] - Loss: 0.9854
Epoch [23/25] - Loss: 0.9857
Epoch [24/25] - Loss: 0.9864
Epoch [25/25] - Loss: 0.9869


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        71
           1       0.48      1.00      0.65       266
           2       0.00      0.00      0.00       218

    accuracy                           0.48       555
   macro avg       0.16      0.33      0.22       555
weighted avg       0.23      0.48      0.31       555

 Validation Accuracy: 0.4793

 Training complete. Best model saved in memory.
