In [1]:
import numpy as np
import pandas as pd
import librosa
import scipy.signal as sg
import os 

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
from sklearn.metrics import accuracy_score

from hmmlearn import hmm
from tqdm import tqdm



In [2]:
base_path = r"E:\ExameEET"   
meta_path = base_path + r"\UrbanSound8K.csv"

metadata = pd.read_csv(meta_path)

# criar pasta results
results_path = os.path.join(base_path, "results")
os.makedirs(results_path, exist_ok=True)

print("Pasta base:", base_path)
print("Pasta results:", results_path)
print(metadata.head())

Pasta base: E:\ExameEET
Pasta results: E:\ExameEET\results
      slice_file_name    fsID  start        end  salience  fold  classID  \
0    100032-3-0-0.wav  100032    0.0   0.317551         1     5        3   
1  100263-2-0-117.wav  100263   58.5  62.500000         1     5        2   
2  100263-2-0-121.wav  100263   60.5  64.500000         1     5        2   
3  100263-2-0-126.wav  100263   63.0  67.000000         1     5        2   
4  100263-2-0-137.wav  100263   68.5  72.500000         1     5        2   

              class  
0          dog_bark  
1  children_playing  
2  children_playing  
3  children_playing  
4  children_playing  


In [3]:
# ============================
# Dufaux Detection
# ============================
def dufaux_detection(signal, sr, frame_ms=100, median_size=11, threshold_scale=1.5):
    frame_size = int(sr * frame_ms / 1000)
    energies = []

    for i in range(0, len(signal), frame_size):
        frame = signal[i:i+frame_size]
        energies.append(np.sum(frame**2))
    energies = np.array(energies)

    if len(energies) < median_size:
        k = max(3, len(energies)//2 * 2 + 1)
        median_size = k

    median_filtered = sg.medfilt(energies, kernel_size=median_size)
    diff = energies - median_filtered
    diff_norm = diff / (np.std(diff) + 1e-6)

    detection = diff_norm > threshold_scale
    return detection, energies, diff_norm


# ============================
# Mel Spectrogram (mean)
# ============================
def extract_mel_spectrogram(signal, sr, n_mels=64):
    mel = librosa.feature.melspectrogram(y=signal, sr=sr, n_mels=n_mels)
    mel_db = librosa.power_to_db(mel, ref=np.max)
    return np.mean(mel_db, axis=1)


# ============================
# MFCC + Delta + Delta² (mean)
# ============================
def extract_mfcc_features(signal, sr, n_mfcc=20):
    mfcc = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=n_mfcc)
    n_frames = mfcc.shape[1]
    width = min(9, n_frames if n_frames % 2 != 0 else n_frames - 1)
    if width < 3: width = 3

    delta = librosa.feature.delta(mfcc, width=width)
    delta2 = librosa.feature.delta(mfcc, order=2, width=width)

    return np.concatenate([
        np.mean(mfcc, axis=1),
        np.mean(delta, axis=1),
        np.mean(delta2, axis=1)
    ])


In [4]:
def extract_features(row):
    file_path = f"{base_path}/fold{row['fold']}/{row['slice_file_name']}"
    audio, sr = librosa.load(file_path, sr=None)

    detection, energies, diff_norm = dufaux_detection(audio, sr)
    dufaux_stats = np.array([
        np.mean(energies),
        np.std(energies),
        np.max(diff_norm),
        np.mean(diff_norm),
        np.sum(detection)
    ])

    mel_features = extract_mel_spectrogram(audio, sr)
    mfcc_full = extract_mfcc_features(audio, sr)

    return np.concatenate([dufaux_stats, mel_features, mfcc_full])


In [5]:
def load_fold_gmm(fold):
    train_df = metadata[metadata['fold'] != fold]
    test_df  = metadata[metadata['fold'] == fold]

    X_train, y_train = [], []
    X_test,  y_test  = [], []

    for _, row in train_df.iterrows():
        X_train.append(extract_features(row))
        y_train.append(row['classID'])

    for _, row in test_df.iterrows():
        X_test.append(extract_features(row))
        y_test.append(row['classID'])

    return (
        np.array(X_train), np.array(y_train),
        np.array(X_test),  np.array(y_test)
    )


def train_gmm(X, y, n_components=8):
    models = {}
    for c in np.unique(y):
        gm = GaussianMixture(
            n_components=n_components,
            covariance_type='diag',
            max_iter=200,
            random_state=0
        )
        gm.fit(X[y == c])
        models[c] = gm
    return models


def gmm_predict(models, X):
    preds = []
    for x in X:
        scores = {c: model.score([x]) for c, model in models.items()}
        preds.append(max(scores, key=scores.get))
    return np.array(preds)


In [6]:
def extract_sequence_features(row, n_mfcc=20):
    file_path = f"{base_path}/fold{row['fold']}/{row['slice_file_name']}"
    audio, sr = librosa.load(file_path, sr=None)

    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
    n_frames = mfcc.shape[1]

    width = min(9, n_frames if n_frames % 2 != 0 else n_frames - 1)
    if width < 3: width = 3

    delta = librosa.feature.delta(mfcc, width=width)

    return np.vstack([mfcc, delta]).T  # (T, 2*n_mfcc)


def train_hmm_models(fold, n_states=3, n_mfcc=20):
    train_df = metadata[metadata['fold'] != fold]
    models = {}

    for c in np.unique(metadata['classID']):
        df_c = train_df[train_df['classID'] == c]

        seqs = []
        lengths = []

        for _, row in df_c.iterrows():
            X = extract_sequence_features(row, n_mfcc)
            seqs.append(X)
            lengths.append(len(X))

        X_concat = np.vstack(seqs)

        model = hmm.GaussianHMM(
            n_components=n_states,
            covariance_type='diag',
            n_iter=50,
            random_state=0
        )
        model.fit(X_concat, lengths)
        models[c] = model

    return models


def hmm_predict(models, test_df, n_mfcc=20):
    preds = []
    for _, row in test_df.iterrows():
        X = extract_sequence_features(row, n_mfcc)
        scores = {c: models[c].score(X) for c in models.keys()}
        preds.append(max(scores, key=scores.get))
    return np.array(preds)


In [7]:
def cross_validate_gmm_hmm(use_pca=True, pca_dim=40):
    acc_gmm = []
    acc_hmm = []

    os.makedirs(results_path, exist_ok=True)

    print("\n\n===== INICIANDO CROSS-VALIDATION (10 FOLDS) =====")

    for fold in tqdm(range(1, 11), desc="Folds (1 a 10)"):
        print(f"\n============================")
        print(f"FOLD {fold}")
        print(f"============================")

        # --------------------
        # GMM
        # --------------------
        X_train, y_train, X_test, y_test = load_fold_gmm(fold)

        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test  = scaler.transform(X_test)

        pca = None
        if use_pca:
            pca = PCA(n_components=pca_dim, random_state=0)
            X_train = pca.fit_transform(X_train)
            X_test  = pca.transform(X_test)

        gmm_models = train_gmm(X_train, y_train)
        gmm_preds = gmm_predict(gmm_models, X_test)

        acc_g = accuracy_score(y_test, gmm_preds)
        acc_gmm.append(acc_g)
        print(f"GMM accuracy: {acc_g:.4f}")

        np.save(os.path.join(results_path, f"gmm_preds_fold{fold}.npy"), gmm_preds)
        np.save(os.path.join(results_path, f"y_test_fold{fold}.npy"), y_test)

        # --------------------
        # HMM (agora com tqdm interno)
        # --------------------
        hmm_models = train_hmm_models(fold)  

        test_df = metadata[metadata['fold'] == fold]

        hmm_preds = []

        for _, row in tqdm(test_df.iterrows(), 
                           total=len(test_df),
                           desc=f"HMM Fold {fold} - classificando"):
            X = extract_sequence_features(row)
            scores = {c: hmm_models[c].score(X) for c in hmm_models}
            hmm_preds.append(max(scores, key=scores.get))

        hmm_preds = np.array(hmm_preds)
        acc_h = accuracy_score(y_test, hmm_preds)
        acc_hmm.append(acc_h)
        print(f"HMM accuracy: {acc_h:.4f}")

        np.save(os.path.join(results_path, f"hmm_preds_fold{fold}.npy"), hmm_preds)

    # -------------------------------
    # Salvar resultados finais
    # -------------------------------
    np.save(os.path.join(results_path, "gmm_fold_accuracy.npy"), acc_gmm)
    np.save(os.path.join(results_path, "hmm_fold_accuracy.npy"), acc_hmm)

    summary = {
        "mean_gmm": float(np.mean(acc_gmm)),
        "mean_hmm": float(np.mean(acc_hmm)),
        "std_gmm": float(np.std(acc_gmm)),
        "std_hmm": float(np.std(acc_hmm)),
        "gmm_fold_results": [float(x) for x in acc_gmm],
        "hmm_fold_results": [float(x) for x in acc_hmm],
    }

    import json
    with open(os.path.join(results_path, "summary.json"), "w") as f:
        json.dump(summary, f, indent=4)

    print("\nResultados salvos em:", results_path)

    return acc_gmm, acc_hmm


In [8]:
acc_gmm, acc_hmm = cross_validate_gmm_hmm()

print("\n===== RESULTADOS FINAIS =====")
print("GMM:", acc_gmm)
print("HMM:", acc_hmm)

print("GMM Média:", np.mean(acc_gmm))
print("HMM Média:", np.mean(acc_hmm))

# Apenas confirma que os arquivos foram escritos
print(os.listdir(results_path))




===== INICIANDO CROSS-VALIDATION (10 FOLDS) =====


Folds (1 a 10):   0%|          | 0/10 [00:00<?, ?it/s]


FOLD 1


  "cipher": algorithms.TripleDES,
  "class": algorithms.Blowfish,
  "class": algorithms.TripleDES,
  median_filtered = sg.medfilt(energies, kernel_size=median_size)
  median_filtered = sg.medfilt(energies, kernel_size=median_size)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)


GMM accuracy: 0.5155


  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
HMM Fold 1 - classificando: 100%|██████████| 873/873 [00:24<00:00, 34.97it/s]
  median_filtered = sg.medfilt(energies, kernel_size=median_size)


HMM accuracy: 0.5578

FOLD 2


  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)


GMM accuracy: 0.4245


  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
HMM Fold 2 - classificando: 100%|██████████| 888/888 [00:23<00:00, 38.33it/s]
  median_filtered = sg.medfilt(energies, kernel_size=median_size)


HMM accuracy: 0.5101

FOLD 3


  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)


GMM accuracy: 0.4411


  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
HMM Fold 3 - classificando: 100%|██████████| 925/925 [00:22<00:00, 40.46it/s]
  median_filtered = sg.medfilt(energies, kernel_size=median_size)


HMM accuracy: 0.5005

FOLD 4


  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)


GMM accuracy: 0.4677


  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
HMM Fold 4 - classificando: 100%|██████████| 990/990 [00:27<00:00, 36.44it/s]
Folds (1 a 10):  40%|████      | 4/10 [47:33<1:10:40, 706.77s/it]

HMM accuracy: 0.5182

FOLD 5


  median_filtered = sg.medfilt(energies, kernel_size=median_size)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)


GMM accuracy: 0.4573


  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
HMM Fold 5 - classificando: 100%|██████████| 936/936 [00:25<00:00, 36.80it/s]
  median_filtered = sg.medfilt(energies, kernel_size=median_size)


HMM accuracy: 0.5192

FOLD 6


  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)


GMM accuracy: 0.4702


  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
HMM Fold 6 - classificando: 100%|██████████| 823/823 [00:21<00:00, 38.43it/s]
  median_filtered = sg.medfilt(energies, kernel_size=median_size)


HMM accuracy: 0.4775

FOLD 7


  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)


GMM accuracy: 0.5227


  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
HMM Fold 7 - classificando: 100%|██████████| 838/838 [00:22<00:00, 37.10it/s]
  median_filtered = sg.medfilt(energies, kernel_size=median_size)


HMM accuracy: 0.4093

FOLD 8


  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)


GMM accuracy: 0.4913


  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
HMM Fold 8 - classificando: 100%|██████████| 806

HMM accuracy: 0.4578

FOLD 9


  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)


GMM accuracy: 0.5748


  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
HMM Fold 9 - classificando: 100%|██████████| 816/816 [00:23<00:00, 34.22it/s]
  median_filtered = sg.medfilt(energies, kernel_size=median_size)


HMM accuracy: 0.5527

FOLD 10


  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)


GMM accuracy: 0.5305


  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
HMM Fold 10 - classificando: 100%|██████████| 837/837 [00:22<00:00, 36.47it/s]
Folds (1 a 10): 100%|██████████| 10/10 [2:00:57<00:00, 725.73s/it]

HMM accuracy: 0.6045

Resultados salvos em: E:\ExameEET\results

===== RESULTADOS FINAIS =====
GMM: [0.5154639175257731, 0.42454954954954954, 0.4410810810810811, 0.4676767676767677, 0.45726495726495725, 0.47023086269744835, 0.522673031026253, 0.4913151364764268, 0.5747549019607843, 0.5304659498207885]
HMM: [0.5578465063001146, 0.5101351351351351, 0.5005405405405405, 0.5181818181818182, 0.5192307692307693, 0.4775212636695018, 0.40930787589498807, 0.45781637717121587, 0.5526960784313726, 0.6045400238948626]
GMM Média: 0.4895476155079829
HMM Média: 0.5107816388450319
['gmm_fold_accuracy.npy', 'gmm_preds_fold1.npy', 'gmm_preds_fold10.npy', 'gmm_preds_fold2.npy', 'gmm_preds_fold3.npy', 'gmm_preds_fold4.npy', 'gmm_preds_fold5.npy', 'gmm_preds_fold6.npy', 'gmm_preds_fold7.npy', 'gmm_preds_fold8.npy', 'gmm_preds_fold9.npy', 'hmm_fold_accuracy.npy', 'hmm_preds_fold1.npy', 'hmm_preds_fold10.npy', 'hmm_preds_fold2.npy', 'hmm_preds_fold3.npy', 'hmm_preds_fold4.npy', 'hmm_preds_fold5.npy', 'hmm_pre


