### Import Libraries

In [1]:
### Import Libraries
# Adicionamos as bibliotecas do TensorFlow/Keras
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input, Conv2D, BatchNormalization, Activation, 
    MaxPooling2D, GlobalAveragePooling2D, Dense, Dropout
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.utils import to_categorical

# Bibliotecas Scikit-learn (CPU)
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import GroupKFold
from sklearn.metrics import classification_report, confusion_matrix as sklearn_confusion_matrix

# Bibliotecas de Extração e Plotagem
import numpy as np
import pandas as pd
import librosa
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import os
import shutil
from tqdm import tqdm
import time
import utils # Reutiliza o utils.py do seu projeto

2025-11-11 04:56:27.959744: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-11-11 04:56:28.045291: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762851388.073751   13549 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762851388.082616   13549 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1762851388.162612   13549 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

### Audio Preprocessing

In [2]:
# --- Configuração ---
METADATA_DIR = '../fma_metadata'
AUDIO_DIR_GENRES = '../fma_datasets/fma_small_genres'
FEATURE_FILE_X = '../preprocessed_features/fma_small_mel_spec_X.npy'
FEATURE_FILE_y = '../preprocessed_features/fma_small_mel_spec_y.npy'
FEATURE_FILE_groups = '../preprocessed_features/fma_small_mel_spec_groups.npy'

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Configura o TensorFlow para alocar memória dinamicamente
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"Configurado memory growth para {len(gpus)} GPU(s).")
    except RuntimeError as e:
        print(e)

# --- Parâmetros de Extração ---
SR = 22050
WINDOW_SIZE_SEC = 3
OVERLAP_PERCENT = 0.25
N_MELS = 96
N_FFT = 2048
HOP_LENGTH = 512
TARGET_STEPS = 128 

# --- Carregar Metadados ---
tracks = utils.load(f'{METADATA_DIR}/tracks.csv')
small_mask = tracks[('set', 'subset')] == 'small'
y_all_labels_pd = tracks.loc[small_mask, ('track', 'genre_top')]
splits_pd = tracks.loc[small_mask, ('set', 'split')]

# --- Codificar os Gêneros (Labels) ---
label_encoder = LabelEncoder()
y_all_encoded_np = label_encoder.fit_transform(y_all_labels_pd).astype(np.int32)
y_all_onehot_np = to_categorical(y_all_encoded_np, num_classes=len(label_encoder.classes_))

track_metadata = pd.DataFrame({
    'genre_top': y_all_labels_pd,
    'genre_encoded': y_all_encoded_np,
    'split': splits_pd
}, index=y_all_labels_pd.index)

class_names = label_encoder.classes_
NUM_CLASSES = len(class_names)

print(f"Metadados carregados para {track_metadata.shape[0]} faixas 'small'.")
print(f"Gêneros: {class_names}")

Configurado memory growth para 1 GPU(s).
Metadados carregados para 8000 faixas 'small'.
Gêneros: ['Electronic' 'Experimental' 'Folk' 'Hip-Hop' 'Instrumental'
 'International' 'Pop' 'Rock']


In [3]:
def extract_melspectrogram_windowed(file_path, sr=SR, window_size_sec=WINDOW_SIZE_SEC, 
                                  overlap_percent=OVERLAP_PERCENT, n_mels=N_MELS, 
                                  n_fft=N_FFT, hop_length=HOP_LENGTH, target_steps=TARGET_STEPS):
    """
    Extrai espectrogramas log-mel para janelas de áudio.
    Similar ao seu v2, mas calcula espectrogramas em vez de 518 estatísticas.
    """
    
    all_window_specs = []
    
    try:
        # 1. Carrega o áudio de 30s
        y, sr_loaded = librosa.load(file_path, mono=True, sr=sr)
        
        # 2. Cria janelas de áudio (ex: 3s)
        samples_per_window = int(window_size_sec * sr)
        hop_size = int(samples_per_window * (1.0 - overlap_percent)) 
        
        if len(y) < samples_per_window:
            # print(f"Aviso: Áudio {file_path} mais curto que {window_size_sec}s. Pulando.")
            return []

        # y_frames é [n_windows, samples_per_window]
        y_frames = librosa.util.frame(y, frame_length=samples_per_window, hop_length=hop_size, axis=0)
        
        # 3. Calcula o espectrograma para CADA janela
        for y_window in y_frames:
            S = librosa.feature.melspectrogram(y=y_window, sr=sr, n_mels=n_mels, 
                                               n_fft=n_fft, hop_length=hop_length)
            
            # Converte para Log-Mel (dB)
            log_S = librosa.power_to_db(S, ref=np.max)
            
            # 4. Padroniza o tamanho do tempo (importante para a CNN)
            if log_S.shape[1] < target_steps:
                # Adiciona padding se for curto
                log_S = np.pad(log_S, ((0, 0), (0, target_steps - log_S.shape[1])), mode='constant')
            else:
                # Trunca se for longo
                log_S = log_S[:, :target_steps]
            
            all_window_specs.append(log_S)
            
    except Exception as e:
        print(f"Erro ao processar {file_path}: {e}")
        return None 
        
    return all_window_specs

# --- Loop de Extração e Cache (Similar ao v2) ---

if not (os.path.exists(FEATURE_FILE_X) and 
        os.path.exists(FEATURE_FILE_y) and 
        os.path.exists(FEATURE_FILE_groups)):
    
    print(f"Arquivos de features (mel-spec) não encontrados. Iniciando extração...")
    
    window_counts = []
    track_ids_list = []
    labels_list = []

    print("Passo 1/2: Contando janelas (Dry Run)...")
    for track_id, row in tqdm(track_metadata.iterrows(), total=track_metadata.shape[0]):
        genre_top = row['genre_top']
        file_path = f"{AUDIO_DIR_GENRES}/{genre_top}/{track_id:06d}.mp3"
        
        if not os.path.exists(file_path):
            continue
            
        window_specs = extract_melspectrogram_windowed(file_path)
        
        if window_specs is None or len(window_specs) == 0:
            window_counts.append(0)
            continue
        
        num_windows_for_track = len(window_specs)
        window_counts.append(num_windows_for_track)
        
        label_onehot = y_all_onehot_np[y_all_labels_pd.index == track_id][0]
        
        for _ in range(num_windows_for_track):
            track_ids_list.append(track_id)
            labels_list.append(label_onehot)

    # --- Pré-alocação dos Arrays ---
    total_windows = sum(window_counts)
    X_np = np.empty((total_windows, N_MELS, TARGET_STEPS), dtype=np.float32)
    y_onehot_np = np.array(labels_list, dtype=np.float32)
    groups_np = np.array(track_ids_list, dtype=np.int32)
    
    print(f"Total de janelas a serem extraídas: {total_windows}")
    print("Passo 2/2: Preenchendo arrays (Extração Real)...")
    
    current_window_index = 0
    track_index = 0
    
    for track_id, row in tqdm(track_metadata.iterrows(), total=track_metadata.shape[0]):
        num_windows_for_this_track = window_counts[track_index]
        track_index += 1 # Avança o contador de faixas
        
        if num_windows_for_this_track == 0:
            continue # Pula faixas que falharam ou eram curtas no dry run
        
        genre_top = row['genre_top']
        file_path = f"{AUDIO_DIR_GENRES}/{genre_top}/{track_id:06d}.mp3"
        
        # Extrai as janelas novamente
        window_specs = extract_melspectrogram_windowed(file_path)
        
        if window_specs is None or len(window_specs) == 0:
            # Isso não deveria acontecer se o dry run funcionou, mas é uma segurança
            continue 
            
        # Preenche o array X_np
        for spec in window_specs:
            X_np[current_window_index] = spec
            current_window_index += 1

    print(f"\nExtração concluída. Shape de X: {X_np.shape}. Shape de y: {y_onehot_np.shape}")
    
    # Salva os arquivos cacheados
    os.makedirs(os.path.dirname(FEATURE_FILE_X), exist_ok=True)
    np.save(FEATURE_FILE_X, X_np)
    np.save(FEATURE_FILE_y, y_onehot_np)
    np.save(FEATURE_FILE_groups, groups_np)

else:
    print(f"Carregando features (mel-spec) cacheadas de {FEATURE_FILE_X}...")
    X_np = np.load(FEATURE_FILE_X)
    y_onehot_np = np.load(FEATURE_FILE_y) # Carrega os labels já em one-hot
    groups_np = np.load(FEATURE_FILE_groups)
    print("Features e grupos carregados.")

# --- Checagem de Segurança ---
if X_np.size == 0:
    print("\nERRO: O array 'X_np' está vazio.")
else:
    y_encoded_np = np.argmax(y_onehot_np, axis=1) 
    print(f"Total de amostras (janelas): {X_np.shape[0]}")
    print(f"Shape de X: {X_np.shape}")
    print(f"Shape de y (one-hot): {y_onehot_np.shape}")
    print(f"Total de grupos (faixas únicas): {len(np.unique(groups_np))}")

Arquivos de features (mel-spec) não encontrados. Iniciando extração...
Passo 1/2: Contando janelas (Dry Run)...


  6%|▌         | 489/8000 [00:53<13:27,  9.30it/s][src/libmpg123/layer3.c:INT123_do_layer3():1948] error: dequantization failed!
 11%|█▏        | 901/8000 [01:38<12:55,  9.16it/s][src/libmpg123/layer3.c:INT123_do_layer3():1908] error: dequantization failed!
 15%|█▍        | 1181/8000 [02:07<12:00,  9.46it/s][src/libmpg123/layer3.c:INT123_do_layer3():1908] error: dequantization failed!
 28%|██▊       | 2265/8000 [04:01<10:36,  9.02it/s][src/libmpg123/layer3.c:INT123_do_layer3():1878] error: part2_3_length (3360) too large for available bit count (3240)
 28%|██▊       | 2267/8000 [04:01<09:20, 10.23it/s][src/libmpg123/layer3.c:INT123_do_layer3():1878] error: part2_3_length (3328) too large for available bit count (3240)
 55%|█████▌    | 4423/8000 [07:45<06:15,  9.53it/s]Note: Illegal Audio-MPEG-Header 0x00000000 at offset 33361.
Note: Trying to resync...
Note: Skipped 1024 bytes in input.
[src/libmpg123/parse.c:wetwork():1389] error: Giving up resync after 1024 bytes - your stream is not

Erro ao processar ../fma_datasets/fma_small_genres/Electronic/099134.mp3: 


  y, sr_loaded = librosa.load(file_path, mono=True, sr=sr)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Erro ao processar ../fma_datasets/fma_small_genres/Rock/108925.mp3: 


  y, sr_loaded = librosa.load(file_path, mono=True, sr=sr)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
 87%|████████▋ | 6967/8000 [12:07<01:34, 10.89it/s]

Erro ao processar ../fma_datasets/fma_small_genres/Experimental/133297.mp3: 


100%|██████████| 8000/8000 [13:53<00:00,  9.60it/s]


Total de janelas a serem extraídas: 99278
Passo 2/2: Preenchendo arrays (Extração Real)...


  6%|▌         | 490/8000 [00:50<12:42,  9.85it/s][src/libmpg123/layer3.c:INT123_do_layer3():1948] error: dequantization failed!
 11%|█▏        | 901/8000 [01:30<13:12,  8.96it/s][src/libmpg123/layer3.c:INT123_do_layer3():1908] error: dequantization failed!
 15%|█▍        | 1180/8000 [01:57<11:06, 10.24it/s][src/libmpg123/layer3.c:INT123_do_layer3():1908] error: dequantization failed!
 28%|██▊       | 2265/8000 [03:47<10:04,  9.49it/s][src/libmpg123/layer3.c:INT123_do_layer3():1878] error: part2_3_length (3360) too large for available bit count (3240)
 28%|██▊       | 2267/8000 [03:47<08:25, 11.35it/s][src/libmpg123/layer3.c:INT123_do_layer3():1878] error: part2_3_length (3328) too large for available bit count (3240)
100%|██████████| 8000/8000 [13:25<00:00,  9.93it/s]



Extração concluída. Shape de X: (99278, 96, 128). Shape de y: (99278, 8)
Total de amostras (janelas): 99278
Shape de X: (99278, 96, 128)
Shape de y (one-hot): (99278, 8)
Total de grupos (faixas únicas): 7994


In [None]:
def build_model(input_shape, num_classes):
    """
    Cria um modelo CNN 2D simples para classificação de espectrogramas.
    """
    
    # Camada de Entrada
    inp = Input(shape=input_shape)
    
    # --- Bloco 1 ---
    x = Conv2D(32, kernel_size=(3, 3), padding='same', activation='relu')(inp)
    x = BatchNormalization()(x)
    x = Conv2D(32, kernel_size=(3, 3), padding='same', activation='relu')(x)
    x = BatchNormalization()(x)
    x = MaxPooling2D(pool_size=(2, 2))(x)
    
    # --- Bloco 2 ---
    x = Conv2D(64, kernel_size=(3, 3), padding='same', activation='relu')(x)
    x = BatchNormalization()(x)
    x = Conv2D(64, kernel_size=(3, 3), padding='same', activation='relu')(x)
    x = BatchNormalization()(x)
    x = MaxPooling2D(pool_size=(2, 2))(x)
    
    # --- Bloco 3 ---
    x = Conv2D(128, kernel_size=(3, 3), padding='same', activation='relu')(x)
    x = BatchNormalization()(x)
    x = MaxPooling2D(pool_size=(2, 2))(x)

    # --- Camada de Agregação ---
    # GlobalAveragePooling2D é mais eficiente que Flatten
    x = GlobalAveragePooling2D()(x)
    
    # --- Camadas Densas (Classificador) ---
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.5)(x) # Dropout para regularização
    
    # Camada de Saída
    out = Dense(num_classes, activation='softmax')(x)
    
    model = Model(inputs=inp, outputs=out)
    
    # Compila o modelo
    model.compile(optimizer=Adam(learning_rate=1e-3),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    
    return model

# Define o shape de entrada (Altura, Largura, Canais)
# (N_MELS, TARGET_STEPS, 1 canal -> mono)
INPUT_SHAPE = (N_MELS, TARGET_STEPS, 1) 

# Testa a construção do modelo
model_teste = build_model(INPUT_SHAPE, NUM_CLASSES)
model_teste.summary()
del model_teste # Libera a memória

I0000 00:00:1762853039.339511   13549 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 3584 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3060 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6


: 

### Treino dos Modelos

In [None]:
# 1. Definir a Estratégia de CV (GroupKFold)
n_splits = 10
gkf = GroupKFold(n_splits=n_splits)

# 2. Dicionários para guardar os resultados (Nível da Faixa)
# (Igual ao seu v2)
cv_scores_track = {}
out_of_fold_preds_track = {}

# Callbacks do Keras
# Parar o treino se a perda na validação não melhorar por 3 épocas
early_stopper = EarlyStopping(monitor='val_loss', patience=3, verbose=1, restore_best_weights=True)

print(f"\nIniciando CV {n_splits}-Fold para CNN 2D (GPU)...")
start_model_time = time.time()

fold_scores_acc = []
all_preds_for_model = []
all_true_for_model = []

# 3. Executar a Validação Cruzada
# Usamos y_encoded_np (índices) para o split, mas y_onehot_np (one-hot) para o treino
for fold, (train_idx, test_idx) in enumerate(gkf.split(X_np, y_encoded_np, groups=groups_np)):
    
    print(f"\n--- Processando Fold {fold+1}/{n_splits} ---")
    
    # --- 4.1. Preparação dos Dados do Fold ---
    
    # Índices de treino e teste (nível de janela)
    X_train_np_fold = X_np[train_idx]
    y_train_onehot_fold = y_onehot_np[train_idx]
    
    X_test_np_fold = X_np[test_idx]
    y_true_window_np = y_encoded_np[test_idx] # Labels como índice (0-7) para o report
    y_true_onehot_fold = y_onehot_np[test_idx] # Labels one-hot para o val_loss
    
    track_ids_test_fold = groups_np[test_idx]

    # --- 4.2. Padronização (StandardScaler) - VITAL para DL ---
    # O Scaler deve ser "fitado" APENAS nos dados de treino do fold.
    
    scaler = StandardScaler()
    
    # Reshape para 2D (amostras, features) para o scaler
    X_train_flat = X_train_np_fold.reshape(X_train_np_fold.shape[0], -1)
    X_test_flat = X_test_np_fold.reshape(X_test_np_fold.shape[0], -1)
    
    # Fit no treino e transforma ambos
    X_train_scaled_flat = scaler.fit_transform(X_train_flat)
    X_test_scaled_flat = scaler.transform(X_test_flat)
    
    # --- 4.3. Reshape para a CNN (Adiciona canal) ---
    # (amostras, altura, largura, canais)
    X_train_fold = X_train_scaled_flat.reshape(-1, N_MELS, TARGET_STEPS, 1)
    X_test_fold = X_test_scaled_flat.reshape(-1, N_MELS, TARGET_STEPS, 1)

    # --- 4.4. Treinamento do Modelo ---
    
    # Cria um novo modelo para cada fold (reseta os pesos)
    model = build_model(INPUT_SHAPE, NUM_CLASSES)
    
    model.fit(
        X_train_fold, 
        y_train_onehot_fold,
        batch_size=16, # Ajuste conforme a VRAM (32, 64, 128)
        epochs=20,     # O EarlyStopping vai parar quando for o ideal
        callbacks=[early_stopper],
        validation_data=(X_test_fold, y_true_onehot_fold), # Usa o set de teste do fold como validação
        verbose=2
    )
    
    # --- 4.5. Predição (Nível Janela) ---
    y_pred_probs = model.predict(X_test_fold)
    y_pred_window_np = np.argmax(y_pred_probs, axis=1) # Converte de one-hot para índice

    # --- 4.6. Agregação (Votação Majoritária - Igual ao v2) ---
    
    df_fold = pd.DataFrame({
        'track_id': track_ids_test_fold,
        'y_true_window': y_true_window_np,
        'y_pred_window': y_pred_window_np
    })

    grouped = df_fold.groupby('track_id')
    y_true_track = grouped['y_true_window'].first()
    # Usamos keepdims=True para compatibilidade com scipy > 1.11
    y_pred_track = grouped['y_pred_window'].apply(lambda x: stats.mode(x, keepdims=True)[0][0])
    
    # Precisamos usar numpy para a métrica (cuML não está envolvido aqui)
    acc_fold_track = np.mean(y_true_track == y_pred_track)
    fold_scores_acc.append(acc_fold_track)
    
    all_preds_for_model.append(y_pred_track.values)
    all_true_for_model.append(y_true_track.values)
    
    # Limpa a memória
    del X_train_fold, y_train_onehot_fold, X_test_fold, y_true_onehot_fold, model
    tf.keras.backend.clear_session()


# --- 5. Resultados Finais (Nível Faixa) ---

cv_scores_track["CNN 2D (GPU)"] = {
    'Acurácia (Nível Faixa)': np.array(fold_scores_acc)
}

y_true_final = np.concatenate(all_true_for_model)
y_pred_final = np.concatenate(all_preds_for_model)

out_of_fold_preds_track["CNN 2D (GPU)"] = {
    'y_true': y_true_final,
    'y_pred': y_pred_final
}

end_model_time = time.time()
print(f"\nTreinamento da CNN 2D concluído em {end_model_time - start_model_time:.2f} segundos.")


Iniciando CV 10-Fold para CNN 2D (GPU)...

--- Processando Fold 1/10 ---


In [None]:
print("\n--- Treinamento e Avaliação de CV (Nível Faixa) Concluídos ---")

# 4. Analisar Resultados (Relatório de Métricas Nível Faixa)
for model_name, metrics in cv_scores_track.items():
    print(f"\n========= Resultados {model_name} ({n_splits}-Fold CV) - Nível Faixa (Votação Majoritária) ==========")
    
    for metric_name, scores_cpu in metrics.items():
        print(f"--- {metric_name} ---")
        print(f"  Média        : {scores_cpu.mean():.4f}")
        print(f"  Desv. Padrão : {scores_cpu.std():.4f}")
        print(f"  Scores (por Fold): {np.round(scores_cpu, 3)}")

# 5. Análise (Matrizes de Confusão Nível Faixa)
print("\n\n========= Matrizes de Confusão (Agregadas da CV) - Nível Faixa ==========")

for model_name, results in out_of_fold_preds_track.items():
    print(f"--- {model_name} ---")
    
    y_true_agg = results['y_true']
    y_pred_agg = results['y_pred']
    
    # Usamos o confusion_matrix do sklearn (CPU)
    cm_cpu = sklearn_confusion_matrix(y_true_agg, y_pred_agg)
    
    plt.figure(figsize=(5, 4))
    sns.heatmap(cm_cpu, annot=True, fmt='d', cmap='Blues',
                xticklabels=class_names, yticklabels=class_names)
    plt.title(f'Matriz de Confusão (Nível Faixa) - {model_name}')
    plt.ylabel('Rótulo Verdadeiro')
    plt.xlabel('Rótulo Previsto')
    plt.show()
    
    print("\n--- Relatório de Classificação (Nível Faixa) ---")
    print(classification_report(y_true_agg, y_pred_agg, target_names=class_names))