In [8]:
# =========================================================
# 0) Instalação 
# =========================================================
# !pip install -U torch torchaudio pytorch-lightning torchmetrics transformers scikit-learn

In [9]:
# =========================================================
# 1) Imports, checagem de GPU e configs globais
# =========================================================
import os
import json
import warnings
from pathlib import Path

import numpy as np
import torch
import torchaudio
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc

import pytorch_lightning as pl
from pytorch_lightning import LightningModule, Trainer, seed_everything
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor
import torchmetrics as tm
from torch.utils.data import Dataset, DataLoader

from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification

warnings.filterwarnings("ignore")

# Reprodutibilidade
SEED = 42
seed_everything(SEED, workers=True)

# >>> CAMINHO DO DATASET <<<
BASE_PUMP_DIR = r"D:/dataset pump/-6_dB_pump/pump"

# Hiperparâmetros principais
TARGET_SR       = 16000          # Wav2Vec2 base trabalha em 16 kHz
MAX_DURATION_S  = 10.0           # audios de 10s 
MAX_LENGTH      = int(TARGET_SR * MAX_DURATION_S)  # em amostras
BATCH_SIZE      = 4              
LR              = 1e-4           # LR padrão
MAX_EPOCHS      = 30
PATIENCE_ES     = 10             # early stopping
NUM_WORKERS     = 0              
PRECISION       = "16-mixed"     # mude para 32 se sua GPU/driver der erro

# Checagem rápida de GPU
print("CUDA:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))


Seed set to 42


CUDA: True
GPU: NVIDIA GeForce RTX 3050 6GB Laptop GPU


In [10]:
# =========================================================
# 2) Listagem dos arquivos e rótulos (0=normal, 1=abnormal)
# =========================================================
def list_all_ids(base_pump_dir: str):
    X, y = [], []
    base = Path(base_pump_dir)
    for id_dir in sorted(base.glob("id_*")):
        normal_files   = sorted(map(str, (id_dir / "normal").glob("*.wav")))
        abnormal_files = sorted(map(str, (id_dir / "abnormal").glob("*.wav")))
        X.extend(normal_files + abnormal_files)
        y.extend([0] * len(normal_files) + [1] * len(abnormal_files))
    return X, y

X_ALL, Y_ALL = list_all_ids(BASE_PUMP_DIR)
print(f"Total: {len(X_ALL)} | Normais: {Y_ALL.count(0)} | Anormais: {Y_ALL.count(1)}")
print("Exemplo:", Y_ALL[0], X_ALL[0])

# Pesos das classes (desbalanceamento)
class_weights_np = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(Y_ALL),
    y=Y_ALL
)
class_weights = torch.tensor(class_weights_np, dtype=torch.float32)
print("Class weights:", class_weights.tolist())

Total: 4205 | Normais: 3749 | Anormais: 456
Exemplo: 0 D:\dataset pump\-6_dB_pump\pump\id_00\normal\00000000.wav
Class weights: [0.56081622838974, 4.610745429992676]


In [11]:
# =========================================================
# 3) Processor do Wav2Vec2
# =========================================================
# facebook/wav2vec2-base espera áudio mono 16 kHz -> produz tensores prontos para o modelo
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base") # Garante que o input é compatível com o modelo

In [None]:
# =========================================================
# 4) Dataset de áudio bruto + DataModule 
# =========================================================
class RawAudioDataset(Dataset):
    """
    - Carrega waveform do arquivo
    - Converte para mono
    - Faz resample para 16 kHz (se preciso)
    - Retorna apenas waveform (1D) e label -> padding/truncation ficam no collate_fn
    """
    def __init__(self, files, labels, target_sr=16000):
        self.files = files
        self.labels = labels
        self.target_sr = target_sr

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        path  = self.files[idx]
        label = int(self.labels[idx])

        waveform, sr = torchaudio.load(path)  # [C, T]
        if waveform.shape[0] > 1:
            waveform = waveform.mean(dim=0, keepdim=True)  # mono [1, T]
        if sr != self.target_sr:
            waveform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=self.target_sr)(waveform)

        return {"waveform": waveform.squeeze(0), "label": label}


class Wav2Vec2DataModule(pl.LightningDataModule):
    
    # Split: 70/15/15
  
    def __init__(self, X, y, processor, batch_size=4, num_workers=0, target_sr=16000, max_length=None):
        super().__init__()
        self.X = X
        self.y = y
        self.processor = processor
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.target_sr = target_sr
        self.max_length = max_length

    def setup(self, stage=None):
        X_train, X_temp, y_train, y_temp = train_test_split(
            self.X, self.y, test_size=0.3, random_state=SEED, stratify=self.y
        )
        X_val, X_test, y_val, y_test = train_test_split(
            X_temp, y_temp, test_size=0.5, random_state=SEED, stratify=y_temp
        )
        self.train_ds = RawAudioDataset(X_train, y_train, self.target_sr)
        self.val_ds   = RawAudioDataset(X_val,   y_val,   self.target_sr)
        self.test_ds  = RawAudioDataset(X_test,  y_test,  self.target_sr)
        print(f"Tamanhos -> train: {len(self.train_ds)}, val: {len(self.val_ds)}, test: {len(self.test_ds)}")

    def collate_fn(self, batch):
        waveforms = [b["waveform"].squeeze(0).numpy() for b in batch]
        labels    = torch.tensor([b["label"] for b in batch], dtype=torch.long)

        inputs = self.processor(
            waveforms,
            sampling_rate=self.target_sr,
            return_tensors="pt",
            padding="max_length" if self.max_length is not None else "longest",
            truncation=True if self.max_length is not None else False,
            max_length=self.max_length
        )

        return {
            "input_values":  inputs["input_values"],                     # [B, T]
            "attention_mask":inputs.get("attention_mask", None),         
            "labels":        labels
        }



    def train_dataloader(self):
        return DataLoader(self.train_ds, batch_size=self.batch_size, shuffle=True,
                          num_workers=self.num_workers, collate_fn=self.collate_fn, pin_memory=True)

    def val_dataloader(self):
        return DataLoader(self.val_ds, batch_size=self.batch_size, shuffle=False,
                          num_workers=self.num_workers, collate_fn=self.collate_fn, pin_memory=True)

    def test_dataloader(self):
        return DataLoader(self.test_ds, batch_size=self.batch_size, shuffle=False,
                          num_workers=self.num_workers, collate_fn=self.collate_fn, pin_memory=True)


datamodule = Wav2Vec2DataModule(
    X_ALL, Y_ALL,
    processor=processor,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
    target_sr=TARGET_SR,
    max_length=MAX_LENGTH
)
datamodule.setup()

Tamanhos -> train: 2943, val: 631, test: 631


In [13]:
# =========================================================
# 5) LightningModule com Wav2Vec2ForSequenceClassification
# =========================================================
class LitWav2Vec2Classifier(LightningModule):
    def __init__(self, lr=1e-4, class_weights=None, model_name="facebook/wav2vec2-base"):
        super().__init__()
        self.save_hyperparameters(ignore=["class_weights"])
        self.model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name, num_labels=2)


        try:
            self.model.gradient_checkpointing_enable()
        except Exception:
            pass

        # class weights 
        if class_weights is not None:
            self.register_buffer("class_weights", class_weights)
        else:
            self.class_weights = None

        self.loss_fn = torch.nn.CrossEntropyLoss(weight=self.class_weights)

        # Métricas
        self.train_acc = tm.classification.MulticlassAccuracy(num_classes=2)
        self.val_acc   = tm.classification.MulticlassAccuracy(num_classes=2)
        self.val_auc   = tm.classification.MulticlassAUROC(num_classes=2)
        self.test_acc  = tm.classification.MulticlassAccuracy(num_classes=2)
        self.test_auc  = tm.classification.MulticlassAUROC(num_classes=2)

    def forward(self, input_values, attention_mask=None):
        input_values = self._ensure_bt(input_values)

        if attention_mask is not None:
            attention_mask = self._ensure_bt(attention_mask)
            return self.model(input_values=input_values, attention_mask=attention_mask)
        else:
            return self.model(input_values=input_values)


    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=self.hparams.lr)

    def shared_step(self, batch, stage):
        outputs = self(
            input_values=batch["input_values"],
            attention_mask=batch.get("attention_mask", None)  # pode ser None
        )

        logits = outputs.logits
        loss   = self.loss_fn(logits, batch["labels"])
        preds  = torch.argmax(logits, dim=1)

        if stage == "train":
            self.train_acc.update(preds, batch["labels"])
            self.log("train_loss", loss, prog_bar=True, on_step=False, on_epoch=True)
        elif stage == "val":
            self.val_acc.update(preds, batch["labels"])
            probs = torch.softmax(logits, dim=1)
            self.val_auc.update(probs, batch["labels"])
            self.log("val_loss", loss, prog_bar=True, on_step=False, on_epoch=True)
        else:  # test
            self.test_acc.update(preds, batch["labels"])
            probs = torch.softmax(logits, dim=1)
            self.test_auc.update(probs, batch["labels"])
            self.log("test_loss", loss, on_step=False, on_epoch=True)
        return loss

    def training_step(self, batch, batch_idx):
        return self.shared_step(batch, "train")

    def on_train_epoch_end(self):
        self.log("train_acc", self.train_acc.compute(), prog_bar=True)
        self.train_acc.reset()

    def validation_step(self, batch, batch_idx):
        return self.shared_step(batch, "val")

    def on_validation_epoch_end(self):
        self.log("val_acc", self.val_acc.compute(), prog_bar=True)
        self.log("val_auc", self.val_auc.compute(), prog_bar=True)
        self.val_acc.reset()
        self.val_auc.reset()

    @staticmethod
    def _ensure_bt(x):
        # Remove dimensões extras tipo [1, B, T] -> [B, T]
        if x.dim() > 2 and x.size(0) == 1:
            x = x.squeeze(0)
        if x.dim() == 3 and x.size(1) == 1:  # [B,1,T] -> [B,T]
            x = x.squeeze(1)
        return x
    

    def test_step(self, batch, batch_idx):
        return self.shared_step(batch, "test")

    def on_test_epoch_end(self):
        self.log("test_acc", self.test_acc.compute())
        self.log("test_auc", self.test_auc.compute())
        self.test_acc.reset()
        self.test_auc.reset()

In [14]:
# =========================================================
# 6) Treinador, callbacks e treino
# =========================================================
checkpoint_cb = ModelCheckpoint(
    monitor="val_loss",
    mode="min",
    save_top_k=1,
    filename="wav2vec2-best-{epoch:02d}-{val_loss:.4f}"
)
earlystop_cb = EarlyStopping(
    monitor="val_loss",
    mode="min",
    patience=PATIENCE_ES
)
lr_monitor = LearningRateMonitor(logging_interval="epoch")

model = LitWav2Vec2Classifier(
    lr=LR,
    class_weights=class_weights,
    model_name="facebook/wav2vec2-base"
)

trainer = Trainer(
    max_epochs=MAX_EPOCHS,
    deterministic=True,
    callbacks=[checkpoint_cb, earlystop_cb, lr_monitor],
    accelerator="auto",   # usa GPU se disponível
    devices="auto",
    precision=PRECISION,  
    log_every_n_steps=10
)

trainer.fit(model, datamodule=datamodule)
print("Melhor checkpoint:", checkpoint_cb.best_model_path)

# Carrega o melhor modelo e testa
best_model = LitWav2Vec2Classifier.load_from_checkpoint(
    checkpoint_cb.best_model_path,
    lr=LR,
    model_name="facebook/wav2vec2-base",
    class_weights=class_weights
)
trainer.test(best_model, datamodule=datamodule)


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Tamanhos -> train: 2943, val: 631, test: 631



  | Name      | Type                              | Params | Mode 
------------------------------------------------------------------------
0 | model     | Wav2Vec2ForSequenceClassification | 94.6 M | eval 
1 | loss_fn   | CrossEntropyLoss                  | 0      | train
2 | train_acc | MulticlassAccuracy                | 0      | train
3 | val_acc   | MulticlassAccuracy                | 0      | train
4 | val_auc   | MulticlassAUROC                   | 0      | train
5 | test_acc  | MulticlassAccuracy                | 0      | train
6 | test_auc  | MulticlassAUROC                   | 0      | train
------------------------------------------------------------------------
94.6 M    Trainable params
0         Non-trainable params
94.6 M    Total params
378.276   Total estimated model params size (MB)
6         Modules in train mode
223       Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=30` reached.


Melhor checkpoint: d:\dataset pump\lightning_logs\version_26\checkpoints\wav2vec2-best-epoch=25-val_loss=0.5996.ckpt


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Tamanhos -> train: 2943, val: 631, test: 631


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 0.5845102071762085, 'test_acc': 0.5, 'test_auc': 0.5}]