In [1]:
import os
import numpy as np
import pandas as pd
import librosa
import torch
import torch.nn as nn
import pytorch_lightning as pl

from tqdm.auto import tqdm
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import StratifiedKFold
from pytorch_lightning import seed_everything
from pytorch_lightning.callbacks import ModelCheckpoint
from transformers import HubertForSequenceClassification, AutoFeatureExtractor, AutoConfig
from torch.optim import AdamW
import bitsandbytes as bnb



In [2]:
# Constants
DATA_DIR = ''  # Adjust this path as necessary
PREPROC_DIR = './preproc'
SUBMISSION_DIR = './submission'
MODEL_DIR = './model'
SAMPLING_RATE = 16000
SEED = 42
N_FOLD = 20
BATCH_SIZE = 2
NUM_LABELS = 2
AUDIO_MODEL_NAME = 'abhishtagatya/hubert-base-960h-asv19-deepfake'

In [3]:
# Utility functions
def accuracy(preds, labels):
    return (preds == labels).float().mean()

In [4]:
def getAudios(df):
    audios = []
    valid_indices = []
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        try:
            audio, _ = librosa.load(row['path'], sr=SAMPLING_RATE)
            audios.append(audio)
            valid_indices.append(idx)
        except FileNotFoundError:
            print(f"File not found: {row['path']}. Skipping.")
        except Exception as e:
            print(f"Error loading {row['path']}: {e}. Skipping.")
    return audios, valid_indices

In [5]:
class MyDataset(Dataset):
    def __init__(self, audio, audio_feature_extractor, labels=None):
        if labels is None:
            labels = [[0] * NUM_LABELS for _ in range(len(audio))]
        self.labels = np.array(labels).astype(np.float32)
        self.audio = audio
        self.audio_feature_extractor = audio_feature_extractor

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        label = self.labels[idx]
        audio = self.audio[idx]
        audio_feature = self.audio_feature_extractor(raw_speech=audio, return_tensors='np', sampling_rate=SAMPLING_RATE)
        audio_values, audio_attn_mask = audio_feature['input_values'][0], audio_feature['attention_mask'][0]

        item = {
            'label': label,
            'audio_values': audio_values,
            'audio_attn_mask': audio_attn_mask,
        }

        return item

In [6]:
def collate_fn(samples):
    batch_labels = []
    batch_audio_values = []
    batch_audio_attn_masks = []

    for sample in samples:
        batch_labels.append(sample['label'])
        batch_audio_values.append(torch.tensor(sample['audio_values']))
        batch_audio_attn_masks.append(torch.tensor(sample['audio_attn_mask']))

    batch_labels = np.array(batch_labels)
    batch_labels = torch.tensor(batch_labels)
    batch_audio_values = pad_sequence(batch_audio_values, batch_first=True)
    batch_audio_attn_masks = pad_sequence(batch_audio_attn_masks, batch_first=True)

    batch = {
        'label': batch_labels,
        'audio_values': batch_audio_values,
        'audio_attn_mask': batch_audio_attn_masks,
    }

    return batch

In [7]:
class MyLitModel(pl.LightningModule):
    def __init__(self, audio_model_name, num_labels, n_layers=1, projector=True, classifier=True, dropout=0.07,
                 lr_decay=1):
        super(MyLitModel, self).__init__()
        self.config = AutoConfig.from_pretrained(audio_model_name, num_labels=num_labels)
        self.config.activation_dropout = dropout
        self.config.attention_dropout = dropout
        self.config.final_dropout = dropout
        self.config.hidden_dropout = dropout
        self.config.hidden_dropout_prob = dropout
        self.audio_model = HubertForSequenceClassification.from_pretrained(audio_model_name, config=self.config)
        self.lr_decay = lr_decay
        self._do_reinit(n_layers, projector, classifier)

    def forward(self, audio_values, audio_attn_mask):
        logits = self.audio_model(input_values=audio_values, attention_mask=audio_attn_mask).logits
        return logits

    def training_step(self, batch, batch_idx):
        audio_values = batch['audio_values']
        audio_attn_mask = batch['audio_attn_mask']
        labels = batch['label']

        logits = self(audio_values, audio_attn_mask)
        loss = nn.BCEWithLogitsLoss()(logits, labels)

        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)

        return loss

    def validation_step(self, batch, batch_idx):
        audio_values = batch['audio_values']
        audio_attn_mask = batch['audio_attn_mask']
        labels = batch['label']

        logits = self(audio_values, audio_attn_mask)
        loss = nn.BCEWithLogitsLoss()(logits, labels)

        self.log('val_loss', loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def predict_step(self, batch, batch_idx, dataloader_idx=None):
        audio_values = batch['audio_values']
        audio_attn_mask = batch['audio_attn_mask']

        logits = self(audio_values, audio_attn_mask)
        probs = torch.sigmoid(logits)

        return probs

    def configure_optimizers(self):
        lr = 1e-5
        layer_decay = self.lr_decay
        weight_decay = 0.01
        llrd_params = self._get_llrd_params(lr=lr, layer_decay=layer_decay, weight_decay=weight_decay)
        optimizer = bnb.optim.AdamW8bit(llrd_params)  # optimizer 을 8bit 로 하여 계산 속도 향상 및 vram 사용량 감축
        return optimizer

    def _get_llrd_params(self, lr, layer_decay, weight_decay):
        n_layers = self.audio_model.config.num_hidden_layers
        llrd_params = []
        for name, value in list(self.named_parameters()):
            if ('bias' in name) or ('layer_norm' in name):
                llrd_params.append({"params": value, "lr": lr, "weight_decay": 0.0})
            elif ('emb' in name) or ('feature' in name):
                llrd_params.append(
                    {"params": value, "lr": lr * (layer_decay ** (n_layers + 1)), "weight_decay": weight_decay})
            elif 'encoder.layer' in name:
                for n_layer in range(n_layers):
                    if f'encoder.layer.{n_layer}' in name:
                        llrd_params.append(
                            {"params": value, "lr": lr * (layer_decay ** (n_layer + 1)), "weight_decay": weight_decay})
            else:
                llrd_params.append({"params": value, "lr": lr, "weight_decay": weight_decay})
        return llrd_params

    def _do_reinit(self, n_layers=0, projector=True, classifier=True):
        if projector:
            self.audio_model.projector.apply(self._init_weight_and_bias)
        if classifier:
            self.audio_model.classifier.apply(self._init_weight_and_bias)

        for n in range(n_layers):
            self.audio_model.hubert.encoder.layers[-(n + 1)].apply(self._init_weight_and_bias)

    def _init_weight_and_bias(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.audio_model.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

In [8]:
seed_everything(SEED)

# 사운드 특징 추출
audio_feature_extractor = AutoFeatureExtractor.from_pretrained(AUDIO_MODEL_NAME)
audio_feature_extractor.return_attention_mask = True

# 데이터 로드
train_df = pd.read_csv('./train_final.csv')
print(f"Train DataFrame shape: {train_df.shape}")

Seed set to 42


Train DataFrame shape: (59999, 2)


In [9]:
train_df['path'] = train_df['path'].apply(lambda x: os.path.join(DATA_DIR, x))

In [10]:
# 싱글 라벨을 멀티 라벨로 변환
train_df['label'] = train_df['label'].apply(
    lambda x: [1, 0] if x in [0, 4] else (
        [0, 1] if x in [1, 5] else (
            [1, 1] if x == 2 else [0, 0]
        )
    )
)

In [11]:
train_audios, valid_indices = getAudios(train_df)
print(f"Number of valid train audios: {len(train_audios)}")
train_df = train_df.iloc[valid_indices].reset_index(drop=True)
train_labels = np.array(train_df['label'].tolist())


  0%|          | 0/59999 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [13]:
skf = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)
for fold_idx, (train_indices, val_indices) in enumerate(
        skf.split(train_labels, train_labels.argmax(axis=1))):

    if fold_idx < 7 :
        continue
    
    print(
        f"Fold {fold_idx}: Train indices length: {len(train_indices)}, Validation indices length: {len(val_indices)}")
    train_fold_audios = [train_audios[train_index] for train_index in train_indices]
    val_fold_audios = [train_audios[val_index] for val_index in val_indices]

    train_fold_labels = train_labels[train_indices]
    val_fold_labels = train_labels[val_indices]
    train_fold_ds = MyDataset(train_fold_audios, audio_feature_extractor, train_fold_labels)
    val_fold_ds = MyDataset(val_fold_audios, audio_feature_extractor, val_fold_labels)
    train_fold_dl = DataLoader(train_fold_ds, batch_size=BATCH_SIZE, collate_fn=collate_fn)
    val_fold_dl = DataLoader(val_fold_ds, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    checkpoint_acc_callback = ModelCheckpoint(
        monitor='val_loss',
        dirpath=MODEL_DIR,
        filename=f'fold_{fold_idx}' + '_{epoch:02d}-{val_loss:.4f}-{train_loss:.4f}',
        save_top_k=30,
        mode='max'
    )

    my_lit_model = MyLitModel(
        audio_model_name=AUDIO_MODEL_NAME,
        num_labels=NUM_LABELS,
        n_layers=1, projector=True, classifier=True, dropout=0.07, lr_decay=0.8
    )

    trainer = pl.Trainer(
        accelerator='cuda',
        max_epochs=1,
        precision='16',
        val_check_interval=0.1,
        callbacks=[checkpoint_acc_callback],
        accumulate_grad_batches=2
        # batch_size * accumulate_grad_batches = 가 실질적인 배치 사이즈임. (vram 은 batch_size 기준으로 소모함.)
    )

    print(f"Starting training for fold {fold_idx}...")
    trainer.fit(my_lit_model, train_fold_dl, val_fold_dl)
    print(f"Training completed for fold {fold_idx}.")

    del my_lit_model

Fold 0: Train indices length: 56999, Validation indices length: 3000


C:\Users\shsmc\Downloads\wavemotion\venv\lib\site-packages\lightning_fabric\connector.py:571: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
C:\Users\shsmc\Downloads\wavemotion\venv\lib\site-packages\pytorch_lightning\trainer\connectors\logger_connector\logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
You are using a CUDA device ('NVIDIA GeForce RTX 4060 Ti') that has Tensor 

Starting training for fold 0...



  | Name        | Type                            | Params | Mode
-----------------------------------------------------------------------
0 | audio_model | HubertForSequenceClassification | 94.6 M | eval
-----------------------------------------------------------------------
94.6 M    Trainable params
0         Non-trainable params
94.6 M    Total params
378.276   Total estimated model params size (MB)


Sanity Checking: |                                                                                            …

C:\Users\shsmc\Downloads\wavemotion\venv\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
  attn_output = torch.nn.functional.scaled_dot_product_attention(
C:\Users\shsmc\Downloads\wavemotion\venv\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Training: |                                                                                                   …

Validation: |                                                                                                 …

C:\Users\shsmc\Downloads\wavemotion\venv\lib\site-packages\pytorch_lightning\trainer\call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...


Training completed for fold 0.
Fold 1: Train indices length: 56999, Validation indices length: 3000


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type                            | Params | Mode
-----------------------------------------------------------------------
0 | audio_model | HubertForSequenceClassification | 94.6 M | eval
-----------------------------------------------------------------------
94.6 M    Trainable params
0         Non-trainable params
94.6 M    Total params
378.276   Total estimated model params size (MB)


Starting training for fold 1...


Sanity Checking: |                                                                                            …

Training: |                                                                                                   …

Validation: |                                                                                                 …

Training completed for fold 1.
Fold 2: Train indices length: 56999, Validation indices length: 3000


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type                            | Params | Mode
-----------------------------------------------------------------------
0 | audio_model | HubertForSequenceClassification | 94.6 M | eval
-----------------------------------------------------------------------
94.6 M    Trainable params
0         Non-trainable params
94.6 M    Total params
378.276   Total estimated model params size (MB)


Starting training for fold 2...


Sanity Checking: |                                                                                            …

Training: |                                                                                                   …

Validation: |                                                                                                 …

Training completed for fold 2.
Fold 3: Train indices length: 56999, Validation indices length: 3000


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type                            | Params | Mode
-----------------------------------------------------------------------
0 | audio_model | HubertForSequenceClassification | 94.6 M | eval
-----------------------------------------------------------------------
94.6 M    Trainable params
0         Non-trainable params
94.6 M    Total params
378.276   Total estimated model params size (MB)


Starting training for fold 3...


Sanity Checking: |                                                                                            …

Training: |                                                                                                   …

Validation: |                                                                                                 …

Training completed for fold 3.
Fold 4: Train indices length: 56999, Validation indices length: 3000


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type                            | Params | Mode
-----------------------------------------------------------------------
0 | audio_model | HubertForSequenceClassification | 94.6 M | eval
-----------------------------------------------------------------------
94.6 M    Trainable params
0         Non-trainable params
94.6 M    Total params
378.276   Total estimated model params size (MB)


Starting training for fold 4...


Sanity Checking: |                                                                                            …

Training: |                                                                                                   …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Training completed for fold 4.
Fold 5: Train indices length: 56999, Validation indices length: 3000


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type                            | Params | Mode
-----------------------------------------------------------------------
0 | audio_model | HubertForSequenceClassification | 94.6 M | eval
-----------------------------------------------------------------------
94.6 M    Trainable params
0         Non-trainable params
94.6 M    Total params
378.276   Total estimated model params size (MB)


Starting training for fold 5...


Sanity Checking: |                                                                                            …

Training: |                                                                                                   …

Validation: |                                                                                                 …

Training completed for fold 5.
Fold 6: Train indices length: 56999, Validation indices length: 3000


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type                            | Params | Mode
-----------------------------------------------------------------------
0 | audio_model | HubertForSequenceClassification | 94.6 M | eval
-----------------------------------------------------------------------
94.6 M    Trainable params
0         Non-trainable params
94.6 M    Total params
378.276   Total estimated model params size (MB)


Starting training for fold 6...


Sanity Checking: |                                                                                            …

Training: |                                                                                                   …

Validation: |                                                                                                 …

Training completed for fold 6.
Fold 7: Train indices length: 56999, Validation indices length: 3000


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type                            | Params | Mode
-----------------------------------------------------------------------
0 | audio_model | HubertForSequenceClassification | 94.6 M | eval
-----------------------------------------------------------------------
94.6 M    Trainable params
0         Non-trainable params
94.6 M    Total params
378.276   Total estimated model params size (MB)


Starting training for fold 7...


Sanity Checking: |                                                                                            …

Training: |                                                                                                   …

Validation: |                                                                                                 …

Training completed for fold 7.
Fold 8: Train indices length: 56999, Validation indices length: 3000


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type                            | Params | Mode
-----------------------------------------------------------------------
0 | audio_model | HubertForSequenceClassification | 94.6 M | eval
-----------------------------------------------------------------------
94.6 M    Trainable params
0         Non-trainable params
94.6 M    Total params
378.276   Total estimated model params size (MB)


Starting training for fold 8...


Sanity Checking: |                                                                                            …

Training: |                                                                                                   …

Validation: |                                                                                                 …

Training completed for fold 8.
Fold 9: Train indices length: 56999, Validation indices length: 3000


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type                            | Params | Mode
-----------------------------------------------------------------------
0 | audio_model | HubertForSequenceClassification | 94.6 M | eval
-----------------------------------------------------------------------
94.6 M    Trainable params
0         Non-trainable params
94.6 M    Total params
378.276   Total estimated model params size (MB)


Starting training for fold 9...


Sanity Checking: |                                                                                            …

Training: |                                                                                                   …

Validation: |                                                                                                 …

Training completed for fold 9.
Fold 10: Train indices length: 56999, Validation indices length: 3000


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type                            | Params | Mode
-----------------------------------------------------------------------
0 | audio_model | HubertForSequenceClassification | 94.6 M | eval
-----------------------------------------------------------------------
94.6 M    Trainable params
0         Non-trainable params
94.6 M    Total params
378.276   Total estimated model params size (MB)


Starting training for fold 10...


Sanity Checking: |                                                                                            …

Training: |                                                                                                   …

Validation: |                                                                                                 …

Training completed for fold 10.
Fold 11: Train indices length: 56999, Validation indices length: 3000


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type                            | Params | Mode
-----------------------------------------------------------------------
0 | audio_model | HubertForSequenceClassification | 94.6 M | eval
-----------------------------------------------------------------------
94.6 M    Trainable params
0         Non-trainable params
94.6 M    Total params
378.276   Total estimated model params size (MB)


Starting training for fold 11...


Sanity Checking: |                                                                                            …

Training: |                                                                                                   …

Validation: |                                                                                                 …

Training completed for fold 11.
Fold 12: Train indices length: 56999, Validation indices length: 3000


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type                            | Params | Mode
-----------------------------------------------------------------------
0 | audio_model | HubertForSequenceClassification | 94.6 M | eval
-----------------------------------------------------------------------
94.6 M    Trainable params
0         Non-trainable params
94.6 M    Total params
378.276   Total estimated model params size (MB)


Starting training for fold 12...


Sanity Checking: |                                                                                            …

Training: |                                                                                                   …

Validation: |                                                                                                 …

Training completed for fold 12.
Fold 13: Train indices length: 56999, Validation indices length: 3000


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type                            | Params | Mode
-----------------------------------------------------------------------
0 | audio_model | HubertForSequenceClassification | 94.6 M | eval
-----------------------------------------------------------------------
94.6 M    Trainable params
0         Non-trainable params
94.6 M    Total params
378.276   Total estimated model params size (MB)


Starting training for fold 13...


Sanity Checking: |                                                                                            …

Training: |                                                                                                   …

Validation: |                                                                                                 …

Training completed for fold 13.
Fold 14: Train indices length: 56999, Validation indices length: 3000


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type                            | Params | Mode
-----------------------------------------------------------------------
0 | audio_model | HubertForSequenceClassification | 94.6 M | eval
-----------------------------------------------------------------------
94.6 M    Trainable params
0         Non-trainable params
94.6 M    Total params
378.276   Total estimated model params size (MB)


Starting training for fold 14...


Sanity Checking: |                                                                                            …

Training: |                                                                                                   …

Validation: |                                                                                                 …

Training completed for fold 14.
Fold 15: Train indices length: 56999, Validation indices length: 3000


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type                            | Params | Mode
-----------------------------------------------------------------------
0 | audio_model | HubertForSequenceClassification | 94.6 M | eval
-----------------------------------------------------------------------
94.6 M    Trainable params
0         Non-trainable params
94.6 M    Total params
378.276   Total estimated model params size (MB)


Starting training for fold 15...


Sanity Checking: |                                                                                            …

Training: |                                                                                                   …

Validation: |                                                                                                 …

Training completed for fold 15.
Fold 16: Train indices length: 56999, Validation indices length: 3000


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type                            | Params | Mode
-----------------------------------------------------------------------
0 | audio_model | HubertForSequenceClassification | 94.6 M | eval
-----------------------------------------------------------------------
94.6 M    Trainable params
0         Non-trainable params
94.6 M    Total params
378.276   Total estimated model params size (MB)


Starting training for fold 16...


Sanity Checking: |                                                                                            …

Training: |                                                                                                   …

Validation: |                                                                                                 …

Training completed for fold 16.
Fold 17: Train indices length: 56999, Validation indices length: 3000


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type                            | Params | Mode
-----------------------------------------------------------------------
0 | audio_model | HubertForSequenceClassification | 94.6 M | eval
-----------------------------------------------------------------------
94.6 M    Trainable params
0         Non-trainable params
94.6 M    Total params
378.276   Total estimated model params size (MB)


Starting training for fold 17...


Sanity Checking: |                                                                                            …

Training: |                                                                                                   …

Validation: |                                                                                                 …

Training completed for fold 17.
Fold 18: Train indices length: 56999, Validation indices length: 3000


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type                            | Params | Mode
-----------------------------------------------------------------------
0 | audio_model | HubertForSequenceClassification | 94.6 M | eval
-----------------------------------------------------------------------
94.6 M    Trainable params
0         Non-trainable params
94.6 M    Total params
378.276   Total estimated model params size (MB)


Starting training for fold 18...


Sanity Checking: |                                                                                            …

Training: |                                                                                                   …

Training completed for fold 18.
Fold 19: Train indices length: 57000, Validation indices length: 2999


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Starting training for fold 19...



  | Name        | Type                            | Params | Mode
-----------------------------------------------------------------------
0 | audio_model | HubertForSequenceClassification | 94.6 M | eval
-----------------------------------------------------------------------
94.6 M    Trainable params
0         Non-trainable params
94.6 M    Total params
378.276   Total estimated model params size (MB)


Sanity Checking: |                                                                                            …

Training: |                                                                                                   …

Validation: |                                                                                                 …

Training completed for fold 19.
