In [None]:
import numpy as np
import pandas as pd

from pathlib import Path
from tqdm import tqdm

import torchaudio

import os
import sys

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Sat Mar  1 00:12:46 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   50C    P8             12W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
%%capture
!pip install transformers
!pip install datasets

In [None]:
%%capture
!sudo apt-get install git-lfs


In [None]:
%%capture
!git lfs install
!--system
!--skip-repo

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from datasets import load_dataset

# Use the fixed CSV files instead of the original ones
data_files = {
    'train': '/content/train_dm.csv',  # Use the corrected CSV file
    'valid': '/content/valid_dm.csv'
}

# Load the dataset as a proper CSV (comma-separated)
dataset = load_dataset("csv", data_files=data_files)  # No need for delimiter

train_data = dataset['train']
valid_data = dataset['valid']

# Verify column names
print(train_data.column_names)  # Should print ['file', 'label', 'path']


['file', 'label', 'path']


In [None]:
repo_name = "wav2vec2-large-xls-r-300m-dm32"

In [None]:
input_col = 'path'
output_col = 'label'
audio_len = 32

In [None]:
print(train_data.column_names)


['file', 'label', 'path']


In [None]:
label_list = train_data.unique(output_col)
label_list.sort()
num_classes = len(label_list)
print(f"Number of classes: {num_classes}")
print(f"Classes: {label_list}")

Number of classes: 2
Classes: ['dementia', 'nodementia']


In [None]:
from random import randint

def random_subsample(wav: np.ndarray, max_length: float, sample_rate: int = 16000):
    """Randomly sample chunks of `max_length` seconds from the input audio"""
    sample_length = int(round(sample_rate * max_length))
    if len(wav) <= sample_length:
        return wav
    random_offset = randint(0, len(wav) - sample_length - 1)
    return wav[random_offset : random_offset + sample_length]

In [None]:
from transformers import AutoConfig, Wav2Vec2Processor, Wav2Vec2FeatureExtractor

In [None]:
model_name = "facebook/wav2vec2-xls-r-300m"
pooling_mode = "mean"

In [None]:
config = AutoConfig.from_pretrained(
    model_name,
    num_labels=num_classes,
    label2id={label: i for i, label in enumerate(label_list)},
    id2label={i: label for i, label in enumerate(label_list)},
    finetuning_task="wav2vec2_clf",
    )

setattr(config, "pooling_mode", pooling_mode)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name,)
target_sampling_rate = feature_extractor.sampling_rate
print(f"The target sampling rate: {target_sampling_rate}")

The target sampling rate: 16000


In [None]:

def speech_to_array(path):
    speech, sr = torchaudio.load(path)
    transform = torchaudio.transforms.Resample(sr, 16000)
    speech = transform(speech)[0].numpy().squeeze()
    return random_subsample(speech, max_length=audio_len)


def label_to_id(label, label_list):
    if len(label_list) > 0:
        return label_list.index(label) if label in label_list else -1
    return label

def preprocess_fn(examples):
    speech_list = [speech_to_array(path) for path in examples[input_col]]
    target_list = [label_to_id(label, label_list) for label in examples[output_col]]
    result = feature_extractor(speech_list, sampling_rate=target_sampling_rate)
    result['labels'] = list(target_list)

    return result


In [None]:
import torchaudio
import torch

# Track missing files count
missing_files = {"train": {"dementia": 0, "non-dementia": 0}, "valid": {"dementia": 0, "non-dementia": 0}}



def speech_to_array(path):
    try:
        speech, sr = torchaudio.load(path)  # Load audio
        speech = torch.mean(speech, dim=0)  # Convert multi-channel to single-channel (mono)
        transform = torchaudio.transforms.Resample(sr, 16000)  # Resample if needed
        speech = transform(speech).numpy().squeeze()  # Convert to numpy array & ensure 1D format
        return speech
    except Exception as e:
        return None  # Handle missing files


def preprocess_fn(examples, dataset_type="train"):
    global missing_files  # Track missing files

    input_col = "path"
    label_col = "label"

    speech_list = []
    target_list = []

    for path, label in zip(examples[input_col], examples[label_col]):
        speech = speech_to_array(path)

        if speech is None:  # Handle missing files
            if label == "dementia":
                missing_files[dataset_type]["dementia"] += 1
            else:
                missing_files[dataset_type]["non-dementia"] += 1

            # Replace missing audio with silence (empty 1-second sample)
            speech = np.zeros(16000, dtype=np.float32)  # Ensure it's a 1D float32 array

        speech_list.append(speech.tolist())  # Convert NumPy array to list
        target_list.append(label_to_id(label, label_list))  # Convert label to numerical ID

    # Extract features using Wav2Vec2 feature extractor
    result = feature_extractor(speech_list, sampling_rate=target_sampling_rate)

    # Ensure `input_values` is correctly formatted
    result["input_values"] = [np.array(x, dtype=np.float32).tolist() for x in result["input_values"]]

    result["labels"] = target_list  # Add labels

    return result


In [None]:
# Reapply preprocessing to train and validation data
train_data = train_data.map(lambda x: preprocess_fn(x, dataset_type="train"), batch_size=8, batched=True, num_proc=4)
valid_data = valid_data.map(lambda x: preprocess_fn(x, dataset_type="valid"), batch_size=8, batched=True, num_proc=4)

# Verify that 'input_values' is properly formatted
print(train_data[0].keys())  # Should contain 'input_values' and 'labels'
print(type(train_data[0]["input_values"]))  # Should be a list of float values


dict_keys(['file', 'label', 'path', 'input_values', 'attention_mask', 'labels'])
<class 'list'>


In [None]:
# Apply preprocessing while tracking missing files
train_data = train_data.map(lambda x: preprocess_fn(x, dataset_type="train"), batch_size=8, batched=True, num_proc=4)
valid_data = valid_data.map(lambda x: preprocess_fn(x, dataset_type="valid"), batch_size=8, batched=True, num_proc=4)

# Print Missing File Counts
print("Missing Files Summary:")
print(f"Training Set - Dementia: {missing_files['train']['dementia']}, Non-Dementia: {missing_files['train']['non-dementia']}")
print(f"Validation Set - Dementia: {missing_files['valid']['dementia']}, Non-Dementia: {missing_files['valid']['non-dementia']}")


Map (num_proc=4):   0%|          | 0/227 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/48 [00:00<?, ? examples/s]

Missing Files Summary:
Training Set - Dementia: 0, Non-Dementia: 0
Validation Set - Dementia: 0, Non-Dementia: 0


In [None]:
import os

# Check if files exist
missing_files_check = {"train": {"dementia": 0, "non-dementia": 0}, "valid": {"dementia": 0, "non-dementia": 0}}

for dataset, df in [("train", train_data), ("valid", valid_data)]:
    for path, label in zip(df["path"], df["label"]):
        if not os.path.exists(path):  # Check if the file exists
            if label == "dementia":
                missing_files_check[dataset]["dementia"] += 1
            else:
                missing_files_check[dataset]["non-dementia"] += 1

# Print actual missing files count
print("Manually Verified Missing Files:")
print(f"Training Set - Dementia: {missing_files_check['train']['dementia']}, Non-Dementia: {missing_files_check['train']['non-dementia']}")
print(f"Validation Set - Dementia: {missing_files_check['valid']['dementia']}, Non-Dementia: {missing_files_check['valid']['non-dementia']}")


Manually Verified Missing Files:
Training Set - Dementia: 2, Non-Dementia: 79
Validation Set - Dementia: 0, Non-Dementia: 19


In [None]:
train_data[0].keys()

dict_keys(['file', 'label', 'path', 'input_values', 'attention_mask', 'labels'])

In [None]:

#seq_len = [train_data[i]['path'] for i in range(len(train_data)) if len(train_data[i]['input_values']) < 128000]

In [None]:
from dataclasses import dataclass
from typing import Optional, Tuple
import torch
from transformers.file_utils import ModelOutput

@dataclass
class SpeechClassifierModelOutput(ModelOutput):
    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None


In [None]:
import torch
import torch.nn as nn
from transformers.models.wav2vec2.modeling_wav2vec2 import (Wav2Vec2PreTrainedModel, Wav2Vec2Model)

class Wav2Vec2ClassificationHead(nn.Module):
    """head for wav2vec classification task"""
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.final_dropout)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        x = features
        x = self.dense(x)
        x = self.dropout(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x

class Wav2Vec2ForSpeechClassification(Wav2Vec2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.pooling_mode = config.pooling_mode
        self.config = config

        self.wav2vec2 = Wav2Vec2Model(config)
        self.classifier = Wav2Vec2ClassificationHead(config)

        self.init_weights()

    def freeze_feature_extractor(self):
        self.wav2vec2.feature_extractor._freeze_parameters()

    def merge_strategy(self, hidden_states, mode="mean"):
        if mode == "mean":
            return torch.mean(hidden_states, dim=1)
        elif mode == "max":
            return torch.max(hidden_states, dim=1)[0]
        elif mode == "sum":
            return torch.sum(hidden_states, dim=1)
        else:
            raise ValueError(f"Unknown merge strategy: {mode}")

    def forward(
        self,
        input_values,
        attention_mask=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        labels=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        outputs = self.wav2vec2(input_values,
                            attention_mask=attention_mask,
                            output_attentions=output_attentions,
                            output_hidden_states=output_hidden_states,
                            return_dict=return_dict)

        hidden_states = outputs[0]
        hidden_states = self.merge_strategy(hidden_states, self.pooling_mode)
        logits = self.classifier(hidden_states)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SpeechClassifierModelOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)

In [None]:
from dataclasses import dataclass
from typing import Dict, List, Optional, Union
import torch

import transformers
from transformers import Wav2Vec2Processor, Wav2Vec2FeatureExtractor

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        feature_extractor (:class:`~transformers.Wav2Vec2FeatureExtractor`):
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """
    feature_extractor: Wav2Vec2FeatureExtractor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"],} for feature in features]
        label_features = [feature["labels"] for feature in features]

        d_type = torch.long if isinstance(label_features[0], int) else torch.float

        batch = self.feature_extractor.pad( input_features,
                                    padding=self.padding,
                                    max_length=self.max_length,
                                    pad_to_multiple_of=self.pad_to_multiple_of,
                                    return_tensors="pt",
        )

        batch['labels'] = torch.tensor(label_features, dtype=d_type)

        return batch

In [None]:
data_collator = DataCollatorCTCWithPadding(feature_extractor=feature_extractor, padding=True)

In [None]:
import numpy as np
from transformers import EvalPrediction

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis=1)

    return {'accuracy': (preds == p.label_ids).astype(np.float32).mean().item()}

In [None]:
#from transformers import Wav2Vec2ForSequenceClassification
model = Wav2Vec2ForSpeechClassification.from_pretrained(model_name, config=config)

Some weights of Wav2Vec2ForSpeechClassification were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.freeze_feature_extractor()

In [None]:
from transformers import Trainer, TrainingArguments

'''training_args = TrainingArguments(
    output_dir= repo_name,
    group_by_length=True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    evaluation_strategy="steps",
    gradient_checkpointing=True,
    num_train_epochs=22,
    logging_dir=None,
    save_steps=110,
    eval_steps=34,
    logging_steps=110,
    learning_rate=1e-4,
    save_total_limit=2,
    fp16=True,
    push_to_hub=True,
)

'''



training_args = TrainingArguments(
    output_dir=repo_name,
    group_by_length=True,
    per_device_train_batch_size=2,  # Reduce from 8 → 2 (Saves GPU memory)
    per_device_eval_batch_size=2,  # Reduce from 8 → 2
    gradient_accumulation_steps=8,  # Increase to compensate for small batch size
    evaluation_strategy="steps",
    gradient_checkpointing=True,
    num_train_epochs=15,  # Reduce from 22 → 15 (Faster training)
    save_steps=110,
    eval_steps=34,
    logging_steps=110,
    learning_rate=1e-4,
    save_total_limit=2,
    fp16=True,
    push_to_hub=True,
)





In [None]:
from typing import Any, Dict, Union

import torch
from packaging import version
from torch import nn

from transformers import (
    Trainer,
    is_apex_available,
)

if is_apex_available():
    from apex import amp

if version.parse(torch.__version__) >= version.parse("1.6"):
    _is_native_amp_available = True
    from torch.cuda.amp import autocast


class CTCTrainer(Trainer):
    def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor:
        """
        Perform a training step on a batch of inputs.

        Subclass and override to inject custom behavior.

        Args:
            model (:obj:`nn.Module`):
                The model to train.
            inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
                The inputs and targets of the model.

                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
                argument :obj:`labels`. Check your model's documentation for all accepted arguments.

        Return:
            :obj:`torch.Tensor`: The tensor with training loss on this batch.
        """

        model.train()
        inputs = self._prepare_inputs(inputs)

        if self.use_amp:
            with autocast():
                loss = self.compute_loss(model, inputs)
        else:
            loss = self.compute_loss(model, inputs)

        if self.args.gradient_accumulation_steps > 1:
            loss = loss / self.args.gradient_accumulation_steps

        if self.use_amp:
            self.scaler.scale(loss).backward()
        elif self.use_apex:
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        elif self.deepspeed:
            self.deepspeed.backward(loss)
        else:
            loss.backward()

        return loss.detach()

In [None]:
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=valid_data,
    tokenizer=feature_extractor,
)

  trainer = Trainer(


In [None]:
!pip install --upgrade accelerate
from accelerate import Accelerator
Accelerator().free_memory()


Collecting accelerate
  Downloading accelerate-1.4.0-py3-none-any.whl.metadata (19 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0

[]

In [None]:
from accelerate import Accelerator

# Initialize Accelerator
accelerator = Accelerator()

# Reset state to avoid issues
accelerator.free_memory()


[]

In [None]:
import torch
torch.cuda.empty_cache()
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmahendraranadeep[0m ([33mmahendraranadeep-student[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Accuracy
34,No log,0.400425,0.8125
68,No log,0.407041,0.791667
102,No log,0.436093,0.8125
136,0.431400,0.389001,0.791667
170,0.431400,0.407808,0.8125
204,0.431400,0.391912,0.729167


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

TrainOutput(global_step=210, training_loss=0.39791038149879093, metrics={'train_runtime': 5252.983, 'train_samples_per_second': 0.648, 'train_steps_per_second': 0.04, 'total_flos': 3.7234577125987256e+18, 'train_loss': 0.39791038149879093, 'epoch': 14.0})

In [None]:
trainer.push_to_hub(repo_name)

CommitInfo(commit_url='https://huggingface.co/RanadeepMahendra/wav2vec2-large-xls-r-300m-dm32/commit/80f92feef2ccac118bd571300a6623977b3739d4', commit_message='wav2vec2-large-xls-r-300m-dm32', commit_description='', oid='80f92feef2ccac118bd571300a6623977b3739d4', pr_url=None, repo_url=RepoUrl('https://huggingface.co/RanadeepMahendra/wav2vec2-large-xls-r-300m-dm32', endpoint='https://huggingface.co', repo_type='model', repo_id='RanadeepMahendra/wav2vec2-large-xls-r-300m-dm32'), pr_revision=None, pr_num=None)