## Installation of Libraries

In [None]:
!pip install  -U transformers
!pip install -U accelerate
!apt install ffmpeg
!pip install -U datasets
from datasets import load_dataset

Collecting transformers
  Downloading transformers-4.40.1-py3-none-any.whl (9.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.40.0
    Uninstalling transformers-4.40.0:
      Successfully uninstalled transformers-4.40.0
Successfully installed transformers-4.40.1
Collecting accelerate
  Downloading accelerate-0.29.3-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 

##Extracting Features using Hugging Face Transformers

In [None]:
from transformers import AutoFeatureExtractor

model_id = "ntu-spml/distilhubert"
feature_extractor = AutoFeatureExtractor.from_pretrained(
    model_id, do_normalize=True, return_attention_mask=True
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/214 [00:00<?, ?B/s]

##Setting Sampling Rate

In [None]:
sampling_rate = feature_extractor.sampling_rate
sampling_rate

16000

##Preprocessing Audio Dataset with Hugging Face

In [None]:
#!pip install datasets
from datasets import load_dataset
from datasets import Audio

# Load your dataset from the CSV file
my_dataset = load_dataset('csv', data_files='/content/drive/MyDrive/dataset.csv')

# Define the desired sampling rate
sampling_rate = 16000

# Cast the 'audio_file' column to the Audio type with the specified sampling rate
my_dataset = my_dataset.cast_column("audio_file", Audio(sampling_rate=sampling_rate))

# Now 'audio' column contains the audio data with the specified sampling rate


Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
my_dataset

DatasetDict({
    train: Dataset({
        features: ['audio_file', 'label'],
        num_rows: 815
    })
})

##Training Split of my_dataset

In [None]:
my_dataset["train"][790]

{'audio_file': {'path': '/content/drive/MyDrive/Blocks/REC20240427111925.wav',
  'array': array([-4.32009983e-12,  5.91171556e-12,  5.45696821e-12, ...,
          5.64008532e-03,  6.91546034e-03,  7.45811919e-03]),
  'sampling_rate': 16000},
 'label': 'blocks'}

## Splitting my_dataset into Training and 10% Test Size


In [None]:
my_dataset = my_dataset["train"].train_test_split(seed=42, shuffle=True, test_size=0.1)
my_dataset


DatasetDict({
    train: Dataset({
        features: ['audio_file', 'label'],
        num_rows: 733
    })
    test: Dataset({
        features: ['audio_file', 'label'],
        num_rows: 82
    })
})

##Calculating Mean and Variance of Audio Sample Array

In [None]:
import numpy as np

sample = my_dataset["train"][0]["audio_file"]

print(f"Mean: {np.mean(sample['array']):.3}, Variance: {np.var(sample['array']):.3}")

Mean: -6.69e-06, Variance: 0.0166


##Extracting Features from Audio Sample Array and Calculating Mean/Variance of Input Values

In [None]:
inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])

print(f"inputs keys: {list(inputs.keys())}")

print(
    f"Mean: {np.mean(inputs['input_values']):.3}, Variance: {np.var(inputs['input_values']):.3}"
)

inputs keys: ['input_values', 'attention_mask']
Mean: -3e-09, Variance: 1.0


##Preprocessing Function for Extracting Features from Audio Samples and Handling Padding/Truncation

In [None]:
def preprocess_function(examples):
  audio_arrays = [x["array"] for x in examples["audio_file"]]

  # Calculate max_length based on your audio clip duration and sampling rate
  max_duration = 10.0  # Adjust this based on your actual clip duration (in seconds)
  sampling_rate = feature_extractor.sampling_rate  # Assuming you have access to this
  max_length = int(sampling_rate * max_duration)

  inputs = feature_extractor(
      audio_arrays,
      sampling_rate=sampling_rate,
      max_length=max_length,
      truncation=True,  # Not needed for shorter clips, but harmless here
      padding="longest",  # Explicitly set padding for clarity
      return_attention_mask=True,
  )
  return inputs


##Encoding Dataset with Preprocessed Features and Initializing Audio Classification Model

In [None]:
my_dataset_encoded = my_dataset.map(
    preprocess_function,
    remove_columns=["audio_file"],
    batched=True,
    batch_size=100,
    num_proc=1,
)
my_dataset_encoded

Map:   0%|          | 0/733 [00:00<?, ? examples/s]

Map:   0%|          | 0/82 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_values', 'attention_mask'],
        num_rows: 733
    })
    test: Dataset({
        features: ['label', 'input_values', 'attention_mask'],
        num_rows: 82
    })
})

In [None]:
from transformers import AutoModelForAudioClassification

num_labels = len(id2label)

model = AutoModelForAudioClassification.from_pretrained(
    model_id,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
)

In [None]:
from transformers import AutoModelForAudioClassification

# Define the number of labels
num_labels = 4  # Since you have 4 classes of stutter and nonstutter

# Instantiate the model for audio classification
model = AutoModelForAudioClassification.from_pretrained(
    model_id,
    num_labels=num_labels,
)

config.json:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/94.0M [00:00<?, ?B/s]

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at ntu-spml/distilhubert and are newly initialized: ['classifier.bias', 'classifier.weight', 'encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import accelerate


In [None]:
pip install -U accelerate>=0.21.0


In [None]:
pip install torch

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [None]:
from accelerate import Accelerator

##Setting Up Training Configuration for Fine-Tuning Audio Classification Model

In [None]:
accelerator = Accelerator()
device = accelerator.device

In [None]:
from transformers import TrainingArguments

model_name = "distilhubert"

batch_size = 8
gradient_accumulation_steps = 1
num_train_epochs = 10

training_args = TrainingArguments(
    f"{model_name}-finetuned-stutterdetection",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    warmup_ratio=0.1,
    logging_steps=5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,
    push_to_hub=True,
)

In [None]:
pip install -U evaluate

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: responses, evaluate
Successfully installed evaluate-0.4.1 responses-0.18.0


##Installing the Hugging Face Hub Library

In [None]:
! pip install huggingface_hub



In [None]:
pip install -U transformers[torch]


Collecting transformers[torch]
  Downloading transformers-4.40.1-py3-none-any.whl (9.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.29.3-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch-

##Authenticating with Hugging Face Hub for Notebook Integration

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Initializing Trainer for Fine-Tuning Audio Classification Model


In [None]:
import evaluate
import numpy as np

metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=my_dataset_encoded["train"],
    eval_dataset=my_dataset_encoded["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

##Training the Audio Classification Model

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.8357,0.781211,0.865854
2,0.2951,0.367988,0.890244
3,0.097,0.400044,0.865854
4,0.0872,0.395312,0.902439
5,0.4557,0.490421,0.902439
6,0.0368,0.497242,0.902439
7,0.0074,0.540826,0.914634
8,0.0039,0.545965,0.902439
9,0.0036,0.568433,0.902439
10,0.0035,0.571686,0.902439


TrainOutput(global_step=920, training_loss=0.23715027312181242, metrics={'train_runtime': 1855.0119, 'train_samples_per_second': 3.951, 'train_steps_per_second': 0.496, 'total_flos': 1.66700806848e+17, 'train_loss': 0.23715027312181242, 'epoch': 10.0})

##Pushing the Fine-Tuned Model to the Hugging Face Hub

In [None]:
kwargs = {
    "dataset_tags": "HareemFatima/stutteringdetection",
    "dataset": "stuttering",
    "model_name": f"{model_name}-finetuned-stutteringdetection",
    "finetuned_from": model_id,
    "tasks": "audio-classification",
}


In [None]:
trainer.push_to_hub(**kwargs)


CommitInfo(commit_url='https://huggingface.co/HareemFatima/distilhubert-finetuned-stutterdetection/commit/404d59de9d8db5d8750e9ca04f19866d6b011428', commit_message='End of training', commit_description='', oid='404d59de9d8db5d8750e9ca04f19866d6b011428', pr_url=None, pr_revision=None, pr_num=None)