In [None]:
pip install wandb

In [None]:
pip install transformers

In [None]:
pip install torch torchvision torchaudio

In [None]:
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu


In [None]:
pip util 

In [2]:
import os
import logging
import wandb
import numpy as np
import librosa
from datasets import DatasetDict, load_dataset
from transformers import (
    HubertForSequenceClassification,
    PretrainedConfig,
    Trainer,
    TrainingArguments,
    Wav2Vec2FeatureExtractor,
)
# from utils import collator

logging.basicConfig(
    format="%(asctime)s | %(levelname)s: %(message)s", level=logging.INFO
)

  from pandas.core import (





In [3]:
PROJECT_ROOT = "C:/Users/NadiadAdmin/Desktop/Audio Emotion project/medium article"
dataset_config = {
    "LOADING_SCRIPT_FILES": os.path.join(PROJECT_ROOT, "crema.py"),
    "CONFIG_NAME": "clean",
    "DATA_DIR": os.path.join(PROJECT_ROOT, "crema-d.zip"),
    "CACHE_DIR": os.path.join(PROJECT_ROOT, "cache_crema"),
}

ds = load_dataset(
    dataset_config["LOADING_SCRIPT_FILES"],
    dataset_config["CONFIG_NAME"],
    data_dir=dataset_config["DATA_DIR"],
    cache_dir=dataset_config["CACHE_DIR"],
    trust_remote_code=True
)
print(ds)


DatasetDict({
    train: Dataset({
        features: ['file', 'label'],
        num_rows: 7442
    })
})


In [4]:
from transformers import Wav2Vec2FeatureExtractor
model = "facebook/hubert-base-ls960"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model)
print(feature_extractor)

Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0,
  "return_attention_mask": false,
  "sampling_rate": 16000
}



In [5]:
model_path = "facebook/hubert-large-ls960-ft"
hubert_model = HubertForSequenceClassification.from_pretrained(model_path)
hubert_model_config = hubert_model.config
print("Num of labels:", hubert_model_config.num_labels)

  return self.fget.__get__(instance, owner)()
Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-large-ls960-ft and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Num of labels: 2


In [6]:
from transformers import HubertConfig, HubertForSequenceClassification
NUM_LABELS = 6
model_id = "facebook/hubert-base-ls960"

config = HubertConfig.from_pretrained(model_id, num_labels=NUM_LABELS)
hubert_model = HubertForSequenceClassification.from_pretrained(
    model_id,
    config=config,  # because we need to update num_labels as per our dataset
    ignore_mismatched_sizes=True,  # to avoid classifier size mismatch from from_pretrained.
)

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# freeze all layers to begin with
for param in hubert_model.parameters():
    param.requires_grad = False
    
# freeze two encoder layers    
layers_freeze_num = 2
n_layers = (
    4 + layers_freeze_num * 16
)  # 4 refers to projector and classifier's weights and biases.
for name, param in list(hubert_model.named_parameters())[-n_layers:]:
    param.requires_grad = True

In [8]:
from datasets import load_dataset

def process_audio(example):
    import librosa
    # Load the audio file with librosa
    audio, _ = librosa.load(example["file"], sr=16000, mono=False)
    # Return the modified example with the audio array
    return {"array": audio}

# Assume ds is your dataset
ds = ds.map(process_audio, num_proc=2)


In [9]:
print(ds)

DatasetDict({
    train: Dataset({
        features: ['file', 'label', 'array'],
        num_rows: 7442
    })
})


In [10]:
unique_labels = set()
for example in ds['train']:
    unique_labels.add(example['label'])

In [11]:
unique_labels

{'ANG', 'DIS', 'FEA', 'HAP', 'NEU', 'SAD'}

In [12]:
from datasets import DatasetDict

# Assuming `ds` is your DatasetDict object as shown in your message
# Step 1: Define the label mapping
label_to_int = {
    "SAD": 0,
    "FEA": 1,
    "HAP": 2,
    "NEU": 3,
    "ANG": 4,
    "DIS": 5
}

# Step 2: Convert labels in the dataset
def convert_labels(example):
    example['label'] = label_to_int[example['label']]
    return example

ds['train'] = ds['train'].map(convert_labels)

# Step 3: Verify the conversion
for example in ds['train']:
    print(example['label'])
    break

4


In [13]:
# PROCESS THE DATASET TO THE FORMAT EXPECTED BY THE MODEL FOR TRAINING

INPUT_FIELD = "input_values"
LABEL_FIELD = "labels"

def prepare_dataset(batch, feature_extractor):
    audio_arr = batch["array"]
    input = feature_extractor(
        audio_arr, sampling_rate=16000, padding=True, return_tensors="pt"
    )

    batch[INPUT_FIELD] = input.input_values[0]
    batch[LABEL_FIELD] = int(batch["label"])  # colname MUST be labels as Trainer will look for it by default
    return batch

In [14]:
# APPLY THE DATA PREP USING FEATURE EXTRACTOR TO ALL EXAMPLES
ds = ds.map(
    prepare_dataset,
    fn_kwargs={"feature_extractor": feature_extractor},
    # num_proc=2,
)
logging.info("Finished extracting features from audio arrays.")

2024-08-16 07:45:16,531 | INFO: Finished extracting features from audio arrays.


In [15]:
# LABEL TO ID
ds = ds.class_encode_column("label")

In [16]:
# INTRODUCE TRAIN TEST VAL SPLITS

# 90% train, 10% test + validation
train_testvalid = ds["train"].train_test_split(shuffle=True, test_size=0.1)
# Split the 10% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
# gather everyone if you want to have a single DatasetDict
ds = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'val': test_valid['train']})

In [17]:
trainer_config = {
  "OUTPUT_DIR": "results",
  "TRAIN_EPOCHS": 20,
  "TRAIN_BATCH_SIZE": 8,
  "EVAL_BATCH_SIZE": 8,
  "GRADIENT_ACCUMULATION_STEPS": 4,
  "WARMUP_STEPS": 500,
  "DECAY": 0.01,
  "LOGGING_STEPS": 10,
  "MODEL_DIR": "models/test-hubert-model",
  "SAVE_STEPS": 100
}

# Fine-Tuning with Trainer
training_args = TrainingArguments(
    output_dir=trainer_config["OUTPUT_DIR"],  # output directory
    gradient_accumulation_steps=trainer_config[
        "GRADIENT_ACCUMULATION_STEPS"
    ],  # accumulate the gradients before running optimization step
    num_train_epochs=trainer_config[
        "TRAIN_EPOCHS"
    ],  # total number of training epochs
    per_device_train_batch_size=trainer_config[
        "TRAIN_BATCH_SIZE"
    ],  # batch size per device during training
    per_device_eval_batch_size=trainer_config[
        "EVAL_BATCH_SIZE"
    ],  # batch size for evaluation
    warmup_steps=trainer_config[
        "WARMUP_STEPS"
    ],  # number of warmup steps for learning rate scheduler
    save_steps=trainer_config["SAVE_STEPS"], # save checkpoint every 100 steps
    weight_decay=trainer_config["DECAY"],  # strength of weight decay
    logging_steps=trainer_config["LOGGING_STEPS"],
    evaluation_strategy="epoch",  # report metric at end of each epoch
    report_to="wandb",  # enable logging to W&B
)



In [None]:
pip install accelerate

In [18]:
from dataclasses import dataclass
from typing import Dict, List, Optional, Union

import torch
from transformers import Wav2Vec2Processor

INPUT_FIELD = "input_values"
LABEL_FIELD = "labels"


@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(
        self, examples: List[Dict[str, Union[List[int], torch.Tensor]]]
    ) -> Dict[str, torch.Tensor]:

        input_features = [
            {INPUT_FIELD: example[INPUT_FIELD]} for example in examples
        ]  # example is basically row0, row1, etc...
        labels = [example[LABEL_FIELD] for example in examples]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        batch[LABEL_FIELD] = torch.tensor(labels)

        return batch



In [19]:
# DEFINE DATA COLLATOR - TO PAD TRAINING BATCHES DYNAMICALLY
data_collator = DataCollatorCTCWithPadding(
            processor=feature_extractor,
            padding=True
)

In [None]:
pip install evaluate


In [20]:
# from datasets import load_metric

from evaluate import load
def compute_metrics(eval_pred):
    # DEFINE EVALUATION METRIC
    compute_accuracy_metric = load("accuracy")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return compute_accuracy_metric.compute(predictions=predictions, references=labels)

In [21]:
# START TRAINING
trainer = Trainer(
    model=hubert_model,  # the instantiated 🤗 Transformers model to be trained
    args=training_args,  # training arguments, defined above
    data_collator=data_collator,
    train_dataset=ds["train"],  # training dataset
    eval_dataset=ds["val"],  # evaluation dataset
    compute_metrics=compute_metrics,
)

In [22]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mmaheeshpurohit[0m ([33mmaheeshpurohit-vellore-institute-of-technology[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy
0,1.5958,1.495933,0.413978
2,1.1237,0.945449,0.623656
4,0.9575,0.805183,0.69086
6,0.8783,0.76189,0.693548
8,0.8996,0.763145,0.709677
10,0.7494,0.727967,0.736559
12,0.6481,0.719995,0.739247
14,0.7348,0.730048,0.736559
16,0.6772,0.713877,0.728495
18,0.7067,0.705777,0.75


TrainOutput(global_step=4180, training_loss=0.8973172523188249, metrics={'train_runtime': 109391.3862, 'train_samples_per_second': 1.224, 'train_steps_per_second': 0.038, 'total_flos': 4.0564153651911926e+18, 'train_loss': 0.8973172523188249, 'epoch': 19.952267303102627})

In [23]:
predictions=trainer.predict(ds['test'])

In [24]:
logging.info("Test Set Result: {}".format(predictions.metrics))
wandb.log({"test_accuracy": predictions.metrics["test_accuracy"]})

2024-08-17 14:22:25,913 | INFO: Test Set Result: {'test_loss': 0.6547810435295105, 'test_accuracy': 0.7721179624664879, 'test_runtime': 137.2306, 'test_samples_per_second': 2.718, 'test_steps_per_second': 0.342}
